rahul7star's picture
Update app.py
dba7c16 verified
import spaces
import gradio as gr
import torch
import time
import re
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_NAME = "rahul7star/Qwen3-4B-Thinking-2509-Genius-Coder-AI-Full"
MAX_INPUT_TOKENS = 4096
MAX_NEW_TOKENS = 4096
# ---------------- Model ----------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME,
trust_remote_code=True
)
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
trust_remote_code=True,
low_cpu_mem_usage=True
)
# Important for Qwen
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
print("βœ… Model loaded successfully")
SYSTEM_PROMPT = """You are a professional AI Coding Assistant.
Your responses must be:
- Clear and concise
- Well-structured with headings and bullet points
- Technically accurate
- Written in a formal, professional tone
- Focused on best practices and production-quality code
"""
# ---------------- Helper ----------------
def strip_thinking(text: str):
"""Remove <think> reasoning blocks"""
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
# ---------------- Inference ----------------
@spaces.GPU()
def generate_answer(question, max_tokens):
print("\n================ GENERATE ANSWER START ================")
if not question or not question.strip():
return "Please enter a valid question."
try:
start_time = time.time()
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": question.strip()},
]
# βœ… New Qwen chat template
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True,
)
inputs = tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=MAX_INPUT_TOKENS,
).to(model.device)
input_token_count = inputs.input_ids.shape[-1]
print(f"Input tokens: {input_token_count}")
max_tokens = min(int(max_tokens), MAX_NEW_TOKENS)
print(f"Final max_new_tokens: {max_tokens}")
print("πŸš€ Starting generation...")
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False, # Deterministic (recommended)
temperature=0.0, # Important when do_sample=False
repetition_penalty=1.05,
use_cache=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
print("βœ… Generation finished")
generated_tokens = output[0][input_token_count:]
response = tokenizer.decode(
generated_tokens,
skip_special_tokens=True,
)
# πŸ”₯ Remove thinking block
response = strip_thinking(response)
print(f"Generated tokens: {generated_tokens.shape[-1]}")
print(f"⏱ Total time: {time.time() - start_time:.2f} sec")
print("================ GENERATE ANSWER END ==================\n")
return response if response else "No output generated."
except Exception as e:
import traceback
traceback.print_exc()
if torch.cuda.is_available():
torch.cuda.empty_cache()
return f"Error occurred: {str(e)}"
# ---------------- UI ----------------
with gr.Blocks() as demo:
gr.Markdown(
"""
# πŸ€– Professional Coding Assistant
**Powered by Qwen3-4B Thinking**
Optimized for:
- ⚑ Stable GPU inference
- 🧠 Deterministic responses
- πŸ’» Production-quality code
"""
)
question = gr.Textbox(
label="Your Question",
placeholder="Explain Quick Sort with complexity and a Python example",
value="write a python code using pytorch for a simple neural network demo",
lines=4,
)
answer = gr.Markdown(label="AI Response", elem_id="answer_box")
max_tokens = gr.Slider(
64, 4096, value=2048, step=32, label="Max New Tokens"
)
with gr.Row():
submit = gr.Button("Generate Answer", variant="primary")
copy_btn = gr.Button("πŸ“‹ Copy Response")
clear = gr.Button("Clear")
submit.click(
fn=generate_answer,
inputs=[question, max_tokens],
outputs=answer,
)
clear.click(
fn=lambda: ("", ""),
outputs=[question, answer],
)
copy_btn.click(
fn=None,
js="""
() => {
const el = document.querySelector('#answer_box');
navigator.clipboard.writeText(el.innerText);
}
""",
)
demo.launch(
theme=gr.themes.Soft(),
css="""
.gradio-container { max-width: 900px !important; margin: auto; }
textarea { font-size: 14px !important; }
""",
)