import gradio as gr from transformers import AutoModelForImageTextToText, AutoProcessor from peft import PeftModel import torch # --- CONFIGURATION --- BASE_MODEL = "unsloth/Qwen3-VL-2B-Instruct-unsloth-bnb-4bit" LORA_ID = "EthanCastro/qwen3-vl-2b-quickdraw" print("Loading model and processor...") model = AutoModelForImageTextToText.from_pretrained( BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) model = PeftModel.from_pretrained(model, LORA_ID) processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct", trust_remote_code=True) print("Model Ready!") def respond(message, image, history): # History is now a list of dictionaries # Format: [{"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}] messages = [] # 1. Convert history to Qwen's multimodal format for msg in history: # We need to ensure content is treated as text for the history buffer content = msg["content"] # If content is a list (multimodal), extract just the text for simplicity if isinstance(content, list): text_content = next((item['text'] for item in content if item['type'] == 'text'), "") else: text_content = content messages.append({ "role": msg["role"], "content": [{"type": "text", "text": text_content}] }) # 2. Add current user turn with the new image user_content = [] if image is not None: user_content.append({"type": "image", "image": image}) user_content.append({"type": "text", "text": message}) messages.append({"role": "user", "content": user_content}) # 3. Tokenize and Generate text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) if image is not None: inputs = processor(text=[text], images=[image], return_tensors="pt").to("cuda") else: inputs = processor(text=[text], return_tensors="pt").to("cuda") with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=1500, temperature=0.3) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0] if "assistant" in generated_text: response = generated_text.split("assistant")[-1].strip() else: response = generated_text return response # --- GRADIO INTERFACE --- # Note: 'theme' removed from here per Gradio 6 migration guide with gr.Blocks() as demo: gr.Markdown("# 🎨 QuickDraw → tldraw JSON") # Chatbot using default "messages" format (no type argument needed) chatbot = gr.Chatbot(height=500) with gr.Row(): img_input = gr.Image(type="pil", label="Upload Sketch", scale=1) with gr.Column(scale=3): txt_input = gr.Textbox( show_label=False, placeholder="Convert this sketch to tldraw JSON format...", container=False ) submit_btn = gr.Button("Send", variant="primary") def chat_wrapper(message, image, history): # 1. Get response bot_res = respond(message, image, history) # 2. Update history using DICTIONARIES history.append({"role": "user", "content": message}) history.append({"role": "assistant", "content": bot_res}) return "", None, history # Initialize state as an empty list submit_btn.click(chat_wrapper, [txt_input, img_input, chatbot], [txt_input, img_input, chatbot]) txt_input.submit(chat_wrapper, [txt_input, img_input, chatbot], [txt_input, img_input, chatbot]) # Theme is now applied here in launch() # Disable SSR to help prevent 503 errors on resource-constrained Spaces demo.launch(theme=gr.themes.Soft(), ssr_mode=False)