import gradio as gr import torch from transformers import AutoProcessor, AutoModelForVision2Seq MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 print(f"🚀 Loading model on {device} ...") processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForVision2Seq.from_pretrained( MODEL_ID, torch_dtype=dtype, device_map="auto" if device == "cuda" else None, trust_remote_code=True, ) model.to(device).eval() print("✅ Model loaded successfully!") def ask_about_image(image, prompt): if image is None or not prompt or not prompt.strip(): return "Please upload an image and enter a question." # ✅ Build a multimodal turn via the official chat template messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": prompt.strip()}, ], } ] # Tokenize with the chat template (injects the correct image placeholders) templated = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, # adds the assistant prefix ) # Important: pass lists for batched API consistency inputs = processor( text=[templated], images=[image], return_tensors="pt", ).to(device) # Safety pads if model.generation_config.pad_token_id is None and processor.tokenizer.pad_token_id is not None: model.generation_config.pad_token_id = processor.tokenizer.pad_token_id if model.generation_config.eos_token_id is None and processor.tokenizer.eos_token_id is not None: model.generation_config.eos_token_id = processor.tokenizer.eos_token_id with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=256, do_sample=False, # deterministic; toggle to True if you want more creative outputs ) answer = processor.batch_decode(output_ids, skip_special_tokens=True)[0] return answer.strip() with gr.Blocks(css=""" #resp_box textarea { min-height: 96px; max-height: 180px; overflow: auto; resize: none; line-height: 1.2; white-space: pre-wrap; } #resp_box label { margin-bottom: 4px; } """) as demo: gr.Markdown("## 🧠 Qwen2.5-VL-3B — Visual Reasoning Assistant") with gr.Row(): image = gr.Image(type="pil", label="Upload an Image") with gr.Column(): question = gr.Textbox( label="Ask about this image", placeholder="e.g. What type of document is this? Is there a stamp or signature?", ) ask_button = gr.Button("Ask") # ✅ Compact area showing only the assistant’s answer answer = gr.Textbox( label="Response", lines=4, interactive=False, # user cannot edit elem_id="resp_box" ) # The model’s pure output only ask_button.click(fn=ask_about_image, inputs=[image, question], outputs=[answer]) demo.launch()