Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoProcessor, AutoModelForVision2Seq | |
| MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.float16 if device == "cuda" else torch.float32 | |
| print(f"🚀 Loading model on {device} ...") | |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| model = AutoModelForVision2Seq.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=dtype, | |
| device_map="auto" if device == "cuda" else None, | |
| trust_remote_code=True, | |
| ) | |
| model.to(device).eval() | |
| print("✅ Model loaded successfully!") | |
| def ask_about_image(image, prompt): | |
| if image is None or not prompt or not prompt.strip(): | |
| return "Please upload an image and enter a question." | |
| # ✅ Build a multimodal turn via the official chat template | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": prompt.strip()}, | |
| ], | |
| } | |
| ] | |
| # Tokenize with the chat template (injects the correct image placeholders) | |
| templated = processor.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, # adds the assistant prefix | |
| ) | |
| # Important: pass lists for batched API consistency | |
| inputs = processor( | |
| text=[templated], | |
| images=[image], | |
| return_tensors="pt", | |
| ).to(device) | |
| # Safety pads | |
| if model.generation_config.pad_token_id is None and processor.tokenizer.pad_token_id is not None: | |
| model.generation_config.pad_token_id = processor.tokenizer.pad_token_id | |
| if model.generation_config.eos_token_id is None and processor.tokenizer.eos_token_id is not None: | |
| model.generation_config.eos_token_id = processor.tokenizer.eos_token_id | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=256, | |
| do_sample=False, # deterministic; toggle to True if you want more creative outputs | |
| ) | |
| answer = processor.batch_decode(output_ids, skip_special_tokens=True)[0] | |
| return answer.strip() | |
| with gr.Blocks(css=""" | |
| #resp_box textarea { | |
| min-height: 96px; | |
| max-height: 180px; | |
| overflow: auto; | |
| resize: none; | |
| line-height: 1.2; | |
| white-space: pre-wrap; | |
| } | |
| #resp_box label { margin-bottom: 4px; } | |
| """) as demo: | |
| gr.Markdown("## 🧠 Qwen2.5-VL-3B — Visual Reasoning Assistant") | |
| with gr.Row(): | |
| image = gr.Image(type="pil", label="Upload an Image") | |
| with gr.Column(): | |
| question = gr.Textbox( | |
| label="Ask about this image", | |
| placeholder="e.g. What type of document is this? Is there a stamp or signature?", | |
| ) | |
| ask_button = gr.Button("Ask") | |
| # ✅ Compact area showing only the assistant’s answer | |
| answer = gr.Textbox( | |
| label="Response", | |
| lines=4, | |
| interactive=False, # user cannot edit | |
| elem_id="resp_box" | |
| ) | |
| # The model’s pure output only | |
| ask_button.click(fn=ask_about_image, inputs=[image, question], outputs=[answer]) | |
| demo.launch() | |