| | import gradio as gr |
| | import torch |
| | from PIL import Image |
| | import requests |
| | from io import BytesIO |
| | from transformers import Qwen3VLForConditionalGeneration, AutoProcessor |
| |
|
| | |
| | MODEL_PATH = "Qwen/Qwen3-VL-2B-Instruct" |
| | CPU_DEVICE = "cpu" |
| |
|
| | |
| | print("Loading model and processor... This will take a few minutes on a CPU.") |
| | processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True) |
| | model = Qwen3VLForConditionalGeneration.from_pretrained( |
| | MODEL_PATH, |
| | trust_remote_code=True, |
| | dtype="auto", |
| | device_map="auto" |
| | ) |
| | print("Model and processor loaded successfully.") |
| |
|
| | |
| | def process_and_generate(image_input, text_prompt): |
| | """ |
| | Processes the image and text prompt, and generates a response from the model. |
| | """ |
| | if image_input is None or not text_prompt.strip(): |
| | return "Please provide both an image and a text prompt." |
| |
|
| | |
| | pil_image = Image.fromarray(image_input) |
| |
|
| | |
| | messages = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image", "image": pil_image}, |
| | {"type": "text", "text": text_prompt}, |
| | ], |
| | } |
| | ] |
| |
|
| | print("Processing inputs and generating response... This will be slow.") |
| | try: |
| | |
| | inputs = processor.apply_chat_template( |
| | messages, |
| | tokenize=True, |
| | add_generation_prompt=True, |
| | return_dict=True, |
| | return_tensors="pt" |
| | ) |
| | inputs = inputs.to(model.device) |
| |
|
| | |
| | generated_ids = model.generate(**inputs, max_new_tokens=1024) |
| |
|
| | |
| | generated_ids_trimmed = [ |
| | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| | ] |
| | |
| | |
| | output_text = processor.batch_decode( |
| | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| | ) |
| | |
| | |
| | return output_text[0] |
| | |
| | except Exception as e: |
| | return f"An error occurred during generation: {str(e)}" |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown( |
| | """ |
| | # Qwen3-VL-2B-Instruct CPU Demo |
| | This Space runs the `Qwen/Qwen3-VL-2B-Instruct` model using the standard `transformers` library. |
| | **Warning:** Running this on a free CPU Space is **very slow**. Duplicate this space for solo experience. |
| | """ |
| | ) |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | image_input = gr.Image(type="numpy", label="Upload Image") |
| | text_prompt = gr.Textbox(label="Prompt", placeholder="e.g., Describe this image in detail.") |
| | submit_button = gr.Button("Generate Response") |
| | with gr.Column(): |
| | output_text = gr.Textbox(label="Model Output", lines=10, interactive=False) |
| |
|
| | submit_button.click( |
| | fn=process_and_generate, |
| | inputs=[image_input, text_prompt], |
| | outputs=output_text |
| | ) |
| |
|
| |
|
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |