Spaces:

dmorawiec
/

Qwen-VL-Object-Detection

Running on Zero

File size: 12,030 Bytes

import base64
import gc
import json
import os
from io import BytesIO
from pathlib import Path

import gradio as gr
import torch
from json_repair import repair_json
from qwen_vl_utils import process_vision_info
from transformers import (
    AutoProcessor,
    Qwen2_5_VLForConditionalGeneration,
    Qwen2VLForConditionalGeneration,
    Qwen3VLForConditionalGeneration,
)

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

EXAMPLES_DIR = Path(__file__).parent / "examples"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model_ids = [
    "Qwen/Qwen2-VL-2B-Instruct",  # https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct
    "Qwen/Qwen2-VL-7B-Instruct",  # https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
    "Qwen/Qwen2.5-VL-3B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct
    "Qwen/Qwen2.5-VL-7B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
    "Qwen/Qwen2.5-VL-32B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct
    "Qwen/Qwen2.5-VL-72B-Instruct",  # https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct
    "Qwen/Qwen3-VL-2B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct
    "Qwen/Qwen3-VL-4B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct
    "Qwen/Qwen3-VL-8B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct
    "Qwen/Qwen3-VL-32B-Instruct",  # https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct
]


def scale_image(image, target_size=1000):
    width, height = image.size
    if max(width, height) <= target_size:
        return image

    if width >= height:
        new_width = target_size
        new_height = int((target_size / width) * height)
    else:
        new_height = target_size
        new_width = int((target_size / height) * width)

    return image.resize((new_width, new_height))


def image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str


with gr.Blocks() as demo:
    gr.Markdown("# Qwen-VL Object-Detection")
    gr.Markdown(
        "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection."
    )

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Inputs")

            image_input = gr.Image(
                label="Input Image",
                type="pil",
            )

            gr.Markdown("## Settings")

            input_model_id = gr.Dropdown(
                choices=model_ids,
                label="✨ Select Model ID",
            )
            default_system_prompt = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.'
            system_prompt = gr.Textbox(
                label="System Prompt",
                lines=3,
                value=default_system_prompt,
            )
            default_user_prompt = "detect object"
            user_prompt = gr.Textbox(
                label="User Prompt",
                lines=3,
                value=default_user_prompt,
            )
            max_new_tokens = gr.Slider(
                label="Max New Tokens",
                minimum=32,
                maximum=4096,
                value=256,
                step=32,
                interactive=True,
            )
            image_target_size = gr.Slider(
                label="Image Target Size",
                minimum=256,
                maximum=4096,
                value=1024,
                step=1,
                interactive=True,
            )

        with gr.Column():
            gr.Markdown("## Outputs")

            output_annotated_image = gr.AnnotatedImage(
                format="jpeg",
                key="output_annotated_image",
                label="Output Image",
            )

            gr.Markdown("## Detections")

            output_text = gr.Textbox(
                label="Output Text",
                lines=10,
                key="output_text",
            )

    with gr.Row():
        run_button = gr.Button("Run")

    # Global variables to track loaded model
    current_model = None
    current_processor = None
    current_model_id = None

    def run(
        image,
        model_id: str,
        system_prompt: str,
        user_prompt: str,
        max_new_tokens: int = 1024,
        image_target_size: int | None = None,
    ):
        global current_model, current_processor, current_model_id
        scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True

        # Only load model if it's different from the currently loaded one
        if current_model_id != model_id or current_model is None:
            # Clear previous model from memory
            if current_model is not None:
                del current_model
                current_model = None

            if current_processor is not None:
                del current_processor
                current_processor = None

            # Force garbage collection and clear CUDA cache
            gc.collect()
            torch.cuda.empty_cache()

            if torch.cuda.is_available():
                torch.cuda.synchronize()

            # Load new model
            if model_id.startswith("Qwen/Qwen2-VL"):
                model_loader = Qwen2VLForConditionalGeneration
            elif model_id.startswith("Qwen/Qwen2.5-VL"):
                model_loader = Qwen2_5_VLForConditionalGeneration
            elif model_id.startswith("Qwen/Qwen3-VL"):
                model_loader = Qwen3VLForConditionalGeneration
            current_model = model_loader.from_pretrained(
                model_id,
                torch_dtype="auto",
                device_map="auto",
            ).eval()
            current_processor = AutoProcessor.from_pretrained(model_id)
            current_model_id = model_id

        model = current_model
        processor = current_processor

        base64_image = image_to_base64(
            scale_image(image, image_target_size) if image_target_size else image
        )

        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "image": f"data:image;base64,{base64_image}",
                    },
                    {"type": "text", "text": system_prompt},
                    {"type": "text", "text": user_prompt},
                ],
            }
        ]

        text = processor.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        image_inputs, video_inputs = process_vision_info(messages)
        inputs = processor(
            text=[text],
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(DEVICE)

        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :]
            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(
            generated_ids_trimmed,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=False,
        )

        output_text = str(output_text[0])
        output_text = repair_json(output_text)
        output_json = json.loads(output_text)

        x_scale = float(image.width / 1000) if scale else 1.0
        y_scale = float(image.height / 1000) if scale else 1.0
        bboxes = []
        for detection in output_json:
            if "bbox_2d" not in detection:
                continue
            if len(detection["bbox_2d"]) != 4:
                continue
            if "label" not in detection:
                continue

            xmin, ymin, xmax, ymax = detection["bbox_2d"]
            label = detection.get("label", "")
            bbox = [
                int(xmin * x_scale),
                int(ymin * y_scale),
                int(xmax * x_scale),
                int(ymax * y_scale),
            ]
            bboxes.append((bbox, label))

        return [(image, bboxes), str(json.dumps(output_json))]

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Examples")

            gr.Examples(
                fn=run,
                cache_examples=True,
                cache_mode="eager",
                run_on_click=True,
                examples=[
                    [
                        EXAMPLES_DIR
                        / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg",
                        "Qwen/Qwen3-VL-4B-Instruct",
                        default_system_prompt,
                        "detect sailboat, rowboat, person",
                        512,
                        1920,
                    ],
                    [
                        EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg",
                        "Qwen/Qwen3-VL-4B-Instruct",
                        default_system_prompt,
                        "detect shirt, jeans, jacket, skirt, sunglasses, earring, drink",
                        1024,
                        1920,
                    ],
                    [
                        EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg",
                        "Qwen/Qwen3-VL-4B-Instruct",
                        default_system_prompt,
                        "detect basketball, player with white jersey, player with black jersey",
                        512,
                        1920,
                    ],
                    [
                        EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg",
                        "Qwen/Qwen3-VL-4B-Instruct",
                        default_system_prompt,
                        "detect app to find great places, app to take beautiful photos, app to listen music",
                        512,
                        1920,
                    ],
                    [
                        EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg",
                        "Qwen/Qwen3-VL-4B-Instruct",
                        default_system_prompt,
                        "detect person, bicycle, netherlands flag",
                        1920,
                        1920,
                    ],
                ],
                inputs=[
                    image_input,
                    input_model_id,
                    system_prompt,
                    user_prompt,
                    max_new_tokens,
                    image_target_size,
                ],
                outputs=[
                    output_annotated_image,
                    output_text,
                ],
            )

            if DEVICE != "cuda":
                gr.Markdown(
                    "👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime."
                )

    # Connect the button to the detection function
    run_button.click(
        fn=run,
        inputs=[
            image_input,
            input_model_id,
            system_prompt,
            user_prompt,
            max_new_tokens,
            image_target_size,
        ],
        outputs=[
            output_annotated_image,
            output_text,
        ],
    )

if __name__ == "__main__":
    demo.launch(
        # share=True,
    )