|
|
import base64 |
|
|
import gc |
|
|
import json |
|
|
import os |
|
|
from io import BytesIO |
|
|
from pathlib import Path |
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
from json_repair import repair_json |
|
|
from qwen_vl_utils import process_vision_info |
|
|
from transformers import ( |
|
|
AutoProcessor, |
|
|
Qwen2_5_VLForConditionalGeneration, |
|
|
Qwen2VLForConditionalGeneration, |
|
|
Qwen3VLForConditionalGeneration, |
|
|
) |
|
|
|
|
|
from kofi import SCRIPT |
|
|
|
|
|
|
|
|
if "SPACES_ZERO_GPU" in os.environ.keys(): |
|
|
import spaces |
|
|
else: |
|
|
|
|
|
class spaces: |
|
|
@staticmethod |
|
|
def GPU(func=None, duration=300): |
|
|
def decorator(f): |
|
|
def wrapper(*args, **kwargs): |
|
|
return f(*args, **kwargs) |
|
|
|
|
|
return wrapper |
|
|
|
|
|
if func is None: |
|
|
return decorator |
|
|
return decorator(func) |
|
|
|
|
|
|
|
|
|
|
|
HEADLINE = "# Qwen-VL Object-Detection" |
|
|
SUBLINE = "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection." |
|
|
|
|
|
EXAMPLES_DIR = Path(__file__).parent / "examples" |
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
MODEL_IDS = [ |
|
|
"Qwen/Qwen2-VL-2B-Instruct", |
|
|
"Qwen/Qwen2-VL-7B-Instruct", |
|
|
"Qwen/Qwen2.5-VL-3B-Instruct", |
|
|
"Qwen/Qwen2.5-VL-7B-Instruct", |
|
|
"Qwen/Qwen2.5-VL-32B-Instruct", |
|
|
"Qwen/Qwen2.5-VL-72B-Instruct", |
|
|
"Qwen/Qwen3-VL-2B-Instruct", |
|
|
"Qwen/Qwen3-VL-4B-Instruct", |
|
|
"Qwen/Qwen3-VL-8B-Instruct", |
|
|
"Qwen/Qwen3-VL-32B-Instruct", |
|
|
] |
|
|
DEFAULT_SYSTEM_PROMPT = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.' |
|
|
EXAMPLES = [ |
|
|
[ |
|
|
EXAMPLES_DIR / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg", |
|
|
"Qwen/Qwen3-VL-4B-Instruct", |
|
|
DEFAULT_SYSTEM_PROMPT, |
|
|
"detect sailboat, rowboat, person", |
|
|
512, |
|
|
"Yes", |
|
|
1920, |
|
|
], |
|
|
[ |
|
|
EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg", |
|
|
"Qwen/Qwen3-VL-4B-Instruct", |
|
|
DEFAULT_SYSTEM_PROMPT, |
|
|
"detect shirt, jeans, jacket, skirt, sunglasses, earring, drink", |
|
|
1024, |
|
|
"Yes", |
|
|
1920, |
|
|
], |
|
|
[ |
|
|
EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg", |
|
|
"Qwen/Qwen3-VL-4B-Instruct", |
|
|
DEFAULT_SYSTEM_PROMPT, |
|
|
"detect basketball, player with white jersey, player with black jersey", |
|
|
512, |
|
|
"Yes", |
|
|
1920, |
|
|
], |
|
|
[ |
|
|
EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg", |
|
|
"Qwen/Qwen3-VL-4B-Instruct", |
|
|
DEFAULT_SYSTEM_PROMPT, |
|
|
"detect app to find great places, app to take beautiful photos, app to listen music", |
|
|
512, |
|
|
"Yes", |
|
|
1920, |
|
|
], |
|
|
[ |
|
|
EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg", |
|
|
"Qwen/Qwen3-VL-4B-Instruct", |
|
|
DEFAULT_SYSTEM_PROMPT, |
|
|
"detect person, bicycle, netherlands flag", |
|
|
1920, |
|
|
"Yes", |
|
|
1920, |
|
|
], |
|
|
] |
|
|
|
|
|
|
|
|
current_model = None |
|
|
current_processor = None |
|
|
current_model_id = None |
|
|
|
|
|
|
|
|
class AutoModel: |
|
|
@staticmethod |
|
|
def from_pretrained(model_id, dtype="auto", device_map="cpu"): |
|
|
if model_id.startswith("Qwen/Qwen2-VL"): |
|
|
model_loader = Qwen2VLForConditionalGeneration |
|
|
elif model_id.startswith("Qwen/Qwen2.5-VL"): |
|
|
model_loader = Qwen2_5_VLForConditionalGeneration |
|
|
elif model_id.startswith("Qwen/Qwen3-VL"): |
|
|
model_loader = Qwen3VLForConditionalGeneration |
|
|
else: |
|
|
raise ValueError(f"Unsupported model ID: {model_id}") |
|
|
return model_loader.from_pretrained( |
|
|
model_id, dtype=dtype, device_map=device_map |
|
|
) |
|
|
|
|
|
|
|
|
def resize_image(image, target_size=1000): |
|
|
width, height = image.size |
|
|
if max(width, height) <= target_size: |
|
|
return image |
|
|
|
|
|
if width >= height: |
|
|
new_width = target_size |
|
|
new_height = int((target_size / width) * height) |
|
|
else: |
|
|
new_height = target_size |
|
|
new_width = int((target_size / height) * width) |
|
|
|
|
|
return image.resize((new_width, new_height)) |
|
|
|
|
|
|
|
|
def image_to_base64(image): |
|
|
buffered = BytesIO() |
|
|
image.save(buffered, format="PNG") |
|
|
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") |
|
|
return img_str |
|
|
|
|
|
|
|
|
with gr.Blocks(js=SCRIPT) as demo: |
|
|
gr.Markdown(HEADLINE) |
|
|
gr.Markdown(SUBLINE) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("## Inputs") |
|
|
|
|
|
image_input = gr.Image( |
|
|
label="Input Image", |
|
|
type="pil", |
|
|
) |
|
|
|
|
|
gr.Markdown("## Settings") |
|
|
|
|
|
input_model_id = gr.Dropdown( |
|
|
choices=MODEL_IDS, |
|
|
label="✨ Select Model ID", |
|
|
) |
|
|
system_prompt = gr.Textbox( |
|
|
label="System Prompt", |
|
|
lines=3, |
|
|
value=DEFAULT_SYSTEM_PROMPT, |
|
|
) |
|
|
default_user_prompt = "detect object" |
|
|
user_prompt = gr.Textbox( |
|
|
label="User Prompt", |
|
|
lines=3, |
|
|
value=default_user_prompt, |
|
|
) |
|
|
max_new_tokens = gr.Slider( |
|
|
label="Max New Tokens", |
|
|
minimum=32, |
|
|
maximum=4096, |
|
|
value=256, |
|
|
step=32, |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
image_resize = gr.Radio( |
|
|
label="Resize Image", |
|
|
choices=["Yes", "No"], |
|
|
value="Yes", |
|
|
interactive=True, |
|
|
scale=2, |
|
|
) |
|
|
|
|
|
image_target_size = gr.Slider( |
|
|
label="Image Target Size", |
|
|
minimum=256, |
|
|
maximum=4096, |
|
|
value=1024, |
|
|
step=1, |
|
|
interactive=True, |
|
|
scale=2, |
|
|
) |
|
|
|
|
|
with gr.Column(): |
|
|
gr.Markdown("## Outputs") |
|
|
|
|
|
output_annotated_image = gr.AnnotatedImage( |
|
|
format="jpeg", |
|
|
key="output_annotated_image", |
|
|
label="Output Image", |
|
|
) |
|
|
|
|
|
gr.Markdown("## Detections") |
|
|
|
|
|
output_text = gr.Textbox( |
|
|
label="Output Text", |
|
|
lines=10, |
|
|
key="output_text", |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
run_button = gr.Button("Run") |
|
|
|
|
|
def load_model( |
|
|
model_id: str, |
|
|
): |
|
|
global current_model, current_processor, current_model_id |
|
|
|
|
|
|
|
|
if current_model_id != model_id or current_model is None: |
|
|
|
|
|
if current_model is not None: |
|
|
del current_model |
|
|
current_model = None |
|
|
|
|
|
if current_processor is not None: |
|
|
del current_processor |
|
|
current_processor = None |
|
|
|
|
|
|
|
|
gc.collect() |
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.synchronize() |
|
|
|
|
|
gr.Info( |
|
|
f"Downloading and loading <strong>{model_id.removeprefix('Qwen/')}</strong> model files ...", |
|
|
duration=10, |
|
|
) |
|
|
|
|
|
current_model = AutoModel.from_pretrained( |
|
|
model_id, dtype="auto", device_map="cpu" |
|
|
) |
|
|
current_processor = AutoProcessor.from_pretrained(model_id) |
|
|
current_model_id = model_id |
|
|
|
|
|
return current_model, current_processor |
|
|
|
|
|
@spaces.GPU(duration=300) |
|
|
def generate( |
|
|
model, |
|
|
processor, |
|
|
image, |
|
|
model_id: str, |
|
|
system_prompt: str, |
|
|
user_prompt: str, |
|
|
max_new_tokens: int, |
|
|
image_resize: str, |
|
|
image_target_size: int | None, |
|
|
): |
|
|
|
|
|
model = model.to(DEVICE) |
|
|
model.eval() |
|
|
|
|
|
base64_image = image_to_base64( |
|
|
resize_image(image, image_target_size) |
|
|
if image_resize == "Yes" and image_target_size |
|
|
else image |
|
|
) |
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{ |
|
|
"type": "image", |
|
|
"image": f"data:image;base64,{base64_image}", |
|
|
}, |
|
|
{"type": "text", "text": system_prompt}, |
|
|
{"type": "text", "text": user_prompt}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
text = processor.apply_chat_template( |
|
|
messages, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True, |
|
|
) |
|
|
image_inputs, video_inputs = process_vision_info(messages) |
|
|
inputs = processor( |
|
|
text=[text], |
|
|
images=image_inputs, |
|
|
videos=video_inputs, |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
) |
|
|
inputs = inputs.to(DEVICE) |
|
|
|
|
|
generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens) |
|
|
generated_ids_trimmed = [ |
|
|
out_ids[len(in_ids) :] |
|
|
for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
|
|
] |
|
|
output_text = processor.batch_decode( |
|
|
generated_ids_trimmed, |
|
|
skip_special_tokens=True, |
|
|
clean_up_tokenization_spaces=False, |
|
|
) |
|
|
|
|
|
output_text = str(output_text[0]) |
|
|
output_text = repair_json(output_text) |
|
|
output_json = json.loads(output_text) |
|
|
|
|
|
scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True |
|
|
x_scale = float(image.width / 1000) if scale else 1.0 |
|
|
y_scale = float(image.height / 1000) if scale else 1.0 |
|
|
bboxes = [] |
|
|
for detection in output_json: |
|
|
if "bbox_2d" not in detection: |
|
|
continue |
|
|
if len(detection["bbox_2d"]) != 4: |
|
|
continue |
|
|
if "label" not in detection: |
|
|
continue |
|
|
|
|
|
xmin, ymin, xmax, ymax = detection["bbox_2d"] |
|
|
label = detection.get("label", "") |
|
|
bbox = [ |
|
|
int(xmin * x_scale), |
|
|
int(ymin * y_scale), |
|
|
int(xmax * x_scale), |
|
|
int(ymax * y_scale), |
|
|
] |
|
|
bboxes.append((bbox, label)) |
|
|
|
|
|
return [(image, bboxes), str(json.dumps(output_json))] |
|
|
|
|
|
def run( |
|
|
image, |
|
|
model_id: str, |
|
|
system_prompt: str, |
|
|
user_prompt: str, |
|
|
max_new_tokens: int = 1024, |
|
|
image_resize: str = "Yes", |
|
|
image_target_size: int | None = None, |
|
|
): |
|
|
|
|
|
model, processor = load_model(model_id) |
|
|
|
|
|
|
|
|
return generate( |
|
|
model, |
|
|
processor, |
|
|
image, |
|
|
model_id, |
|
|
system_prompt, |
|
|
user_prompt, |
|
|
max_new_tokens, |
|
|
image_resize, |
|
|
image_target_size, |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
gr.Markdown("## Examples") |
|
|
|
|
|
gr.Examples( |
|
|
fn=run, |
|
|
cache_examples=True, |
|
|
cache_mode="eager", |
|
|
run_on_click=False, |
|
|
examples=EXAMPLES, |
|
|
inputs=[ |
|
|
image_input, |
|
|
input_model_id, |
|
|
system_prompt, |
|
|
user_prompt, |
|
|
max_new_tokens, |
|
|
image_resize, |
|
|
image_target_size, |
|
|
], |
|
|
outputs=[ |
|
|
output_annotated_image, |
|
|
output_text, |
|
|
], |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
if DEVICE != "cuda": |
|
|
gr.Markdown( |
|
|
"👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime." |
|
|
) |
|
|
gr.HTML('<div id="kofi" style="text-align: center;"></div>') |
|
|
|
|
|
|
|
|
run_button.click( |
|
|
fn=run, |
|
|
inputs=[ |
|
|
image_input, |
|
|
input_model_id, |
|
|
system_prompt, |
|
|
user_prompt, |
|
|
max_new_tokens, |
|
|
image_resize, |
|
|
image_target_size, |
|
|
], |
|
|
outputs=[ |
|
|
output_annotated_image, |
|
|
output_text, |
|
|
], |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
share=False, |
|
|
) |
|
|
|