Qwen-VL-Object-Detection

Paused

Darius Morawiec

refactor: Refactor header constants for better readability in the app

e0d3f3e 3 months ago

13.6 kB

	import base64
	import gc
	import json
	import os
	from io import BytesIO
	from pathlib import Path

	import gradio as gr
	import torch
	from json_repair import repair_json
	from qwen_vl_utils import process_vision_info
	from transformers import (
	AutoProcessor,
	Qwen2_5_VLForConditionalGeneration,
	Qwen2VLForConditionalGeneration,
	Qwen3VLForConditionalGeneration,
	)

	from kofi import SCRIPT

	# Handle spaces.GPU decorator for GPU allocation in Spaces
	if "SPACES_ZERO_GPU" in os.environ.keys():
	import spaces
	else:

	class spaces:
	@staticmethod
	def GPU(func=None, duration=300):
	def decorator(f):
	def wrapper(args, *kwargs):
	return f(args, *kwargs)

	return wrapper

	if func is None:
	return decorator
	return decorator(func)


	# Define constants
	HEADLINE = "# Qwen-VL Object-Detection"
	SUBLINE = "Compare [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl), [Qwen2.5-VL](https://huggingface.co/collections/Qwen/qwen25-vl) and [Qwen2-VL](https://huggingface.co/collections/Qwen/qwen2-vl) models by [Qwen](https://huggingface.co/Qwen) for object detection."

	EXAMPLES_DIR = Path(__file__).parent / "examples"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_IDS = [
	"Qwen/Qwen2-VL-2B-Instruct", # https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct
	"Qwen/Qwen2-VL-7B-Instruct", # https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
	"Qwen/Qwen2.5-VL-3B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct
	"Qwen/Qwen2.5-VL-7B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
	"Qwen/Qwen2.5-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct
	"Qwen/Qwen2.5-VL-72B-Instruct", # https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct
	"Qwen/Qwen3-VL-2B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct
	"Qwen/Qwen3-VL-4B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct
	"Qwen/Qwen3-VL-8B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct
	"Qwen/Qwen3-VL-32B-Instruct", # https://huggingface.co/Qwen/Qwen3-VL-32B-Instruct
	]
	DEFAULT_SYSTEM_PROMPT = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[{"bbox_2d": [xmin, ymin, xmax, ymax], "label": "placeholder"}, ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "placeholder"}, {"bbox_2d": [40, 15, 52, 27], "label": "placeholder"}]`.'
	EXAMPLES = [
	[
	EXAMPLES_DIR / "niklas-ohlrogge-niamoh-de-fDYRfHoRC4k-unsplash.jpg",
	"Qwen/Qwen3-VL-4B-Instruct",
	DEFAULT_SYSTEM_PROMPT,
	"detect sailboat, rowboat, person",
	512,
	"Yes",
	1920,
	],
	[
	EXAMPLES_DIR / "elevate-nYgy58eb9aw-unsplash.jpg",
	"Qwen/Qwen3-VL-4B-Instruct",
	DEFAULT_SYSTEM_PROMPT,
	"detect shirt, jeans, jacket, skirt, sunglasses, earring, drink",
	1024,
	"Yes",
	1920,
	],
	[
	EXAMPLES_DIR / "markus-spiske-oPDQGXW7i40-unsplash.jpg",
	"Qwen/Qwen3-VL-4B-Instruct",
	DEFAULT_SYSTEM_PROMPT,
	"detect basketball, player with white jersey, player with black jersey",
	512,
	"Yes",
	1920,
	],
	[
	EXAMPLES_DIR / "william-hook-9e9PD9blAto-unsplash.jpg",
	"Qwen/Qwen3-VL-4B-Instruct",
	DEFAULT_SYSTEM_PROMPT,
	"detect app to find great places, app to take beautiful photos, app to listen music",
	512,
	"Yes",
	1920,
	],
	[
	EXAMPLES_DIR / "tasso-mitsarakis-dw7Y4W6Rhmk-unsplash.jpg",
	"Qwen/Qwen3-VL-4B-Instruct",
	DEFAULT_SYSTEM_PROMPT,
	"detect person, bicycle, netherlands flag",
	1920,
	"Yes",
	1920,
	],
	]

	# Global variables to track loaded model
	current_model = None
	current_processor = None
	current_model_id = None


	class AutoModel:
	@staticmethod
	def from_pretrained(model_id, dtype="auto", device_map="cpu"):
	if model_id.startswith("Qwen/Qwen2-VL"):
	model_loader = Qwen2VLForConditionalGeneration
	elif model_id.startswith("Qwen/Qwen2.5-VL"):
	model_loader = Qwen2_5_VLForConditionalGeneration
	elif model_id.startswith("Qwen/Qwen3-VL"):
	model_loader = Qwen3VLForConditionalGeneration
	else:
	raise ValueError(f"Unsupported model ID: {model_id}")
	return model_loader.from_pretrained(
	model_id, dtype=dtype, device_map=device_map
	)


	def resize_image(image, target_size=1000):
	width, height = image.size
	if max(width, height) <= target_size:
	return image

	if width >= height:
	new_width = target_size
	new_height = int((target_size / width) * height)
	else:
	new_height = target_size
	new_width = int((target_size / height) * width)

	return image.resize((new_width, new_height))


	def image_to_base64(image):
	buffered = BytesIO()
	image.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
	return img_str


	with gr.Blocks(js=SCRIPT) as demo:
	gr.Markdown(HEADLINE)
	gr.Markdown(SUBLINE)

	with gr.Row():
	with gr.Column():
	gr.Markdown("## Inputs")

	image_input = gr.Image(
	label="Input Image",
	type="pil",
	)

	gr.Markdown("## Settings")

	input_model_id = gr.Dropdown(
	choices=MODEL_IDS,
	label="✨ Select Model ID",
	)
	system_prompt = gr.Textbox(
	label="System Prompt",
	lines=3,
	value=DEFAULT_SYSTEM_PROMPT,
	)
	default_user_prompt = "detect object"
	user_prompt = gr.Textbox(
	label="User Prompt",
	lines=3,
	value=default_user_prompt,
	)
	max_new_tokens = gr.Slider(
	label="Max New Tokens",
	minimum=32,
	maximum=4096,
	value=256,
	step=32,
	interactive=True,
	)

	image_resize = gr.Radio(
	label="Resize Image",
	choices=["Yes", "No"],
	value="Yes",
	interactive=True,
	scale=2,
	)

	image_target_size = gr.Slider(
	label="Image Target Size",
	minimum=256,
	maximum=4096,
	value=1024,
	step=1,
	interactive=True,
	scale=2,
	)

	with gr.Column():
	gr.Markdown("## Outputs")

	output_annotated_image = gr.AnnotatedImage(
	format="jpeg",
	key="output_annotated_image",
	label="Output Image",
	)

	gr.Markdown("## Detections")

	output_text = gr.Textbox(
	label="Output Text",
	lines=10,
	key="output_text",
	)

	with gr.Row():
	run_button = gr.Button("Run")

	def load_model(
	model_id: str,
	):
	global current_model, current_processor, current_model_id

	# Only load model if it's different from the currently loaded one
	if current_model_id != model_id or current_model is None:
	# Clear previous model from memory
	if current_model is not None:
	del current_model
	current_model = None

	if current_processor is not None:
	del current_processor
	current_processor = None

	# Force garbage collection and clear CUDA cache
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()

	gr.Info(
	f"Downloading and loading <strong>{model_id.removeprefix('Qwen/')}</strong> model files ...",
	duration=10,
	)

	current_model = AutoModel.from_pretrained(
	model_id, dtype="auto", device_map="cpu"
	)
	current_processor = AutoProcessor.from_pretrained(model_id)
	current_model_id = model_id

	return current_model, current_processor

	@spaces.GPU(duration=300)
	def generate(
	model,
	processor,
	image,
	model_id: str,
	system_prompt: str,
	user_prompt: str,
	max_new_tokens: int,
	image_resize: str,
	image_target_size: int \| None,
	):
	# Move model to CUDA if available (inside @spaces.GPU decorated function)
	model = model.to(DEVICE)
	model.eval()

	base64_image = image_to_base64(
	resize_image(image, image_target_size)
	if image_resize == "Yes" and image_target_size
	else image
	)
	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": f"data:image;base64,{base64_image}",
	},
	{"type": "text", "text": system_prompt},
	{"type": "text", "text": user_prompt},
	],
	}
	]

	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to(DEVICE)

	generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	)

	output_text = str(output_text[0])
	output_text = repair_json(output_text)
	output_json = json.loads(output_text)

	scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True
	x_scale = float(image.width / 1000) if scale else 1.0
	y_scale = float(image.height / 1000) if scale else 1.0
	bboxes = []
	for detection in output_json:
	if "bbox_2d" not in detection:
	continue
	if len(detection["bbox_2d"]) != 4:
	continue
	if "label" not in detection:
	continue

	xmin, ymin, xmax, ymax = detection["bbox_2d"]
	label = detection.get("label", "")
	bbox = [
	int(xmin * x_scale),
	int(ymin * y_scale),
	int(xmax * x_scale),
	int(ymax * y_scale),
	]
	bboxes.append((bbox, label))

	return [(image, bboxes), str(json.dumps(output_json))]

	def run(
	image,
	model_id: str,
	system_prompt: str,
	user_prompt: str,
	max_new_tokens: int = 1024,
	image_resize: str = "Yes",
	image_target_size: int \| None = None,
	):
	# Load the model and processor (on CPU)
	model, processor = load_model(model_id)

	# Run inference (on GPU *if available)
	return generate(
	model,
	processor,
	image,
	model_id,
	system_prompt,
	user_prompt,
	max_new_tokens,
	image_resize,
	image_target_size,
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("## Examples")

	gr.Examples(
	fn=run,
	cache_examples=True,
	cache_mode="eager",
	run_on_click=False,
	examples=EXAMPLES,
	inputs=[
	image_input,
	input_model_id,
	system_prompt,
	user_prompt,
	max_new_tokens,
	image_resize,
	image_target_size,
	],
	outputs=[
	output_annotated_image,
	output_text,
	],
	)

	with gr.Row():
	with gr.Column():
	if DEVICE != "cuda":
	gr.Markdown(
	"👉 It's recommended to run this application on a machine with a CUDA-compatible GPU for optimal performance. You can clone this space locally or duplicate this space with a CUDA-enabled runtime."
	)
	gr.HTML('<div id="kofi" style="text-align: center;"></div>')

	# Connect the button to the detection function
	run_button.click(
	fn=run,
	inputs=[
	image_input,
	input_model_id,
	system_prompt,
	user_prompt,
	max_new_tokens,
	image_resize,
	image_target_size,
	],
	outputs=[
	output_annotated_image,
	output_text,
	],
	)

	if __name__ == "__main__":
	demo.launch(
	share=False,
	)