Spaces:

dmorawiec
/

Qwen-VL-Object-Detection

Running on Zero

Darius Morawiec

Add initial app

d5c4625 about 1 month ago

7.61 kB

	import base64
	import gc
	import json
	import os
	from io import BytesIO

	import gradio as gr
	import torch
	from json_repair import repair_json
	from qwen_vl_utils import process_vision_info
	from transformers import (
	AutoProcessor,
	Qwen2_5_VLForConditionalGeneration,
	Qwen2VLForConditionalGeneration,
	Qwen3VLForConditionalGeneration,
	)

	os.environ["CUDA_VISIBLE_DEVICES"] = "0"

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	model_ids = [
	"Qwen/Qwen2-VL-2B-Instruct",
	"Qwen/Qwen2-VL-7B-Instruct",
	"Qwen/Qwen2.5-VL-3B-Instruct",
	"Qwen/Qwen2.5-VL-7B-Instruct",
	"Qwen/Qwen2.5-VL-32B-Instruct",
	"Qwen/Qwen2.5-VL-72B-Instruct",
	"Qwen/Qwen3-VL-2B-Instruct",
	"Qwen/Qwen3-VL-4B-Instruct",
	"Qwen/Qwen3-VL-8B-Instruct",
	"Qwen/Qwen3-VL-32B-Instruct",
	]


	def image_to_base64(image):
	buffered = BytesIO()
	image.save(buffered, format="PNG")
	img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
	return img_str


	with gr.Blocks() as demo:
	title = gr.Markdown("# Qwen-VL-Localization")

	if DEVICE != "cuda":
	gr.Markdown(
	"⚠️ CUDA not available. This application requires a CUDA-compatible GPU to function properly. You can duplicate this space with a CUDA-enabled runtime."
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("## Inputs")

	image_input = gr.Image(
	label="Input Image",
	type="pil",
	)

	gr.Markdown("## Settings")

	input_model_id = gr.Dropdown(
	choices=model_ids,
	label="Select Model ID",
	)
	default_system_prompt = 'You are a helpful assistant to detect objects in images. When asked to detect elements based on a description, you return a valid JSON object containing bounding boxes for all elements in the form `[["bbox_2d": [XMIN, YMIN, XMAX, YMAX], "label": "LABEL"], ...]`. For example, a valid response could be: `[{"bbox_2d": [10, 30, 20, 60], "label": "face"}, {"bbox_2d": [40, 15, 52, 27], "label": "face"}]`.'
	system_prompt = gr.Textbox(
	label="System Prompt:",
	lines=3,
	value=default_system_prompt,
	)
	default_user_prompt = "detect humans"
	user_prompt = gr.Textbox(
	label="User Prompt:",
	lines=3,
	value=default_user_prompt,
	)
	max_new_tokens = gr.Slider(
	label="Max New Tokens:",
	minimum=32,
	maximum=4096,
	value=256,
	step=32,
	interactive=True,
	)

	with gr.Column():
	gr.Markdown("## Outputs")

	output_annotated_image = gr.AnnotatedImage(
	format="jpeg",
	key="output_annotated_image",
	label="Output Image",
	)

	gr.Markdown("## Detections")

	output_text = gr.Textbox(
	label="Output Text",
	lines=3,
	key="output_text",
	)

	with gr.Row():
	run_button = gr.Button("Run")

	# Global variables to track loaded model
	current_model = None
	current_processor = None
	current_model_id = None

	def run(
	image,
	system_prompt: str,
	user_prompt: str,
	model_id: str,
	max_new_tokens: int = 1024,
	):
	global current_model, current_processor, current_model_id
	scale = False if model_id.startswith("Qwen/Qwen2.5-VL") else True

	# Only load model if it's different from the currently loaded one
	if current_model_id != model_id or current_model is None:
	# Clear previous model from memory
	if current_model is not None:
	del current_model
	current_model = None

	if current_processor is not None:
	del current_processor
	current_processor = None

	# Force garbage collection and clear CUDA cache
	gc.collect()
	torch.cuda.empty_cache()

	if torch.cuda.is_available():
	torch.cuda.synchronize()

	# Load new model
	if model_id.startswith("Qwen/Qwen2-VL"):
	model_loader = Qwen2VLForConditionalGeneration
	elif model_id.startswith("Qwen/Qwen2.5-VL"):
	model_loader = Qwen2_5_VLForConditionalGeneration
	elif model_id.startswith("Qwen/Qwen3-VL"):
	model_loader = Qwen3VLForConditionalGeneration
	current_model = model_loader.from_pretrained(
	model_id,
	torch_dtype="auto",
	device_map="auto",
	).eval()
	current_processor = AutoProcessor.from_pretrained(model_id)
	current_model_id = model_id

	model = current_model
	processor = current_processor

	messages = [
	{
	"role": "user",
	"content": [
	{
	"type": "image",
	"image": f"data:image;base64,{image_to_base64(image)}",
	},
	{"type": "text", "text": system_prompt},
	{"type": "text", "text": user_prompt},
	],
	}
	]

	text = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True,
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	inputs = inputs.to("cuda")

	generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	)

	output_text = str(output_text[0])
	output_text = repair_json(output_text)
	output_json = json.loads(output_text)

	x_scale = float(image.width / 1000) if scale else 1.0
	y_scale = float(image.height / 1000) if scale else 1.0
	bboxes = []
	for detection in output_json:
	if "bbox_2d" not in detection:
	continue
	if len(detection["bbox_2d"]) != 4:
	continue
	if "label" not in detection:
	continue

	xmin, ymin, xmax, ymax = detection["bbox_2d"]
	label = detection.get("label", "")
	bbox = [
	int(xmin * x_scale),
	int(ymin * y_scale),
	int(xmax * x_scale),
	int(ymax * y_scale),
	]
	bboxes.append((bbox, label))

	return [(image, bboxes), str(output_text)]

	# Connect the button to the detection function
	run_button.click(
	fn=run,
	inputs=[
	image_input,
	system_prompt,
	user_prompt,
	input_model_id,
	max_new_tokens,
	],
	outputs=[
	output_annotated_image,
	output_text,
	],
	)

	if __name__ == "__main__":
	demo.launch(
	# share=True,
	)