Spaces:

omlab
/

VLM-R1-OVD

Runtime error

App Files Files Community

VLM-R1-OVD / app.py

qq-hzlh

change port

53b7de4 9 months ago

raw

history blame

5.64 kB

	import re
	import torch
	import json_repair
	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from PIL import Image, ImageDraw

	def draw_bbox(image, annotation):
	x1, y1, x2, y2 = annotation["bbox_2d"]
	label = annotation["label"]
	draw = ImageDraw.Draw(image)

	# 绘制边界框
	draw.rectangle((x1, y1, x2, y2), outline="red", width=5)

	# 绘制标签文本
	font_size = 20
	text_position = (x1, y1 - font_size - 5) if y1 > font_size + 5 else (x1, y2 + 5)
	try:
	draw.text(text_position, label, fill="red", font_size = font_size)
	except Exception as e:
	print(f"文本绘制错误: {e}")
	# 如果默认绘制失败，使用简单的方式绘制文本
	draw.text(text_position, label, fill="red")

	return image

	def draw_bboxes(image, annotations):
	"""绘制多个边界框和标签"""
	result_image = image.copy()
	for annotation in annotations:
	result_image = draw_bbox(result_image, annotation)

	return result_image

	def extract_bbox_answer(content):
	# Extract content between <answer> and </answer> if present
	answer_matches = re.findall(r'<answer>(.*?)</answer>', content, re.DOTALL)
	if answer_matches:
	# Use the last match
	text = answer_matches[-1]
	else:
	text = content

	# 使用json_repair修复JSON
	try:
	data = json_repair.loads(text)
	if isinstance(data, list) and len(data) > 0:
	return data
	else:
	return []
	except Exception as e:
	print(f"JSON解析错误: {e}")
	return []

	import spaces

	@spaces.GPU
	def process_image_and_text(image, text):
	"""Process image and text input, return thinking process and bbox"""
	question = f"Please carefully check the image and detect the following objects: [{text}]. "

	question = question + "First thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. Please carefully check the image and detect the following objects: [\"equestrian rider's helmet\"]. Output the bbox coordinates of detected objects in <answer></answer>. The bbox coordinates in Markdown format should be: \n```json\n[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"object name\"}]\n```\n If no targets are detected in the image, simply respond with \"None\"."

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": question},
	],
	}
	]

	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)

	inputs = processor(
	text=[text],
	images=image,
	return_tensors="pt",
	padding=True,
	padding_side="left",
	add_special_tokens=False,
	)

	inputs = inputs.to("cuda")

	with torch.no_grad():
	generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=1024, do_sample=False)
	generated_ids_trimmed = [
	out_ids[len(inputs.input_ids[0]):] for out_ids in generated_ids
	]

	output_text = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True
	)[0]
	print("output_text: ", output_text)

	# Extract thinking process
	think_match = re.search(r'<think>(.*?)</think>', output_text, re.DOTALL)
	thinking_process = think_match.group(1).strip() if think_match else "No thinking process found"

	answer_match = re.search(r'<answer>(.*?)</answer>', output_text, re.DOTALL)
	answer_output = answer_match.group(1).strip() if answer_match else "No answer extracted"

	# Get bbox and draw
	bbox = extract_bbox_answer(output_text)

	# Draw bbox on the image
	result_image = image.copy()
	result_image = draw_bboxes(result_image, bbox)

	return thinking_process, answer_output,result_image

	if __name__ == "__main__":
	import gradio as gr

	model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	device = "cuda"
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.float16)
	model.to(device)
	processor = AutoProcessor.from_pretrained(model_path)

	def gradio_interface(image, text):
	thinking, output,result_image = process_image_and_text(image, text)
	return thinking, output, result_image

	demo = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Image(type="pil", label="Input Image"),
	gr.Textbox(label="Description Text")
	],
	outputs=[
	gr.Textbox(label="Thinking Process"),
	gr.Textbox(label="Response"),
	gr.Image(type="pil", label="Result with Bbox")
	],
	title="Open-Vocabulary Object Detection Demo",
	description="Upload an image and input description text, the system will return the thinking process and region annotation. \n\nOur GitHub: [VLM-R1](https://github.com/om-ai-lab/VLM-R1/tree/main)",
	examples=[
	["examples/image1.jpg", "person"],
	["examples/image2.jpg", "drink, fruit"],
	["examples/image3.png", "keyboard, white cup, laptop"],
	],
	cache_examples=False,
	examples_per_page=10
	)

	demo.launch(server_name="0.0.0.0", server_port=7860, share=True)