jiyatai
/

temp

Model card Files Files and versions

xet

Community

jiyatai commited on Oct 18, 2025

Commit

d8618db

verified ·

1 Parent(s): 428367e

Upload qwen25vl.py with huggingface_hub

Browse files

Files changed (1) hide show

qwen25vl.py +250 -0

qwen25vl.py ADDED Viewed

	@@ -0,0 +1,250 @@

+from datasets import load_dataset
+import json
+import random
+import io
+import ast
+from PIL import Image, ImageDraw, ImageFont
+from PIL import ImageColor
+from tqdm import tqdm
+import torch
+import os
+import torch.distributed as dist
+import xml.etree.ElementTree as ET
+additional_colors = [colorname for (colorname, colorcode) in ImageColor.colormap.items()]
+def decode_xml_points(text):
+    try:
+        root = ET.fromstring(text)
+        num_points = (len(root.attrib) - 1) // 2
+        points = []
+        for i in range(num_points):
+            x = root.attrib.get(f'x{i+1}')
+            y = root.attrib.get(f'y{i+1}')
+            points.append([x, y])
+        alt = root.attrib.get('alt')
+        phrase = root.text.strip() if root.text else None
+        return {
+            "points": points,
+            "alt": alt,
+            "phrase": phrase
+        }
+    except Exception as e:
+        print(e)
+        return None
+def plot_bounding_boxes(im, bounding_boxes, input_width, input_height):
+    """
+    Plots bounding boxes on an image with markers for each a name, using PIL, normalized coordinates, and different colors.
+    Args:
+        img_path: The path to the image file.
+        bounding_boxes: A list of bounding boxes containing the name of the object
+         and their positions in normalized [y1 x1 y2 x2] format.
+    """
+    # Load the image
+    img = im
+    width, height = img.size
+    # print(img.size)
+    # Create a drawing object
+    draw = ImageDraw.Draw(img)
+    # Define a list of colors
+    colors = [
+    'red',
+    'green',
+    'blue',
+    'yellow',
+    'orange',
+    'pink',
+    'purple',
+    'brown',
+    'gray',
+    'beige',
+    'turquoise',
+    'cyan',
+    'magenta',
+    'lime',
+    'navy',
+    'maroon',
+    'teal',
+    'olive',
+    'coral',
+    'lavender',
+    'violet',
+    'gold',
+    'silver',
+    ] + additional_colors
+    # Parsing out the markdown fencing
+    bounding_boxes = parse_json(bounding_boxes)
+    # font = ImageFont.truetype("NotoSansCJK-Regular.ttc", size=14)
+    try:
+      json_output = ast.literal_eval(bounding_boxes)
+    except Exception as e:
+      end_idx = bounding_boxes.rfind('"}') + len('"}')
+      truncated_text = bounding_boxes[:end_idx] + "]"
+      json_output = ast.literal_eval(truncated_text)
+    # Iterate over the bounding boxes
+    for i, bounding_box in enumerate(json_output):
+      # Select a color from the list
+      color = colors[i % len(colors)]
+      # Convert normalized coordinates to absolute coordinates
+      abs_y1 = int(bounding_box["bbox_2d"][1]/input_height * height)
+      abs_x1 = int(bounding_box["bbox_2d"][0]/input_width * width)
+      abs_y2 = int(bounding_box["bbox_2d"][3]/input_height * height)
+      abs_x2 = int(bounding_box["bbox_2d"][2]/input_width * width)
+      if abs_x1 > abs_x2:
+        abs_x1, abs_x2 = abs_x2, abs_x1
+      if abs_y1 > abs_y2:
+        abs_y1, abs_y2 = abs_y2, abs_y1
+      # Draw the bounding box
+      draw.rectangle(
+          ((abs_x1, abs_y1), (abs_x2, abs_y2)), outline=color, width=4
+      )
+      # # Draw the text
+      # if "label" in bounding_box:
+      #   draw.text((abs_x1 + 8, abs_y1 + 6), bounding_box["label"], fill=color, font=font)
+    # Display the image
+    # img.show()
+    # img.save('output.png')
+    return [abs_x1, abs_y1, abs_x2, abs_y2]
+# @title Parsing JSON output
+def parse_json(json_output):
+    # Parsing out the markdown fencing
+    lines = json_output.splitlines()
+    for i, line in enumerate(lines):
+        if line == "```json":
+            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
+            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
+            break  # Exit the loop once "```json" is found
+    return json_output
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+world_size = torch.cuda.device_count()
+rank = int(os.environ.get("LOCAL_RANK", 0))
+os.environ['MASTER_ADDR'] = 'localhost'
+os.environ['MASTER_PORT'] = '12355'
+# 初始化进程组
+dist.init_process_group(
+    backend="nccl",  # 使用NCCL后端（适用于GPU）
+    init_method="env://",
+    rank=rank,
+    world_size=world_size
+)
+print(f"Rank {rank} initialized")
+device = torch.device(f"cuda:{rank}")
+model_path = "/lustre/fsw/portfolios/nvr/users/yataij/pretrained/Qwen2.5-VL-7B-Instruct"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2",device_map={"": device},)
+processor = AutoProcessor.from_pretrained(model_path)
+def inference(image, prompt, system_prompt="You are a helpful assistant", max_new_tokens=1024):
+  # image = Image.open(img_url)
+  img_url_dummy = "/lustre/fsw/portfolios/nvr/users/yataij/data/SPAR-7M-RGBD/example.png"
+  messages = [
+    {
+      "role": "system",
+      "content": system_prompt
+    },
+    {
+      "role": "user",
+      "content": [
+        {
+          "type": "text",
+          "text": prompt
+        },
+        {
+          "image": img_url_dummy
+        }
+      ]
+    }
+  ]
+  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+  # print("input:\n",text)
+  inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt").to(device)
+  output_ids = model.generate(**inputs, max_new_tokens=1024)
+  generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+  output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+  # print("output:\n",output_text[0])
+  input_height = inputs['image_grid_thw'][0][1]*14
+  input_width = inputs['image_grid_thw'][0][2]*14
+  return output_text[0], input_height, input_width
+# prepare the model input
+dataset = load_dataset('/lustre/fs12/portfolios/nvr/projects/nvr_lpr_nvgptvision/users/yataij/data/SPAR_Bench')
+print(len(dataset['test']))
+print(dataset['test'].features.keys())
+data_list = []
+for i,example in tqdm(enumerate(dataset['test'])):
+    if example['img_type'] == 'single_view' and example['format_type'] == 'select': # select fill
+        data_list.append(example)
+print('test', len(data_list))
+visual_prompts = json.load(open('qwen3_visual_prompt_extract.json'))
+print(len(visual_prompts))
+data_list = data_list[rank::world_size]
+visual_prompts = visual_prompts[rank::world_size]
+res = []
+for i in tqdm(range(len(data_list))):
+  instance = data_list[i]
+  visual_prompt = visual_prompts[i]
+  assert instance['id'] == visual_prompt['id']
+  image = instance['image'][0]
+  width, height = image.size
+  vp_bbox = {}
+  for vp,ins in visual_prompt['visual_prompt'].items():
+    if 'point' in vp:
+      color = vp.split()[0]
+      prompt = f"Locate the {color} round point, output its bbox coordinates using JSON format."
+      response, input_height, input_width = inference(image, prompt)
+      try:
+        coord = plot_bounding_boxes(image,response,input_width,input_height)
+      except:
+        print(i, vp)
+        continue
+      coord = [coord[0]-50, coord[1]-50, coord[2]+50, coord[3]+50]
+      if coord[0] < 0: coord[0] = 0
+      if coord[1] < 0: coord[1] = 0
+      if coord[2] > width: coord[2] = width
+      if coord[3] > height: coord[3] = height
+      vp_bbox[vp] = coord
+    elif 'bbox' in vp:
+      anno = f"the {ins} in {vp}"
+      prompt = f"Locate {anno}, output its bbox coordinates using JSON format."
+      response, input_height, input_width = inference(image, prompt)
+      try:
+        coord = plot_bounding_boxes(image,response,input_width,input_height)
+      except:
+        print(i, vp)
+        continue
+      vp_bbox[vp] = coord
+  visual_prompt['visual_prompt_bbox'] = vp_bbox
+  res.append(visual_prompt)
+  with open(f'qwen25vl_sparbench_singleimg_select_bbox_rank{rank}.json', 'w') as f:
+    json.dump(res, f, indent=4)