lzy337 commited on
Commit
dfa7aa6
·
verified ·
1 Parent(s): 7b4fa0c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +48 -67
README.md CHANGED
@@ -9,19 +9,21 @@ For details of how to employ the models, please refer to our [repo](https://gith
9
  Below is the code of a quick demo:
10
 
11
  ```
12
- # pip install "vllm>=0.4" transformers pillow huggingface_hub
 
 
 
 
13
  import json
14
  import re
15
  from pathlib import Path
16
 
17
- from PIL import Image, ImageDraw
18
- from huggingface_hub import hf_hub_download
19
- from transformers import AutoTokenizer
20
  from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize as qwen_smart_resize
21
- from vllm import LLM, SamplingParams
22
-
23
- MODEL_ID = "osunlp/GUI-Drag-7B"
24
 
 
 
25
  FN_CALL_TEMPLATE = """You are a helpful assistant.
26
  # Tools
27
  You may call one or more functions to assist with the user query.
@@ -35,56 +37,42 @@ For each function call, return a json object with function name and arguments wi
35
  </tool_call>
36
  """
37
 
 
 
 
 
 
 
 
38
 
39
- def parse_drag_coordinates(response: str):
40
- """Match the first mouse_move/left_click + left_click_drag pair."""
41
- matches = re.findall(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", response, flags=re.DOTALL)
42
- if len(matches) < 2:
43
  return None
44
-
45
- first = json.loads(matches[0])
46
- second = json.loads(matches[1])
47
-
48
- first_action = first["arguments"].get("action")
49
- second_action = second["arguments"].get("action")
50
- if first_action not in ("mouse_move", "left_click"):
51
  return None
52
- if second_action != "left_click_drag":
53
  return None
54
-
55
  start = first["arguments"].get("coordinate")
56
  end = second["arguments"].get("coordinate")
57
- if not start or not end:
58
- return None
59
-
60
  return start, end
61
 
62
-
63
- def resize_back(coord, original_size, resized_size):
64
  ox, oy = original_size
65
  rx, ry = resized_size
66
- return round(coord[0] * ox / rx), round(coord[1] * oy / ry)
67
-
68
 
69
- def annotate_drag(image: Image.Image, start, end, save_path: Path):
70
- draw = ImageDraw.Draw(image)
71
- draw.ellipse((start[0] - 8, start[1] - 8, start[0] + 8, start[1] + 8), outline="lime", width=2)
72
- draw.ellipse((end[0] - 8, end[1] - 8, end[0] + 8, end[1] + 8), outline="red", width=2)
73
- draw.line((*start, *end), fill="yellow", width=3)
74
- image.save(save_path)
75
-
76
-
77
- def main():
78
- image = Image.open("demo_image.png")
79
  instruction = "Drag to select the highlighted paragraph."
 
80
 
81
- resized_h, resized_w = qwen_smart_resize(image.height, image.width, max_pixels=2_116_800, min_pixels=12_544)
82
-
83
- # vLLM + tokenizer initialisation
84
- llm = LLM(model=MODEL_ID, trust_remote_code=True, tokenizer_mode="slow", dtype="bfloat16")
85
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
86
- chat_tpl_path = hf_hub_download(repo_id=MODEL_ID, filename="chat_template.json")
87
- tokenizer.chat_template = json.loads(Path(chat_tpl_path).read_text())["chat_template"]
88
 
89
  messages = [
90
  {
@@ -94,39 +82,32 @@ def main():
94
  {
95
  "role": "user",
96
  "content": [
97
- {"type": "image"},
98
  {"type": "text", "text": instruction},
99
  ],
100
  },
101
  ]
102
 
103
- prompt_token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
104
- sampling = SamplingParams(temperature=0.01, top_k=1, max_tokens=1024)
105
-
106
- outputs = llm.generate(
107
- {
108
- "prompt_token_ids": prompt_token_ids,
109
- "multi_modal_data": {"image": image},
110
- },
111
- sampling_params=sampling,
112
  )
113
 
114
- generated_ids = outputs[0].outputs[0].token_ids
115
- response = tokenizer.decode(generated_ids, skip_special_tokens=True)
116
- drag = parse_drag_coordinates(response)
117
  if not drag:
118
- print("Model did not produce a valid drag action.")
119
  return
120
 
121
- # map coordinates back to the original resolution
122
- raw_start, raw_end = drag
123
- start = resize_back(raw_start, image.size, (resized_w, resized_h))
124
- end = resize_back(raw_end, image.size, (resized_w, resized_h))
125
-
126
- print("Predicted drag:", start, "→", end)
127
- annotate_drag(image.copy(), start, end, Path("GUI-Drag-7B_demo.png"))
128
-
129
 
130
  if __name__ == "__main__":
131
- main()
132
  ```
 
9
  Below is the code of a quick demo:
10
 
11
  ```
12
+ # pip install "transformers>=4.42" pillow openai
13
+ # 并启动你的 vLLM 服务,例如:
14
+ # vllm serve osunlp/GUI-Drag-7B --tensor-parallel-size 1 --dtype bfloat16 --port 8000
15
+
16
+ import base64
17
  import json
18
  import re
19
  from pathlib import Path
20
 
21
+ from openai import OpenAI
22
+ from PIL import Image
 
23
  from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize as qwen_smart_resize
 
 
 
24
 
25
+ MODEL_NAME = "GUI-Drag-7B"
26
+ BASE_URL = "http://localhost:8000/v1" # 替换成你的 vLLM 服务端口
27
  FN_CALL_TEMPLATE = """You are a helpful assistant.
28
  # Tools
29
  You may call one or more functions to assist with the user query.
 
37
  </tool_call>
38
  """
39
 
40
+ def encode_image(path: Path) -> str:
41
+ img = Image.open(path)
42
+ buf = Path(path).with_suffix(".tmp")
43
+ img.save(buf, format="PNG")
44
+ data = buf.read_bytes()
45
+ buf.unlink()
46
+ return base64.b64encode(data).decode("utf-8")
47
 
48
+ def process_simple_drag_response(parsed_responses):
49
+ if len(parsed_responses) < 2:
 
 
50
  return None
51
+ first = json.loads(parsed_responses[0])
52
+ second = json.loads(parsed_responses[1])
53
+ if first["arguments"]["action"] not in ("mouse_move", "left_click"):
 
 
 
 
54
  return None
55
+ if second["arguments"]["action"] != "left_click_drag":
56
  return None
 
57
  start = first["arguments"].get("coordinate")
58
  end = second["arguments"].get("coordinate")
 
 
 
59
  return start, end
60
 
61
+ def resize_back(coords, original_size, resized_size):
 
62
  ox, oy = original_size
63
  rx, ry = resized_size
64
+ return round(coords[0] * ox / rx), round(coords[1] * oy / ry)
 
65
 
66
+ def demo():
67
+ image_path = Path("demo_image.png")
 
 
 
 
 
 
 
 
68
  instruction = "Drag to select the highlighted paragraph."
69
+ image = Image.open(image_path)
70
 
71
+ resized_h, resized_w = qwen_smart_resize(
72
+ image.height, image.width,
73
+ max_pixels=2_116_800,
74
+ min_pixels=12_544,
75
+ )
 
 
76
 
77
  messages = [
78
  {
 
82
  {
83
  "role": "user",
84
  "content": [
85
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(image_path)}"}},
86
  {"type": "text", "text": instruction},
87
  ],
88
  },
89
  ]
90
 
91
+ client = OpenAI(base_url=BASE_URL, api_key="EMPTY")
92
+ response = client.chat.completions.create(
93
+ model=MODEL_NAME,
94
+ messages=messages,
95
+ temperature=0.0,
96
+ max_tokens=1024,
 
 
 
97
  )
98
 
99
+ text = response.choices[0].message.content
100
+ parsed = re.findall(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", text, re.DOTALL)
101
+ drag = process_simple_drag_response(parsed)
102
  if not drag:
103
+ print("No drag action detected.")
104
  return
105
 
106
+ start, end = drag
107
+ start = resize_back(start, image.size, (resized_w, resized_h))
108
+ end = resize_back(end, image.size, (resized_w, resized_h))
109
+ print("Predicted drag:", start, "->", end)
 
 
 
 
110
 
111
  if __name__ == "__main__":
112
+ demo()
113
  ```