osunlp
/

GUI-Drag-7B

@@ -9,19 +9,21 @@ For details of how to employ the models, please refer to our [repo](https://gith
 Below is the code of a quick demo:
 ```
-# pip install "vllm>=0.4" transformers pillow huggingface_hub
 import json
 import re
 from pathlib import Path
-from PIL import Image, ImageDraw
-from huggingface_hub import hf_hub_download
-from transformers import AutoTokenizer
 from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize as qwen_smart_resize
-from vllm import LLM, SamplingParams
-MODEL_ID = "osunlp/GUI-Drag-7B"
 FN_CALL_TEMPLATE = """You are a helpful assistant.
 # Tools
 You may call one or more functions to assist with the user query.
@@ -35,56 +37,42 @@ For each function call, return a json object with function name and arguments wi
 </tool_call>
 """
-def parse_drag_coordinates(response: str):
-    """Match the first mouse_move/left_click + left_click_drag pair."""
-    matches = re.findall(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", response, flags=re.DOTALL)
-    if len(matches) < 2:
         return None
-    first = json.loads(matches[0])
-    second = json.loads(matches[1])
-    first_action = first["arguments"].get("action")
-    second_action = second["arguments"].get("action")
-    if first_action not in ("mouse_move", "left_click"):
         return None
-    if second_action != "left_click_drag":
         return None
     start = first["arguments"].get("coordinate")
     end = second["arguments"].get("coordinate")
-    if not start or not end:
-        return None
     return start, end
-def resize_back(coord, original_size, resized_size):
     ox, oy = original_size
     rx, ry = resized_size
-    return round(coord[0] * ox / rx), round(coord[1] * oy / ry)
-def annotate_drag(image: Image.Image, start, end, save_path: Path):
-    draw = ImageDraw.Draw(image)
-    draw.ellipse((start[0] - 8, start[1] - 8, start[0] + 8, start[1] + 8), outline="lime", width=2)
-    draw.ellipse((end[0] - 8, end[1] - 8, end[0] + 8, end[1] + 8), outline="red", width=2)
-    draw.line((*start, *end), fill="yellow", width=3)
-    image.save(save_path)
-def main():
-    image = Image.open("demo_image.png")
     instruction = "Drag to select the highlighted paragraph."
-    resized_h, resized_w = qwen_smart_resize(image.height, image.width, max_pixels=2_116_800, min_pixels=12_544)
-    # vLLM + tokenizer initialisation
-    llm = LLM(model=MODEL_ID, trust_remote_code=True, tokenizer_mode="slow", dtype="bfloat16")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
-    chat_tpl_path = hf_hub_download(repo_id=MODEL_ID, filename="chat_template.json")
-    tokenizer.chat_template = json.loads(Path(chat_tpl_path).read_text())["chat_template"]
     messages = [
         {
@@ -94,39 +82,32 @@ def main():
         {
             "role": "user",
             "content": [
-                {"type": "image"},
                 {"type": "text", "text": instruction},
             ],
         },
     ]
-    prompt_token_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
-    sampling = SamplingParams(temperature=0.01, top_k=1, max_tokens=1024)
-    outputs = llm.generate(
-        {
-            "prompt_token_ids": prompt_token_ids,
-            "multi_modal_data": {"image": image},
-        },
-        sampling_params=sampling,
     )
-    generated_ids = outputs[0].outputs[0].token_ids
-    response = tokenizer.decode(generated_ids, skip_special_tokens=True)
-    drag = parse_drag_coordinates(response)
     if not drag:
-        print("Model did not produce a valid drag action.")
         return
-    # map coordinates back to the original resolution
-    raw_start, raw_end = drag
-    start = resize_back(raw_start, image.size, (resized_w, resized_h))
-    end = resize_back(raw_end, image.size, (resized_w, resized_h))
-    print("Predicted drag:", start, "→", end)
-    annotate_drag(image.copy(), start, end, Path("GUI-Drag-7B_demo.png"))
 if __name__ == "__main__":
-    main()
 ```

 Below is the code of a quick demo:
 ```
+# pip install "transformers>=4.42" pillow openai
+# 并启动你的 vLLM 服务，例如：
+# vllm serve osunlp/GUI-Drag-7B --tensor-parallel-size 1 --dtype bfloat16 --port 8000
+import base64
 import json
 import re
 from pathlib import Path
+from openai import OpenAI
+from PIL import Image
 from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize as qwen_smart_resize
+MODEL_NAME = "GUI-Drag-7B"
+BASE_URL = "http://localhost:8000/v1"   # 替换成你的 vLLM 服务端口
 FN_CALL_TEMPLATE = """You are a helpful assistant.
 # Tools
 You may call one or more functions to assist with the user query.
 </tool_call>
 """
+def encode_image(path: Path) -> str:
+    img = Image.open(path)
+    buf = Path(path).with_suffix(".tmp")
+    img.save(buf, format="PNG")
+    data = buf.read_bytes()
+    buf.unlink()
+    return base64.b64encode(data).decode("utf-8")
+def process_simple_drag_response(parsed_responses):
+    if len(parsed_responses) < 2:
         return None
+    first = json.loads(parsed_responses[0])
+    second = json.loads(parsed_responses[1])
+    if first["arguments"]["action"] not in ("mouse_move", "left_click"):
         return None
+    if second["arguments"]["action"] != "left_click_drag":
         return None
     start = first["arguments"].get("coordinate")
     end = second["arguments"].get("coordinate")
     return start, end
+def resize_back(coords, original_size, resized_size):
     ox, oy = original_size
     rx, ry = resized_size
+    return round(coords[0] * ox / rx), round(coords[1] * oy / ry)
+def demo():
+    image_path = Path("demo_image.png")
     instruction = "Drag to select the highlighted paragraph."
+    image = Image.open(image_path)
+    resized_h, resized_w = qwen_smart_resize(
+        image.height, image.width,
+        max_pixels=2_116_800,
+        min_pixels=12_544,
+    )
     messages = [
         {
         {
             "role": "user",
             "content": [
+                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(image_path)}"}},
                 {"type": "text", "text": instruction},
             ],
         },
     ]
+    client = OpenAI(base_url=BASE_URL, api_key="EMPTY")
+    response = client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        temperature=0.0,
+        max_tokens=1024,
     )
+    text = response.choices[0].message.content
+    parsed = re.findall(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", text, re.DOTALL)
+    drag = process_simple_drag_response(parsed)
     if not drag:
+        print("No drag action detected.")
         return
+    start, end = drag
+    start = resize_back(start, image.size, (resized_w, resized_h))
+    end = resize_back(end, image.size, (resized_w, resized_h))
+    print("Predicted drag:", start, "->", end)
 if __name__ == "__main__":
+    demo()
 ```