Abs6187 commited on
Commit
7860e5b
Β·
1 Parent(s): 0301cd4

Adding Prompt and UI Changes

Browse files
Files changed (1) hide show
  1. app.py +112 -84
app.py CHANGED
@@ -4,123 +4,151 @@ import ast
4
  import torch
5
  from PIL import Image, ImageDraw
6
  import gradio as gr
7
- import base64
8
- from io import BytesIO
9
-
10
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
11
- from qwen_vl_utils import process_vision_info # include this file in your repo if not pip-installable
12
 
 
 
13
  _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
14
  "ByteDance-Seed/UI-TARS-1.5-7B",
15
  device_map="auto",
16
  torch_dtype=torch.float16
17
  )
18
-
19
  _PROCESSOR = AutoProcessor.from_pretrained(
20
  "ByteDance-Seed/UI-TARS-1.5-7B",
21
- size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28}, # sane res
22
  use_fast=True,
23
  )
24
-
25
  model = _MODEL
26
  processor = _PROCESSOR
27
 
28
-
29
- def draw_point(image: Image.Image, point=None, radius: int = 5):
30
- """Overlay a red dot on the screenshot where the model clicked."""
31
  img = image.copy()
32
- if point:
33
  x, y = point[0] * img.width, point[1] * img.height
34
- ImageDraw.Draw(img).ellipse(
35
- (x - radius, y - radius, x + radius, y + radius), fill="red"
 
 
36
  )
37
  return img
38
 
39
-
40
  @spaces.GPU
41
  def navigate(screenshot, task: str):
42
- """Run one inference step on the GUI‑reasoning model.
43
-
44
- Args:
45
- screenshot (PIL.Image): Latest UI screenshot.
46
- task (str): Natural‑language task description
47
- history (list | str | None): Previous messages list. Accepts either an
48
- actual Python list (via gr.JSON) or a JSON/Python‑literal string.
49
- """
50
-
51
-
52
- # ───────────────────── normalise history input ──────────────────────────
53
 
54
- messages=[]
55
-
56
- prompt_header = (
57
- "You are a GUI agent. You are given a task and your action history, with screenshots."
58
- "You need to perform the next action to complete the task. \n\n## Output Format\n```\nThought: ...\nAction: ...\n```\n\n## Action Space\n\nclick(start_box='<|box_start|>(x1, y1)<|box_end|>')\nleft_double(start_box='<|box_start|>(x1, y1)<|box_end|>')\nright_single(start_box='<|box_start|<(x1, y1)>|box_end|>')\ndrag(start_box='<|box_start|>(x1, y1)<|box_end|>', end_box='<|box_start|>(x3, y3)<|box_end|>')\nhotkey(key='')\ntype(content='') #If you want to submit your input, use \"\\n\" at the end of `content`.\nscroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='down or up or right or left')\nwait() #Sleep for 5s and take a screenshot to check for any changes.\nfinished(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format.\n\n\n## Note\n- Use English in `Thought` part.\n- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. Always use 'win' instead of 'meta' key\n\n"
59
- f"## User Instruction\n{task}"
60
- )
61
- current = {"role":"user","content":[{"type":"text","text":prompt_header},{"type": "image_url", "image_url":screenshot}]}
62
-
63
- messages.append(current)
64
 
65
- #New Comment 1
66
- # ─────────────────────────── model forward ─────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
 
 
 
 
 
 
68
  images, videos = process_vision_info(messages)
69
- text = processor.apply_chat_template(
70
- messages, tokenize=False, add_generation_prompt=True
71
- )
72
  inputs = processor(
73
- text=[text],
74
- images=images,
75
- videos=videos,
76
- padding=True,
77
- return_tensors="pt",
78
  ).to("cuda")
79
 
80
- generated = model.generate(**inputs, max_new_tokens=128)
81
- trimmed = [
82
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated)
83
- ]
84
- raw_out = processor.batch_decode(
85
- trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
86
- )[0]
87
 
88
- # ─────── draw predicted click for quick visual verification (optional) ──────
89
  try:
90
- actions = ast.literal_eval(raw_out)
91
- for act in actions if isinstance(actions, list) else [actions]:
92
- pos = act.get("position")
93
- if pos and isinstance(pos, list) and len(pos) == 2:
 
 
 
 
 
 
 
 
 
94
  screenshot = draw_point(screenshot, pos)
95
- except Exception:
96
- # decoding failed β†’ just return original screenshot
97
  pass
98
 
99
  return screenshot, raw_out, messages
100
 
101
- # ────────────────────────── Gradio interface ───────────────────────────────
102
-
103
- demo = gr.Interface(
104
- fn=navigate,
105
- inputs=[
106
- gr.Image(type="pil", label="Screenshot"),
107
- gr.Textbox(
108
- lines=1,
109
- placeholder="e.g. Search the weather for New York",
110
- label="Task",
111
- )
112
- ],
113
- outputs=[
114
- gr.Image(label="With Click Point"),
115
- gr.Textbox(label="Raw Action JSON"),
116
- gr.JSON(label="Updated Conversation History")
117
- ],
118
- title="UI-Tars Navigation Demo",
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- demo.launch(
122
- server_name="0.0.0.0",
123
- server_port=7860,
124
- share=False, # or True if you need a public link
125
- ssr_mode=False, # turn off experimental SSR so the process blocks
126
- )
 
 
4
  import torch
5
  from PIL import Image, ImageDraw
6
  import gradio as gr
 
 
 
7
  from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
8
+ from qwen_vl_utils import process_vision_info # Make sure this file is in your repository
9
 
10
+ # --- Model and Processor Initialization ---
11
+ # This setup is standard and remains unchanged.
12
  _MODEL = Qwen2_5_VLForConditionalGeneration.from_pretrained(
13
  "ByteDance-Seed/UI-TARS-1.5-7B",
14
  device_map="auto",
15
  torch_dtype=torch.float16
16
  )
 
17
  _PROCESSOR = AutoProcessor.from_pretrained(
18
  "ByteDance-Seed/UI-TARS-1.5-7B",
19
+ size={"shortest_edge": 100 * 28 * 28, "longest_edge": 16384 * 28 * 28},
20
  use_fast=True,
21
  )
 
22
  model = _MODEL
23
  processor = _PROCESSOR
24
 
25
+ def draw_point(image: Image.Image, point=None, radius: int = 15):
26
+ """Overlays a larger, more visible red dot on the screenshot."""
 
27
  img = image.copy()
28
+ if point and isinstance(point, list) and len(point) == 2:
29
  x, y = point[0] * img.width, point[1] * img.height
30
+ draw = ImageDraw.Draw(img)
31
+ # Draw a larger ellipse for better visibility on high-res screens
32
+ draw.ellipse(
33
+ (x - radius, y - radius, x + radius, y + radius), fill="rgba(255, 0, 0, 180)", outline="white", width=2
34
  )
35
  return img
36
 
 
37
  @spaces.GPU
38
  def navigate(screenshot, task: str):
39
+ """Runs a single inference step of the GUI reasoning model."""
40
+ if not screenshot or not task:
41
+ # Added basic validation to prevent errors with empty inputs
42
+ return None, "Please provide both a screenshot and a task.", []
 
 
 
 
 
 
 
43
 
44
+ messages = []
 
 
 
 
 
 
 
 
 
45
 
46
+ # --- KEY CHANGE: Refined Prompt for Concise Reasoning ---
47
+ # The 'Note' section is updated to guide the model towards a shorter, more direct "Thought" process.
48
+ prompt_header = (
49
+ "You are a GUI agent. You are given a task and a screenshot. Your goal is to determine the next action.\n\n"
50
+ "## Output Format\n```\nThought: ...\nAction: ...\n```\n\n"
51
+ "## Action Space\n"
52
+ "click(start_box='<|box_start|>(x1, y1)<|box_end|>')\n"
53
+ "type(content='...')\n"
54
+ "scroll(start_box='<|box_start|>(x1, y1)<|box_end|>', direction='...')\n"
55
+ "finished(content='...')\n\n"
56
+ "## Note\n"
57
+ "- In the `Thought` part, briefly state your reasoning in a single, direct sentence.\n"
58
+ "- Always use 'win' instead of 'meta' for hotkeys.\n\n"
59
+ f"## User Instruction\n{task}"
60
+ )
61
 
62
+ content = [
63
+ {"type": "text", "text": prompt_header},
64
+ {"type": "image_url", "image_url": screenshot}
65
+ ]
66
+ messages.append({"role": "user", "content": content})
67
+
68
  images, videos = process_vision_info(messages)
69
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
70
  inputs = processor(
71
+ text=[text], images=images, videos=videos, padding=True, return_tensors="pt"
 
 
 
 
72
  ).to("cuda")
73
 
74
+ generated = model.generate(**inputs, max_new_tokens=256)
75
+ trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated)]
76
+ raw_out = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
 
 
 
77
 
 
78
  try:
79
+ if "Action:" in raw_out:
80
+ action_part = raw_out.split("Action:")[1].strip()
81
+ # The model sometimes wraps its output in ```, so we remove it.
82
+ if action_part.startswith("```") and action_part.endswith("```"):
83
+ action_part = action_part[3:-3].strip()
84
+
85
+ action_dict = ast.literal_eval(action_part)
86
+
87
+ box_str = action_dict.get("start_box")
88
+ if box_str and isinstance(box_str, str) and "( " in box_str:
89
+ coords_part = box_str.split('( ')[1].split(' )')[0]
90
+ x_str, y_str = coords_part.split(', ')
91
+ pos = [float(x_str), float(y_str)]
92
  screenshot = draw_point(screenshot, pos)
93
+ except (Exception, SyntaxError) as e:
94
+ print(f"Could not parse action or draw point: {e}")
95
  pass
96
 
97
  return screenshot, raw_out, messages
98
 
99
+ # --- KEY CHANGE: Enhanced Gradio UI ---
100
+ # The interface is rebuilt using gr.Blocks for a cleaner layout and better user guidance.
101
+ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 90% !important;}") as demo:
102
+ gr.Markdown(
103
+ """
104
+ # ✨ Enhanced UI-Tars Navigation Demo
105
+ **Upload a screenshot and provide a task to see how the AI plans its next action.**
106
+ The model will analyze the image and your instruction, then output its thought process and the specific action it would take. A red dot will indicate the target location for clicks or scrolls.
107
+ """
108
+ )
109
+ with gr.Row():
110
+ with gr.Column(scale=1):
111
+ screenshot_in = gr.Image(type="pil", label="Screenshot")
112
+ task_in = gr.Textbox(
113
+ lines=2,
114
+ placeholder="e.g., Click on the 'Sign In' button.",
115
+ label="Task Instruction",
116
+ )
117
+ submit_btn = gr.Button("Analyze Action", variant="primary")
118
+
119
+ gr.Examples(
120
+ examples=[
121
+ ["examples/google.png", "Search for 'latest AI news'"],
122
+ ["examples/github.png", "Find the search bar and type 'Qwen'"],
123
+ ["examples/figma.png", "Select the blue rectangle on the canvas"],
124
+ ],
125
+ inputs=[screenshot_in, task_in],
126
+ label="Example Use Cases"
127
+ )
128
+
129
+ with gr.Column(scale=2):
130
+ screenshot_out = gr.Image(label="Result: Screenshot with Click Point", interactive=False)
131
+ with gr.Accordion("Model Output Details", open=False):
132
+ raw_out = gr.Textbox(label="Full Model Output (Thought & Action)", interactive=False)
133
+ history_out = gr.JSON(label="Conversation History for Debugging", interactive=False)
134
+
135
+ submit_btn.click(
136
+ fn=navigate,
137
+ inputs=[screenshot_in, task_in],
138
+ outputs=[screenshot_out, raw_out, history_out],
139
+ )
140
+
141
+ gr.Markdown(
142
+ """
143
+ ---
144
+ *Model: [ByteDance-Seed/UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B)*
145
+ """
146
+ )
147
 
148
+ if __name__ == "__main__":
149
+ # To run this, you'll need to create an 'examples' directory with the sample images.
150
+ demo.launch(
151
+ server_name="0.0.0.0",
152
+ server_port=7860,
153
+ share=False,
154
+ )