prithivMLmods commited on
Commit
865d1bc
·
verified ·
1 Parent(s): c4905cb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -162
app.py CHANGED
@@ -1,57 +1,98 @@
1
- import spaces
2
  import re
3
- from typing import Tuple, Optional, List, Dict, Any
 
4
 
5
  import gradio as gr
6
  import numpy as np
7
  import torch
 
8
  from PIL import Image, ImageDraw, ImageFont
9
 
10
- # Transformers imports for Fara Model
11
- from transformers import (
12
- Qwen2_5_VLForConditionalGeneration,
13
- AutoProcessor,
14
- )
15
  from qwen_vl_utils import process_vision_info
16
 
17
- # --- Configuration ---
18
- MODEL_ID = "microsoft/Fara-7B"
19
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
20
-
21
  # -----------------------------------------------------------------------------
22
- # PROMPT DEFINITIONS (from prompt.py)
23
  # -----------------------------------------------------------------------------
24
 
25
  OS_ACTIONS = """
26
- def click(x: float, y: float) -> str:
 
 
 
 
 
 
 
27
  \"\"\"
28
- Performs a left-click at the specified normalized coordinates.
29
  Args:
30
- x: The x coordinate (0.0 to 1.0).
31
- y: The y coordinate (0.0 to 1.0).
32
  \"\"\"
33
 
34
- def double_click(x: float, y: float) -> str:
35
  \"\"\"
36
- Performs a double-click at the specified normalized coordinates.
37
  Args:
38
- x: The x coordinate (0.0 to 1.0).
39
- y: The y coordinate (0.0 to 1.0).
 
 
 
 
 
 
 
 
40
  \"\"\"
41
 
42
  def type(text: str) -> str:
43
  \"\"\"
44
- Types the specified text.
 
 
 
 
 
 
 
45
  Args:
46
- text: The text to type.
 
 
 
 
 
47
  \"\"\"
48
 
49
  def drag(from_coord: list[float], to_coord: list[float]) -> str:
50
  \"\"\"
51
- Drags from [x1, y1] to [x2, y2].
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  Args:
53
- from_coord: The starting normalized coordinates [x1, y1].
54
- to_coord: The ending normalized coordinates [x2, y2].
55
  \"\"\"
56
  """
57
 
@@ -59,71 +100,72 @@ OS_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and
59
 
60
  For each step:
61
  • First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
62
- • Then, use <code></code> to perform the action. It will be executed in a stateful environment.
63
 
64
  The following functions are exposed to the Python interpreter:
65
  <code>
66
  {OS_ACTIONS}
67
  </code>
68
 
69
- The state persists between code executions.
70
  """
71
 
72
  # -----------------------------------------------------------------------------
73
- # FARA MODEL WRAPPER (adapted from smolvlm_inference.py)
74
  # -----------------------------------------------------------------------------
75
 
76
- class FaraModelWrapper:
77
- def __init__(self, model_id: str, to_device: str):
78
- print(f"Loading {model_id} on {to_device}...")
 
 
79
  self.model_id = model_id
80
 
81
- try:
82
- self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
83
- self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
84
- model_id,
85
- trust_remote_code=True,
86
- torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
87
- device_map="auto" if to_device == "cuda" else None,
88
- )
89
- if to_device == "cpu":
90
- self.model.to("cpu")
91
- self.model.eval()
92
- print("Fara Model loaded successfully.")
93
- except Exception as e:
94
- print(f"Failed to load Fara, falling back to Qwen2.5-VL-7B. Error: {e}")
95
- fallback_id = "Qwen/Qwen2.5-VL-7B-Instruct"
96
- self.processor = AutoProcessor.from_pretrained(fallback_id, trust_remote_code=True)
97
- self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
98
- fallback_id,
99
- trust_remote_code=True,
100
- torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
101
- device_map="auto",
102
- )
103
- print("Fallback model loaded.")
104
 
105
- def generate(self, messages: list[dict], max_new_tokens=512, **kwargs):
106
- """
107
- Generate a response from the Fara/QwenVL model.
108
- """
109
  text = self.processor.apply_chat_template(
110
  messages, tokenize=False, add_generation_prompt=True
111
  )
112
- image_inputs, _ = process_vision_info(messages)
113
 
 
 
 
 
114
  inputs = self.processor(
115
  text=[text],
116
  images=image_inputs,
 
117
  padding=True,
118
  return_tensors="pt",
119
- ).to(self.model.device)
 
 
120
 
 
121
  with torch.no_grad():
122
- generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs)
123
 
124
- # Trim input tokens to get only the generated part
125
  generated_ids_trimmed = [
126
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
127
  ]
128
 
129
  output_text = self.processor.batch_decode(
@@ -132,20 +174,26 @@ class FaraModelWrapper:
132
 
133
  return output_text
134
 
135
- # --- Initialize Global Model ---
136
- model = FaraModelWrapper(
137
- model_id=MODEL_ID,
138
- to_device=DEVICE,
139
- )
 
 
 
140
 
141
  # -----------------------------------------------------------------------------
142
- # HELPER FUNCTIONS (from app.py logic)
143
  # -----------------------------------------------------------------------------
144
 
145
- def get_navigation_prompt(task, image, previous_actions="None"):
146
- """
147
- Constructs the prompt for the model.
148
- """
 
 
 
149
  return [
150
  {
151
  "role": "system",
@@ -155,30 +203,31 @@ def get_navigation_prompt(task, image, previous_actions="None"):
155
  "role": "user",
156
  "content": [
157
  {"type": "image", "image": image},
158
- {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\n{previous_actions}"},
159
  ],
160
  },
161
  ]
162
 
163
- def array_to_image(image_array: np.ndarray) -> Image.Image:
164
- if image_array is None:
165
- raise ValueError("No image provided.")
166
- return Image.fromarray(np.uint8(image_array))
167
-
168
  def parse_actions_from_response(response: str) -> list[str]:
169
- """Parse actions from model response using <code>...</code> pattern."""
170
- pattern = r"<code>\s*(.*?)\s*</code>"
 
171
  matches = re.findall(pattern, response, re.DOTALL)
 
 
 
 
172
  return matches
173
 
174
  def extract_coordinates_from_action(action_code: str) -> list[dict]:
175
- """Extract normalized (0-1) coordinates from action code for visualization."""
176
  localization_actions = []
177
 
178
- # Patterns for different action types expecting normalized floats
179
  patterns = {
180
  'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
181
  'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
 
182
  'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
183
  }
184
 
@@ -186,17 +235,25 @@ def extract_coordinates_from_action(action_code: str) -> list[dict]:
186
  matches = re.finditer(pattern, action_code)
187
  for match in matches:
188
  if action_type == 'drag':
189
- from_x, from_y, to_x, to_y = map(float, match.groups())
190
- localization_actions.append({'type': 'drag_from', 'x': from_x, 'y': from_y, 'action': action_type})
191
- localization_actions.append({'type': 'drag_to', 'x': to_x, 'y': to_y, 'action': action_type})
 
 
 
 
192
  else:
193
- x_val, y_val = map(float, match.groups())
194
- localization_actions.append({'type': action_type, 'x': x_val, 'y': y_val, 'action': action_type})
 
 
 
 
195
 
196
  return localization_actions
197
 
198
  def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
199
- """Draw markers on the image to visualize the predicted action."""
200
  if not coordinates:
201
  return None
202
 
@@ -205,91 +262,100 @@ def create_localized_image(original_image: Image.Image, coordinates: list[dict])
205
  width, height = img_copy.size
206
 
207
  try:
208
- font = ImageFont.truetype("Arial.ttf", 15)
209
- except IOError:
210
  font = ImageFont.load_default()
211
-
212
- colors = {'click': 'red', 'double_click': 'blue', 'drag_from': 'orange', 'drag_to': 'purple'}
 
 
213
 
214
  for i, coord in enumerate(coordinates):
215
- pixel_x = int(coord['x'] * width)
216
- pixel_y = int(coord['y'] * height)
 
 
 
 
 
 
217
  color = colors.get(coord['type'], 'red')
218
 
219
- radius = 8
220
- draw.ellipse([pixel_x - radius, pixel_y - radius, pixel_x + radius, pixel_y + radius], fill=color, outline='white', width=2)
221
 
222
- label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
223
- draw.text((pixel_x + 12, pixel_y - 12), label, fill=color, font=font, stroke_width=1, stroke_fill="white")
224
 
 
225
  if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
226
  next_coord = coordinates[i + 1]
227
- end_x = int(next_coord['x'] * width)
228
- end_y = int(next_coord['y'] * height)
229
  draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
230
-
231
  return img_copy
232
 
233
  # -----------------------------------------------------------------------------
234
- # GRADIO CORE FUNCTION
235
  # -----------------------------------------------------------------------------
236
 
237
  @spaces.GPU(duration=60)
238
- def predict_action(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
239
- """
240
- Main Gradio function: takes image and task, returns model output and visualized image.
241
- """
242
- if model is None:
243
- raise ValueError("Model not loaded")
244
-
245
  input_pil_image = array_to_image(input_numpy_image)
246
 
247
- # Generate prompt and get model prediction
248
  prompt = get_navigation_prompt(task, input_pil_image)
249
- model_response = model.generate(prompt, max_new_tokens=500)
250
- print(f"Model Response: {model_response}")
 
 
251
 
252
- # Parse the response to find action code
253
- action_codes = parse_actions_from_response(model_response)
 
 
 
 
254
 
255
- # Extract coordinates from all found actions for visualization
256
  all_coordinates = []
257
- for code in action_codes:
258
- coordinates = extract_coordinates_from_action(code)
259
  all_coordinates.extend(coordinates)
260
 
261
- # Create the visualized image if coordinates were found
262
- visualized_image = None
263
  if all_coordinates:
264
- visualized_image = create_localized_image(input_pil_image, all_coordinates)
265
- print(f"Found {len(all_coordinates)} localization actions. Visualizing.")
266
- else:
267
- print("No localization actions found in the response.")
268
-
269
- # Return the raw model response and the (possibly updated) image
270
- return model_response, visualized_image if visualized_image else input_pil_image
271
 
272
  # -----------------------------------------------------------------------------
273
- # GRADIO UI LAYOUT
274
  # -----------------------------------------------------------------------------
275
 
276
- title = "Fara GUI Operator"
277
  description = """
278
- This is a demo of the **Fara Model** acting as a GUI Operator.
279
- Provide a screenshot of a user interface and a task you want to perform. The model will output the thought process and the corresponding action code, visualizing clicks and drags directly on the image.
280
- This version does not execute the actions; it only predicts and visualizes them.
281
  """
282
 
283
- # Load Example Data
284
- try:
285
- example_1_image = Image.open("./assets/google.png")
286
- example_1_task = "Search for the name of the current UK Prime Minister."
287
- example_2_image = Image.open("./assets/huggingface.png")
288
- example_2_task = "Find the most trending model."
289
- examples = [[example_1_image, example_1_task], [example_2_image, example_2_task]]
290
- except FileNotFoundError:
291
- print("Warning: Example assets not found. The demo will run without examples.")
292
- examples = []
 
 
 
293
 
294
 
295
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -297,35 +363,33 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
297
  gr.Markdown(description)
298
 
299
  with gr.Row():
300
- with gr.Column(scale=1):
301
- input_image_component = gr.Image(label="UI Screenshot", type="numpy", height=500)
302
  task_component = gr.Textbox(
303
- label="Task",
304
- placeholder="e.g., Search for 'Fara Model'",
305
- info="Type the task you want the model to perform on this UI.",
306
  )
307
- submit_button = gr.Button("Predict Action", variant="primary")
308
-
309
- with gr.Column(scale=1):
310
- output_text_component = gr.Textbox(label="Model Full Output", lines=10, interactive=False)
311
- # The input image component will be updated with the visualized output
312
- gr.Markdown("### Visualized Action")
313
- gr.Markdown("The image on the left will update with markers for clicks/drags.")
314
 
315
  submit_button.click(
316
- predict_action,
317
- [input_image_component, task_component],
318
- [output_text_component, input_image_component]
319
  )
320
 
321
- if examples:
322
  gr.Examples(
323
- examples=examples,
324
  inputs=[input_image_component, task_component],
325
- outputs=[output_text_component, input_image_component],
326
- fn=predict_action,
327
  cache_examples=True,
328
  )
329
 
330
  if __name__ == "__main__":
331
- demo.queue().launch(debug=True, share=True)
 
1
+ import os
2
  import re
3
+ import time
4
+ from typing import Tuple, Optional, List, Dict
5
 
6
  import gradio as gr
7
  import numpy as np
8
  import torch
9
+ import spaces
10
  from PIL import Image, ImageDraw, ImageFont
11
 
12
+ # Transformers & Qwen Utils
13
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 
 
 
14
  from qwen_vl_utils import process_vision_info
15
 
 
 
 
 
16
  # -----------------------------------------------------------------------------
17
+ # 1. PROMPT DEFINITIONS (from prompt.py)
18
  # -----------------------------------------------------------------------------
19
 
20
  OS_ACTIONS = """
21
+ def final_answer(answer: any) -> any:
22
+ \"\"\"
23
+ Provides a final answer to the given problem.
24
+ Args:
25
+ answer: The final answer to the problem
26
+ \"\"\"
27
+
28
+ def move_mouse(self, x: float, y: float) -> str:
29
  \"\"\"
30
+ Moves the mouse cursor to the specified coordinates
31
  Args:
32
+ x: The x coordinate (horizontal position)
33
+ y: The y coordinate (vertical position)
34
  \"\"\"
35
 
36
+ def click(x: Optional[float] = None, y: Optional[float] = None) -> str:
37
  \"\"\"
38
+ Performs a left-click at the specified normalized coordinates
39
  Args:
40
+ x: The x coordinate (horizontal position)
41
+ y: The y coordinate (vertical position)
42
+ \"\"\"
43
+
44
+ def double_click(x: Optional[float] = None, y: Optional[float] = None) -> str:
45
+ \"\"\"
46
+ Performs a double-click at the specified normalized coordinates
47
+ Args:
48
+ x: The x coordinate (horizontal position)
49
+ y: The y coordinate (vertical position)
50
  \"\"\"
51
 
52
  def type(text: str) -> str:
53
  \"\"\"
54
+ Types the specified text at the current cursor position.
55
+ Args:
56
+ text: The text to type
57
+ \"\"\"
58
+
59
+ def press(keys: str | list[str]) -> str:
60
+ \"\"\"
61
+ Presses a keyboard key
62
  Args:
63
+ keys: The key or list of keys to press (e.g. "enter", "space", "backspace", "ctrl", etc.).
64
+ \"\"\"
65
+
66
+ def navigate_back() -> str:
67
+ \"\"\"
68
+ Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
69
  \"\"\"
70
 
71
  def drag(from_coord: list[float], to_coord: list[float]) -> str:
72
  \"\"\"
73
+ Clicks [x1, y1], drags mouse to [x2, y2], then release click.
74
+ Args:
75
+ x1: origin x coordinate
76
+ y1: origin y coordinate
77
+ x2: end x coordinate
78
+ y2: end y coordinate
79
+ \"\"\"
80
+
81
+ def scroll(direction: Literal["up", "down"] = "down", amount: int = 1) -> str:
82
+ \"\"\"
83
+ Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
84
+ Args:
85
+ x: The x coordinate (horizontal position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
86
+ y: The y coordinate (vertical position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
87
+ direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
88
+ amount: The amount to scroll. A good amount is 1 or 2.
89
+ \"\"\"
90
+
91
+ def wait(seconds: float) -> str:
92
+ \"\"\"
93
+ Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
94
  Args:
95
+ seconds: Number of seconds to wait, generally 2 is enough.
 
96
  \"\"\"
97
  """
98
 
 
100
 
101
  For each step:
102
  • First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
103
+ • Then, use <code></code> to perform the action. it will be executed in a stateful environment.
104
 
105
  The following functions are exposed to the Python interpreter:
106
  <code>
107
  {OS_ACTIONS}
108
  </code>
109
 
110
+ The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
111
  """
112
 
113
  # -----------------------------------------------------------------------------
114
+ # 2. MODEL DEFINITION (Adapted for Fara-7B / Qwen2.5-VL)
115
  # -----------------------------------------------------------------------------
116
 
117
+ MODEL_ID = "microsoft/Fara-7B"
118
+
119
+ class FaraTransformersModel:
120
+ def __init__(self, model_id: str, to_device: str = "cuda"):
121
+ print(f"Loading {model_id}...")
122
  self.model_id = model_id
123
 
124
+ # Load Processor
125
+ self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
126
+
127
+ # Load Model
128
+ # Fara is based on Qwen2.5-VL architecture
129
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
130
+ model_id,
131
+ trust_remote_code=True,
132
+ torch_dtype=torch.bfloat16,
133
+ device_map="auto" if to_device == "cuda" else None
134
+ )
135
+
136
+ if to_device == "cpu":
137
+ self.model.to("cpu")
138
+
139
+ self.model.eval()
140
+ print("Model loaded successfully.")
 
 
 
 
 
 
141
 
142
+ def generate(self, messages: list[dict], **kwargs):
143
+ # 1. Prepare Text Prompts
 
 
144
  text = self.processor.apply_chat_template(
145
  messages, tokenize=False, add_generation_prompt=True
146
  )
 
147
 
148
+ # 2. Process Images (Qwen-VL specific utility)
149
+ image_inputs, video_inputs = process_vision_info(messages)
150
+
151
+ # 3. Create Inputs
152
  inputs = self.processor(
153
  text=[text],
154
  images=image_inputs,
155
+ videos=video_inputs,
156
  padding=True,
157
  return_tensors="pt",
158
+ )
159
+
160
+ inputs = inputs.to(self.model.device)
161
 
162
+ # 4. Generate
163
  with torch.no_grad():
164
+ generated_ids = self.model.generate(**inputs, **kwargs)
165
 
166
+ # 5. Decode
167
  generated_ids_trimmed = [
168
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
169
  ]
170
 
171
  output_text = self.processor.batch_decode(
 
174
 
175
  return output_text
176
 
177
+ # Initialize Model Globally (Lazy loading handled by Gradio usually, but here we init for Spaces)
178
+ # We use a global variable that is loaded on first run or at startup
179
+ print(f"Initializing model class for {MODEL_ID}...")
180
+ # Actual loading happens on GPU decorator or first call usually,
181
+ # but for the class structure we initialize it here.
182
+ # Note: Actual torch.load happens inside the class init.
183
+ fara_model = FaraTransformersModel(MODEL_ID, to_device="cuda" if torch.cuda.is_available() else "cpu")
184
+
185
 
186
  # -----------------------------------------------------------------------------
187
+ # 3. HELPER FUNCTIONS (Parsing & Visualization)
188
  # -----------------------------------------------------------------------------
189
 
190
+ def array_to_image(image_array: np.ndarray) -> Image.Image:
191
+ if image_array is None:
192
+ raise ValueError("No image provided. Please upload an image before submitting.")
193
+ return Image.fromarray(np.uint8(image_array))
194
+
195
+ def get_navigation_prompt(task, image):
196
+ """Constructs the chat messages for Fara."""
197
  return [
198
  {
199
  "role": "system",
 
203
  "role": "user",
204
  "content": [
205
  {"type": "image", "image": image},
206
+ {"type": "text", "text": f"Instruction: {task}\n\nPrevious actions:\nNone"},
207
  ],
208
  },
209
  ]
210
 
 
 
 
 
 
211
  def parse_actions_from_response(response: str) -> list[str]:
212
+ """Parse actions from model response using regex pattern."""
213
+ # Look for code blocks
214
+ pattern = r"<code>(.*?)</code>"
215
  matches = re.findall(pattern, response, re.DOTALL)
216
+ if not matches:
217
+ # Fallback: if model forgets code tags but writes function calls
218
+ if "click(" in response or "type(" in response:
219
+ return [response]
220
  return matches
221
 
222
  def extract_coordinates_from_action(action_code: str) -> list[dict]:
223
+ """Extract coordinates from action code for localization actions."""
224
  localization_actions = []
225
 
226
+ # Patterns for different action types
227
  patterns = {
228
  'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
229
  'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
230
+ 'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
231
  'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
232
  }
233
 
 
235
  matches = re.finditer(pattern, action_code)
236
  for match in matches:
237
  if action_type == 'drag':
238
+ from_x, from_y, to_x, to_y = match.groups()
239
+ localization_actions.append({
240
+ 'type': 'drag_from', 'x': float(from_x), 'y': float(from_y), 'action': action_type
241
+ })
242
+ localization_actions.append({
243
+ 'type': 'drag_to', 'x': float(to_x), 'y': float(to_y), 'action': action_type
244
+ })
245
  else:
246
+ x_val = match.group(1)
247
+ y_val = match.group(2) if match.group(2) else x_val
248
+ if x_val and y_val:
249
+ localization_actions.append({
250
+ 'type': action_type, 'x': float(x_val), 'y': float(y_val), 'action': action_type
251
+ })
252
 
253
  return localization_actions
254
 
255
  def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
256
+ """Create an image with localization markers drawn on it."""
257
  if not coordinates:
258
  return None
259
 
 
262
  width, height = img_copy.size
263
 
264
  try:
 
 
265
  font = ImageFont.load_default()
266
+ except:
267
+ font = None
268
+
269
+ colors = {'click': 'red', 'double_click': 'blue', 'move_mouse': 'green', 'drag_from': 'orange', 'drag_to': 'purple'}
270
 
271
  for i, coord in enumerate(coordinates):
272
+ # Normalize if model outputs 0-1 range (Fara usually does)
273
+ # If model outputs pixels, we need to handle that.
274
+ # Fara/SmolVLM usually output normalized coordinates 0-1000 or 0-1.
275
+ # Assuming Fara outputs 0-1 floats based on the System Prompt definition.
276
+
277
+ pixel_x = int(coord['x'] * width) if coord['x'] <= 1.0 else int(coord['x'])
278
+ pixel_y = int(coord['y'] * height) if coord['y'] <= 1.0 else int(coord['y'])
279
+
280
  color = colors.get(coord['type'], 'red')
281
 
282
+ r = 10
283
+ draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
284
 
285
+ label = f"{coord['type']}"
286
+ draw.text((pixel_x + 12, pixel_y - 10), label, fill=color, font=font)
287
 
288
+ # Draw drag arrows
289
  if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
290
  next_coord = coordinates[i + 1]
291
+ end_x = int(next_coord['x'] * width) if next_coord['x'] <= 1.0 else int(next_coord['x'])
292
+ end_y = int(next_coord['y'] * height) if next_coord['y'] <= 1.0 else int(next_coord['y'])
293
  draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
294
+
295
  return img_copy
296
 
297
  # -----------------------------------------------------------------------------
298
+ # 4. APP LOGIC (ZeroGPU)
299
  # -----------------------------------------------------------------------------
300
 
301
  @spaces.GPU(duration=60)
302
+ def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
303
+ if input_numpy_image is None:
304
+ return "Please upload an image.", None
305
+
 
 
 
306
  input_pil_image = array_to_image(input_numpy_image)
307
 
308
+ # 1. Build Prompt
309
  prompt = get_navigation_prompt(task, input_pil_image)
310
+
311
+ # 2. Generate
312
+ if fara_model is None:
313
+ raise ValueError("Model not loaded")
314
 
315
+ navigation_str = fara_model.generate(prompt, max_new_tokens=500)
316
+ print(f"Raw Output: {navigation_str}")
317
+
318
+ # 3. Parse
319
+ navigation_str = navigation_str.strip()
320
+ actions = parse_actions_from_response(navigation_str)
321
 
 
322
  all_coordinates = []
323
+ for action_code in actions:
324
+ coordinates = extract_coordinates_from_action(action_code)
325
  all_coordinates.extend(coordinates)
326
 
327
+ # 4. Visualize
328
+ localized_image = input_pil_image
329
  if all_coordinates:
330
+ visualized = create_localized_image(input_pil_image, all_coordinates)
331
+ if visualized:
332
+ localized_image = visualized
333
+
334
+ return navigation_str, localized_image
 
 
335
 
336
  # -----------------------------------------------------------------------------
337
+ # 5. GRADIO UI
338
  # -----------------------------------------------------------------------------
339
 
340
+ title = "Fara-7B GUI Operator 🖥️"
341
  description = """
342
+ This demo uses **microsoft/Fara-7B** to understand GUI screenshots and generate navigation actions.
343
+ Upload a screenshot, define a task, and see the model's planned actions.
 
344
  """
345
 
346
+ # Load examples safely
347
+ examples = []
348
+ example_paths = [
349
+ ("Search for UK Prime Minister", "./assets/google.png"),
350
+ ("Find trending models", "./assets/huggingface.png")
351
+ ]
352
+
353
+ # We skip checking file existence to allow script to run,
354
+ # but in a real space, ensure ./assets/ folder exists or remove examples
355
+ safe_examples = []
356
+ for label, path in example_paths:
357
+ if os.path.exists(path):
358
+ safe_examples.append([path, label])
359
 
360
 
361
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
363
  gr.Markdown(description)
364
 
365
  with gr.Row():
366
+ with gr.Column():
367
+ input_image_component = gr.Image(label="Upload Interface Screenshot", height=500)
368
  task_component = gr.Textbox(
369
+ label="Task Instruction",
370
+ placeholder="e.g., Click the Search bar and type 'Hello World'",
371
+ lines=2
372
  )
373
+ submit_button = gr.Button("Generate Action", variant="primary")
374
+
375
+ with gr.Column():
376
+ output_image_component = gr.Image(label="Visualized Action", height=500)
377
+ output_code_component = gr.Textbox(label="Model Output (Code)", lines=10, show_copy_button=True)
 
 
378
 
379
  submit_button.click(
380
+ fn=navigate,
381
+ inputs=[input_image_component, task_component],
382
+ outputs=[output_code_component, output_image_component]
383
  )
384
 
385
+ if safe_examples:
386
  gr.Examples(
387
+ examples=safe_examples,
388
  inputs=[input_image_component, task_component],
389
+ outputs=[output_code_component, output_image_component],
390
+ fn=navigate,
391
  cache_examples=True,
392
  )
393
 
394
  if __name__ == "__main__":
395
+ demo.queue().launch()