prithivMLmods commited on
Commit
aa8fa9a
·
verified ·
1 Parent(s): 089b6b1

update app..

Browse files
Files changed (1) hide show
  1. app.py +109 -49
app.py CHANGED
@@ -16,7 +16,9 @@ from PIL import Image, ImageDraw, ImageFont
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
- AutoModelForImageTextToText
 
 
20
  )
21
  from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
22
  from qwen_vl_utils import process_vision_info
@@ -24,7 +26,6 @@ from qwen_vl_utils import process_vision_info
24
  from gradio.themes import Soft
25
  from gradio.themes.utils import colors, fonts, sizes
26
 
27
- # --- Theme Configuration ---
28
  colors.orange_red = colors.Color(
29
  name="orange_red",
30
  c50="#FFF0E5",
@@ -96,8 +97,6 @@ orange_red_theme = OrangeRedTheme()
96
  device = "cuda" if torch.cuda.is_available() else "cpu"
97
  print(f"Running on device: {device}")
98
 
99
- # --- Model Loading ---
100
-
101
  print("🔄 Loading Fara-7B...")
102
  MODEL_ID_V = "microsoft/Fara-7B"
103
  try:
@@ -140,9 +139,22 @@ except Exception as e:
140
  model_h = None
141
  processor_h = None
142
 
143
- print(" Models loading sequence complete.")
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- # --- Helper Functions ---
146
 
147
  def array_to_image(image_array: np.ndarray) -> Image.Image:
148
  if image_array is None: raise ValueError("No image provided.")
@@ -159,13 +171,13 @@ def get_image_proc_params(processor) -> Dict[str, int]:
159
  min_pixels = getattr(ip, "min_pixels", default_min)
160
  max_pixels = getattr(ip, "max_pixels", default_max)
161
 
162
- # Holo2/Qwen specific sizing sometimes in 'size' dict
163
  size_config = getattr(ip, "size", {})
164
  if isinstance(size_config, dict):
165
  if "shortest_edge" in size_config:
166
- min_pixels = size_config["shortest_edge"]
167
  if "longest_edge" in size_config:
168
- max_pixels = size_config["longest_edge"]
169
 
170
  if min_pixels is None: min_pixels = default_min
171
  if max_pixels is None: max_pixels = default_max
@@ -178,12 +190,11 @@ def get_image_proc_params(processor) -> Dict[str, int]:
178
  }
179
 
180
  def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
181
- # Holo2 specific: allows turning thinking off in template
182
  if hasattr(processor, "apply_chat_template"):
183
  try:
184
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
185
  except TypeError:
186
- # Fallback for processors that don't support 'thinking' kwarg
187
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
188
 
189
  tok = getattr(processor, "tokenizer", None)
@@ -200,8 +211,6 @@ def trim_generated(generated_ids, inputs):
200
  return generated_ids
201
  return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
202
 
203
- # --- Prompts ---
204
-
205
  def get_fara_prompt(task, image):
206
  OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
207
  You need to generate the next action to complete the task.
@@ -233,7 +242,6 @@ def get_localization_prompt(task, image):
233
  ]
234
 
235
  def get_holo2_prompt(task, image):
236
- # JSON Schema representation for prompt
237
  schema_str = '{"properties": {"x": {"description": "The x coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "X", "type": "integer"}, "y": {"description": "The y coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "Y", "type": "integer"}}, "required": ["x", "y"], "title": "ClickCoordinates", "type": "object"}'
238
 
239
  prompt = f"""Localize an element on the GUI image according to the provided target and output a click position.
@@ -250,13 +258,32 @@ def get_holo2_prompt(task, image):
250
  },
251
  ]
252
 
253
- # --- Parsing ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  def parse_click_response(text: str) -> List[Dict]:
256
  actions = []
257
  text = text.strip()
258
 
259
- # Generic Point parsing
260
  matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
261
  for m in matches_click:
262
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
@@ -269,7 +296,6 @@ def parse_click_response(text: str) -> List[Dict]:
269
  for m in matches_box:
270
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
271
 
272
- # Fallback tuple
273
  if not actions:
274
  matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
275
  for m in matches_tuple:
@@ -298,11 +324,6 @@ def parse_fara_response(response: str) -> List[Dict]:
298
 
299
  def parse_holo2_response(response: str) -> List[Dict]:
300
  actions = []
301
- # Attempt to find JSON object structure { "x": ..., "y": ... }
302
- # Holo2 may output thinking blocks, but we set thinking=False.
303
- # Just in case, regex search for the json pattern.
304
-
305
- # Look for pure JSON first
306
  try:
307
  data = json.loads(response.strip())
308
  if 'x' in data and 'y' in data:
@@ -311,7 +332,6 @@ def parse_holo2_response(response: str) -> List[Dict]:
311
  except:
312
  pass
313
 
314
- # Regex search if embedded in text
315
  match = re.search(r"\{\s*['\"]x['\"]\s*:\s*(\d+)\s*,\s*['\"]y['\"]\s*:\s*(\d+)\s*\}", response)
316
  if match:
317
  actions.append({
@@ -319,13 +339,27 @@ def parse_holo2_response(response: str) -> List[Dict]:
319
  "x": int(match.group(1)),
320
  "y": int(match.group(2)),
321
  "text": "Holo2",
322
- "norm": True # Flag indicating 0-1000 scale
323
  })
324
- return actions
325
-
326
  return actions
327
 
328
- # --- Visualization ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
 
330
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
331
  if not actions: return None
@@ -345,36 +379,32 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
345
 
346
  color = 'red' if 'click' in act['type'].lower() else 'blue'
347
 
348
- # Draw Crosshair
349
  line_len = 15
350
  width = 4
351
- # Horizontal
352
  draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
353
- # Vertical
354
  draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
355
 
356
- # Outer Circle
357
  r = 20
358
  draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
359
 
360
  label = f"{act['type'].capitalize()}"
361
- if act.get('text'): label += f": \"{act['text']}\""
 
362
 
363
  text_pos = (pixel_x + 25, pixel_y - 15)
364
 
365
- # Label with background
366
  try:
367
  bbox = draw.textbbox(text_pos, label, font=font)
368
  padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
369
  draw.rectangle(padded_bbox, fill="yellow", outline=color)
370
  draw.text(text_pos, label, fill="black", font=font)
371
- except Exception as e:
372
  draw.text(text_pos, label, fill="white")
373
 
374
  return img_copy
375
 
376
- # --- Main Logic ---
377
-
378
  @spaces.GPU
379
  def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
380
  if input_numpy_image is None: return "⚠️ Please upload an image.", None
@@ -385,9 +415,8 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
385
  actions = []
386
  raw_response = ""
387
 
388
- # --- Fara-7B Logic ---
389
  if model_choice == "Fara-7B":
390
- if model_v is None: return "Error: Fara model failed to load on startup.", None
391
  print("Using Fara Pipeline...")
392
 
393
  messages = get_fara_prompt(task, input_pil_image)
@@ -411,7 +440,45 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
411
 
412
  actions = parse_fara_response(raw_response)
413
 
414
- # --- Holo2-4B Logic ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
  elif model_choice == "Holo2-4B":
416
  if model_h is None: return "Error: Holo2 model failed to load.", None
417
  print("Using Holo2-4B Pipeline...")
@@ -419,7 +486,6 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
419
  model, processor = model_h, processor_h
420
  ip_params = get_image_proc_params(processor)
421
 
422
- # Holo2 specific resize logic
423
  resized_h, resized_w = smart_resize(
424
  input_pil_image.height, input_pil_image.width,
425
  factor=ip_params["patch_size"] * ip_params["merge_size"],
@@ -429,8 +495,6 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
429
  proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
430
 
431
  messages = get_holo2_prompt(task, proc_image)
432
-
433
- # Apply chat template with thinking=False for localization
434
  text_prompt = apply_chat_template_compat(processor, messages, thinking=False)
435
 
436
  inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
@@ -444,13 +508,11 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
444
 
445
  actions = parse_holo2_response(raw_response)
446
 
447
- # Scale Holo2 coordinates (Normalized 0-1000 -> Original Pixel)
448
  for a in actions:
449
  if a.get('norm', False):
450
  a['x'] = (a['x'] / 1000.0) * orig_w
451
  a['y'] = (a['y'] / 1000.0) * orig_h
452
 
453
- # --- UI-TARS Logic ---
454
  elif model_choice == "UI-TARS-1.5-7B":
455
  if model_x is None: return "Error: UI-TARS model failed to load.", None
456
  print("Using UI-TARS Pipeline...")
@@ -480,12 +542,10 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
480
 
481
  actions = parse_click_response(raw_response)
482
 
483
- # Scale UI-TARS coordinates (Resized Pixel -> Original Pixel)
484
  if resized_w > 0 and resized_h > 0:
485
  scale_x = orig_w / resized_w
486
  scale_y = orig_h / resized_h
487
  for a in actions:
488
- # UI-TARS output is in resized pixel coords
489
  a['x'] = int(a['x'] * scale_x)
490
  a['y'] = int(a['y'] * scale_y)
491
 
@@ -502,7 +562,6 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
502
 
503
  return raw_response, output_image
504
 
505
- # --- Gradio App ---
506
  css="""
507
  #col-container {
508
  margin: 0 auto;
@@ -512,7 +571,7 @@ css="""
512
  """
513
  with gr.Blocks() as demo:
514
  gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
515
- gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), and [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B).")
516
 
517
  with gr.Row():
518
  with gr.Column(scale=2):
@@ -520,7 +579,7 @@ with gr.Blocks() as demo:
520
 
521
  with gr.Row():
522
  model_choice = gr.Radio(
523
- choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-4B"],
524
  label="Select Model",
525
  value="Fara-7B",
526
  interactive=True
@@ -548,6 +607,7 @@ with gr.Blocks() as demo:
548
  ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
549
  ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
550
  ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
 
551
  ],
552
  inputs=[input_image, task_input, model_choice],
553
  label="Quick Examples"
 
16
  from transformers import (
17
  Qwen2_5_VLForConditionalGeneration,
18
  AutoProcessor,
19
+ AutoModelForImageTextToText,
20
+ AutoModelForVision2Seq,
21
+ AutoTokenizer
22
  )
23
  from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
24
  from qwen_vl_utils import process_vision_info
 
26
  from gradio.themes import Soft
27
  from gradio.themes.utils import colors, fonts, sizes
28
 
 
29
  colors.orange_red = colors.Color(
30
  name="orange_red",
31
  c50="#FFF0E5",
 
97
  device = "cuda" if torch.cuda.is_available() else "cpu"
98
  print(f"Running on device: {device}")
99
 
 
 
100
  print("🔄 Loading Fara-7B...")
101
  MODEL_ID_V = "microsoft/Fara-7B"
102
  try:
 
139
  model_h = None
140
  processor_h = None
141
 
142
+ print("🔄 Loading ActIO-UI-7B...")
143
+ MODEL_ID_A = "Uniphore/actio-ui-7b-rlvr"
144
+ try:
145
+ processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
146
+ model_a = AutoModelForVision2Seq.from_pretrained(
147
+ MODEL_ID_A,
148
+ trust_remote_code=True,
149
+ torch_dtype="auto",
150
+ device_map=device
151
+ ).eval()
152
+ except Exception as e:
153
+ print(f"Failed to load ActIO: {e}")
154
+ model_a = None
155
+ processor_a = None
156
 
157
+ print("✅ Models loading sequence complete.")
158
 
159
  def array_to_image(image_array: np.ndarray) -> Image.Image:
160
  if image_array is None: raise ValueError("No image provided.")
 
171
  min_pixels = getattr(ip, "min_pixels", default_min)
172
  max_pixels = getattr(ip, "max_pixels", default_max)
173
 
174
+ # Some configs hide size in a dict
175
  size_config = getattr(ip, "size", {})
176
  if isinstance(size_config, dict):
177
  if "shortest_edge" in size_config:
178
+ min_pixels = size_config.get("shortest_edge", default_min)
179
  if "longest_edge" in size_config:
180
+ max_pixels = size_config.get("longest_edge", default_max)
181
 
182
  if min_pixels is None: min_pixels = default_min
183
  if max_pixels is None: max_pixels = default_max
 
190
  }
191
 
192
  def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
193
+ # Handles compat for models that support/don't support the 'thinking' arg
194
  if hasattr(processor, "apply_chat_template"):
195
  try:
196
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
197
  except TypeError:
 
198
  return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
199
 
200
  tok = getattr(processor, "tokenizer", None)
 
211
  return generated_ids
212
  return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
213
 
 
 
214
  def get_fara_prompt(task, image):
215
  OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
216
  You need to generate the next action to complete the task.
 
242
  ]
243
 
244
  def get_holo2_prompt(task, image):
 
245
  schema_str = '{"properties": {"x": {"description": "The x coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "X", "type": "integer"}, "y": {"description": "The y coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "Y", "type": "integer"}}, "required": ["x", "y"], "title": "ClickCoordinates", "type": "object"}'
246
 
247
  prompt = f"""Localize an element on the GUI image according to the provided target and output a click position.
 
258
  },
259
  ]
260
 
261
+ def get_actio_prompt(task, image):
262
+ system_prompt = (
263
+ "You are a GUI agent. You are given a task and a screenshot of the screen. "
264
+ "You need to perform a series of pyautogui actions to complete the task."
265
+ )
266
+ # ActIO specific format request
267
+ user_text = (
268
+ "Please perform the following task by providing the action and the coordinates "
269
+ "in the format of <action>(x, y): " + task
270
+ )
271
+
272
+ return [
273
+ {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
274
+ {
275
+ "role": "user",
276
+ "content": [
277
+ {"type": "text", "text": user_text},
278
+ {"type": "image", "image": image},
279
+ ],
280
+ },
281
+ ]
282
 
283
  def parse_click_response(text: str) -> List[Dict]:
284
  actions = []
285
  text = text.strip()
286
 
 
287
  matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
288
  for m in matches_click:
289
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
 
296
  for m in matches_box:
297
  actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
298
 
 
299
  if not actions:
300
  matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
301
  for m in matches_tuple:
 
324
 
325
  def parse_holo2_response(response: str) -> List[Dict]:
326
  actions = []
 
 
 
 
 
327
  try:
328
  data = json.loads(response.strip())
329
  if 'x' in data and 'y' in data:
 
332
  except:
333
  pass
334
 
 
335
  match = re.search(r"\{\s*['\"]x['\"]\s*:\s*(\d+)\s*,\s*['\"]y['\"]\s*:\s*(\d+)\s*\}", response)
336
  if match:
337
  actions.append({
 
339
  "x": int(match.group(1)),
340
  "y": int(match.group(2)),
341
  "text": "Holo2",
342
+ "norm": True # 0-1000 scale
343
  })
 
 
344
  return actions
345
 
346
+ def parse_actio_response(text: str) -> List[Dict]:
347
+ actions = []
348
+ text = text.strip()
349
+ # Pattern for <action>(x, y) e.g., click(500, 300) or type(200, 200)
350
+ # Also handles optional text inside or loosely formatted
351
+ pattern = r"([a-zA-Z_]+)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)"
352
+ matches = re.findall(pattern, text)
353
+
354
+ for m in matches:
355
+ actions.append({
356
+ "type": m[0],
357
+ "x": int(m[1]),
358
+ "y": int(m[2]),
359
+ "text": text,
360
+ "norm": False # ActIO usually outputs absolute pixels relative to input image
361
+ })
362
+ return actions
363
 
364
  def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
365
  if not actions: return None
 
379
 
380
  color = 'red' if 'click' in act['type'].lower() else 'blue'
381
 
382
+ # Crosshair
383
  line_len = 15
384
  width = 4
 
385
  draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
 
386
  draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
387
 
388
+ # Circle
389
  r = 20
390
  draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
391
 
392
  label = f"{act['type'].capitalize()}"
393
+ if act.get('text') and len(act['text']) < 20:
394
+ label += f": \"{act['text']}\""
395
 
396
  text_pos = (pixel_x + 25, pixel_y - 15)
397
 
 
398
  try:
399
  bbox = draw.textbbox(text_pos, label, font=font)
400
  padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
401
  draw.rectangle(padded_bbox, fill="yellow", outline=color)
402
  draw.text(text_pos, label, fill="black", font=font)
403
+ except Exception:
404
  draw.text(text_pos, label, fill="white")
405
 
406
  return img_copy
407
 
 
 
408
  @spaces.GPU
409
  def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
410
  if input_numpy_image is None: return "⚠️ Please upload an image.", None
 
415
  actions = []
416
  raw_response = ""
417
 
 
418
  if model_choice == "Fara-7B":
419
+ if model_v is None: return "Error: Fara model failed to load.", None
420
  print("Using Fara Pipeline...")
421
 
422
  messages = get_fara_prompt(task, input_pil_image)
 
440
 
441
  actions = parse_fara_response(raw_response)
442
 
443
+ elif model_choice == "ActIO-UI-7B":
444
+ if model_a is None: return "Error: ActIO model failed to load.", None
445
+ print("Using ActIO-UI Pipeline...")
446
+
447
+ model, processor = model_a, processor_a
448
+ ip_params = get_image_proc_params(processor)
449
+
450
+ # Resize for performance and standard input compliance
451
+ resized_h, resized_w = smart_resize(
452
+ input_pil_image.height, input_pil_image.width,
453
+ factor=ip_params["patch_size"] * ip_params["merge_size"],
454
+ min_pixels=ip_params["min_pixels"],
455
+ max_pixels=ip_params["max_pixels"],
456
+ )
457
+ proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
458
+
459
+ messages = get_actio_prompt(task, proc_image)
460
+ text_prompt = apply_chat_template_compat(processor, messages)
461
+
462
+ # ActIO/Qwen processors usually handle image list via processor call
463
+ inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
464
+ inputs = {k: v.to(device) for k, v in inputs.items()}
465
+
466
+ with torch.no_grad():
467
+ generated_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
468
+
469
+ generated_ids = trim_generated(generated_ids, inputs)
470
+ raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
471
+
472
+ actions = parse_actio_response(raw_response)
473
+
474
+ # Scale coordinates (Resized -> Original)
475
+ if resized_w > 0 and resized_h > 0:
476
+ scale_x = orig_w / resized_w
477
+ scale_y = orig_h / resized_h
478
+ for a in actions:
479
+ a['x'] = int(a['x'] * scale_x)
480
+ a['y'] = int(a['y'] * scale_y)
481
+
482
  elif model_choice == "Holo2-4B":
483
  if model_h is None: return "Error: Holo2 model failed to load.", None
484
  print("Using Holo2-4B Pipeline...")
 
486
  model, processor = model_h, processor_h
487
  ip_params = get_image_proc_params(processor)
488
 
 
489
  resized_h, resized_w = smart_resize(
490
  input_pil_image.height, input_pil_image.width,
491
  factor=ip_params["patch_size"] * ip_params["merge_size"],
 
495
  proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
496
 
497
  messages = get_holo2_prompt(task, proc_image)
 
 
498
  text_prompt = apply_chat_template_compat(processor, messages, thinking=False)
499
 
500
  inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
 
508
 
509
  actions = parse_holo2_response(raw_response)
510
 
 
511
  for a in actions:
512
  if a.get('norm', False):
513
  a['x'] = (a['x'] / 1000.0) * orig_w
514
  a['y'] = (a['y'] / 1000.0) * orig_h
515
 
 
516
  elif model_choice == "UI-TARS-1.5-7B":
517
  if model_x is None: return "Error: UI-TARS model failed to load.", None
518
  print("Using UI-TARS Pipeline...")
 
542
 
543
  actions = parse_click_response(raw_response)
544
 
 
545
  if resized_w > 0 and resized_h > 0:
546
  scale_x = orig_w / resized_w
547
  scale_y = orig_h / resized_h
548
  for a in actions:
 
549
  a['x'] = int(a['x'] * scale_x)
550
  a['y'] = int(a['y'] * scale_y)
551
 
 
562
 
563
  return raw_response, output_image
564
 
 
565
  css="""
566
  #col-container {
567
  margin: 0 auto;
 
571
  """
572
  with gr.Blocks() as demo:
573
  gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
574
+ gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B) and [ActIO-UI-7B](https://huggingface.co/Uniphore/actio-ui-7b-rlvr).")
575
 
576
  with gr.Row():
577
  with gr.Column(scale=2):
 
579
 
580
  with gr.Row():
581
  model_choice = gr.Radio(
582
+ choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-4B", "ActIO-UI-7B"],
583
  label="Select Model",
584
  value="Fara-7B",
585
  interactive=True
 
607
  ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
608
  ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
609
  ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
610
+ ["examples/2.png", "Search for 'transformers'", "ActIO-UI-7B"],
611
  ],
612
  inputs=[input_image, task_input, model_choice],
613
  label="Quick Examples"