Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on 6 days ago

Commit

aa8fa9a

verified ·

1 Parent(s): 089b6b1

update app..

Browse files

Files changed (1) hide show

app.py +109 -49

app.py CHANGED Viewed

@@ -16,7 +16,9 @@ from PIL import Image, ImageDraw, ImageFont
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
-    AutoModelForImageTextToText
 )
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from qwen_vl_utils import process_vision_info
@@ -24,7 +26,6 @@ from qwen_vl_utils import process_vision_info
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-# --- Theme Configuration ---
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
@@ -96,8 +97,6 @@ orange_red_theme = OrangeRedTheme()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on device: {device}")
-# --- Model Loading ---
 print("🔄 Loading Fara-7B...")
 MODEL_ID_V = "microsoft/Fara-7B"
 try:
@@ -140,9 +139,22 @@ except Exception as e:
     model_h = None
     processor_h = None
-print("✅ Models loading sequence complete.")
-# --- Helper Functions ---
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
@@ -159,13 +171,13 @@ def get_image_proc_params(processor) -> Dict[str, int]:
     min_pixels = getattr(ip, "min_pixels", default_min)
     max_pixels = getattr(ip, "max_pixels", default_max)
-    # Holo2/Qwen specific sizing sometimes in 'size' dict
     size_config = getattr(ip, "size", {})
     if isinstance(size_config, dict):
         if "shortest_edge" in size_config:
-            min_pixels = size_config["shortest_edge"]
         if "longest_edge" in size_config:
-            max_pixels = size_config["longest_edge"]
     if min_pixels is None: min_pixels = default_min
     if max_pixels is None: max_pixels = default_max
@@ -178,12 +190,11 @@ def get_image_proc_params(processor) -> Dict[str, int]:
     }
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
-    # Holo2 specific: allows turning thinking off in template
     if hasattr(processor, "apply_chat_template"):
         try:
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
         except TypeError:
-            # Fallback for processors that don't support 'thinking' kwarg
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     tok = getattr(processor, "tokenizer", None)
@@ -200,8 +211,6 @@ def trim_generated(generated_ids, inputs):
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
-# --- Prompts ---
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
     You need to generate the next action to complete the task.
@@ -233,7 +242,6 @@ def get_localization_prompt(task, image):
     ]
 def get_holo2_prompt(task, image):
-    # JSON Schema representation for prompt
     schema_str = '{"properties": {"x": {"description": "The x coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "X", "type": "integer"}, "y": {"description": "The y coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "Y", "type": "integer"}}, "required": ["x", "y"], "title": "ClickCoordinates", "type": "object"}'
     prompt = f"""Localize an element on the GUI image according to the provided target and output a click position.
@@ -250,13 +258,32 @@ def get_holo2_prompt(task, image):
         },
     ]
-# --- Parsing ---
 def parse_click_response(text: str) -> List[Dict]:
     actions = []
     text = text.strip()
-    # Generic Point parsing
     matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
@@ -269,7 +296,6 @@ def parse_click_response(text: str) -> List[Dict]:
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
-    # Fallback tuple
     if not actions:
         matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
         for m in matches_tuple:
@@ -298,11 +324,6 @@ def parse_fara_response(response: str) -> List[Dict]:
 def parse_holo2_response(response: str) -> List[Dict]:
     actions = []
-    # Attempt to find JSON object structure { "x": ..., "y": ... }
-    # Holo2 may output thinking blocks, but we set thinking=False.
-    # Just in case, regex search for the json pattern.
-    # Look for pure JSON first
     try:
         data = json.loads(response.strip())
         if 'x' in data and 'y' in data:
@@ -311,7 +332,6 @@ def parse_holo2_response(response: str) -> List[Dict]:
     except:
         pass
-    # Regex search if embedded in text
     match = re.search(r"\{\s*['\"]x['\"]\s*:\s*(\d+)\s*,\s*['\"]y['\"]\s*:\s*(\d+)\s*\}", response)
     if match:
         actions.append({
@@ -319,13 +339,27 @@ def parse_holo2_response(response: str) -> List[Dict]:
             "x": int(match.group(1)),
             "y": int(match.group(2)),
             "text": "Holo2",
-            "norm": True # Flag indicating 0-1000 scale
         })
-        return actions
     return actions
-# --- Visualization ---
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
@@ -345,36 +379,32 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         color = 'red' if 'click' in act['type'].lower() else 'blue'
-        # Draw Crosshair
         line_len = 15
         width = 4
-        # Horizontal
         draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
-        # Vertical
         draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
-        # Outer Circle
         r = 20
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
         label = f"{act['type'].capitalize()}"
-        if act.get('text'): label += f": \"{act['text']}\""
         text_pos = (pixel_x + 25, pixel_y - 15)
-        # Label with background
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
             padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
             draw.rectangle(padded_bbox, fill="yellow", outline=color)
             draw.text(text_pos, label, fill="black", font=font)
-        except Exception as e:
             draw.text(text_pos, label, fill="white")
     return img_copy
-# --- Main Logic ---
 @spaces.GPU
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
     if input_numpy_image is None: return "⚠️ Please upload an image.", None
@@ -385,9 +415,8 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     actions = []
     raw_response = ""
-    # --- Fara-7B Logic ---
     if model_choice == "Fara-7B":
-        if model_v is None: return "Error: Fara model failed to load on startup.", None
         print("Using Fara Pipeline...")
         messages = get_fara_prompt(task, input_pil_image)
@@ -411,7 +440,45 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         actions = parse_fara_response(raw_response)
-    # --- Holo2-4B Logic ---
     elif model_choice == "Holo2-4B":
         if model_h is None: return "Error: Holo2 model failed to load.", None
         print("Using Holo2-4B Pipeline...")
@@ -419,7 +486,6 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         model, processor = model_h, processor_h
         ip_params = get_image_proc_params(processor)
-        # Holo2 specific resize logic
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
             factor=ip_params["patch_size"] * ip_params["merge_size"],
@@ -429,8 +495,6 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
         messages = get_holo2_prompt(task, proc_image)
-        # Apply chat template with thinking=False for localization
         text_prompt = apply_chat_template_compat(processor, messages, thinking=False)
         inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
@@ -444,13 +508,11 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         actions = parse_holo2_response(raw_response)
-        # Scale Holo2 coordinates (Normalized 0-1000 -> Original Pixel)
         for a in actions:
             if a.get('norm', False):
                 a['x'] = (a['x'] / 1000.0) * orig_w
                 a['y'] = (a['y'] / 1000.0) * orig_h
-    # --- UI-TARS Logic ---
     elif model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load.", None
         print("Using UI-TARS Pipeline...")
@@ -480,12 +542,10 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         actions = parse_click_response(raw_response)
-        # Scale UI-TARS coordinates (Resized Pixel -> Original Pixel)
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
             for a in actions:
-                # UI-TARS output is in resized pixel coords
                 a['x'] = int(a['x'] * scale_x)
                 a['y'] = int(a['y'] * scale_y)
@@ -502,7 +562,6 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     return raw_response, output_image
-# --- Gradio App ---
 css="""
 #col-container {
     margin: 0 auto;
@@ -512,7 +571,7 @@ css="""
 """
 with gr.Blocks() as demo:
     gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
-    gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), and [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B).")
     with gr.Row():
         with gr.Column(scale=2):
@@ -520,7 +579,7 @@ with gr.Blocks() as demo:
             with gr.Row():
                 model_choice = gr.Radio(
-                    choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-4B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True
@@ -548,6 +607,7 @@ with gr.Blocks() as demo:
             ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
             ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
             ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
         ],
         inputs=[input_image, task_input, model_choice],
         label="Quick Examples"

 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
+    AutoModelForImageTextToText,
+    AutoModelForVision2Seq,
+    AutoTokenizer
 )
 from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from qwen_vl_utils import process_vision_info
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Running on device: {device}")
 print("🔄 Loading Fara-7B...")
 MODEL_ID_V = "microsoft/Fara-7B"
 try:
     model_h = None
     processor_h = None
+print("🔄 Loading ActIO-UI-7B...")
+MODEL_ID_A = "Uniphore/actio-ui-7b-rlvr"
+try:
+    processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
+    model_a = AutoModelForVision2Seq.from_pretrained(
+        MODEL_ID_A,
+        trust_remote_code=True,
+        torch_dtype="auto",
+        device_map=device
+    ).eval()
+except Exception as e:
+    print(f"Failed to load ActIO: {e}")
+    model_a = None
+    processor_a = None
+print("✅ Models loading sequence complete.")
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
     min_pixels = getattr(ip, "min_pixels", default_min)
     max_pixels = getattr(ip, "max_pixels", default_max)
+    # Some configs hide size in a dict
     size_config = getattr(ip, "size", {})
     if isinstance(size_config, dict):
         if "shortest_edge" in size_config:
+            min_pixels = size_config.get("shortest_edge", default_min)
         if "longest_edge" in size_config:
+            max_pixels = size_config.get("longest_edge", default_max)
     if min_pixels is None: min_pixels = default_min
     if max_pixels is None: max_pixels = default_max
     }
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]], thinking: bool = True) -> str:
+    # Handles compat for models that support/don't support the 'thinking' arg
     if hasattr(processor, "apply_chat_template"):
         try:
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, thinking=thinking)
         except TypeError:
             return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     tok = getattr(processor, "tokenizer", None)
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
     You need to generate the next action to complete the task.
     ]
 def get_holo2_prompt(task, image):
     schema_str = '{"properties": {"x": {"description": "The x coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "X", "type": "integer"}, "y": {"description": "The y coordinate, normalized between 0 and 1000.", "ge": 0, "le": 1000, "title": "Y", "type": "integer"}}, "required": ["x", "y"], "title": "ClickCoordinates", "type": "object"}'
     prompt = f"""Localize an element on the GUI image according to the provided target and output a click position.
         },
     ]
+def get_actio_prompt(task, image):
+    system_prompt = (
+        "You are a GUI agent. You are given a task and a screenshot of the screen. "
+        "You need to perform a series of pyautogui actions to complete the task."
+    )
+    # ActIO specific format request
+    user_text = (
+        "Please perform the following task by providing the action and the coordinates "
+        "in the format of <action>(x, y): " + task
+    )
+    return [
+        {"role": "system", "content": [{"type": "text", "text": system_prompt}]},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": user_text},
+                {"type": "image", "image": image},
+            ],
+        },
+    ]
 def parse_click_response(text: str) -> List[Dict]:
     actions = []
     text = text.strip()
     matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": "", "norm": False})
     if not actions:
         matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
         for m in matches_tuple:
 def parse_holo2_response(response: str) -> List[Dict]:
     actions = []
     try:
         data = json.loads(response.strip())
         if 'x' in data and 'y' in data:
     except:
         pass
     match = re.search(r"\{\s*['\"]x['\"]\s*:\s*(\d+)\s*,\s*['\"]y['\"]\s*:\s*(\d+)\s*\}", response)
     if match:
         actions.append({
             "x": int(match.group(1)),
             "y": int(match.group(2)),
             "text": "Holo2",
+            "norm": True # 0-1000 scale
         })
     return actions
+def parse_actio_response(text: str) -> List[Dict]:
+    actions = []
+    text = text.strip()
+    # Pattern for <action>(x, y) e.g., click(500, 300) or type(200, 200)
+    # Also handles optional text inside or loosely formatted
+    pattern = r"([a-zA-Z_]+)\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)"
+    matches = re.findall(pattern, text)
+    for m in matches:
+        actions.append({
+            "type": m[0],
+            "x": int(m[1]),
+            "y": int(m[2]),
+            "text": text,
+            "norm": False # ActIO usually outputs absolute pixels relative to input image
+        })
+    return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
         color = 'red' if 'click' in act['type'].lower() else 'blue'
+        # Crosshair
         line_len = 15
         width = 4
         draw.line((pixel_x - line_len, pixel_y, pixel_x + line_len, pixel_y), fill=color, width=width)
         draw.line((pixel_x, pixel_y - line_len, pixel_x, pixel_y + line_len), fill=color, width=width)
+        # Circle
         r = 20
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
         label = f"{act['type'].capitalize()}"
+        if act.get('text') and len(act['text']) < 20:
+            label += f": \"{act['text']}\""
         text_pos = (pixel_x + 25, pixel_y - 15)
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
             padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
             draw.rectangle(padded_bbox, fill="yellow", outline=color)
             draw.text(text_pos, label, fill="black", font=font)
+        except Exception:
             draw.text(text_pos, label, fill="white")
     return img_copy
 @spaces.GPU
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
     if input_numpy_image is None: return "⚠️ Please upload an image.", None
     actions = []
     raw_response = ""
     if model_choice == "Fara-7B":
+        if model_v is None: return "Error: Fara model failed to load.", None
         print("Using Fara Pipeline...")
         messages = get_fara_prompt(task, input_pil_image)
         actions = parse_fara_response(raw_response)
+    elif model_choice == "ActIO-UI-7B":
+        if model_a is None: return "Error: ActIO model failed to load.", None
+        print("Using ActIO-UI Pipeline...")
+        model, processor = model_a, processor_a
+        ip_params = get_image_proc_params(processor)
+        # Resize for performance and standard input compliance
+        resized_h, resized_w = smart_resize(
+            input_pil_image.height, input_pil_image.width,
+            factor=ip_params["patch_size"] * ip_params["merge_size"],
+            min_pixels=ip_params["min_pixels"],
+            max_pixels=ip_params["max_pixels"],
+        )
+        proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
+        messages = get_actio_prompt(task, proc_image)
+        text_prompt = apply_chat_template_compat(processor, messages)
+        # ActIO/Qwen processors usually handle image list via processor call
+        inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
+        generated_ids = trim_generated(generated_ids, inputs)
+        raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        actions = parse_actio_response(raw_response)
+        # Scale coordinates (Resized -> Original)
+        if resized_w > 0 and resized_h > 0:
+            scale_x = orig_w / resized_w
+            scale_y = orig_h / resized_h
+            for a in actions:
+                a['x'] = int(a['x'] * scale_x)
+                a['y'] = int(a['y'] * scale_y)
     elif model_choice == "Holo2-4B":
         if model_h is None: return "Error: Holo2 model failed to load.", None
         print("Using Holo2-4B Pipeline...")
         model, processor = model_h, processor_h
         ip_params = get_image_proc_params(processor)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
             factor=ip_params["patch_size"] * ip_params["merge_size"],
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
         messages = get_holo2_prompt(task, proc_image)
         text_prompt = apply_chat_template_compat(processor, messages, thinking=False)
         inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
         actions = parse_holo2_response(raw_response)
         for a in actions:
             if a.get('norm', False):
                 a['x'] = (a['x'] / 1000.0) * orig_w
                 a['y'] = (a['y'] / 1000.0) * orig_h
     elif model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load.", None
         print("Using UI-TARS Pipeline...")
         actions = parse_click_response(raw_response)
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
             for a in actions:
                 a['x'] = int(a['x'] * scale_x)
                 a['y'] = int(a['y'] * scale_y)
     return raw_response, output_image
 css="""
 #col-container {
     margin: 0 auto;
 """
 with gr.Blocks() as demo:
     gr.Markdown("# **CUA GUI Operator 🖥️**", elem_id="main-title")
+    gr.Markdown("Perform Computer Use Agent tasks with the models: [Fara-7B](https://huggingface.co/microsoft/Fara-7B), [UI-TARS-1.5-7B](https://huggingface.co/ByteDance-Seed/UI-TARS-1.5-7B), [Holo2-4B](https://huggingface.co/Hcompany/Holo2-4B) and [ActIO-UI-7B](https://huggingface.co/Uniphore/actio-ui-7b-rlvr).")
     with gr.Row():
         with gr.Column(scale=2):
             with gr.Row():
                 model_choice = gr.Radio(
+                    choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-4B", "ActIO-UI-7B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True
             ["examples/1.png", "Click on the Fara-7B model.", "Fara-7B"],
             ["examples/2.png", "Click on the VLMs Collection", "UI-TARS-1.5-7B"],
             ["examples/3.png", "Click on the 'Real-time vision models' collection.", "Holo2-4B"],
+            ["examples/2.png", "Search for 'transformers'", "ActIO-UI-7B"],
         ],
         inputs=[input_image, task_input, model_choice],
         label="Quick Examples"