Spaces:

prithivMLmods
/

CUA-GUI-Operator

Sleeping

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

0b96563

verified ·

1 Parent(s): 35f77a2

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -60

app.py CHANGED Viewed

@@ -5,8 +5,7 @@ import time
 import unicodedata
 import gc
 from io import BytesIO
-from typing import Iterable
-from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
@@ -99,7 +98,8 @@ print(f"Running on device: {device}")
 # --- Load Fara-7B ---
 print("🔄 Loading Fara-7B...")
-MODEL_ID_V = "microsoft/Fara-7B"
 try:
     processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
     model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -116,7 +116,6 @@ except Exception as e:
 print("🔄 Loading UI-TARS-1.5-7B...")
 MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
 try:
-    # Important: use_fast=False is often required for custom tokenizers
     processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
     model_x = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_X,
@@ -128,9 +127,9 @@ except Exception as e:
     model_x = None
     processor_x = None
-# --- Load Holo2-8B ---
 print("🔄 Loading Holo2-8B...")
-MODEL_ID_H = "Hcompany/Holo2-8B"
 try:
     processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
     model_h = AutoModelForImageTextToText.from_pretrained(
@@ -139,7 +138,7 @@ try:
         torch_dtype=torch.float16
     ).to(device).eval()
 except Exception as e:
-    print(f"Failed to load Holo2-8B: {e}")
     model_h = None
     processor_h = None
@@ -147,70 +146,58 @@ print("✅ Models loading sequence complete.")
 # -----------------------------------------------------------------------------
-# 3. UTILS & PROMPTS
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
     return Image.fromarray(np.uint8(image_array))
-# --- Fara Prompt ---
-def get_fara_prompt(task, image):
-    OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
-    You need to generate the next action to complete the task.
-    Output your action inside a <tool_call> block using JSON format.
-    Include "coordinate": [x, y] in pixels for interactions.
-    Examples:
-    <tool_call>{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}</tool_call>
-    <tool_call>{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}</tool_call>
     """
-    return [
-        {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
-        {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": f"Instruction: {task}"}]},
-    ]
-# --- UI-TARS & Holo Prompt ---
-def get_localization_prompt(task, image):
-    guidelines = (
-        "Localize an element on the GUI image according to my instructions and "
-        "output a click position as Click(x, y) with x num pixels from the left edge "
-        "and y num pixels from the top edge."
-    )
-    return [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": f"{guidelines}\n{task}"}
-            ],
-        }
-    ]
-def get_image_proc_params(processor) -> Dict[str, int]:
-    ip = getattr(processor, "image_processor", None)
     return {
-        "patch_size": getattr(ip, "patch_size", 14),
-        "merge_size": getattr(ip, "merge_size", 2),
-        "min_pixels": getattr(ip, "min_pixels", 256 * 256),
-        "max_pixels": getattr(ip, "max_pixels", 1280 * 1280),
     }
-# --- Chat/template helpers ---
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]]) -> str:
     tok = getattr(processor, "tokenizer", None)
     if hasattr(processor, "apply_chat_template"):
         return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     if tok is not None and hasattr(tok, "apply_chat_template"):
         return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Fallback for older models
     texts = []
     for m in messages:
-        if isinstance(m.get("content"), list):
-            for c in m.get("content", []):
                 if isinstance(c, dict) and c.get("type") == "text":
                     texts.append(c.get("text", ""))
-        elif isinstance(m.get("content"), str):
-             texts.append(m.get("content"))
     return "\n".join(texts)
 def batch_decode_compat(processor, token_id_batches, **kw):
@@ -229,9 +216,44 @@ def trim_generated(generated_ids, inputs):
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
 # -----------------------------------------------------------------------------
-# 4. PARSING LOGIC
 # -----------------------------------------------------------------------------
 def parse_click_response(text: str) -> List[Dict]:
@@ -255,6 +277,12 @@ def parse_click_response(text: str) -> List[Dict]:
     matches_box = re.findall(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
     # Remove duplicates
     unique_actions = []
@@ -291,10 +319,9 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
     if not actions: return None
     img_copy = original_image.copy()
     draw = ImageDraw.Draw(img_copy)
-    width, height = img_copy.size
     try:
-        font = ImageFont.load_default(size=20)
     except IOError:
         font = ImageFont.load_default()
@@ -322,20 +349,21 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         text_pos = (pixel_x + 25, pixel_y - 15)
-        # Draw text with background
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
-            draw.rectangle((bbox[0]-5, bbox[1]-3, bbox[2]+5, bbox[3]+3), fill="rgba(0,0,0,180)")
             draw.text(text_pos, label, fill="white", font=font)
         except Exception as e:
-            print(f"Error drawing text: {e}")
-            # Fallback if font loading/drawing fails
             draw.text(text_pos, label, fill="white")
     return img_copy
 # -----------------------------------------------------------------------------
-# 5. CORE LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
@@ -352,6 +380,8 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     if model_choice == "Fara-7B":
         if model_v is None: return "Error: Fara model failed to load on startup.", None
         print("Using Fara Pipeline...")
         messages = get_fara_prompt(task, input_pil_image)
         text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
@@ -386,12 +416,15 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         else:
             return f"Error: Unknown model '{model_choice}'", None
-        # 1. Smart Resize (Crucial for accuracy)
         ip_params = get_image_proc_params(processor)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
             factor=ip_params["patch_size"] * ip_params["merge_size"],
-            min_pixels=ip_params["min_pixels"], max_pixels=ip_params["max_pixels"]
         )
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
@@ -414,6 +447,7 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         actions = parse_click_response(raw_response)
         # 6. Rescale Coordinates back to Original Image Size
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
@@ -433,7 +467,7 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     return raw_response, output_image
 # -----------------------------------------------------------------------------
-# 6. UI SETUP
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=steel_blue_theme, css=css) as demo:

 import unicodedata
 import gc
 from io import BytesIO
+from typing import Iterable, Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
 # --- Load Fara-7B ---
 print("🔄 Loading Fara-7B...")
+MODEL_ID_V = "microsoft/Fara-7B"
+# Note: Ensure this ID is accessible. If private, use "Qwen/Qwen2.5-VL-7B-Instruct" as fallback for testing.
 try:
     processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
     model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 print("🔄 Loading UI-TARS-1.5-7B...")
 MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
 try:
     processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
     model_x = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_X,
     model_x = None
     processor_x = None
+# --- Load Holo2-8B (Using Holo1-3B ID as per previous context) ---
 print("🔄 Loading Holo2-8B...")
+MODEL_ID_H = "Hcompany/Holo1-3B"
 try:
     processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
     model_h = AutoModelForImageTextToText.from_pretrained(
         torch_dtype=torch.float16
     ).to(device).eval()
 except Exception as e:
+    print(f"Failed to load Holo: {e}")
     model_h = None
     processor_h = None
 # -----------------------------------------------------------------------------
+# 3. UTILS & HELPERS
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
     return Image.fromarray(np.uint8(image_array))
+def get_image_proc_params(processor) -> Dict[str, int]:
     """
+    Robustly retrieve image processing parameters, handling NoneTypes.
+    This fixes the 'TypeError: > not supported between int and NoneType' error.
+    """
+    ip = getattr(processor, "image_processor", None)
+    # Default fallback values for Qwen2-VL architecture
+    default_min = 256 * 256
+    default_max = 1280 * 1280
+    patch_size = getattr(ip, "patch_size", 14)
+    merge_size = getattr(ip, "merge_size", 2)
+    min_pixels = getattr(ip, "min_pixels", default_min)
+    max_pixels = getattr(ip, "max_pixels", default_max)
+    # Explicit check because sometimes getattr returns None if the config key exists but is null
+    if min_pixels is None: min_pixels = default_min
+    if max_pixels is None: max_pixels = default_max
     return {
+        "patch_size": patch_size,
+        "merge_size": merge_size,
+        "min_pixels": min_pixels,
+        "max_pixels": max_pixels,
     }
 def apply_chat_template_compat(processor, messages: List[Dict[str, Any]]) -> str:
+    """Helper to apply chat templates across different model types/versions."""
     tok = getattr(processor, "tokenizer", None)
     if hasattr(processor, "apply_chat_template"):
         return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     if tok is not None and hasattr(tok, "apply_chat_template"):
         return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Fallback manual construction
     texts = []
     for m in messages:
+        content = m.get("content", "")
+        if isinstance(content, list):
+            for c in content:
                 if isinstance(c, dict) and c.get("type") == "text":
                     texts.append(c.get("text", ""))
+        elif isinstance(content, str):
+             texts.append(content)
     return "\n".join(texts)
 def batch_decode_compat(processor, token_id_batches, **kw):
         return generated_ids
     return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
+# -----------------------------------------------------------------------------
+# 4. PROMPTS
+# -----------------------------------------------------------------------------
+# --- Fara Prompt ---
+def get_fara_prompt(task, image):
+    OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
+    You need to generate the next action to complete the task.
+    Output your action inside a <tool_call> block using JSON format.
+    Include "coordinate": [x, y] in pixels for interactions.
+    Examples:
+    <tool_call>{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}</tool_call>
+    <tool_call>{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}</tool_call>
+    """
+    return [
+        {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
+        {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": f"Instruction: {task}"}]},
+    ]
+# --- UI-TARS & Holo Prompt ---
+def get_localization_prompt(task, image):
+    guidelines = (
+        "Localize an element on the GUI image according to my instructions and "
+        "output a click position as Click(x, y) with x num pixels from the left edge "
+        "and y num pixels from the top edge."
+    )
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": f"{guidelines}\n{task}"}
+            ],
+        }
+    ]
 # -----------------------------------------------------------------------------
+# 5. PARSING & VISUALIZATION
 # -----------------------------------------------------------------------------
 def parse_click_response(text: str) -> List[Dict]:
     matches_box = re.findall(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
+    # Regex 4: Simple tuple (x,y) - Often seen in your error logs
+    # We look for a standalone tuple pattern
+    matches_tuple = re.findall(r"(?:^|\s)\(\s*(\d+)\s*,\s*(\d+)\s*\)(?:$|\s|,)", text)
+    for m in matches_tuple:
+        actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
     # Remove duplicates
     unique_actions = []
     if not actions: return None
     img_copy = original_image.copy()
     draw = ImageDraw.Draw(img_copy)
     try:
+        font = ImageFont.load_default(size=18)
     except IOError:
         font = ImageFont.load_default()
         text_pos = (pixel_x + 25, pixel_y - 15)
+        # Draw text with background (Bounding Box) to make it readable
         try:
             bbox = draw.textbbox(text_pos, label, font=font)
+            # Add padding to bbox
+            padded_bbox = (bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2)
+            draw.rectangle(padded_bbox, fill="black", outline=color)
             draw.text(text_pos, label, fill="white", font=font)
         except Exception as e:
+            # Fallback
             draw.text(text_pos, label, fill="white")
     return img_copy
 # -----------------------------------------------------------------------------
+# 6. CORE LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
     if model_choice == "Fara-7B":
         if model_v is None: return "Error: Fara model failed to load on startup.", None
         print("Using Fara Pipeline...")
+        # Fara pipeline uses process_vision_info
         messages = get_fara_prompt(task, input_pil_image)
         text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
         else:
             return f"Error: Unknown model '{model_choice}'", None
+        # 1. Smart Resize
+        # We call our robust get_image_proc_params here to avoid the TypeError
         ip_params = get_image_proc_params(processor)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
             factor=ip_params["patch_size"] * ip_params["merge_size"],
+            min_pixels=ip_params["min_pixels"],
+            max_pixels=ip_params["max_pixels"]
         )
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
         actions = parse_click_response(raw_response)
         # 6. Rescale Coordinates back to Original Image Size
+        # The model saw 'resized_w' x 'resized_h', coordinates are likely in that scale
         if resized_w > 0 and resized_h > 0:
             scale_x = orig_w / resized_w
             scale_y = orig_h / resized_h
     return raw_response, output_image
 # -----------------------------------------------------------------------------
+# 7. UI SETUP
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=steel_blue_theme, css=css) as demo: