Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

9c44f17

verified ·

1 Parent(s): 6cd9363

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -156

app.py CHANGED Viewed

@@ -5,7 +5,8 @@ import time
 import unicodedata
 import gc
 from io import BytesIO
-from typing import Iterable, Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
@@ -115,6 +116,7 @@ except Exception as e:
 print("🔄 Loading UI-TARS-1.5-7B...")
 MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
 try:
     processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
     model_x = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_X,
@@ -126,80 +128,16 @@ except Exception as e:
     model_x = None
     processor_x = None
-# --- Load Holo2-8B ---
-print("🔄 Loading Holo2-8B...")
-MODEL_ID_H = "Hcompany/Holo2-8B"
-try:
-    processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
-    model_h = AutoModelForImageTextToText.from_pretrained(
-        MODEL_ID_H,
-        trust_remote_code=True,
-        torch_dtype=torch.float16
-    ).to(device).eval()
-except Exception as e:
-    print(f"Failed to load Holo2: {e}")
-    model_h = None
-    processor_h = None
 print("✅ Models loading sequence complete.")
 # -----------------------------------------------------------------------------
-# 3. UTILS & HELPERS
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
     return Image.fromarray(np.uint8(image_array))
-# --- Compatibility Helpers ---
-def apply_chat_template_compat(processor, messages: List[Dict[str, Any]]) -> str:
-    """Helper to handle chat template application across different processors"""
-    tok = getattr(processor, "tokenizer", None)
-    if hasattr(processor, "apply_chat_template"):
-        return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    if tok is not None and hasattr(tok, "apply_chat_template"):
-        return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Fallback if no template method found
-    texts = []
-    for m in messages:
-        for c in m.get("content", []):
-            if isinstance(c, dict) and c.get("type") == "text":
-                texts.append(c.get("text", ""))
-    return "\n".join(texts)
-def batch_decode_compat(processor, token_id_batches, **kw):
-    """Helper to handle batch decoding"""
-    tok = getattr(processor, "tokenizer", None)
-    if tok is not None and hasattr(tok, "batch_decode"):
-        return tok.batch_decode(token_id_batches, **kw)
-    if hasattr(processor, "batch_decode"):
-        return processor.batch_decode(token_id_batches, **kw)
-    raise AttributeError("No batch_decode available on processor or tokenizer.")
-def trim_generated(generated_ids, inputs):
-    """Removes input tokens from output if necessary"""
-    in_ids = getattr(inputs, "input_ids", None)
-    if in_ids is None and isinstance(inputs, dict):
-        in_ids = inputs.get("input_ids", None)
-    if in_ids is None:
-        return [out_ids for out_ids in generated_ids]
-    return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
-def get_image_proc_params(processor) -> Dict[str, int]:
-    """Extracts resizing parameters from the processor configuration"""
-    ip = getattr(processor, "image_processor", None)
-    return {
-        "patch_size": getattr(ip, "patch_size", 14),
-        "merge_size": getattr(ip, "merge_size", 2), # Default to 2, Holo2 might differ
-        "min_pixels": getattr(ip, "min_pixels", 256 * 256),
-        "max_pixels": getattr(ip, "max_pixels", 1280 * 1280),
-    }
-# -----------------------------------------------------------------------------
-# 4. PROMPTS
-# -----------------------------------------------------------------------------
 # --- Fara Prompt ---
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
@@ -217,6 +155,7 @@ def get_fara_prompt(task, image):
 # --- UI-TARS Prompt ---
 def get_uitars_prompt(task, image):
     guidelines = (
         "Localize an element on the GUI image according to my instructions and "
         "output a click position as Click(x, y) with x num pixels from the left edge "
@@ -232,38 +171,29 @@ def get_uitars_prompt(task, image):
         }
     ]
-# --- Holo2 Prompt ---
-def get_holo_prompt(pil_image: Image.Image, instruction: str) -> List[dict]:
-    guidelines: str = (
-        "Localize an element on the GUI image according to my instructions and "
-        "output a click position as Click(x, y) with x num pixels from the left edge "
-        "and y num pixels from the top edge."
-    )
-    return [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": pil_image},
-                {"type": "text", "text": f"{guidelines}\n{instruction}"}
-            ],
-        }
-    ]
 # -----------------------------------------------------------------------------
-# 5. PARSING LOGIC
 # -----------------------------------------------------------------------------
-def parse_coordinate_response(text: str) -> List[Dict]:
-    """
-    Parses UI-TARS and Holo2 output formats.
-    Targets formats like: Click(x, y), point=[x, y], etc.
-    """
     actions = []
     text = text.strip()
-    print(f"Parsing Coordinate output: {text}")
-    # Regex 1: Click(x, y) - Standard prompt output for UI-TARS & Holo2
     matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
@@ -278,7 +208,7 @@ def parse_coordinate_response(text: str) -> List[Dict]:
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
-    # Remove duplicates
     unique_actions = []
     seen = set()
     for a in actions:
@@ -311,13 +241,23 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
     if not actions: return None
     img_copy = original_image.copy()
     draw = ImageDraw.Draw(img_copy)
     try: font = ImageFont.load_default()
     except: font = None
     for act in actions:
-        pixel_x = int(act['x'])
-        pixel_y = int(act['y'])
         color = 'red' if 'click' in act['type'].lower() else 'blue'
@@ -343,7 +283,7 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
     return img_copy
 # -----------------------------------------------------------------------------
-# 6. CORE LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
@@ -352,18 +292,14 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     input_pil_image = array_to_image(input_numpy_image)
     orig_w, orig_h = input_pil_image.size
-    actions = []
-    raw_response = ""
-    # -----------------------
-    # MODEL: UI-TARS-1.5-7B
-    # -----------------------
     if model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load on startup.", None
         print("Using UI-TARS Pipeline...")
-        # 1. Smart Resize
         ip_params = get_image_proc_params(processor_x)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
@@ -372,78 +308,36 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         )
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
-        # 2. Prompt & Inputs
         messages = get_uitars_prompt(task, proc_image)
         text_prompt = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
-        # 3. Generate
         with torch.no_grad():
             generated_ids = model_x.generate(**inputs, max_new_tokens=128)
         generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
         raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # 4. Parse & Rescale
-        actions = parse_coordinate_response(raw_response)
-        # Map coordinates from resized space back to original space
         scale_x = orig_w / resized_w
         scale_y = orig_h / resized_h
-        for a in actions:
-            a['x'] = int(a['x'] * scale_x)
-            a['y'] = int(a['y'] * scale_y)
-    # -----------------------
-    # MODEL: Holo2-8B
-    # -----------------------
-    elif model_choice == "Holo2-8B":
-        if model_h is None: return "Error: Holo2 model failed to load on startup.", None
-        print("Using Holo2 Pipeline...")
-        # 1. Smart Resize (Holo2 typically uses merge_size=1 or similar logic)
-        ip_params = get_image_proc_params(processor_h)
-        # Force merge_size to 1 if not detected (as per common practice for this model architecture variant)
-        ms = ip_params.get("merge_size", 1)
-        resized_h, resized_w = smart_resize(
-            input_pil_image.height, input_pil_image.width,
-            factor=ip_params["patch_size"] * ms,
-            min_pixels=ip_params["min_pixels"], max_pixels=ip_params["max_pixels"]
-        )
-        proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
-        # 2. Prompt & Inputs
-        messages = get_holo_prompt(proc_image, task)
-        text_prompt = apply_chat_template_compat(processor_h, messages)
-        # Holo2 / Qwen2-VL based inputs
-        inputs = processor_h(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        # 3. Generate
-        with torch.no_grad():
-            generated_ids = model_h.generate(**inputs, max_new_tokens=128)
-        # Trim input tokens
-        generated_ids_trimmed = trim_generated(generated_ids, inputs)
-        raw_response = batch_decode_compat(processor_h, generated_ids_trimmed, skip_special_tokens=True)[0]
-        # 4. Parse & Rescale
-        # Holo2 prompt asks for Click(x,y) similar to UI-TARS
-        actions = parse_coordinate_response(raw_response)
-        # Map coordinates from resized space back to original space
-        scale_x = orig_w / resized_w
-        scale_y = orig_h / resized_h
         for a in actions:
             a['x'] = int(a['x'] * scale_x)
             a['y'] = int(a['y'] * scale_y)
-    # -----------------------
-    # MODEL: Fara-7B
-    # -----------------------
     else:
         if model_v is None: return "Error: Fara model failed to load on startup.", None
         print("Using Fara Pipeline...")
@@ -481,7 +375,7 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     return raw_response, output_image
 # -----------------------------------------------------------------------------
-# 7. UI SETUP
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
@@ -494,7 +388,7 @@ with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
             with gr.Row():
                 model_choice = gr.Radio(
-                    choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-8B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True

 import unicodedata
 import gc
 from io import BytesIO
+from typing import Iterable
+from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
 print("🔄 Loading UI-TARS-1.5-7B...")
 MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
 try:
+    # Important: use_fast=False is often required for custom tokenizers
     processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
     model_x = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_X,
     model_x = None
     processor_x = None
 print("✅ Models loading sequence complete.")
 # -----------------------------------------------------------------------------
+# 3. UTILS & PROMPTS
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
     if image_array is None: raise ValueError("No image provided.")
     return Image.fromarray(np.uint8(image_array))
 # --- Fara Prompt ---
 def get_fara_prompt(task, image):
     OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 # --- UI-TARS Prompt ---
 def get_uitars_prompt(task, image):
+    # UI-TARS generally responds better to a simpler instruction when finetuned
     guidelines = (
         "Localize an element on the GUI image according to my instructions and "
         "output a click position as Click(x, y) with x num pixels from the left edge "
         }
     ]
+def get_image_proc_params(processor) -> Dict[str, int]:
+    ip = getattr(processor, "image_processor", None)
+    return {
+        "patch_size": getattr(ip, "patch_size", 14),
+        "merge_size": getattr(ip, "merge_size", 2),
+        "min_pixels": getattr(ip, "min_pixels", 256 * 256),
+        "max_pixels": getattr(ip, "max_pixels", 1280 * 1280),
+    }
 # -----------------------------------------------------------------------------
+# 4. PARSING LOGIC
 # -----------------------------------------------------------------------------
+def parse_uitars_response(text: str) -> List[Dict]:
+    """Parse various UI-TARS output formats"""
     actions = []
     text = text.strip()
+    # Debug print
+    print(f"Parsing UI-TARS output: {text}")
+    # Regex 1: Click(x, y) - Standard prompt output
+    # Matches: Click(123, 456) or Click(123,456)
     matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
     for m in matches_click:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
     for m in matches_box:
         actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
+    # Remove duplicates if any logic matched multiple times
     unique_actions = []
     seen = set()
     for a in actions:
     if not actions: return None
     img_copy = original_image.copy()
     draw = ImageDraw.Draw(img_copy)
+    width, height = img_copy.size
     try: font = ImageFont.load_default()
     except: font = None
     for act in actions:
+        x = act['x']
+        y = act['y']
+        # Determine if we need to scale normalized coords (0-1) or use absolute
+        # UI-TARS usually outputs absolute pixels relative to the image size it saw.
+        # But we already scaled them in the main loop.
+        # Double check sanity:
+        if x < 1.0 and y < 1.0:
+            pixel_x, pixel_y = int(x * width), int(y * height)
+        else:
+            pixel_x, pixel_y = int(x), int(y)
         color = 'red' if 'click' in act['type'].lower() else 'blue'
     return img_copy
 # -----------------------------------------------------------------------------
+# 5. CORE LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
     input_pil_image = array_to_image(input_numpy_image)
     orig_w, orig_h = input_pil_image.size
+    # --- UI-TARS Logic ---
     if model_choice == "UI-TARS-1.5-7B":
         if model_x is None: return "Error: UI-TARS model failed to load on startup.", None
         print("Using UI-TARS Pipeline...")
+        # 1. Smart Resize (Crucial for UI-TARS accuracy)
+        # We must resize the image to the resolution the model expects/handles best
         ip_params = get_image_proc_params(processor_x)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
         )
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
+        # 2. Prompting
         messages = get_uitars_prompt(task, proc_image)
         text_prompt = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        # 3. Inputs
         inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
+        # 4. Generate
         with torch.no_grad():
             generated_ids = model_x.generate(**inputs, max_new_tokens=128)
+        # Decode
         generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
         raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # 5. Parse
+        actions = parse_uitars_response(raw_response)
+        # 6. Rescale Coordinates back to Original Image Size
+        # The model saw 'resized_w' x 'resized_h', so coordinates are in that space.
+        # We need to map them back to 'orig_w' x 'orig_h' for the visualizer.
         scale_x = orig_w / resized_w
         scale_y = orig_h / resized_h
         for a in actions:
             a['x'] = int(a['x'] * scale_x)
             a['y'] = int(a['y'] * scale_y)
+    # --- Fara Logic ---
     else:
         if model_v is None: return "Error: Fara model failed to load on startup.", None
         print("Using Fara Pipeline...")
     return raw_response, output_image
 # -----------------------------------------------------------------------------
+# 6. UI SETUP
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
             with gr.Row():
                 model_choice = gr.Radio(
+                    choices=["Fara-7B", "UI-TARS-1.5-7B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True