Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

d87b209

verified ·

1 Parent(s): e207530

Update app.py

Browse files

Files changed (1) hide show

app.py +168 -82

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import time
 import unicodedata
 import gc
 from io import BytesIO
-from typing import Iterable
 from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
@@ -114,9 +113,8 @@ except Exception as e:
 # --- Load UI-TARS-1.5-7B ---
 print("🔄 Loading UI-TARS-1.5-7B...")
-MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
 try:
-    # Important: use_fast=False is often required for custom tokenizers
     processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
     model_x = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_X,
@@ -128,7 +126,22 @@ except Exception as e:
     model_x = None
     processor_x = None
-print("✅ Models loading sequence complete.")
 # -----------------------------------------------------------------------------
 # 3. UTILS & PROMPTS
@@ -155,7 +168,6 @@ def get_fara_prompt(task, image):
 # --- UI-TARS Prompt ---
 def get_uitars_prompt(task, image):
-    # UI-TARS generally responds better to a simpler instruction when finetuned
     guidelines = (
         "Localize an element on the GUI image according to my instructions and "
         "output a click position as Click(x, y) with x num pixels from the left edge "
@@ -171,6 +183,19 @@ def get_uitars_prompt(task, image):
         }
     ]
 def get_image_proc_params(processor) -> Dict[str, int]:
     ip = getattr(processor, "image_processor", None)
     return {
@@ -185,39 +210,20 @@ def get_image_proc_params(processor) -> Dict[str, int]:
 # -----------------------------------------------------------------------------
 def parse_uitars_response(text: str) -> List[Dict]:
-    """Parse various UI-TARS output formats"""
     actions = []
     text = text.strip()
-    # Debug print
-    print(f"Parsing UI-TARS output: {text}")
-    # Regex 1: Click(x, y) - Standard prompt output
-    # Matches: Click(123, 456) or Click(123,456)
-    matches_click = re.findall(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
-    for m in matches_click:
-        actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
-    # Regex 2: point=[x, y] - Common model internal format
-    matches_point = re.findall(r"point=\[\s*(\d+)\s*,\s*(\d+)\s*\]", text, re.IGNORECASE)
-    for m in matches_point:
-        actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
-    # Regex 3: start_box='(x, y)' - Another variant
-    matches_box = re.findall(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
-    for m in matches_box:
-        actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
-    # Remove duplicates if any logic matched multiple times
-    unique_actions = []
-    seen = set()
-    for a in actions:
-        key = (a['type'], a['x'], a['y'])
-        if key not in seen:
-            seen.add(key)
-            unique_actions.append(a)
-    return unique_actions
 def parse_fara_response(response: str) -> List[Dict]:
     """Parse Fara <tool_call> JSON format"""
@@ -237,6 +243,74 @@ def parse_fara_response(response: str) -> List[Dict]:
         except: pass
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
     img_copy = original_image.copy()
@@ -250,32 +324,35 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         x = act['x']
         y = act['y']
-        # Determine if we need to scale normalized coords (0-1) or use absolute
-        # UI-TARS usually outputs absolute pixels relative to the image size it saw.
-        # But we already scaled them in the main loop.
-        # Double check sanity:
-        if x < 1.0 and y < 1.0:
-            pixel_x, pixel_y = int(x * width), int(y * height)
         else:
-            pixel_x, pixel_y = int(x), int(y)
         color = 'red' if 'click' in act['type'].lower() else 'blue'
-        # Draw Target Crosshair/Circle
         r = 15
-        line_width = 4
-        # Circle
-        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=line_width)
-        # Center dot
         draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
         # Label
         label = f"{act['type']}"
         if act['text']: label += f": {act['text']}"
         text_pos = (pixel_x + 20, pixel_y - 10)
-        # Draw text background
         bbox = draw.textbbox(text_pos, label, font=font)
         draw.rectangle((bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2), fill="black")
         draw.text(text_pos, label, fill="white", font=font)
@@ -288,18 +365,19 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
 @spaces.GPU(duration=120)
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
-    if input_numpy_image is None: return "⚠️ Please upload an image.", None
     input_pil_image = array_to_image(input_numpy_image)
     orig_w, orig_h = input_pil_image.size
     # --- UI-TARS Logic ---
     if model_choice == "UI-TARS-1.5-7B":
-        if model_x is None: return "Error: UI-TARS model failed to load on startup.", None
-        print("Using UI-TARS Pipeline...")
-        # 1. Smart Resize (Crucial for UI-TARS accuracy)
-        # We must resize the image to the resolution the model expects/handles best
         ip_params = get_image_proc_params(processor_x)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
@@ -308,50 +386,56 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         )
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
-        # 2. Prompting
         messages = get_uitars_prompt(task, proc_image)
         text_prompt = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        # 3. Inputs
         inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
-        # 4. Generate
         with torch.no_grad():
             generated_ids = model_x.generate(**inputs, max_new_tokens=128)
-        # Decode
         generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
         raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # 5. Parse
         actions = parse_uitars_response(raw_response)
-        # 6. Rescale Coordinates back to Original Image Size
-        # The model saw 'resized_w' x 'resized_h', so coordinates are in that space.
-        # We need to map them back to 'orig_w' x 'orig_h' for the visualizer.
         scale_x = orig_w / resized_w
         scale_y = orig_h / resized_h
         for a in actions:
             a['x'] = int(a['x'] * scale_x)
             a['y'] = int(a['y'] * scale_y)
     # --- Fara Logic ---
     else:
-        if model_v is None: return "Error: Fara model failed to load on startup.", None
-        print("Using Fara Pipeline...")
         messages = get_fara_prompt(task, input_pil_image)
         text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
-        inputs = processor_v(
-            text=[text_prompt],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt"
-        )
         inputs = inputs.to(device)
         with torch.no_grad():
@@ -359,20 +443,22 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
         raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Fara usually outputs exact pixels based on original image
         actions = parse_fara_response(raw_response)
-    print(f"Raw Output: {raw_response}")
-    print(f"Parsed Actions: {actions}")
-    # 3. Visualize
     output_image = input_pil_image
     if actions:
         vis = create_localized_image(input_pil_image, actions)
         if vis: output_image = vis
-    return raw_response, output_image
 # -----------------------------------------------------------------------------
 # 6. UI SETUP
@@ -388,7 +474,7 @@ with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
             with gr.Row():
                 model_choice = gr.Radio(
-                    choices=["Fara-7B", "UI-TARS-1.5-7B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True
@@ -403,7 +489,7 @@ with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
         with gr.Column(scale=3):
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
-            output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)
     submit_btn.click(
         fn=process_screenshot,

 import unicodedata
 import gc
 from io import BytesIO
 from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
 # --- Load UI-TARS-1.5-7B ---
 print("🔄 Loading UI-TARS-1.5-7B...")
+MODEL_ID_X = "bytedance/UI-TARS-7B-SFT"
 try:
     processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
     model_x = AutoModelForImageTextToText.from_pretrained(
         MODEL_ID_X,
     model_x = None
     processor_x = None
+# --- Load Holo2-8B ---
+print("🔄 Loading Holo2-8B...")
+MODEL_ID_H = "Hcompany/Holo2-8B"
+try:
+    processor_h = AutoProcessor.from_pretrained(MODEL_ID_H, trust_remote_code=True)
+    model_h = AutoModelForImageTextToText.from_pretrained(
+        MODEL_ID_H,
+        trust_remote_code=True,
+        torch_dtype=torch.float16
+    ).to(device).eval()
+except Exception as e:
+    print(f"Failed to load Holo2: {e}")
+    model_h = None
+    processor_h = None
+print("✅ All Models Loaded Sequence Complete.")
 # -----------------------------------------------------------------------------
 # 3. UTILS & PROMPTS
 # --- UI-TARS Prompt ---
 def get_uitars_prompt(task, image):
     guidelines = (
         "Localize an element on the GUI image according to my instructions and "
         "output a click position as Click(x, y) with x num pixels from the left edge "
         }
     ]
+# --- Holo2 Prompt ---
+def get_holo2_prompt(task, image):
+    # Holo2 typically uses standard chat formatting
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": task}
+            ]
+        }
+    ]
 def get_image_proc_params(processor) -> Dict[str, int]:
     ip = getattr(processor, "image_processor", None)
     return {
 # -----------------------------------------------------------------------------
 def parse_uitars_response(text: str) -> List[Dict]:
+    """Parse UI-TARS specific output formats"""
     actions = []
     text = text.strip()
+    m = re.search(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text, re.IGNORECASE)
+    if m: actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
+    m = re.findall(r"point=\[\s*(\d+)\s*,\s*(\d+)\s*\]", text, re.IGNORECASE)
+    for p in m: actions.append({"type": "click", "x": int(p[0]), "y": int(p[1]), "text": ""})
+    m = re.search(r"start_box=['\"]?\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]?", text, re.IGNORECASE)
+    if m: actions.append({"type": "click", "x": int(m[0]), "y": int(m[1]), "text": ""})
+    return actions
 def parse_fara_response(response: str) -> List[Dict]:
     """Parse Fara <tool_call> JSON format"""
         except: pass
     return actions
+def parse_holo2_response(generated_ids, processor, input_len) -> Tuple[str, str, List[Dict]]:
+    """Parse Holo2 reasoning tokens and JSON content"""
+    all_ids = generated_ids[0].tolist()
+    # Token IDs for <|thought_start|> and <|thought_end|> (Qwen/Holo specific)
+    THOUGHT_START = 151667
+    THOUGHT_END = 151668
+    thinking_content = ""
+    content = ""
+    try:
+        if THOUGHT_START in all_ids:
+            start_idx = all_ids.index(THOUGHT_START)
+            try:
+                end_idx = all_ids.index(THOUGHT_END)
+            except ValueError:
+                end_idx = len(all_ids)
+            thinking_ids = all_ids[start_idx+1:end_idx]
+            thinking_content = processor.decode(thinking_ids, skip_special_tokens=True).strip()
+            # Content is everything after thought_end
+            content_ids = all_ids[end_idx+1:]
+            content = processor.decode(content_ids, skip_special_tokens=True).strip()
+        else:
+            # Fallback if no reasoning tokens found (just raw output)
+            # Slice off input tokens first
+            output_ids = all_ids[input_len:]
+            content = processor.decode(output_ids, skip_special_tokens=True).strip()
+    except Exception as e:
+        print(f"Holo Parsing Error: {e}")
+        content = processor.decode(all_ids[input_len:], skip_special_tokens=True).strip()
+    # Parse JSON Content
+    actions = []
+    try:
+        # Holo2 outputs strictly valid JSON usually
+        # E.g. {"x": 500, "y": 300, "description": "search bar"}
+        # Or {"action": "click", "point": [100, 200]}
+        # Flattening to common format
+        if "{" in content and "}" in content:
+            # Find JSON block if surrounded by text
+            json_str = re.search(r"(\{.*\})", content, re.DOTALL).group(1)
+            data = json.loads(json_str)
+            x, y = 0, 0
+            if "x" in data and "y" in data:
+                x, y = data["x"], data["y"]
+            elif "point" in data:
+                x, y = data["point"][0], data["point"][1]
+            elif "coordinate" in data:
+                x, y = data["coordinate"][0], data["coordinate"][1]
+            if x or y:
+                # Holo2 output is 0-1000 scale
+                actions.append({
+                    "type": "click",
+                    "x": float(x),
+                    "y": float(y),
+                    "text": data.get("description", "") or data.get("text", ""),
+                    "scale_base": 1000 # Flag to indicate this needs normalization from 1000
+                })
+    except Exception as e:
+        print(f"Holo JSON Parse Failed: {e}")
+    return content, thinking_content, actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions: return None
     img_copy = original_image.copy()
         x = act['x']
         y = act['y']
+        # Holo2 Special Case (0-1000 scaling)
+        if act.get('scale_base') == 1000:
+            pixel_x = int((x / 1000) * width)
+            pixel_y = int((y / 1000) * height)
+        # Normalized (0-1)
+        elif x <= 1.0 and y <= 1.0 and x > 0:
+            pixel_x = int(x * width)
+            pixel_y = int(y * height)
+        # Absolute Pixels
         else:
+            pixel_x = int(x)
+            pixel_y = int(y)
         color = 'red' if 'click' in act['type'].lower() else 'blue'
+        # Draw Visuals
         r = 15
+        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
         draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
+        # Draw Cross
+        draw.line([pixel_x - 10, pixel_y, pixel_x + 10, pixel_y], fill=color, width=2)
+        draw.line([pixel_x, pixel_y - 10, pixel_x, pixel_y + 10], fill=color, width=2)
         # Label
         label = f"{act['type']}"
         if act['text']: label += f": {act['text']}"
         text_pos = (pixel_x + 20, pixel_y - 10)
         bbox = draw.textbbox(text_pos, label, font=font)
         draw.rectangle((bbox[0]-4, bbox[1]-2, bbox[2]+4, bbox[3]+2), fill="black")
         draw.text(text_pos, label, fill="white", font=font)
 @spaces.GPU(duration=120)
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
+    if input_numpy_image is None: return "⚠️ Please upload an image.", None, None
     input_pil_image = array_to_image(input_numpy_image)
     orig_w, orig_h = input_pil_image.size
+    actions = []
+    raw_response = ""
+    reasoning_text = None
     # --- UI-TARS Logic ---
     if model_choice == "UI-TARS-1.5-7B":
+        if model_x is None: return "Error: UI-TARS model failed to load.", None, None
+        print("Running UI-TARS...")
         ip_params = get_image_proc_params(processor_x)
         resized_h, resized_w = smart_resize(
             input_pil_image.height, input_pil_image.width,
         )
         proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
         messages = get_uitars_prompt(task, proc_image)
         text_prompt = processor_x.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = processor_x(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
         inputs = {k: v.to(device) for k, v in inputs.items()}
         with torch.no_grad():
             generated_ids = model_x.generate(**inputs, max_new_tokens=128)
         generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
         raw_response = processor_x.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_uitars_response(raw_response)
+        # Rescale
         scale_x = orig_w / resized_w
         scale_y = orig_h / resized_h
         for a in actions:
             a['x'] = int(a['x'] * scale_x)
             a['y'] = int(a['y'] * scale_y)
+    # --- Holo2 Logic ---
+    elif model_choice == "Holo2-8B":
+        if model_h is None: return "Error: Holo2 model failed to load.", None, None
+        print("Running Holo2...")
+        messages = get_holo2_prompt(task, input_pil_image)
+        text_prompt = processor_h.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor_h(text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+        inputs = inputs.to(device)
+        with torch.no_grad():
+            generated_ids = model_h.generate(**inputs, max_new_tokens=512)
+        # Parse Reasoning + Content
+        input_len = len(inputs.input_ids[0])
+        content, thinking, parsed_actions = parse_holo2_response(generated_ids, processor_h, input_len)
+        raw_response = content
+        reasoning_text = thinking
+        actions = parsed_actions
     # --- Fara Logic ---
     else:
+        if model_v is None: return "Error: Fara model failed to load.", None, None
+        print("Running Fara...")
         messages = get_fara_prompt(task, input_pil_image)
         text_prompt = processor_v.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor_v(text=[text_prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
         inputs = inputs.to(device)
         with torch.no_grad():
         generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
         raw_response = processor_v.batch_decode(generated_ids, skip_special_tokens=True)[0]
         actions = parse_fara_response(raw_response)
+    print(f"Raw: {raw_response}")
+    if reasoning_text: print(f"Thinking: {reasoning_text}")
+    # Visualize
     output_image = input_pil_image
     if actions:
         vis = create_localized_image(input_pil_image, actions)
         if vis: output_image = vis
+    final_text_output = f"▶️ OUTPUT:\n{raw_response}"
+    if reasoning_text:
+        final_text_output = f"🧠 THINKING PROCESS:\n{reasoning_text}\n\n" + final_text_output
+    return final_text_output, output_image
 # -----------------------------------------------------------------------------
 # 6. UI SETUP
             with gr.Row():
                 model_choice = gr.Radio(
+                    choices=["Fara-7B", "UI-TARS-1.5-7B", "Holo2-8B"],
                     label="Select Model",
                     value="Fara-7B",
                     interactive=True
         with gr.Column(scale=3):
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
+            output_text = gr.Textbox(label="Model Output & Reasoning", lines=12, show_copy_button=True)
     submit_btn.click(
         fn=process_screenshot,