Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

0f4a1d2

verified ·

1 Parent(s): f8804bb

update app

Browse files

Files changed (1) hide show

app.py +205 -188

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import os
 import re
 import json
 import time
-import shutil
-import uuid
-import tempfile
 import unicodedata
 from io import BytesIO
-from typing import Tuple, Optional, List, Iterable
 import gradio as gr
 import numpy as np
@@ -17,15 +17,21 @@ from PIL import Image, ImageDraw, ImageFont
 # Transformers & Qwen Utils
 from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
 )
 from qwen_vl_utils import process_vision_info
-# Gradio Theme Utils
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
@@ -83,240 +89,255 @@ class SteelBlueTheme(Soft):
         )
 steel_blue_theme = SteelBlueTheme()
-css = """
-#main-title h1 { font-size: 2.3em !important; }
-#out_img { height: 600px; object-fit: contain; }
-"""
 # -----------------------------------------------------------------------------
-# 2. MODEL LOADING (Global Setup)
 # -----------------------------------------------------------------------------
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-# System Prompt
-OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
-You need to generate the next action to complete the task.
-Output your action inside a <tool_call> block using JSON format.
-Include "coordinate": [x, y] in pixels for interactions.
-Examples:
-<tool_call>
-{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}
-</tool_call>
-<tool_call>
-{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}
-</tool_call>
-"""
-# Load Fara-7B
-print("Loading Fara-7B...")
-MODEL_ID_V = "microsoft/Fara-7B"
-processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_V,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16
-).to(device).eval()
-# Load UI-TARS-1.5-7B
-print("Loading UI-TARS-1.5-7B...")
-# Note: Using the official SFT repo. Adjust if you have a specific private repo.
-MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_X,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
-).to(device).eval()
-print("✅ All Models Loaded Successfully")
 # -----------------------------------------------------------------------------
-# 3. UTILS: IMAGE, PARSING, VISUALIZATION
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
-    if image_array is None:
-        raise ValueError("No image provided. Please upload an image.")
     return Image.fromarray(np.uint8(image_array))
-def get_navigation_prompt(task, image):
     return [
         {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
-        {"role": "user", "content": [
-            {"type": "image", "image": image},
-            {"type": "text", "text": f"Instruction: {task}"},
-        ]},
     ]
-def parse_tool_calls(response: str) -> list[dict]:
-    """
-    Parses the <tool_call>{JSON}</tool_call> format.
-    """
     actions = []
-    matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
     for match in matches:
         try:
-            json_str = match.strip()
-            data = json.loads(json_str)
             args = data.get("arguments", {})
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
-            if coords and isinstance(coords, list) and len(coords) == 2:
-                actions.append({
-                    "type": action_type,
-                    "x": float(coords[0]),
-                    "y": float(coords[1]),
-                    "text": text_content,
-                    "raw_json": data
-                })
-                print(f"Parsed Action: {action_type} at {coords}")
-            else:
-                # Handle actions without coordinates (like pressing enter generally)
                 actions.append({
-                    "type": action_type,
-                    "text": text_content,
-                    "raw_json": data
                 })
-        except json.JSONDecodeError:
-            print(f"Failed to parse JSON: {match}")
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
-    """Draws markers on the image based on parsed pixel coordinates."""
-    if not actions:
-        return None
     img_copy = original_image.copy()
     draw = ImageDraw.Draw(img_copy)
     width, height = img_copy.size
-    try:
-        font = ImageFont.load_default()
-    except:
-        font = None
-    colors = {
-        'type': 'blue',
-        'click': 'red',
-        'left_click': 'red',
-        'right_click': 'purple',
-        'double_click': 'orange',
-        'unknown': 'green'
-    }
     for act in actions:
-        # Only draw if coordinates exist
-        if 'x' not in act or 'y' not in act:
-            continue
         x = act['x']
         y = act['y']
-        # Check if Normalized (0.0 - 1.0) or Absolute (Pixels > 1.0)
         if x <= 1.0 and y <= 1.0 and x > 0:
-            pixel_x = int(x * width)
-            pixel_y = int(y * height)
         else:
-            pixel_x = int(x)
-            pixel_y = int(y)
-        action_type = act['type']
-        color = colors.get(action_type, 'green')
-        # Draw Circle Target
-        r = 12
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
         draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
-        # Draw Label text
-        label_text = f"{action_type}"
-        if act['text']:
-            label_text += f": '{act['text']}'"
-        text_pos = (pixel_x + 15, pixel_y - 10)
-        bbox = draw.textbbox(text_pos, label_text, font=font)
-        draw.rectangle(bbox, fill="black")
-        draw.text(text_pos, label_text, fill="white", font=font)
     return img_copy
 # -----------------------------------------------------------------------------
-# 4. PROCESSING LOGIC
 # -----------------------------------------------------------------------------
-@spaces.GPU
-def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str) -> Tuple[str, Optional[Image.Image]]:
-    if input_numpy_image is None:
-        return "⚠️ Please upload an image first.", None
-    # 1. Select Model
-    if model_choice == "Fara-7B":
-        model = model_v
-        processor = processor_v
-    elif model_choice == "UI-TARS-1.5-7B":
-        model = model_x
-        processor = processor_x
-    else:
-        return "Invalid model selection", None
-    # 2. Prepare Data
     input_pil_image = array_to_image(input_numpy_image)
-    prompt = get_navigation_prompt(task, input_pil_image)
-    # 3. Generate
-    text_prompts = processor.apply_chat_template(
-        prompt, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(prompt)
-    inputs = processor(
-        text=[text_prompts],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(device)
-    print(f"Generating with {model_choice}...")
-    with torch.no_grad():
-        generated_ids = model.generate(**inputs, max_new_tokens=512)
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    raw_response = processor.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    print(f"Raw Output:\n{raw_response}")
-    # 4. Parse & Visualize
-    actions = parse_tool_calls(raw_response)
     output_image = input_pil_image
     if actions:
-        visualized = create_localized_image(input_pil_image, actions)
-        if visualized:
-            output_image = visualized
     return raw_response, output_image
 # -----------------------------------------------------------------------------
-# 5. GRADIO UI
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
@@ -344,23 +365,19 @@ with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
         with gr.Column(scale=3):
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
-            output_text = gr.Textbox(label="Raw Model Output (JSON)", lines=8, show_copy_button=True)
-    # Wire up the button
     submit_btn.click(
         fn=process_screenshot,
         inputs=[input_image, task_input, model_choice],
         outputs=[output_text, output_image]
     )
-    # Examples
     gr.Examples(
-        examples=[
-            ["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"],
-        ],
         inputs=[input_image, task_input, model_choice],
         label="Quick Examples"
     )
 if __name__ == "__main__":
-    demo.queue(max_size=20).launch(show_error=True)

 import os
 import re
 import json
+import gc
 import time
 import unicodedata
+import traceback
+import contextlib
 from io import BytesIO
+from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
 # Transformers & Qwen Utils
 from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
+    AutoModelForImageTextToText
 )
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 from qwen_vl_utils import process_vision_info
+# Gradio Theme
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+# -----------------------------------------------------------------------------
+# 1. THEME CONFIGURATION
+# -----------------------------------------------------------------------------
 colors.steel_blue = colors.Color(
     name="steel_blue",
     c50="#EBF3F8",
         )
 steel_blue_theme = SteelBlueTheme()
+css = "#main-title h1 { font-size: 2.3em !important; } #out_img { height: 600px; object-fit: contain; }"
 # -----------------------------------------------------------------------------
+# 2. MODEL MANAGEMENT
 # -----------------------------------------------------------------------------
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+current_model_state = {"model": None, "processor": None, "name": None}
+def load_fara_model():
+    print("🔄 Loading Fara-7B...")
+    MODEL_ID_V = "microsoft/Fara-7B"
+    processor = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID_V, trust_remote_code=True, torch_dtype=torch.float16
+    ).to(DEVICE).eval()
+    return model, processor
+def load_uitars_model():
+    print("🔄 Loading UI-TARS-1.5-7B...")
+    MODEL_ID_X = "ByteDance-Seed/UI-TARS-1.5-7B" # Updated to official HF ID
+    try:
+        model = AutoModelForImageTextToText.from_pretrained(
+            MODEL_ID_X, torch_dtype=torch.float16, trust_remote_code=True
+        ).to(DEVICE).eval()
+        # Important: use_fast=False for UI-TARS compat
+        processor = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True, use_fast=False)
+        return model, processor
+    except Exception as e:
+        print(f"Error loading UI-TARS: {e}")
+        raise e
+def get_model_pipeline(model_choice: str):
+    global current_model_state
+    if current_model_state["name"] == model_choice and current_model_state["model"] is not None:
+        return current_model_state["model"], current_model_state["processor"]
+    if current_model_state["model"] is not None:
+        del current_model_state["model"]
+        del current_model_state["processor"]
+        gc.collect()
+        torch.cuda.empty_cache()
+    if model_choice == "Fara-7B":
+        model, processor = load_fara_model()
+    else:
+        model, processor = load_uitars_model()
+    current_model_state["model"] = model
+    current_model_state["processor"] = processor
+    current_model_state["name"] = model_choice
+    return model, processor
 # -----------------------------------------------------------------------------
+# 3. UTILS & PROMPTS
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
+    if image_array is None: raise ValueError("No image provided.")
     return Image.fromarray(np.uint8(image_array))
+# Fara Prompt
+def get_fara_prompt(task, image):
+    OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
+    You need to generate the next action to complete the task.
+    Output your action inside a <tool_call> block using JSON format.
+    Include "coordinate": [x, y] in pixels for interactions.
+    Examples:
+    <tool_call>{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}</tool_call>
+    <tool_call>{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}</tool_call>
+    """
     return [
         {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
+        {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": f"Instruction: {task}"}]},
     ]
+# UI-TARS Prompt
+def get_uitars_prompt(task, image):
+    guidelines = (
+        "Localize an element on the GUI image according to my instructions and "
+        "output a click position as Click(x, y) with x num pixels from the left edge "
+        "and y num pixels from the top edge."
+    )
+    return [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": f"{guidelines}\n{task}"}
+            ],
+        }
+    ]
+def get_image_proc_params(processor) -> Dict[str, int]:
+    ip = getattr(processor, "image_processor", None)
+    return {
+        "patch_size": getattr(ip, "patch_size", 14),
+        "merge_size": getattr(ip, "merge_size", 2), # Adjusted for typical TARS
+        "min_pixels": getattr(ip, "min_pixels", 256 * 256),
+        "max_pixels": getattr(ip, "max_pixels", 1280 * 1280),
+    }
+# -----------------------------------------------------------------------------
+# 4. PARSING LOGIC
+# -----------------------------------------------------------------------------
+def parse_uitars_response(text: str, img_w: int, img_h: int) -> List[Dict]:
+    """Parse UI-TARS specific output formats"""
     actions = []
+    # 1. Click(x,y)
+    m = re.search(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", text)
+    if m:
+        x, y = int(m.group(1)), int(m.group(2))
+        actions.append({"type": "click", "x": x, "y": y, "text": ""})
+        return actions
+    # 2. start_box='(x,y)'
+    m = re.search(r"start_box=['\"]\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]", text)
+    if m:
+        x, y = int(m.group(1)), int(m.group(2))
+        actions.append({"type": "click", "x": x, "y": y, "text": ""})
+        return actions
+    return actions
+def parse_fara_response(response: str) -> List[Dict]:
+    """Parse Fara <tool_call> JSON format"""
+    actions = []
+    matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
     for match in matches:
         try:
+            data = json.loads(match.strip())
             args = data.get("arguments", {})
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
+            if coords and len(coords) == 2:
                 actions.append({
+                    "type": action_type, "x": float(coords[0]), "y": float(coords[1]), "text": text_content
                 })
+        except: pass
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
+    if not actions: return None
     img_copy = original_image.copy()
     draw = ImageDraw.Draw(img_copy)
     width, height = img_copy.size
+    try: font = ImageFont.load_default()
+    except: font = None
     for act in actions:
         x = act['x']
         y = act['y']
+        # Normalize check
         if x <= 1.0 and y <= 1.0 and x > 0:
+            pixel_x, pixel_y = int(x * width), int(y * height)
         else:
+            pixel_x, pixel_y = int(x), int(y)
+        color = 'red' if 'click' in act['type'] else 'blue'
+        # Draw Target
+        r = 15
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
         draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
+        # Draw Label
+        label = f"{act['type']}: {act['text']}" if act['text'] else act['type']
+        text_pos = (pixel_x + 18, pixel_y - 12)
+        bbox = draw.textbbox(text_pos, label, font=font)
+        draw.rectangle((bbox[0]-2, bbox[1]-2, bbox[2]+2, bbox[3]+2), fill="black")
+        draw.text(text_pos, label, fill="white", font=font)
     return img_copy
 # -----------------------------------------------------------------------------
+# 5. CORE LOGIC
 # -----------------------------------------------------------------------------
+@spaces.GPU(duration=120)
+def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str):
+    if input_numpy_image is None: return "⚠️ Please upload an image.", None
+    # 1. Load Model
+    model, processor = get_model_pipeline(model_choice)
     input_pil_image = array_to_image(input_numpy_image)
+    orig_w, orig_h = input_pil_image.size
+    # 2. Preprocess & Generate
+    if model_choice == "UI-TARS-1.5-7B":
+        # Specific UI-TARS resizing logic
+        ip_params = get_image_proc_params(processor)
+        resized_h, resized_w = smart_resize(
+            input_pil_image.height, input_pil_image.width,
+            factor=ip_params["patch_size"] * ip_params["merge_size"],
+            min_pixels=ip_params["min_pixels"], max_pixels=ip_params["max_pixels"]
+        )
+        proc_image = input_pil_image.resize((resized_w, resized_h), Image.Resampling.LANCZOS)
+        messages = get_uitars_prompt(task, proc_image)
+        # UI-TARS uses apply_chat_template but often requires manual text construction internally
+        # We'll rely on the standard processor flow which handles this if trust_remote_code=True
+        text_prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[text_prompt], images=[proc_image], padding=True, return_tensors="pt")
+        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=128)
+        # Decode
+        generated_ids = [out_ids[len(in_seq):] for in_seq, out_ids in zip(inputs.get("input_ids"), generated_ids)]
+        raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Parse (Scaling coordinates back to original size)
+        actions = parse_uitars_response(raw_response, resized_w, resized_h)
+        # Scale back coordinates
+        scale_x, scale_y = orig_w / resized_w, orig_h / resized_h
+        for a in actions:
+            a['x'] = int(a['x'] * scale_x)
+            a['y'] = int(a['y'] * scale_y)
+    else: # Fara-7B
+        messages = get_fara_prompt(task, input_pil_image)
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+        inputs = inputs.to(DEVICE)
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=512)
+        generated_ids = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        raw_response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        actions = parse_fara_response(raw_response)
+    # 3. Visualize
     output_image = input_pil_image
     if actions:
+        vis = create_localized_image(input_pil_image, actions)
+        if vis: output_image = vis
     return raw_response, output_image
 # -----------------------------------------------------------------------------
+# 6. UI SETUP
 # -----------------------------------------------------------------------------
 with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
         with gr.Column(scale=3):
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
+            output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)
     submit_btn.click(
         fn=process_screenshot,
         inputs=[input_image, task_input, model_choice],
         outputs=[output_text, output_image]
     )
     gr.Examples(
+        examples=[["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"]],
         inputs=[input_image, task_input, model_choice],
         label="Quick Examples"
     )
 if __name__ == "__main__":
+    demo.queue().launch()