Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

73b010f

verified ·

1 Parent(s): 6aaf210

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -127

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os
 import re
 import json
-import gc
 import time
 import unicodedata
 from io import BytesIO
-from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
@@ -14,17 +16,87 @@ import spaces
 from PIL import Image, ImageDraw, ImageFont
 # Transformers & Qwen Utils
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 # -----------------------------------------------------------------------------
-# 1. CONSTANTS & PROMPTS
 # -----------------------------------------------------------------------------
-# Device Configuration
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# System Prompt (Forces models to output parseable JSON)
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
@@ -41,81 +113,31 @@ Examples:
 </tool_call>
 """
-# -----------------------------------------------------------------------------
-# 2. GLOBAL STATE & MODEL MANAGEMENT
-# -----------------------------------------------------------------------------
-# We use a global dictionary to hold the currently loaded model to avoid reloading if not changed
-current_model_state = {
-    "model": None,
-    "processor": None,
-    "name": None
-}
-def load_fara_model():
-    """Loads Microsoft Fara-7B to CUDA"""
-    print("🔄 Loading Fara-7B...")
-    MODEL_ID_V = "microsoft/Fara-7B"
-    processor = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ID_V,
-        trust_remote_code=True,
-        torch_dtype=torch.float16 # As requested
-    ).to(DEVICE).eval()
-    return model, processor
-def load_uitars_model():
-    """Loads UI-TARS-1.5-7B to CUDA"""
-    print("🔄 Loading UI-TARS...")
-    # Note: Using the official SFT repo as the specific ID provided in snippet might be private/incorrect
-    MODEL_ID_X = "bytedance/UI-TARS-7B-SFT"
-    processor = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ID_X,
-        trust_remote_code=True,
-        torch_dtype=torch.bfloat16, # As requested
-    ).to(DEVICE).eval()
-    return model, processor
-def get_model_pipeline(model_choice: str):
-    """
-    Manages VRAM: Unloads old model, loads new model based on user choice.
-    """
-    global current_model_state
-    # If the requested model is already loaded, return it
-    if current_model_state["name"] == model_choice and current_model_state["model"] is not None:
-        return current_model_state["model"], current_model_state["processor"]
-    # Otherwise, clear VRAM first
-    if current_model_state["model"] is not None:
-        print("🗑️ Unloading previous model to free VRAM...")
-        del current_model_state["model"]
-        del current_model_state["processor"]
-        current_model_state["model"] = None
-        current_model_state["processor"] = None
-        gc.collect()
-        torch.cuda.empty_cache()
-    # Load the requested model
-    if model_choice == "Fara-7B":
-        model, processor = load_fara_model()
-    else:
-        model, processor = load_uitars_model()
-    # Update state
-    current_model_state["model"] = model
-    current_model_state["processor"] = processor
-    current_model_state["name"] = model_choice
-    return model, processor
 # -----------------------------------------------------------------------------
-# 3. UTILS: IMAGE & PARSING
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
@@ -149,7 +171,6 @@ def parse_tool_calls(response: str) -> list[dict]:
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
-            # Basic validation
             if coords and isinstance(coords, list) and len(coords) == 2:
                 actions.append({
                     "type": action_type,
@@ -160,8 +181,12 @@ def parse_tool_calls(response: str) -> list[dict]:
                 })
                 print(f"Parsed Action: {action_type} at {coords}")
             else:
-                # Some actions (like key press) might not have coords
-                print(f"Action parsed without coords: {action_type}")
         except json.JSONDecodeError:
             print(f"Failed to parse JSON: {match}")
@@ -169,7 +194,7 @@ def parse_tool_calls(response: str) -> list[dict]:
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
-    """Draws visual markers on the screenshot."""
     if not actions:
         return None
@@ -183,18 +208,23 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         font = None
     colors = {
-        'click': 'red', 'left_click': 'red',
         'type': 'blue',
         'right_click': 'purple',
         'double_click': 'orange',
         'unknown': 'green'
     }
     for act in actions:
         x = act['x']
         y = act['y']
-        # Handle normalized (0-1) vs pixel coordinates
         if x <= 1.0 and y <= 1.0 and x > 0:
             pixel_x = int(x * width)
             pixel_y = int(y * height)
@@ -205,46 +235,51 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         action_type = act['type']
         color = colors.get(action_type, 'green')
-        # Draw Target
-        r = 15
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
-        draw.ellipse([pixel_x - 4, pixel_y - 4, pixel_x + 4, pixel_y + 4], fill=color)
-        # Draw Label
         label_text = f"{action_type}"
         if act['text']:
             label_text += f": '{act['text']}'"
-        text_pos = (pixel_x + 18, pixel_y - 12)
         bbox = draw.textbbox(text_pos, label_text, font=font)
-        draw.rectangle((bbox[0]-2, bbox[1]-2, bbox[2]+2, bbox[3]+2), fill="black")
         draw.text(text_pos, label_text, fill="white", font=font)
     return img_copy
 # -----------------------------------------------------------------------------
-# 4. GRADIO LOGIC (ZERO-GPU ENABLED)
 # -----------------------------------------------------------------------------
-@spaces.GPU(duration=120)
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
-    # 1. Load the specific model requested (Fara or UI-TARS) to CUDA
-    model, processor = get_model_pipeline(model_choice)
     # 2. Prepare Data
     input_pil_image = array_to_image(input_numpy_image)
-    prompt_messages = get_navigation_prompt(task, input_pil_image)
     # 3. Generate
-    print(f"Generating response using {model_choice}...")
     text_prompts = processor.apply_chat_template(
-        prompt_messages, tokenize=False, add_generation_prompt=True
     )
-    image_inputs, video_inputs = process_vision_info(prompt_messages)
     inputs = processor(
         text=[text_prompts],
@@ -253,8 +288,9 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
         padding=True,
         return_tensors="pt",
     )
-    inputs = inputs.to(model.device)
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=512)
@@ -280,35 +316,21 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: s
     return raw_response, output_image
 # -----------------------------------------------------------------------------
-# 5. GRADIO UI SETUP
 # -----------------------------------------------------------------------------
-title = "CUA GUI Agent 🖥️"
-description = """
-**Computer Use Agent (CUA)** Demo.
-Upload a screenshot and provide a task instruction. The model will analyze the UI and output the precise coordinates and actions required.
-**Models Supported:**
-* **Fara-7B**: Microsoft's GUI agent model.
-* **UI-TARS-1.5-7B**: ByteDance's GUI agent model.
-"""
-custom_css = """
-#out_img { height: 600px; object-fit: contain; }
-"""
-with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
-    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
-    gr.Markdown(description)
     with gr.Row():
-        with gr.Column():
             input_image = gr.Image(label="Upload Screenshot", height=500)
             with gr.Row():
-                model_choice = gr.Dropdown(
-                    label="Choose CUA Model",
                     choices=["Fara-7B", "UI-TARS-1.5-7B"],
                     value="Fara-7B",
                     interactive=True
                 )
@@ -320,9 +342,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
             )
             submit_btn = gr.Button("Analyze UI & Generate Action", variant="primary")
-        with gr.Column():
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
-            output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)
     # Wire up the button
     submit_btn.click(
@@ -331,7 +353,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
         outputs=[output_text, output_image]
     )
-    # Example for quick testing
     gr.Examples(
         examples=[
             ["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"],
@@ -341,4 +363,4 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
     )
 if __name__ == "__main__":
-    demo.queue().launch()

 import os
 import re
 import json
 import time
+import shutil
+import uuid
+import tempfile
 import unicodedata
 from io import BytesIO
+from typing import Tuple, Optional, List, Iterable
 import gradio as gr
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
 # Transformers & Qwen Utils
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+)
 from qwen_vl_utils import process_vision_info
+# Gradio Theme Utils
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+colors.steel_blue = colors.Color(
+    name="steel_blue",
+    c50="#EBF3F8",
+    c100="#D3E5F0",
+    c200="#A8CCE1",
+    c300="#7DB3D2",
+    c400="#529AC3",
+    c500="#4682B4",
+    c600="#3E72A0",
+    c700="#36638C",
+    c800="#2E5378",
+    c900="#264364",
+    c950="#1E3450",
+)
+class SteelBlueTheme(Soft):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.gray,
+        secondary_hue: colors.Color | str = colors.steel_blue,
+        neutral_hue: colors.Color | str = colors.slate,
+        text_size: sizes.Size | str = sizes.text_lg,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        super().set(
+            background_fill_primary="*primary_50",
+            background_fill_primary_dark="*primary_900",
+            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
+            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
+            button_primary_text_color="white",
+            button_primary_text_color_hover="white",
+            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
+            block_title_text_weight="600",
+            block_border_width="3px",
+            block_shadow="*shadow_drop_lg",
+            button_primary_shadow="*shadow_drop_lg",
+            button_large_padding="11px",
+        )
+steel_blue_theme = SteelBlueTheme()
+css = """
+#main-title h1 { font-size: 2.3em !important; }
+#out_img { height: 600px; object-fit: contain; }
+"""
 # -----------------------------------------------------------------------------
+# 2. MODEL LOADING (Global Setup)
 # -----------------------------------------------------------------------------
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+# System Prompt
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
 </tool_call>
 """
+# Load Fara-7B
+print("Loading Fara-7B...")
+MODEL_ID_V = "microsoft/Fara-7B"
+processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
+model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_V,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16
+).to(device).eval()
+# Load UI-TARS-1.5-7B
+print("Loading UI-TARS-1.5-7B...")
+# Note: Using the official SFT repo. Adjust if you have a specific private repo.
+MODEL_ID_X = "bytedance/UI-TARS-7B"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_X,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+).to(device).eval()
+print("✅ All Models Loaded Successfully")
 # -----------------------------------------------------------------------------
+# 3. UTILS: IMAGE, PARSING, VISUALIZATION
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
             if coords and isinstance(coords, list) and len(coords) == 2:
                 actions.append({
                     "type": action_type,
                 })
                 print(f"Parsed Action: {action_type} at {coords}")
             else:
+                # Handle actions without coordinates (like pressing enter generally)
+                actions.append({
+                    "type": action_type,
+                    "text": text_content,
+                    "raw_json": data
+                })
         except json.JSONDecodeError:
             print(f"Failed to parse JSON: {match}")
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
+    """Draws markers on the image based on parsed pixel coordinates."""
     if not actions:
         return None
         font = None
     colors = {
         'type': 'blue',
+        'click': 'red',
+        'left_click': 'red',
         'right_click': 'purple',
         'double_click': 'orange',
         'unknown': 'green'
     }
     for act in actions:
+        # Only draw if coordinates exist
+        if 'x' not in act or 'y' not in act:
+            continue
         x = act['x']
         y = act['y']
+        # Check if Normalized (0.0 - 1.0) or Absolute (Pixels > 1.0)
         if x <= 1.0 and y <= 1.0 and x > 0:
             pixel_x = int(x * width)
             pixel_y = int(y * height)
         action_type = act['type']
         color = colors.get(action_type, 'green')
+        # Draw Circle Target
+        r = 12
         draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
+        draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
+        # Draw Label text
         label_text = f"{action_type}"
         if act['text']:
             label_text += f": '{act['text']}'"
+        text_pos = (pixel_x + 15, pixel_y - 10)
         bbox = draw.textbbox(text_pos, label_text, font=font)
+        draw.rectangle(bbox, fill="black")
         draw.text(text_pos, label_text, fill="white", font=font)
     return img_copy
 # -----------------------------------------------------------------------------
+# 4. PROCESSING LOGIC
 # -----------------------------------------------------------------------------
+@spaces.GPU
 def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
+    # 1. Select Model
+    if model_choice == "Fara-7B":
+        model = model_v
+        processor = processor_v
+    elif model_choice == "UI-TARS-1.5-7B":
+        model = model_x
+        processor = processor_x
+    else:
+        return "Invalid model selection", None
     # 2. Prepare Data
     input_pil_image = array_to_image(input_numpy_image)
+    prompt = get_navigation_prompt(task, input_pil_image)
     # 3. Generate
     text_prompts = processor.apply_chat_template(
+        prompt, tokenize=False, add_generation_prompt=True
     )
+    image_inputs, video_inputs = process_vision_info(prompt)
     inputs = processor(
         text=[text_prompts],
         padding=True,
         return_tensors="pt",
     )
+    inputs = inputs.to(device)
+    print(f"Generating with {model_choice}...")
     with torch.no_grad():
         generated_ids = model.generate(**inputs, max_new_tokens=512)
     return raw_response, output_image
 # -----------------------------------------------------------------------------
+# 5. GRADIO UI
 # -----------------------------------------------------------------------------
+with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
+    gr.Markdown("# **CUA GUI Agent 🖥️**", elem_id="main-title")
+    gr.Markdown("Upload a screenshot, select a model, and provide a task. The model will determine the precise UI coordinates and actions.")
     with gr.Row():
+        with gr.Column(scale=2):
             input_image = gr.Image(label="Upload Screenshot", height=500)
             with gr.Row():
+                model_choice = gr.Radio(
                     choices=["Fara-7B", "UI-TARS-1.5-7B"],
+                    label="Select Model",
                     value="Fara-7B",
                     interactive=True
                 )
             )
             submit_btn = gr.Button("Analyze UI & Generate Action", variant="primary")
+        with gr.Column(scale=3):
             output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
+            output_text = gr.Textbox(label="Raw Model Output (JSON)", lines=8, show_copy_button=True)
     # Wire up the button
     submit_btn.click(
         outputs=[output_text, output_image]
     )
+    # Examples
     gr.Examples(
         examples=[
             ["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"],
     )
 if __name__ == "__main__":
+    demo.queue(max_size=20).launch(show_error=True)