Spaces:

prithivMLmods
/

CUA-GUI-Operator

Sleeping

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

6f12eee

verified ·

1 Parent(s): fcb0f85

Update app.py

Browse files

Files changed (1) hide show

app.py +107 -79

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 import re
 import json
 import time
 import shutil
 import uuid
-import gc
 import tempfile
 import unicodedata
 from io import BytesIO
@@ -21,18 +21,24 @@ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 # -----------------------------------------------------------------------------
-# 1. CONSTANTS & CONFIGURATION
 # -----------------------------------------------------------------------------
-# Map display names to Hugging Face Repo IDs
 MODEL_MAP = {
     "Fara-7B": "microsoft/Fara-7B",
-    "UI-TARS-1.5-7B": "ByteDance-Seed/UI-TARS-1.5-7B"
 }
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# System Prompt
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
@@ -50,49 +56,38 @@ Examples:
 """
 # -----------------------------------------------------------------------------
-# 2. GLOBAL MODEL STATE MANAGEMENT
 # -----------------------------------------------------------------------------
-# Global variables to track the currently loaded model
-CURRENT_MODEL = None
-CURRENT_PROCESSOR = None
-CURRENT_MODEL_ID = None
-def load_model(model_key: str):
     """
-    Dynamically loads the requested model.
-    Unloads the previous model to free up GPU memory if a switch occurs.
     """
-    global CURRENT_MODEL, CURRENT_PROCESSOR, CURRENT_MODEL_ID
-    target_repo_id = MODEL_MAP[model_key]
-    # If the requested model is already loaded, do nothing
-    if CURRENT_MODEL is not None and CURRENT_MODEL_ID == target_repo_id:
-        print(f"Model {model_key} is already loaded.")
-        return
-    print(f"--- Switching Model to {model_key} ({target_repo_id}) ---")
-    # 1. Unload existing model to free GPU memory
     if CURRENT_MODEL is not None:
-        print("Unloading current model...")
         del CURRENT_MODEL
         del CURRENT_PROCESSOR
         CURRENT_MODEL = None
         CURRENT_PROCESSOR = None
         gc.collect()
         torch.cuda.empty_cache()
-        print("Memory cleared.")
-    # 2. Load new model
     try:
-        print(f"Loading processor for {target_repo_id}...")
-        processor = AutoProcessor.from_pretrained(target_repo_id, trust_remote_code=True)
-        print(f"Loading model weights for {target_repo_id}...")
         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            target_repo_id,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
             device_map="auto" if DEVICE == "cuda" else None,
@@ -103,47 +98,51 @@ def load_model(model_key: str):
         model.eval()
-        # Update global state
         CURRENT_MODEL = model
         CURRENT_PROCESSOR = processor
-        CURRENT_MODEL_ID = target_repo_id
-        print(f"Successfully loaded {model_key}.")
     except Exception as e:
-        print(f"Error loading model {target_repo_id}: {e}")
         raise e
-def generate_response(messages: list[dict], max_new_tokens=512):
-    """
-    Runs generation using the currently loaded global model.
-    """
-    if CURRENT_MODEL is None or CURRENT_PROCESSOR is None:
-        raise ValueError("No model loaded.")
-    text = CURRENT_PROCESSOR.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
     image_inputs, video_inputs = process_vision_info(messages)
-    inputs = CURRENT_PROCESSOR(
         text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
     )
-    inputs = inputs.to(CURRENT_MODEL.device)
     with torch.no_grad():
-        generated_ids = CURRENT_MODEL.generate(**inputs, max_new_tokens=max_new_tokens)
     generated_ids_trimmed = [
         out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
-    return CURRENT_PROCESSOR.batch_decode(
         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
 # -----------------------------------------------------------------------------
 # 3. PARSING & VISUALIZATION LOGIC
@@ -164,18 +163,25 @@ def get_navigation_prompt(task, image):
     ]
 def parse_tool_calls(response: str) -> list[dict]:
     actions = []
     matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
     for match in matches:
         try:
             json_str = match.strip()
             data = json.loads(json_str)
             args = data.get("arguments", {})
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
             if coords and isinstance(coords, list) and len(coords) == 2:
                 actions.append({
                     "type": action_type,
@@ -184,12 +190,18 @@ def parse_tool_calls(response: str) -> list[dict]:
                     "text": text_content,
                     "raw_json": data
                 })
-        except Exception as e:
-            print(f"Error parsing tool call: {e}")
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions:
         return None
@@ -211,11 +223,11 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         'unknown': 'green'
     }
-    for act in actions:
         x = act['x']
         y = act['y']
-        # Determine if coords are normalized or absolute
         if x <= 1.0 and y <= 1.0 and x > 0:
             pixel_x = int(x * width)
             pixel_y = int(y * height)
@@ -226,42 +238,54 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         action_type = act['type']
         color = colors.get(action_type, 'green')
-        # Draw Target
-        r = 12
-        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
-        draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
-        # Label
         label_text = f"{action_type}"
         if act['text']:
             label_text += f": '{act['text']}'"
-        text_pos = (pixel_x + 15, pixel_y - 10)
         bbox = draw.textbbox(text_pos, label_text, font=font)
         draw.rectangle(bbox, fill="black")
         draw.text(text_pos, label_text, fill="white", font=font)
     return img_copy
 # -----------------------------------------------------------------------------
-# 4. GRADIO PROCESSING LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
-def process_screenshot(input_numpy_image: np.ndarray, task: str, selected_model_key: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
-    # 1. Ensure correct model is loaded
-    load_model(selected_model_key)
     # 2. Prepare Data
     input_pil_image = array_to_image(input_numpy_image)
     prompt = get_navigation_prompt(task, input_pil_image)
     # 3. Generate
-    print(f"Generating with {selected_model_key}...")
-    raw_response = generate_response(prompt, max_new_tokens=500)
     print(f"Raw Output:\n{raw_response}")
     # 4. Parse & Visualize
@@ -276,13 +300,17 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str, selected_model_
     return raw_response, output_image
 # -----------------------------------------------------------------------------
-# 5. UI SETUP
 # -----------------------------------------------------------------------------
-title = "Computer Use Agent (CUA) Playground 🖥️"
 description = """
-Analyze GUI screenshots and generate action coordinates using State-of-the-Art Vision Language Models.
-Supported Models: **Microsoft Fara-7B** and **ByteDance UI-TARS-1.5-7B**.
 """
 custom_css = """
@@ -297,13 +325,13 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
         with gr.Column():
             input_image = gr.Image(label="Upload Screenshot", height=500)
-            # Model Selector
-            model_selector = gr.Dropdown(
-                label="Choose CUA Model",
-                choices=["Fara-7B", "UI-TARS-1.5-7B"],
-                value="Fara-7B",
-                interactive=True
-            )
             task_input = gr.Textbox(
                 label="Task Instruction",
@@ -319,20 +347,20 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
     # Wire up the button
     submit_btn.click(
         fn=process_screenshot,
-        inputs=[input_image, task_input, model_selector],
         outputs=[output_text, output_image]
     )
     gr.Examples(
         examples=[
-            ["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"],
-            ["./assets/google.png", "Click the Sign In button", "UI-TARS-1.5-7B"],
         ],
-        inputs=[input_image, task_input, model_selector],
         label="Quick Examples"
     )
 if __name__ == "__main__":
-    # Pre-load the default model on startup to speed up first inference (optional)
-    # load_model("Fara-7B")
     demo.queue().launch()

 import os
 import re
 import json
+import gc
 import time
 import shutil
 import uuid
 import tempfile
 import unicodedata
 from io import BytesIO
 from qwen_vl_utils import process_vision_info
 # -----------------------------------------------------------------------------
+# 1. CONSTANTS & SYSTEM PROMPT
 # -----------------------------------------------------------------------------
+# Mapping UI labels to Hugging Face Model IDs
 MODEL_MAP = {
     "Fara-7B": "microsoft/Fara-7B",
+    # Using the official SFT checkpoint for UI-TARS
+    "UI-TARS-1.5-7B": "bytedance/UI-TARS-7B-SFT"
 }
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Global model state
+CURRENT_MODEL = None
+CURRENT_PROCESSOR = None
+CURRENT_MODEL_NAME = None
+# Updated System Prompt to encourage the JSON format
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
 """
 # -----------------------------------------------------------------------------
+# 2. MODEL LOADING LOGIC
 # -----------------------------------------------------------------------------
+def load_model_to_device(model_name: str):
     """
+    Loads the specified model to GPU, unloading previous models to save VRAM.
     """
+    global CURRENT_MODEL, CURRENT_PROCESSOR, CURRENT_MODEL_NAME
+    target_id = MODEL_MAP.get(model_name, model_name)
+    # If already loaded, skip
+    if CURRENT_MODEL_NAME == model_name and CURRENT_MODEL is not None:
+        return CURRENT_MODEL, CURRENT_PROCESSOR
+    print(f"🔄 Switching model to: {model_name} ({target_id})...")
+    # 1. Cleanup previous model
     if CURRENT_MODEL is not None:
         del CURRENT_MODEL
         del CURRENT_PROCESSOR
         CURRENT_MODEL = None
         CURRENT_PROCESSOR = None
         gc.collect()
         torch.cuda.empty_cache()
+        print("🗑️ Previous model unloaded.")
+    # 2. Load New Model
     try:
+        processor = AutoProcessor.from_pretrained(target_id, trust_remote_code=True)
         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            target_id,
             trust_remote_code=True,
             torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
             device_map="auto" if DEVICE == "cuda" else None,
         model.eval()
         CURRENT_MODEL = model
         CURRENT_PROCESSOR = processor
+        CURRENT_MODEL_NAME = model_name
+        print(f"✅ {model_name} loaded successfully.")
+        return model, processor
     except Exception as e:
+        print(f"❌ Error loading {model_name}: {e}")
         raise e
+def generate_response(model, processor, messages, max_new_tokens=512):
+    """Generic generation function for Qwen2.5-VL based models"""
+    # Apply Chat Template
+    text = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
+    # Process Images
     image_inputs, video_inputs = process_vision_info(messages)
+    # Prepare Inputs
+    inputs = processor(
         text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
     )
+    inputs = inputs.to(model.device)
+    # Generate
     with torch.no_grad():
+        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    # Decode
     generated_ids_trimmed = [
         out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
+    output_text = processor.batch_decode(
         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
+    return output_text
 # -----------------------------------------------------------------------------
 # 3. PARSING & VISUALIZATION LOGIC
     ]
 def parse_tool_calls(response: str) -> list[dict]:
+    """
+    Parses the <tool_call>{JSON}</tool_call> format.
+    """
     actions = []
+    # Regex to find content between <tool_call> tags
     matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
     for match in matches:
         try:
             json_str = match.strip()
             data = json.loads(json_str)
             args = data.get("arguments", {})
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
+            # Check if coords exist and are a list of length 2
             if coords and isinstance(coords, list) and len(coords) == 2:
                 actions.append({
                     "type": action_type,
                     "text": text_content,
                     "raw_json": data
                 })
+                print(f"Parsed Action: {action_type} at {coords}")
+            else:
+                # Some actions like 'scroll' might not have coordinates in some models
+                print(f"Non-coordinate action or invalid: {json_str}")
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse JSON: {e}")
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
+    """Draws markers on the image based on parsed pixel coordinates."""
     if not actions:
         return None
         'unknown': 'green'
     }
+    for i, act in enumerate(actions):
         x = act['x']
         y = act['y']
+        # Check if Normalized (0.0 - 1.0) or Absolute (Pixels > 1.0)
         if x <= 1.0 and y <= 1.0 and x > 0:
             pixel_x = int(x * width)
             pixel_y = int(y * height)
         action_type = act['type']
         color = colors.get(action_type, 'green')
+        # Draw Circle Target
+        r = 15 # Radius
+        draw.ellipse(
+            [pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
+            outline=color,
+            width=4
+        )
+        # Draw Center Dot
+        draw.ellipse(
+            [pixel_x - 4, pixel_y - 4, pixel_x + 4, pixel_y + 4],
+            fill=color
+        )
+        # Label Text
         label_text = f"{action_type}"
         if act['text']:
             label_text += f": '{act['text']}'"
+        # Text Background
+        text_pos = (pixel_x + 18, pixel_y - 12)
         bbox = draw.textbbox(text_pos, label_text, font=font)
+        # Add padding to bbox
+        bbox = (bbox[0]-2, bbox[1]-2, bbox[2]+2, bbox[3]+2)
         draw.rectangle(bbox, fill="black")
         draw.text(text_pos, label_text, fill="white", font=font)
     return img_copy
 # -----------------------------------------------------------------------------
+# 4. GRADIO LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
+def process_screenshot(input_numpy_image: np.ndarray, task: str, model_choice: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
+    # 1. Load Requested Model (Switching if necessary)
+    model, processor = load_model_to_device(model_choice)
     # 2. Prepare Data
     input_pil_image = array_to_image(input_numpy_image)
     prompt = get_navigation_prompt(task, input_pil_image)
     # 3. Generate
+    print(f"Generating response using {model_choice}...")
+    raw_response = generate_response(model, processor, prompt, max_new_tokens=512)
     print(f"Raw Output:\n{raw_response}")
     # 4. Parse & Visualize
     return raw_response, output_image
 # -----------------------------------------------------------------------------
+# 5. GRADIO UI SETUP
 # -----------------------------------------------------------------------------
+title = "CUA GUI Agent 🖥️"
 description = """
+**Computer Use Agent (CUA)** Demo.
+Upload a screenshot and provide a task instruction. The model will analyze the UI and output the precise coordinates and actions required.
+**Models Supported:**
+* **Fara-7B**: Microsoft's GUI agent model.
+* **UI-TARS-1.5-7B**: ByteDance's GUI agent model.
 """
 custom_css = """
         with gr.Column():
             input_image = gr.Image(label="Upload Screenshot", height=500)
+            with gr.Row():
+                model_choice = gr.Dropdown(
+                    label="Choose CUA Model",
+                    choices=list(MODEL_MAP.keys()),
+                    value="Fara-7B",
+                    interactive=True
+                )
             task_input = gr.Textbox(
                 label="Task Instruction",
     # Wire up the button
     submit_btn.click(
         fn=process_screenshot,
+        inputs=[input_image, task_input, model_choice],
         outputs=[output_text, output_image]
     )
+    # Example for quick testing
     gr.Examples(
         examples=[
+            ["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"],
         ],
+        inputs=[input_image, task_input, model_choice],
         label="Quick Examples"
     )
 if __name__ == "__main__":
+    # Pre-load default model to speed up first request if memory allows,
+    # but strictly loading on GPU request is safer for Spaces.
     demo.queue().launch()