Spaces:

prithivMLmods
/

CUA-GUI-Operator

Sleeping

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

fcb0f85

verified ·

1 Parent(s): 5c21f23

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -142

app.py CHANGED Viewed

@@ -4,9 +4,9 @@ import json
 import time
 import shutil
 import uuid
 import tempfile
 import unicodedata
-import gc
 from io import BytesIO
 from typing import Tuple, Optional, List, Dict, Any
@@ -21,18 +21,18 @@ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 # -----------------------------------------------------------------------------
-# 1. CONSTANTS & SYSTEM PROMPT
 # -----------------------------------------------------------------------------
-# Available Models
-MODELS = {
     "Fara-7B": "microsoft/Fara-7B",
     "UI-TARS-1.5-7B": "ByteDance-Seed/UI-TARS-1.5-7B"
 }
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# System Prompt asking for JSON format
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
@@ -50,87 +50,100 @@ Examples:
 """
 # -----------------------------------------------------------------------------
-# 2. MODEL MANAGEMENT
 # -----------------------------------------------------------------------------
-class ModelManager:
-    def __init__(self):
-        self.current_model_id = None
-        self.model = None
-        self.processor = None
-    def load_model(self, model_key):
-        model_id = MODELS.get(model_key)
-        if not model_id:
-            raise ValueError(f"Unknown model: {model_key}")
-        # If already loaded, skip
-        if self.current_model_id == model_id and self.model is not None:
-            return
-        print(f"--- Swapping model to {model_key} ({model_id}) ---")
-        # Unload previous model to save VRAM
-        if self.model is not None:
-            del self.model
-            del self.processor
-            self.model = None
-            self.processor = None
-            gc.collect()
-            torch.cuda.empty_cache()
-            print("Previous model unloaded.")
-        print(f"Loading {model_id}...")
-        try:
-            self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                model_id,
-                trust_remote_code=True,
-                torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
-                device_map="auto" if DEVICE == "cuda" else None,
-            )
-            if DEVICE == "cpu":
-                self.model.to("cpu")
-            self.model.eval()
-            self.current_model_id = model_id
-            print(f"Successfully loaded {model_key}")
-        except Exception as e:
-            print(f"Error loading model {model_id}: {e}")
-            raise e
-    def generate(self, model_key, messages, max_new_tokens=512):
-        # Ensure correct model is loaded
-        self.load_model(model_key)
-        # Prepare inputs
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
         )
-        image_inputs, video_inputs = process_vision_info(messages)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self.model.device)
-        # Generate
-        with torch.no_grad():
-            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        return self.processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
-# Global instance
-model_manager = ModelManager()
 # -----------------------------------------------------------------------------
 # 3. PARSING & VISUALIZATION LOGIC
@@ -151,17 +164,13 @@ def get_navigation_prompt(task, image):
     ]
 def parse_tool_calls(response: str) -> list[dict]:
-    """
-    Parses <tool_call>{JSON}</tool_call> tags.
-    Also attempts fallback regex for plain coordinate output just in case.
-    """
     actions = []
-    # 1. Try Specific JSON Tool Call
-    json_matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
-    for match in json_matches:
         try:
-            data = json.loads(match.strip())
             args = data.get("arguments", {})
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
@@ -173,28 +182,14 @@ def parse_tool_calls(response: str) -> list[dict]:
                     "x": float(coords[0]),
                     "y": float(coords[1]),
                     "text": text_content,
-                    "source": "json"
                 })
-        except:
-            pass
-    # 2. Fallback: Search for any [x, y] or (x, y) pattern if JSON parsing yielded nothing
-    if not actions:
-        # Regex for [123, 456] or (123, 456)
-        coord_matches = re.findall(r"[\[\(](\d+(?:\.\d+)?),\s*(\d+(?:\.\d+)?)[\]\)]", response)
-        for x, y in coord_matches:
-            actions.append({
-                "type": "click", # Assume click for raw coords
-                "x": float(x),
-                "y": float(y),
-                "text": "",
-                "source": "regex"
-            })
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
-    """Draws markers on the image based on parsed pixel coordinates."""
     if not actions:
         return None
@@ -220,7 +215,7 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         x = act['x']
         y = act['y']
-        # Coordinate Normalization check
         if x <= 1.0 and y <= 1.0 and x > 0:
             pixel_x = int(x * width)
             pixel_y = int(y * height)
@@ -232,16 +227,9 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         color = colors.get(action_type, 'green')
         # Draw Target
-        r = 12
-        draw.ellipse(
-            [pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
-            outline=color,
-            width=4
-        )
-        draw.ellipse(
-            [pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3],
-            fill=color
-        )
         # Label
         label_text = f"{action_type}"
@@ -249,44 +237,36 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
             label_text += f": '{act['text']}'"
         text_pos = (pixel_x + 15, pixel_y - 10)
-        # Bounding box for text background
-        if font:
-            bbox = draw.textbbox(text_pos, label_text, font=font)
-            draw.rectangle(bbox, fill="black")
-            draw.text(text_pos, label_text, fill="white", font=font)
-        else:
-            draw.text(text_pos, label_text, fill="black") # fallback
     return img_copy
 # -----------------------------------------------------------------------------
-# 4. GRADIO LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
-def process_screenshot(model_choice: str, input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
-    # Convert to PIL
     input_pil_image = array_to_image(input_numpy_image)
-    # Build Prompt
     prompt = get_navigation_prompt(task, input_pil_image)
-    # Generate Response
-    print(f"Generating response with {model_choice}...")
-    try:
-        raw_response = model_manager.generate(model_choice, prompt, max_new_tokens=500)
-    except Exception as e:
-        return f"Error generating response: {str(e)}", None
     print(f"Raw Output:\n{raw_response}")
-    # Parse Actions
     actions = parse_tool_calls(raw_response)
-    # Visualize
     output_image = input_pil_image
     if actions:
         visualized = create_localized_image(input_pil_image, actions)
@@ -296,13 +276,13 @@ def process_screenshot(model_choice: str, input_numpy_image: np.ndarray, task: s
     return raw_response, output_image
 # -----------------------------------------------------------------------------
-# 5. GRADIO UI SETUP
 # -----------------------------------------------------------------------------
-title = "CUA GUI Operator 🖥️"
 description = """
-This demo uses **Vision Language Models** to understand GUI screenshots and generate actions.
-Select a model, upload a screenshot, and define a task.
 """
 custom_css = """
@@ -315,6 +295,8 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
     with gr.Row():
         with gr.Column():
             # Model Selector
             model_selector = gr.Dropdown(
                 label="Choose CUA Model",
@@ -323,7 +305,6 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
                 interactive=True
             )
-            input_image = gr.Image(label="Upload Screenshot", height=500)
             task_input = gr.Textbox(
                 label="Task Instruction",
                 placeholder="e.g. Input the server address readyforquantum.com...",
@@ -338,18 +319,20 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
     # Wire up the button
     submit_btn.click(
         fn=process_screenshot,
-        inputs=[model_selector, input_image, task_input],
         outputs=[output_text, output_image]
     )
-    # Example for quick testing
     gr.Examples(
         examples=[
-            ["Fara-7B", "./assets/google.png", "Search for 'Hugging Face'"],
         ],
-        inputs=[model_selector, input_image, task_input],
         label="Quick Examples"
     )
 if __name__ == "__main__":
     demo.queue().launch()

 import time
 import shutil
 import uuid
+import gc
 import tempfile
 import unicodedata
 from io import BytesIO
 from typing import Tuple, Optional, List, Dict, Any
 from qwen_vl_utils import process_vision_info
 # -----------------------------------------------------------------------------
+# 1. CONSTANTS & CONFIGURATION
 # -----------------------------------------------------------------------------
+# Map display names to Hugging Face Repo IDs
+MODEL_MAP = {
     "Fara-7B": "microsoft/Fara-7B",
     "UI-TARS-1.5-7B": "ByteDance-Seed/UI-TARS-1.5-7B"
 }
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# System Prompt
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
 """
 # -----------------------------------------------------------------------------
+# 2. GLOBAL MODEL STATE MANAGEMENT
 # -----------------------------------------------------------------------------
+# Global variables to track the currently loaded model
+CURRENT_MODEL = None
+CURRENT_PROCESSOR = None
+CURRENT_MODEL_ID = None
+def load_model(model_key: str):
+    """
+    Dynamically loads the requested model.
+    Unloads the previous model to free up GPU memory if a switch occurs.
+    """
+    global CURRENT_MODEL, CURRENT_PROCESSOR, CURRENT_MODEL_ID
+    target_repo_id = MODEL_MAP[model_key]
+    # If the requested model is already loaded, do nothing
+    if CURRENT_MODEL is not None and CURRENT_MODEL_ID == target_repo_id:
+        print(f"Model {model_key} is already loaded.")
+        return
+    print(f"--- Switching Model to {model_key} ({target_repo_id}) ---")
+    # 1. Unload existing model to free GPU memory
+    if CURRENT_MODEL is not None:
+        print("Unloading current model...")
+        del CURRENT_MODEL
+        del CURRENT_PROCESSOR
+        CURRENT_MODEL = None
+        CURRENT_PROCESSOR = None
+        gc.collect()
+        torch.cuda.empty_cache()
+        print("Memory cleared.")
+    # 2. Load new model
+    try:
+        print(f"Loading processor for {target_repo_id}...")
+        processor = AutoProcessor.from_pretrained(target_repo_id, trust_remote_code=True)
+        print(f"Loading model weights for {target_repo_id}...")
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            target_repo_id,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
+            device_map="auto" if DEVICE == "cuda" else None,
         )
+        if DEVICE == "cpu":
+            model.to("cpu")
+        model.eval()
+        # Update global state
+        CURRENT_MODEL = model
+        CURRENT_PROCESSOR = processor
+        CURRENT_MODEL_ID = target_repo_id
+        print(f"Successfully loaded {model_key}.")
+    except Exception as e:
+        print(f"Error loading model {target_repo_id}: {e}")
+        raise e
+def generate_response(messages: list[dict], max_new_tokens=512):
+    """
+    Runs generation using the currently loaded global model.
+    """
+    if CURRENT_MODEL is None or CURRENT_PROCESSOR is None:
+        raise ValueError("No model loaded.")
+    text = CURRENT_PROCESSOR.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = CURRENT_PROCESSOR(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(CURRENT_MODEL.device)
+    with torch.no_grad():
+        generated_ids = CURRENT_MODEL.generate(**inputs, max_new_tokens=max_new_tokens)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    return CURRENT_PROCESSOR.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
 # -----------------------------------------------------------------------------
 # 3. PARSING & VISUALIZATION LOGIC
     ]
 def parse_tool_calls(response: str) -> list[dict]:
     actions = []
+    matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
+    for match in matches:
         try:
+            json_str = match.strip()
+            data = json.loads(json_str)
             args = data.get("arguments", {})
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
                     "x": float(coords[0]),
                     "y": float(coords[1]),
                     "text": text_content,
+                    "raw_json": data
                 })
+        except Exception as e:
+            print(f"Error parsing tool call: {e}")
     return actions
 def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
     if not actions:
         return None
         x = act['x']
         y = act['y']
+        # Determine if coords are normalized or absolute
         if x <= 1.0 and y <= 1.0 and x > 0:
             pixel_x = int(x * width)
             pixel_y = int(y * height)
         color = colors.get(action_type, 'green')
         # Draw Target
+        r = 12
+        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=4)
+        draw.ellipse([pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3], fill=color)
         # Label
         label_text = f"{action_type}"
             label_text += f": '{act['text']}'"
         text_pos = (pixel_x + 15, pixel_y - 10)
+        bbox = draw.textbbox(text_pos, label_text, font=font)
+        draw.rectangle(bbox, fill="black")
+        draw.text(text_pos, label_text, fill="white", font=font)
     return img_copy
 # -----------------------------------------------------------------------------
+# 4. GRADIO PROCESSING LOGIC
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=120)
+def process_screenshot(input_numpy_image: np.ndarray, task: str, selected_model_key: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
+    # 1. Ensure correct model is loaded
+    load_model(selected_model_key)
+    # 2. Prepare Data
     input_pil_image = array_to_image(input_numpy_image)
     prompt = get_navigation_prompt(task, input_pil_image)
+    # 3. Generate
+    print(f"Generating with {selected_model_key}...")
+    raw_response = generate_response(prompt, max_new_tokens=500)
     print(f"Raw Output:\n{raw_response}")
+    # 4. Parse & Visualize
     actions = parse_tool_calls(raw_response)
     output_image = input_pil_image
     if actions:
         visualized = create_localized_image(input_pil_image, actions)
     return raw_response, output_image
 # -----------------------------------------------------------------------------
+# 5. UI SETUP
 # -----------------------------------------------------------------------------
+title = "Computer Use Agent (CUA) Playground 🖥️"
 description = """
+Analyze GUI screenshots and generate action coordinates using State-of-the-Art Vision Language Models.
+Supported Models: **Microsoft Fara-7B** and **ByteDance UI-TARS-1.5-7B**.
 """
 custom_css = """
     with gr.Row():
         with gr.Column():
+            input_image = gr.Image(label="Upload Screenshot", height=500)
             # Model Selector
             model_selector = gr.Dropdown(
                 label="Choose CUA Model",
                 interactive=True
             )
             task_input = gr.Textbox(
                 label="Task Instruction",
                 placeholder="e.g. Input the server address readyforquantum.com...",
     # Wire up the button
     submit_btn.click(
         fn=process_screenshot,
+        inputs=[input_image, task_input, model_selector],
         outputs=[output_text, output_image]
     )
     gr.Examples(
         examples=[
+            ["./assets/google.png", "Search for 'Hugging Face'", "Fara-7B"],
+            ["./assets/google.png", "Click the Sign In button", "UI-TARS-1.5-7B"],
         ],
+        inputs=[input_image, task_input, model_selector],
         label="Quick Examples"
     )
 if __name__ == "__main__":
+    # Pre-load the default model on startup to speed up first inference (optional)
+    # load_model("Fara-7B")
     demo.queue().launch()