Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

5c21f23

verified ·

1 Parent(s): 7d0d550

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -73

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import shutil
 import uuid
 import tempfile
 import unicodedata
 from io import BytesIO
 from typing import Tuple, Optional, List, Dict, Any
@@ -23,10 +24,15 @@ from qwen_vl_utils import process_vision_info
 # 1. CONSTANTS & SYSTEM PROMPT
 # -----------------------------------------------------------------------------
-MODEL_ID = "microsoft/Fara-7B"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Updated System Prompt to encourage the JSON format the model prefers
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
@@ -44,31 +50,59 @@ Examples:
 """
 # -----------------------------------------------------------------------------
-# 2. MODEL DEFINITION
 # -----------------------------------------------------------------------------
-class FaraTransformersModel:
-    def __init__(self, model_id: str, to_device: str = "cuda"):
-        print(f"Loading {model_id} on {to_device}...")
-        self.model_id = model_id
         try:
             self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
                 trust_remote_code=True,
-                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
-                device_map="auto" if to_device == "cuda" else None,
             )
-            if to_device == "cpu":
                 self.model.to("cpu")
             self.model.eval()
-            print("Model loaded successfully.")
         except Exception as e:
-            print(f"Error loading Fara: {e}")
             raise e
-    def generate(self, messages: list[dict], max_new_tokens=512):
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
@@ -83,6 +117,7 @@ class FaraTransformersModel:
         )
         inputs = inputs.to(self.model.device)
         with torch.no_grad():
             generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
@@ -94,12 +129,11 @@ class FaraTransformersModel:
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )[0]
-# Initialize Model
-print(f"Initializing model class for {MODEL_ID}...")
-fara_model = FaraTransformersModel(MODEL_ID, to_device=DEVICE)
 # -----------------------------------------------------------------------------
-# 3. PARSING & VISUALIZATION LOGIC (UPDATED)
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
@@ -118,25 +152,17 @@ def get_navigation_prompt(task, image):
 def parse_tool_calls(response: str) -> list[dict]:
     """
-    Parses the <tool_call>{JSON}</tool_call> format specifically.
-    Extracts coordinates and action types.
     """
     actions = []
-    # Regex to find content between <tool_call> tags
-    # re.DOTALL allows matching across newlines
-    matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
-    for match in matches:
         try:
-            # Clean up the string just in case
-            json_str = match.strip()
-            data = json.loads(json_str)
-            # Access the 'arguments' dictionary
             args = data.get("arguments", {})
-            # Extract coordinates: Expecting list like [399, 496]
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
@@ -147,16 +173,23 @@ def parse_tool_calls(response: str) -> list[dict]:
                     "x": float(coords[0]),
                     "y": float(coords[1]),
                     "text": text_content,
-                    "raw_json": data
                 })
-                print(f"Parsed Action: {action_type} at {coords}")
-            else:
-                print(f"No valid coordinates found in tool call: {json_str}")
-        except json.JSONDecodeError as e:
-            print(f"Failed to parse JSON in tool call: {e}\nString was: {match}")
-        except Exception as e:
-            print(f"Unexpected error parsing tool call: {e}")
     return actions
@@ -169,7 +202,6 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
     draw = ImageDraw.Draw(img_copy)
     width, height = img_copy.size
-    # Try loading font
     try:
         font = ImageFont.load_default()
     except:
@@ -184,49 +216,46 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
         'unknown': 'green'
     }
-    for i, act in enumerate(actions):
         x = act['x']
         y = act['y']
-        # Check if Normalized (0.0 - 1.0) or Absolute (Pixels > 1.0)
-        # The logs showed [399, 496], so these are pixels.
-        # However, to be safe, we check.
         if x <= 1.0 and y <= 1.0 and x > 0:
-            # It's normalized, convert to pixels
             pixel_x = int(x * width)
             pixel_y = int(y * height)
         else:
-            # It's absolute pixels
             pixel_x = int(x)
             pixel_y = int(y)
         action_type = act['type']
         color = colors.get(action_type, 'green')
-        # Draw Circle Target
-        r = 12 # Radius
         draw.ellipse(
             [pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
             outline=color,
             width=4
         )
-        # Draw Center Dot
         draw.ellipse(
             [pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3],
             fill=color
         )
-        # Draw Label text
         label_text = f"{action_type}"
         if act['text']:
             label_text += f": '{act['text']}'"
-        # Draw text background for readability
         text_pos = (pixel_x + 15, pixel_y - 10)
-        bbox = draw.textbbox(text_pos, label_text, font=font)
-        draw.rectangle(bbox, fill="black")
-        draw.text(text_pos, label_text, fill="white", font=font)
     return img_copy
@@ -234,29 +263,30 @@ def create_localized_image(original_image: Image.Image, actions: list[dict]) ->
 # 4. GRADIO LOGIC
 # -----------------------------------------------------------------------------
-@spaces.GPU(duration=60)
-def process_screenshot(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
     # Convert to PIL
     input_pil_image = array_to_image(input_numpy_image)
-    # 1. Build Prompt
     prompt = get_navigation_prompt(task, input_pil_image)
-    # 2. Generate Response
-    if fara_model is None:
-        raise ValueError("Model not loaded")
-    print("Generating response...")
-    raw_response = fara_model.generate(prompt, max_new_tokens=500)
     print(f"Raw Output:\n{raw_response}")
-    # 3. Parse Actions
     actions = parse_tool_calls(raw_response)
-    # 4. Visualize
     output_image = input_pil_image
     if actions:
         visualized = create_localized_image(input_pil_image, actions)
@@ -269,10 +299,10 @@ def process_screenshot(input_numpy_image: np.ndarray, task: str) -> Tuple[str, O
 # 5. GRADIO UI SETUP
 # -----------------------------------------------------------------------------
-title = "Fara-7B GUI Operator 🖥️"
 description = """
-This demo uses **microsoft/Fara-7B** to understand GUI screenshots.
-It generates action coordinates which are then parsed and plotted on the image.
 """
 custom_css = """
@@ -285,6 +315,14 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
     with gr.Row():
         with gr.Column():
             input_image = gr.Image(label="Upload Screenshot", height=500)
             task_input = gr.Textbox(
                 label="Task Instruction",
@@ -300,16 +338,16 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
     # Wire up the button
     submit_btn.click(
         fn=process_screenshot,
-        inputs=[input_image, task_input],
         outputs=[output_text, output_image]
     )
     # Example for quick testing
     gr.Examples(
         examples=[
-            ["./assets/google.png", "Search for 'Hugging Face'"],
         ],
-        inputs=[input_image, task_input],
         label="Quick Examples"
     )

 import uuid
 import tempfile
 import unicodedata
+import gc
 from io import BytesIO
 from typing import Tuple, Optional, List, Dict, Any
 # 1. CONSTANTS & SYSTEM PROMPT
 # -----------------------------------------------------------------------------
+# Available Models
+MODELS = {
+    "Fara-7B": "microsoft/Fara-7B",
+    "UI-TARS-1.5-7B": "ByteDance-Seed/UI-TARS-1.5-7B"
+}
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# System Prompt asking for JSON format
 OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
 You need to generate the next action to complete the task.
 """
 # -----------------------------------------------------------------------------
+# 2. MODEL MANAGEMENT
 # -----------------------------------------------------------------------------
+class ModelManager:
+    def __init__(self):
+        self.current_model_id = None
+        self.model = None
+        self.processor = None
+    def load_model(self, model_key):
+        model_id = MODELS.get(model_key)
+        if not model_id:
+            raise ValueError(f"Unknown model: {model_key}")
+        # If already loaded, skip
+        if self.current_model_id == model_id and self.model is not None:
+            return
+        print(f"--- Swapping model to {model_key} ({model_id}) ---")
+        # Unload previous model to save VRAM
+        if self.model is not None:
+            del self.model
+            del self.processor
+            self.model = None
+            self.processor = None
+            gc.collect()
+            torch.cuda.empty_cache()
+            print("Previous model unloaded.")
+        print(f"Loading {model_id}...")
         try:
             self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
                 trust_remote_code=True,
+                torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
+                device_map="auto" if DEVICE == "cuda" else None,
             )
+            if DEVICE == "cpu":
                 self.model.to("cpu")
             self.model.eval()
+            self.current_model_id = model_id
+            print(f"Successfully loaded {model_key}")
         except Exception as e:
+            print(f"Error loading model {model_id}: {e}")
             raise e
+    def generate(self, model_key, messages, max_new_tokens=512):
+        # Ensure correct model is loaded
+        self.load_model(model_key)
+        # Prepare inputs
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         )
         inputs = inputs.to(self.model.device)
+        # Generate
         with torch.no_grad():
             generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )[0]
+# Global instance
+model_manager = ModelManager()
 # -----------------------------------------------------------------------------
+# 3. PARSING & VISUALIZATION LOGIC
 # -----------------------------------------------------------------------------
 def array_to_image(image_array: np.ndarray) -> Image.Image:
 def parse_tool_calls(response: str) -> list[dict]:
     """
+    Parses <tool_call>{JSON}</tool_call> tags.
+    Also attempts fallback regex for plain coordinate output just in case.
     """
     actions = []
+    # 1. Try Specific JSON Tool Call
+    json_matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
+    for match in json_matches:
         try:
+            data = json.loads(match.strip())
             args = data.get("arguments", {})
             coords = args.get("coordinate", [])
             action_type = args.get("action", "unknown")
             text_content = args.get("text", "")
                     "x": float(coords[0]),
                     "y": float(coords[1]),
                     "text": text_content,
+                    "source": "json"
                 })
+        except:
+            pass
+    # 2. Fallback: Search for any [x, y] or (x, y) pattern if JSON parsing yielded nothing
+    if not actions:
+        # Regex for [123, 456] or (123, 456)
+        coord_matches = re.findall(r"[\[\(](\d+(?:\.\d+)?),\s*(\d+(?:\.\d+)?)[\]\)]", response)
+        for x, y in coord_matches:
+            actions.append({
+                "type": "click", # Assume click for raw coords
+                "x": float(x),
+                "y": float(y),
+                "text": "",
+                "source": "regex"
+            })
     return actions
     draw = ImageDraw.Draw(img_copy)
     width, height = img_copy.size
     try:
         font = ImageFont.load_default()
     except:
         'unknown': 'green'
     }
+    for act in actions:
         x = act['x']
         y = act['y']
+        # Coordinate Normalization check
         if x <= 1.0 and y <= 1.0 and x > 0:
             pixel_x = int(x * width)
             pixel_y = int(y * height)
         else:
             pixel_x = int(x)
             pixel_y = int(y)
         action_type = act['type']
         color = colors.get(action_type, 'green')
+        # Draw Target
+        r = 12
         draw.ellipse(
             [pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
             outline=color,
             width=4
         )
         draw.ellipse(
             [pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3],
             fill=color
         )
+        # Label
         label_text = f"{action_type}"
         if act['text']:
             label_text += f": '{act['text']}'"
         text_pos = (pixel_x + 15, pixel_y - 10)
+        # Bounding box for text background
+        if font:
+            bbox = draw.textbbox(text_pos, label_text, font=font)
+            draw.rectangle(bbox, fill="black")
+            draw.text(text_pos, label_text, fill="white", font=font)
+        else:
+            draw.text(text_pos, label_text, fill="black") # fallback
     return img_copy
 # 4. GRADIO LOGIC
 # -----------------------------------------------------------------------------
+@spaces.GPU(duration=120)
+def process_screenshot(model_choice: str, input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
     if input_numpy_image is None:
         return "⚠️ Please upload an image first.", None
     # Convert to PIL
     input_pil_image = array_to_image(input_numpy_image)
+    # Build Prompt
     prompt = get_navigation_prompt(task, input_pil_image)
+    # Generate Response
+    print(f"Generating response with {model_choice}...")
+    try:
+        raw_response = model_manager.generate(model_choice, prompt, max_new_tokens=500)
+    except Exception as e:
+        return f"Error generating response: {str(e)}", None
     print(f"Raw Output:\n{raw_response}")
+    # Parse Actions
     actions = parse_tool_calls(raw_response)
+    # Visualize
     output_image = input_pil_image
     if actions:
         visualized = create_localized_image(input_pil_image, actions)
 # 5. GRADIO UI SETUP
 # -----------------------------------------------------------------------------
+title = "CUA GUI Operator 🖥️"
 description = """
+This demo uses **Vision Language Models** to understand GUI screenshots and generate actions.
+Select a model, upload a screenshot, and define a task.
 """
 custom_css = """
     with gr.Row():
         with gr.Column():
+            # Model Selector
+            model_selector = gr.Dropdown(
+                label="Choose CUA Model",
+                choices=["Fara-7B", "UI-TARS-1.5-7B"],
+                value="Fara-7B",
+                interactive=True
+            )
             input_image = gr.Image(label="Upload Screenshot", height=500)
             task_input = gr.Textbox(
                 label="Task Instruction",
     # Wire up the button
     submit_btn.click(
         fn=process_screenshot,
+        inputs=[model_selector, input_image, task_input],
         outputs=[output_text, output_image]
     )
     # Example for quick testing
     gr.Examples(
         examples=[
+            ["Fara-7B", "./assets/google.png", "Search for 'Hugging Face'"],
         ],
+        inputs=[model_selector, input_image, task_input],
         label="Quick Examples"
     )