Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on 9 days ago

Commit

865d1bc

verified ·

1 Parent(s): c4905cb

Update app.py

Browse files

Files changed (1) hide show

app.py +226 -162

app.py CHANGED Viewed

@@ -1,57 +1,98 @@
-import spaces
 import re
-from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
 import numpy as np
 import torch
 from PIL import Image, ImageDraw, ImageFont
-# Transformers imports for Fara Model
-from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
-    AutoProcessor,
-)
 from qwen_vl_utils import process_vision_info
-# --- Configuration ---
-MODEL_ID = "microsoft/Fara-7B"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # -----------------------------------------------------------------------------
-# PROMPT DEFINITIONS (from prompt.py)
 # -----------------------------------------------------------------------------
 OS_ACTIONS = """
-def click(x: float, y: float) -> str:
     \"\"\"
-    Performs a left-click at the specified normalized coordinates.
     Args:
-        x: The x coordinate (0.0 to 1.0).
-        y: The y coordinate (0.0 to 1.0).
     \"\"\"
-def double_click(x: float, y: float) -> str:
     \"\"\"
-    Performs a double-click at the specified normalized coordinates.
     Args:
-        x: The x coordinate (0.0 to 1.0).
-        y: The y coordinate (0.0 to 1.0).
     \"\"\"
 def type(text: str) -> str:
     \"\"\"
-    Types the specified text.
     Args:
-        text: The text to type.
     \"\"\"
 def drag(from_coord: list[float], to_coord: list[float]) -> str:
     \"\"\"
-    Drags from [x1, y1] to [x2, y2].
     Args:
-        from_coord: The starting normalized coordinates [x1, y1].
-        to_coord: The ending normalized coordinates [x2, y2].
     \"\"\"
 """
@@ -59,71 +100,72 @@ OS_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and
 For each step:
 	•	First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
-	•	Then, use <code></code> to perform the action. It will be executed in a stateful environment.
 The following functions are exposed to the Python interpreter:
 <code>
 {OS_ACTIONS}
 </code>
-The state persists between code executions.
 """
 # -----------------------------------------------------------------------------
-# FARA MODEL WRAPPER (adapted from smolvlm_inference.py)
 # -----------------------------------------------------------------------------
-class FaraModelWrapper:
-    def __init__(self, model_id: str, to_device: str):
-        print(f"Loading {model_id} on {to_device}...")
         self.model_id = model_id
-        try:
-            self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
-            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                model_id,
-                trust_remote_code=True,
-                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
-                device_map="auto" if to_device == "cuda" else None,
-            )
-            if to_device == "cpu":
-                self.model.to("cpu")
-            self.model.eval()
-            print("Fara Model loaded successfully.")
-        except Exception as e:
-            print(f"Failed to load Fara, falling back to Qwen2.5-VL-7B. Error: {e}")
-            fallback_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-            self.processor = AutoProcessor.from_pretrained(fallback_id, trust_remote_code=True)
-            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                fallback_id,
-                trust_remote_code=True,
-                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
-                device_map="auto",
-            )
-            print("Fallback model loaded.")
-    def generate(self, messages: list[dict], max_new_tokens=512, **kwargs):
-        """
-        Generate a response from the Fara/QwenVL model.
-        """
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
-        image_inputs, _ = process_vision_info(messages)
         inputs = self.processor(
             text=[text],
             images=image_inputs,
             padding=True,
             return_tensors="pt",
-        ).to(self.model.device)
         with torch.no_grad():
-            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, **kwargs)
-        # Trim input tokens to get only the generated part
         generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         output_text = self.processor.batch_decode(
@@ -132,20 +174,26 @@ class FaraModelWrapper:
         return output_text
-# --- Initialize Global Model ---
-model = FaraModelWrapper(
-    model_id=MODEL_ID,
-    to_device=DEVICE,
-)
 # -----------------------------------------------------------------------------
-# HELPER FUNCTIONS (from app.py logic)
 # -----------------------------------------------------------------------------
-def get_navigation_prompt(task, image, previous_actions="None"):
-    """
-    Constructs the prompt for the model.
-    """
     return [
         {
             "role": "system",
@@ -155,30 +203,31 @@ def get_navigation_prompt(task, image, previous_actions="None"):
             "role": "user",
             "content": [
                 {"type": "image", "image": image},
-                {"type": "text", "text": f"Please generate the next move according to the UI screenshot, instruction and previous actions.\n\nInstruction: {task}\n\nPrevious actions:\n{previous_actions}"},
             ],
         },
     ]
-def array_to_image(image_array: np.ndarray) -> Image.Image:
-    if image_array is None:
-        raise ValueError("No image provided.")
-    return Image.fromarray(np.uint8(image_array))
 def parse_actions_from_response(response: str) -> list[str]:
-    """Parse actions from model response using <code>...</code> pattern."""
-    pattern = r"<code>\s*(.*?)\s*</code>"
     matches = re.findall(pattern, response, re.DOTALL)
     return matches
 def extract_coordinates_from_action(action_code: str) -> list[dict]:
-    """Extract normalized (0-1) coordinates from action code for visualization."""
     localization_actions = []
-    # Patterns for different action types expecting normalized floats
     patterns = {
         'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
         'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
         'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
     }
@@ -186,17 +235,25 @@ def extract_coordinates_from_action(action_code: str) -> list[dict]:
         matches = re.finditer(pattern, action_code)
         for match in matches:
             if action_type == 'drag':
-                from_x, from_y, to_x, to_y = map(float, match.groups())
-                localization_actions.append({'type': 'drag_from', 'x': from_x, 'y': from_y, 'action': action_type})
-                localization_actions.append({'type': 'drag_to', 'x': to_x, 'y': to_y, 'action': action_type})
             else:
-                x_val, y_val = map(float, match.groups())
-                localization_actions.append({'type': action_type, 'x': x_val, 'y': y_val, 'action': action_type})
     return localization_actions
 def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
-    """Draw markers on the image to visualize the predicted action."""
     if not coordinates:
         return None
@@ -205,91 +262,100 @@ def create_localized_image(original_image: Image.Image, coordinates: list[dict])
     width, height = img_copy.size
     try:
-        font = ImageFont.truetype("Arial.ttf", 15)
-    except IOError:
         font = ImageFont.load_default()
-    colors = {'click': 'red', 'double_click': 'blue', 'drag_from': 'orange', 'drag_to': 'purple'}
     for i, coord in enumerate(coordinates):
-        pixel_x = int(coord['x'] * width)
-        pixel_y = int(coord['y'] * height)
         color = colors.get(coord['type'], 'red')
-        radius = 8
-        draw.ellipse([pixel_x - radius, pixel_y - radius, pixel_x + radius, pixel_y + radius], fill=color, outline='white', width=2)
-        label = f"{coord['type']}({coord['x']:.2f},{coord['y']:.2f})"
-        draw.text((pixel_x + 12, pixel_y - 12), label, fill=color, font=font, stroke_width=1, stroke_fill="white")
         if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
             next_coord = coordinates[i + 1]
-            end_x = int(next_coord['x'] * width)
-            end_y = int(next_coord['y'] * height)
             draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
     return img_copy
 # -----------------------------------------------------------------------------
-# GRADIO CORE FUNCTION
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=60)
-def predict_action(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
-    """
-    Main Gradio function: takes image and task, returns model output and visualized image.
-    """
-    if model is None:
-        raise ValueError("Model not loaded")
     input_pil_image = array_to_image(input_numpy_image)
-    # Generate prompt and get model prediction
     prompt = get_navigation_prompt(task, input_pil_image)
-    model_response = model.generate(prompt, max_new_tokens=500)
-    print(f"Model Response: {model_response}")
-    # Parse the response to find action code
-    action_codes = parse_actions_from_response(model_response)
-    # Extract coordinates from all found actions for visualization
     all_coordinates = []
-    for code in action_codes:
-        coordinates = extract_coordinates_from_action(code)
         all_coordinates.extend(coordinates)
-    # Create the visualized image if coordinates were found
-    visualized_image = None
     if all_coordinates:
-        visualized_image = create_localized_image(input_pil_image, all_coordinates)
-        print(f"Found {len(all_coordinates)} localization actions. Visualizing.")
-    else:
-        print("No localization actions found in the response.")
-    # Return the raw model response and the (possibly updated) image
-    return model_response, visualized_image if visualized_image else input_pil_image
 # -----------------------------------------------------------------------------
-# GRADIO UI LAYOUT
 # -----------------------------------------------------------------------------
-title = "Fara GUI Operator"
 description = """
-This is a demo of the **Fara Model** acting as a GUI Operator.
-Provide a screenshot of a user interface and a task you want to perform. The model will output the thought process and the corresponding action code, visualizing clicks and drags directly on the image.
-This version does not execute the actions; it only predicts and visualizes them.
 """
-# Load Example Data
-try:
-    example_1_image = Image.open("./assets/google.png")
-    example_1_task = "Search for the name of the current UK Prime Minister."
-    example_2_image = Image.open("./assets/huggingface.png")
-    example_2_task = "Find the most trending model."
-    examples = [[example_1_image, example_1_task], [example_2_image, example_2_task]]
-except FileNotFoundError:
-    print("Warning: Example assets not found. The demo will run without examples.")
-    examples = []
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -297,35 +363,33 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(description)
     with gr.Row():
-        with gr.Column(scale=1):
-            input_image_component = gr.Image(label="UI Screenshot", type="numpy", height=500)
             task_component = gr.Textbox(
-                label="Task",
-                placeholder="e.g., Search for 'Fara Model'",
-                info="Type the task you want the model to perform on this UI.",
             )
-            submit_button = gr.Button("Predict Action", variant="primary")
-        with gr.Column(scale=1):
-            output_text_component = gr.Textbox(label="Model Full Output", lines=10, interactive=False)
-            # The input image component will be updated with the visualized output
-            gr.Markdown("### Visualized Action")
-            gr.Markdown("The image on the left will update with markers for clicks/drags.")
     submit_button.click(
-        predict_action,
-        [input_image_component, task_component],
-        [output_text_component, input_image_component]
     )
-    if examples:
         gr.Examples(
-            examples=examples,
             inputs=[input_image_component, task_component],
-            outputs=[output_text_component, input_image_component],
-            fn=predict_action,
             cache_examples=True,
         )
 if __name__ == "__main__":
-    demo.queue().launch(debug=True, share=True)

+import os
 import re
+import time
+from typing import Tuple, Optional, List, Dict
 import gradio as gr
 import numpy as np
 import torch
+import spaces
 from PIL import Image, ImageDraw, ImageFont
+# Transformers & Qwen Utils
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 # -----------------------------------------------------------------------------
+# 1. PROMPT DEFINITIONS (from prompt.py)
 # -----------------------------------------------------------------------------
 OS_ACTIONS = """
+def final_answer(answer: any) -> any:
+    \"\"\"
+    Provides a final answer to the given problem.
+    Args:
+        answer: The final answer to the problem
+    \"\"\"
+def move_mouse(self, x: float, y: float) -> str:
     \"\"\"
+    Moves the mouse cursor to the specified coordinates
     Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
     \"\"\"
+def click(x: Optional[float] = None, y: Optional[float] = None) -> str:
     \"\"\"
+    Performs a left-click at the specified normalized coordinates
     Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+def double_click(x: Optional[float] = None, y: Optional[float] = None) -> str:
+    \"\"\"
+    Performs a double-click at the specified normalized coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
     \"\"\"
 def type(text: str) -> str:
     \"\"\"
+    Types the specified text at the current cursor position.
+    Args:
+        text: The text to type
+    \"\"\"
+def press(keys: str | list[str]) -> str:
+    \"\"\"
+    Presses a keyboard key
     Args:
+        keys: The key or list of keys to press (e.g. "enter", "space", "backspace", "ctrl", etc.).
+    \"\"\"
+def navigate_back() -> str:
+    \"\"\"
+    Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
     \"\"\"
 def drag(from_coord: list[float], to_coord: list[float]) -> str:
     \"\"\"
+    Clicks [x1, y1], drags mouse to [x2, y2], then release click.
+    Args:
+        x1: origin x coordinate
+        y1: origin y coordinate
+        x2: end x coordinate
+        y2: end y coordinate
+    \"\"\"
+def scroll(direction: Literal["up", "down"] = "down", amount: int = 1) -> str:
+    \"\"\"
+    Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
+    Args:
+        x: The x coordinate (horizontal position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
+        y: The y coordinate (vertical position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
+        direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
+        amount: The amount to scroll. A good amount is 1 or 2.
+    \"\"\"
+def wait(seconds: float) -> str:
+    \"\"\"
+    Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
     Args:
+        seconds: Number of seconds to wait, generally 2 is enough.
     \"\"\"
 """
 For each step:
 	•	First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
+	•	Then, use <code></code> to perform the action. it will be executed in a stateful environment.
 The following functions are exposed to the Python interpreter:
 <code>
 {OS_ACTIONS}
 </code>
+The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
 """
 # -----------------------------------------------------------------------------
+# 2. MODEL DEFINITION (Adapted for Fara-7B / Qwen2.5-VL)
 # -----------------------------------------------------------------------------
+MODEL_ID = "microsoft/Fara-7B"
+class FaraTransformersModel:
+    def __init__(self, model_id: str, to_device: str = "cuda"):
+        print(f"Loading {model_id}...")
         self.model_id = model_id
+        # Load Processor
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        # Load Model
+        # Fara is based on Qwen2.5-VL architecture
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_id,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            device_map="auto" if to_device == "cuda" else None
+        )
+        if to_device == "cpu":
+            self.model.to("cpu")
+        self.model.eval()
+        print("Model loaded successfully.")
+    def generate(self, messages: list[dict], **kwargs):
+        # 1. Prepare Text Prompts
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
+        # 2. Process Images (Qwen-VL specific utility)
+        image_inputs, video_inputs = process_vision_info(messages)
+        # 3. Create Inputs
         inputs = self.processor(
             text=[text],
             images=image_inputs,
+            videos=video_inputs,
             padding=True,
             return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        # 4. Generate
         with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, **kwargs)
+        # 5. Decode
         generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         output_text = self.processor.batch_decode(
         return output_text
+# Initialize Model Globally (Lazy loading handled by Gradio usually, but here we init for Spaces)
+# We use a global variable that is loaded on first run or at startup
+print(f"Initializing model class for {MODEL_ID}...")
+# Actual loading happens on GPU decorator or first call usually,
+# but for the class structure we initialize it here.
+# Note: Actual torch.load happens inside the class init.
+fara_model = FaraTransformersModel(MODEL_ID, to_device="cuda" if torch.cuda.is_available() else "cpu")
 # -----------------------------------------------------------------------------
+# 3. HELPER FUNCTIONS (Parsing & Visualization)
 # -----------------------------------------------------------------------------
+def array_to_image(image_array: np.ndarray) -> Image.Image:
+    if image_array is None:
+        raise ValueError("No image provided. Please upload an image before submitting.")
+    return Image.fromarray(np.uint8(image_array))
+def get_navigation_prompt(task, image):
+    """Constructs the chat messages for Fara."""
     return [
         {
             "role": "system",
             "role": "user",
             "content": [
                 {"type": "image", "image": image},
+                {"type": "text", "text": f"Instruction: {task}\n\nPrevious actions:\nNone"},
             ],
         },
     ]
 def parse_actions_from_response(response: str) -> list[str]:
+    """Parse actions from model response using regex pattern."""
+    # Look for code blocks
+    pattern = r"<code>(.*?)</code>"
     matches = re.findall(pattern, response, re.DOTALL)
+    if not matches:
+        # Fallback: if model forgets code tags but writes function calls
+        if "click(" in response or "type(" in response:
+            return [response]
     return matches
 def extract_coordinates_from_action(action_code: str) -> list[dict]:
+    """Extract coordinates from action code for localization actions."""
     localization_actions = []
+    # Patterns for different action types
     patterns = {
         'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
         'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
+        'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
         'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
     }
         matches = re.finditer(pattern, action_code)
         for match in matches:
             if action_type == 'drag':
+                from_x, from_y, to_x, to_y = match.groups()
+                localization_actions.append({
+                    'type': 'drag_from', 'x': float(from_x), 'y': float(from_y), 'action': action_type
+                })
+                localization_actions.append({
+                    'type': 'drag_to', 'x': float(to_x), 'y': float(to_y), 'action': action_type
+                })
             else:
+                x_val = match.group(1)
+                y_val = match.group(2) if match.group(2) else x_val
+                if x_val and y_val:
+                    localization_actions.append({
+                        'type': action_type, 'x': float(x_val), 'y': float(y_val), 'action': action_type
+                    })
     return localization_actions
 def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
+    """Create an image with localization markers drawn on it."""
     if not coordinates:
         return None
     width, height = img_copy.size
     try:
         font = ImageFont.load_default()
+    except:
+        font = None
+    colors = {'click': 'red', 'double_click': 'blue', 'move_mouse': 'green', 'drag_from': 'orange', 'drag_to': 'purple'}
     for i, coord in enumerate(coordinates):
+        # Normalize if model outputs 0-1 range (Fara usually does)
+        # If model outputs pixels, we need to handle that.
+        # Fara/SmolVLM usually output normalized coordinates 0-1000 or 0-1.
+        # Assuming Fara outputs 0-1 floats based on the System Prompt definition.
+        pixel_x = int(coord['x'] * width) if coord['x'] <= 1.0 else int(coord['x'])
+        pixel_y = int(coord['y'] * height) if coord['y'] <= 1.0 else int(coord['y'])
         color = colors.get(coord['type'], 'red')
+        r = 10
+        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r], outline=color, width=3)
+        label = f"{coord['type']}"
+        draw.text((pixel_x + 12, pixel_y - 10), label, fill=color, font=font)
+        # Draw drag arrows
         if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
             next_coord = coordinates[i + 1]
+            end_x = int(next_coord['x'] * width) if next_coord['x'] <= 1.0 else int(next_coord['x'])
+            end_y = int(next_coord['y'] * height) if next_coord['y'] <= 1.0 else int(next_coord['y'])
             draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
     return img_copy
 # -----------------------------------------------------------------------------
+# 4. APP LOGIC (ZeroGPU)
 # -----------------------------------------------------------------------------
 @spaces.GPU(duration=60)
+def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
+    if input_numpy_image is None:
+        return "Please upload an image.", None
     input_pil_image = array_to_image(input_numpy_image)
+    # 1. Build Prompt
     prompt = get_navigation_prompt(task, input_pil_image)
+    # 2. Generate
+    if fara_model is None:
+        raise ValueError("Model not loaded")
+    navigation_str = fara_model.generate(prompt, max_new_tokens=500)
+    print(f"Raw Output: {navigation_str}")
+    # 3. Parse
+    navigation_str = navigation_str.strip()
+    actions = parse_actions_from_response(navigation_str)
     all_coordinates = []
+    for action_code in actions:
+        coordinates = extract_coordinates_from_action(action_code)
         all_coordinates.extend(coordinates)
+    # 4. Visualize
+    localized_image = input_pil_image
     if all_coordinates:
+        visualized = create_localized_image(input_pil_image, all_coordinates)
+        if visualized:
+            localized_image = visualized
+    return navigation_str, localized_image
 # -----------------------------------------------------------------------------
+# 5. GRADIO UI
 # -----------------------------------------------------------------------------
+title = "Fara-7B GUI Operator 🖥️"
 description = """
+This demo uses **microsoft/Fara-7B** to understand GUI screenshots and generate navigation actions.
+Upload a screenshot, define a task, and see the model's planned actions.
 """
+# Load examples safely
+examples = []
+example_paths = [
+    ("Search for UK Prime Minister", "./assets/google.png"),
+    ("Find trending models", "./assets/huggingface.png")
+]
+# We skip checking file existence to allow script to run,
+# but in a real space, ensure ./assets/ folder exists or remove examples
+safe_examples = []
+for label, path in example_paths:
+    if os.path.exists(path):
+        safe_examples.append([path, label])
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(description)
     with gr.Row():
+        with gr.Column():
+            input_image_component = gr.Image(label="Upload Interface Screenshot", height=500)
             task_component = gr.Textbox(
+                label="Task Instruction",
+                placeholder="e.g., Click the Search bar and type 'Hello World'",
+                lines=2
             )
+            submit_button = gr.Button("Generate Action", variant="primary")
+        with gr.Column():
+            output_image_component = gr.Image(label="Visualized Action", height=500)
+            output_code_component = gr.Textbox(label="Model Output (Code)", lines=10, show_copy_button=True)
     submit_button.click(
+        fn=navigate,
+        inputs=[input_image_component, task_component],
+        outputs=[output_code_component, output_image_component]
     )
+    if safe_examples:
         gr.Examples(
+            examples=safe_examples,
             inputs=[input_image_component, task_component],
+            outputs=[output_code_component, output_image_component],
+            fn=navigate,
             cache_examples=True,
         )
 if __name__ == "__main__":
+    demo.queue().launch()