Spaces:

prithivMLmods
/

CUA-GUI-Operator

Sleeping

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

7d0d550

verified ·

1 Parent(s): 1c19f0a

Update app.py

Browse files

Files changed (1) hide show

app.py +294 -346

app.py CHANGED Viewed

@@ -1,369 +1,317 @@
-import gradio as gr
-import json, os, re, traceback, contextlib
-from typing import Any, List, Dict
-import spaces
 import torch
-from PIL import Image, ImageDraw
-import requests
-from transformers import AutoModelForImageTextToText, AutoProcessor
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
-# --- Configuration ---
-MODEL_ID = "microsoft/Fara-7B"
-# ---------------- Device / DType helpers ----------------
-def pick_device() -> str:
-    """
-    On HF Spaces (ZeroGPU), CUDA is only available inside @spaces.GPU calls.
-    We still honor FORCE_DEVICE for local testing.
-    """
-    forced = os.getenv("FORCE_DEVICE", "").lower().strip()
-    if forced in {"cpu", "cuda", "mps"}:
-        return forced
-    if torch.cuda.is_available():
-        return "cuda"
-    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
-        return "mps"
-    return "cpu"
-def pick_dtype(device: str) -> torch.dtype:
-    if device == "cuda":
-        major, _ = torch.cuda.get_device_capability()
-        return torch.bfloat16 if major >= 8 else torch.float16  # Ampere+ -> bf16
-    if device == "mps":
-        return torch.float16
-    return torch.float32  # CPU: FP32 is usually fastest & most stable
-def move_to_device(batch, device: str):
-    if isinstance(batch, dict):
-        return {k: (v.to(device, non_blocking=True) if hasattr(v, "to") else v) for k, v in batch.items()}
-    if hasattr(batch, "to"):
-        return batch.to(device, non_blocking=True)
-    return batch
-# --- Chat/template helpers ---
-def apply_chat_template_compat(processor, messages: List[Dict[str, Any]]) -> str:
-    tok = getattr(processor, "tokenizer", None)
-    if hasattr(processor, "apply_chat_template"):
-        return processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    if tok is not None and hasattr(tok, "apply_chat_template"):
-        return tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    texts = []
-    for m in messages:
-        for c in m.get("content", []):
-            if isinstance(c, dict) and c.get("type") == "text":
-                texts.append(c.get("text", ""))
-    return "\n".join(texts)
-def batch_decode_compat(processor, token_id_batches, **kw):
-    tok = getattr(processor, "tokenizer", None)
-    if tok is not None and hasattr(tok, "batch_decode"):
-        return tok.batch_decode(token_id_batches, **kw)
-    if hasattr(processor, "batch_decode"):
-        return processor.batch_decode(token_id_batches, **kw)
-    raise AttributeError("No batch_decode available on processor or tokenizer.")
-def get_image_proc_params(processor) -> Dict[str, int]:
-    ip = getattr(processor, "image_processor", None)
-    return {
-        "patch_size": getattr(ip, "patch_size", 14),
-        "merge_size": getattr(ip, "merge_size", 1),
-        "min_pixels": getattr(ip, "min_pixels", 256 * 256),
-        "max_pixels": getattr(ip, "max_pixels", 1280 * 1280),
-    }
-def trim_generated(generated_ids, inputs):
-    in_ids = getattr(inputs, "input_ids", None)
-    if in_ids is None and isinstance(inputs, dict):
-        in_ids = inputs.get("input_ids", None)
-    if in_ids is None:
-        return [out_ids for out_ids in generated_ids]
-    return [out_ids[len(in_seq):] for in_seq, out_ids in zip(in_ids, generated_ids)]
-# --- Parsing helper: normalize various UI-TARS click formats to (x, y) ---
-def parse_click_coordinates(text: str, img_w: int, img_h: int):
-    """
-    Returns (x, y) in image coordinates, clamped to bounds, or None.
-    Handles:
-      - Click(start_box='(x,y)') / Click(end_box='(x,y)')
-      - Click(box='(x1,y1,x2,y2)') -> center
-      - Click(x, y)
-      - Click({'x':..., 'y':...}) / Click({"x":...,"y":...})
-    Preference: start_box > end_box when both exist.
-    """
-    s = str(text)
-    # 1) start_box / end_box
-    pairs = re.findall(r"(start_box|end_box)\s*=\s*['\"]\(\s*(\d+)\s*,\s*(\d+)\s*\)['\"]", s)
-    if pairs:
-        start = next(((int(x), int(y)) for k, x, y in pairs if k == "start_box"), None)
-        if start:
-            x, y = start
-            return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
-        end = next(((int(x), int(y)) for k, x, y in pairs if k == "end_box"), None)
-        if end:
-            x, y = end
-            return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
-    # 2) box='(x1,y1,x2,y2)' -> center
-    m = re.search(r"box\s*=\s*['\"]\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)['\"]", s)
-    if m:
-        x1, y1, x2, y2 = map(int, m.groups())
-        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
-        return max(0, min(cx, img_w - 1)), max(0, min(cy, img_h - 1))
-    # 3) Direct Click(x, y)
-    m = re.search(r"Click\s*\(\s*(\d+)\s*,\s*(\d+)\s*\)", s)
-    if m:
-        x, y = int(m.group(1)), int(m.group(2))
-        return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
-    # 4) JSON-ish dicts
-    m = re.search(r"Click\s*\(\s*[{[][^)}]*['\"]?x['\"]?\s*:\s*(\d+)\s*,\s*['\"]?y['\"]?\s*:\s*(\d+)[^)}]*\)\s*", s)
-    if m:
-        x, y = int(m.group(1)), int(m.group(2))
-        return max(0, min(x, img_w - 1)), max(0, min(y, img_h - 1))
-    return None
-# --- Load model/processor ON CPU at import time (ZeroGPU safe) ---
-print(f"Loading model and processor for {MODEL_ID} on CPU startup (ZeroGPU safe)...")
-model = None
-processor = None
-model_loaded = False
-load_error_message = ""
-try:
-    model = AutoModelForImageTextToText.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.float32,  # CPU-safe dtype at import
-        trust_remote_code=True,
-    )
-    # IMPORTANT: use_fast=False to avoid the breaking change error you hit
-    processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
-    model.eval()
-    model_loaded = True
-    print("Model and processor loaded on CPU.")
-except Exception as e:
-    load_error_message = (
-        f"Error loading model/processor: {e}\n"
-        "This might be due to network/model ID/library versions.\n"
-        "Check the full traceback in the logs."
-    )
-    print(load_error_message)
-    traceback.print_exc()
-# --- Prompt builder ---
-def get_localization_prompt(pil_image: Image.Image, instruction: str) -> List[dict]:
-    guidelines: str = (
-        "Localize an element on the GUI image according to my instructions and "
-        "output a click position as Click(x, y) with x num pixels from the left edge "
-        "and y num pixels from the top edge."
-    )
     return [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": pil_image},
-                {"type": "text", "text": f"{guidelines}\n{instruction}"}
-            ],
-        }
     ]
-# --- Inference core (device passed in; AMP used when suitable) ---
-@torch.inference_mode()
-def run_inference_localization(
-    messages_for_template: List[dict[str, Any]],
-    pil_image_for_processing: Image.Image,
-    device: str,
-    dtype: torch.dtype,
-) -> str:
-    text_prompt = apply_chat_template_compat(processor, messages, ) if False else apply_chat_template_compat(processor, messages_for_template)
-    inputs = processor(
-        text=[text_prompt],
-        images=[pil_image_for_processing],
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = move_to_device(inputs, device)
-    # AMP contexts
-    if device == "cuda":
-        amp_ctx = torch.autocast(device_type="cuda", dtype=dtype)
-    elif device == "mps":
-        amp_ctx = torch.autocast(device_type="mps", dtype=torch.float16)
-    else:
-        amp_ctx = contextlib.nullcontext()
-    with amp_ctx:
-        generated_ids = model.generate(
-            **inputs,
-            max_new_tokens=128,
-            do_sample=False,
-        )
-    generated_ids_trimmed = trim_generated(generated_ids, inputs)
-    decoded_output = batch_decode_compat(
-        processor,
-        generated_ids_trimmed,
-        skip_special_tokens=True,
-        clean_up_tokenization_spaces=False
-    )
-    return decoded_output[0] if decoded_output else ""
-# --- Gradio processing function (ZeroGPU-visible) ---
-@spaces.GPU(duration=120)  # keep GPU attached briefly between calls (seconds)
-def predict_click_location(input_pil_image: Image.Image, instruction: str):
-    if not model_loaded or not processor or not model:
-        return f"Model not loaded. Error: {load_error_message}", None, "device: n/a | dtype: n/a"
-    if not input_pil_image:
-        return "No image provided. Please upload an image.", None, "device: n/a | dtype: n/a"
-    if not instruction or instruction.strip() == "":
-        return "No instruction provided. Please type an instruction.", input_pil_image.copy().convert("RGB"), "device: n/a | dtype: n/a"
-    # Decide device/dtype *inside* the GPU-decorated call
-    device = pick_device()
-    dtype = pick_dtype(device)
-    # Optional perf knobs for CUDA
-    if device == "cuda":
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.set_float32_matmul_precision("high")
-    # If needed, move model now that GPU is available
-    try:
-        p = next(model.parameters())
-        cur_dev = p.device.type
-        cur_dtype = p.dtype
-    except StopIteration:
-        cur_dev, cur_dtype = "cpu", torch.float32
-    if cur_dev != device or cur_dtype != dtype:
-        model.to(device=device, dtype=dtype)
-        model.eval()
-    # 1) Resize according to image processor params (safe defaults if missing)
     try:
-        ip = get_image_proc_params(processor)
-        resized_height, resized_width = smart_resize(
-            input_pil_image.height,
-            input_pil_image.width,
-            factor=ip["patch_size"] * ip["merge_size"],
-            min_pixels=ip["min_pixels"],
-            max_pixels=ip["max_pixels"],
         )
-        resized_image = input_pil_image.resize(
-            size=(resized_width, resized_height),
-            resample=Image.Resampling.LANCZOS
         )
-    except Exception as e:
-        traceback.print_exc()
-        return f"Error resizing image: {e}", input_pil_image.copy().convert("RGB"), f"device: {device} | dtype: {dtype}"
-    # 2) Build messages with image + instruction
-    messages = get_localization_prompt(resized_image, instruction)
-    # 3) Run inference
-    try:
-        coordinates_str = run_inference_localization(messages, resized_image, device, dtype)
-    except Exception as e:
-        traceback.print_exc()
-        return f"Error during model inference: {e}", resized_image.copy().convert("RGB"), f"device: {device} | dtype: {dtype}"
-    # 4) Parse coordinates and draw marker
-    output_image_with_click = resized_image.copy().convert("RGB")
-    coords = parse_click_coordinates(coordinates_str, resized_width, resized_height)
-    if coords is not None:
-        x, y = coords
-        draw = ImageDraw.Draw(output_image_with_click)
-        radius = max(5, min(resized_width // 100, resized_height // 100, 15))
-        bbox = (x - radius, y - radius, x + radius, y + radius)
-        draw.ellipse(bbox, outline="red", width=max(2, radius // 4))
-        print(f"Predicted and drawn click at: ({x}, {y}) on resized image ({resized_width}x{resized_height})")
-    else:
-        print(f"Could not parse a click from model output: {coordinates_str}")
-    return coordinates_str, output_image_with_click, f"device: {device} | dtype: {str(dtype).replace('torch.', '')}"
-# --- Load Example Data ---
-example_image = None
-example_instruction = "Enter the server address readyforquantum.com to check its security"
-try:
-    example_image_url = "https://readyforquantum.com/img/screentest.jpg"
-    example_image = Image.open(requests.get(example_image_url, stream=True).raw)
-except Exception as e:
-    print(f"Could not load example image from URL: {e}")
-    traceback.print_exc()
-    try:
-        example_image = Image.new("RGB", (200, 150), color="lightgray")
-        draw = ImageDraw.Draw(example_image)
-        draw.text((10, 10), "Example image\nfailed to load", fill="black")
-    except Exception:
-        pass
-# --- Gradio UI ---
-title = "GUI Nav Demo"
-article = f"""
-<p style='text-align: center'>
-Model: <a href='https://huggingface.co/{MODEL_ID}' target='_blank'>{MODEL_ID}</a>
-</p>
 """
-if not model_loaded:
-    with gr.Blocks() as demo:
-        gr.Markdown(f"# <center>⚠️ Error: Model Failed to Load ⚠️</center>")
-        gr.Markdown(f"<center>{load_error_message}</center>")
-        gr.Markdown("<center>See logs for the full traceback.</center>")
-else:
-    with gr.Blocks(theme=gr.themes.Soft()) as demo:
-        gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
-        gr.Markdown(article)
-        with gr.Row():
-            with gr.Column(scale=1):
-                input_image_component = gr.Image(type="pil", label="Input UI Image", height=400)
-                instruction_component = gr.Textbox(
-                    label="Instruction",
-                    placeholder="e.g., Click the 'Login' button",
-                    info="Type the action you want the model to localize on the image."
-                )
-                submit_button = gr.Button("Localize Click", variant="primary")
-            with gr.Column(scale=1):
-                output_coords_component = gr.Textbox(
-                    label="Predicted Coordinates / Action",
-                    interactive=False
-                )
-                output_image_component = gr.Image(
-                    type="pil",
-                    label="Image with Predicted Click Point",
-                    height=400,
-                    interactive=False
-                )
-                runtime_info = gr.Textbox(
-                    label="Runtime Info",
-                    value="device: n/a | dtype: n/a",
-                    interactive=False
-                )
-        if example_image:
-            gr.Examples(
-                examples=[[example_image, example_instruction]],
-                inputs=[input_image_component, instruction_component],
-                outputs=[output_coords_component, output_image_component, runtime_info],
-                fn=predict_click_location,
-                cache_examples="lazy",
             )
-        submit_button.click(
-            fn=predict_click_location,
-            inputs=[input_image_component, instruction_component],
-            outputs=[output_coords_component, output_image_component, runtime_info]
-        )
 if __name__ == "__main__":
-    demo.launch(debug=True)

+import os
+import re
+import json
+import time
+import shutil
+import uuid
+import tempfile
+import unicodedata
+from io import BytesIO
+from typing import Tuple, Optional, List, Dict, Any
+import gradio as gr
+import numpy as np
 import torch
+import spaces
+from PIL import Image, ImageDraw, ImageFont
+# Transformers & Qwen Utils
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+# -----------------------------------------------------------------------------
+# 1. CONSTANTS & SYSTEM PROMPT
+# -----------------------------------------------------------------------------
+MODEL_ID = "microsoft/Fara-7B"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Updated System Prompt to encourage the JSON format the model prefers
+OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
+You need to generate the next action to complete the task.
+Output your action inside a <tool_call> block using JSON format.
+Include "coordinate": [x, y] in pixels for interactions.
+Examples:
+<tool_call>
+{"name": "User", "arguments": {"action": "click", "coordinate": [400, 300]}}
+</tool_call>
+<tool_call>
+{"name": "User", "arguments": {"action": "type", "coordinate": [100, 200], "text": "hello"}}
+</tool_call>
+"""
+# -----------------------------------------------------------------------------
+# 2. MODEL DEFINITION
+# -----------------------------------------------------------------------------
+class FaraTransformersModel:
+    def __init__(self, model_id: str, to_device: str = "cuda"):
+        print(f"Loading {model_id} on {to_device}...")
+        self.model_id = model_id
+        try:
+            self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_id,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
+                device_map="auto" if to_device == "cuda" else None,
+            )
+            if to_device == "cpu":
+                self.model.to("cpu")
+            self.model.eval()
+            print("Model loaded successfully.")
+        except Exception as e:
+            print(f"Error loading Fara: {e}")
+            raise e
+    def generate(self, messages: list[dict], max_new_tokens=512):
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(self.model.device)
+        with torch.no_grad():
+            generated_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        return self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+# Initialize Model
+print(f"Initializing model class for {MODEL_ID}...")
+fara_model = FaraTransformersModel(MODEL_ID, to_device=DEVICE)
+# -----------------------------------------------------------------------------
+# 3. PARSING & VISUALIZATION LOGIC (UPDATED)
+# -----------------------------------------------------------------------------
+def array_to_image(image_array: np.ndarray) -> Image.Image:
+    if image_array is None:
+        raise ValueError("No image provided. Please upload an image.")
+    return Image.fromarray(np.uint8(image_array))
+def get_navigation_prompt(task, image):
     return [
+        {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
+        {"role": "user", "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": f"Instruction: {task}"},
+        ]},
     ]
+def parse_tool_calls(response: str) -> list[dict]:
+    """
+    Parses the <tool_call>{JSON}</tool_call> format specifically.
+    Extracts coordinates and action types.
+    """
+    actions = []
+    # Regex to find content between <tool_call> tags
+    # re.DOTALL allows matching across newlines
+    matches = re.findall(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
+    for match in matches:
+        try:
+            # Clean up the string just in case
+            json_str = match.strip()
+            data = json.loads(json_str)
+            # Access the 'arguments' dictionary
+            args = data.get("arguments", {})
+            # Extract coordinates: Expecting list like [399, 496]
+            coords = args.get("coordinate", [])
+            action_type = args.get("action", "unknown")
+            text_content = args.get("text", "")
+            if coords and isinstance(coords, list) and len(coords) == 2:
+                actions.append({
+                    "type": action_type,
+                    "x": float(coords[0]),
+                    "y": float(coords[1]),
+                    "text": text_content,
+                    "raw_json": data
+                })
+                print(f"Parsed Action: {action_type} at {coords}")
+            else:
+                print(f"No valid coordinates found in tool call: {json_str}")
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse JSON in tool call: {e}\nString was: {match}")
+        except Exception as e:
+            print(f"Unexpected error parsing tool call: {e}")
+    return actions
+def create_localized_image(original_image: Image.Image, actions: list[dict]) -> Optional[Image.Image]:
+    """Draws markers on the image based on parsed pixel coordinates."""
+    if not actions:
+        return None
+    img_copy = original_image.copy()
+    draw = ImageDraw.Draw(img_copy)
+    width, height = img_copy.size
+    # Try loading font
     try:
+        font = ImageFont.load_default()
+    except:
+        font = None
+    colors = {
+        'type': 'blue',
+        'click': 'red',
+        'left_click': 'red',
+        'right_click': 'purple',
+        'double_click': 'orange',
+        'unknown': 'green'
+    }
+    for i, act in enumerate(actions):
+        x = act['x']
+        y = act['y']
+        # Check if Normalized (0.0 - 1.0) or Absolute (Pixels > 1.0)
+        # The logs showed [399, 496], so these are pixels.
+        # However, to be safe, we check.
+        if x <= 1.0 and y <= 1.0 and x > 0:
+            # It's normalized, convert to pixels
+            pixel_x = int(x * width)
+            pixel_y = int(y * height)
+        else:
+            # It's absolute pixels
+            pixel_x = int(x)
+            pixel_y = int(y)
+        action_type = act['type']
+        color = colors.get(action_type, 'green')
+        # Draw Circle Target
+        r = 12 # Radius
+        draw.ellipse(
+            [pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
+            outline=color,
+            width=4
         )
+        # Draw Center Dot
+        draw.ellipse(
+            [pixel_x - 3, pixel_y - 3, pixel_x + 3, pixel_y + 3],
+            fill=color
         )
+        # Draw Label text
+        label_text = f"{action_type}"
+        if act['text']:
+            label_text += f": '{act['text']}'"
+        # Draw text background for readability
+        text_pos = (pixel_x + 15, pixel_y - 10)
+        bbox = draw.textbbox(text_pos, label_text, font=font)
+        draw.rectangle(bbox, fill="black")
+        draw.text(text_pos, label_text, fill="white", font=font)
+    return img_copy
+# -----------------------------------------------------------------------------
+# 4. GRADIO LOGIC
+# -----------------------------------------------------------------------------
+@spaces.GPU(duration=60)
+def process_screenshot(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
+    if input_numpy_image is None:
+        return "⚠️ Please upload an image first.", None
+    # Convert to PIL
+    input_pil_image = array_to_image(input_numpy_image)
+    # 1. Build Prompt
+    prompt = get_navigation_prompt(task, input_pil_image)
+    # 2. Generate Response
+    if fara_model is None:
+        raise ValueError("Model not loaded")
+    print("Generating response...")
+    raw_response = fara_model.generate(prompt, max_new_tokens=500)
+    print(f"Raw Output:\n{raw_response}")
+    # 3. Parse Actions
+    actions = parse_tool_calls(raw_response)
+    # 4. Visualize
+    output_image = input_pil_image
+    if actions:
+        visualized = create_localized_image(input_pil_image, actions)
+        if visualized:
+            output_image = visualized
+    return raw_response, output_image
+# -----------------------------------------------------------------------------
+# 5. GRADIO UI SETUP
+# -----------------------------------------------------------------------------
+title = "Fara-7B GUI Operator 🖥️"
+description = """
+This demo uses **microsoft/Fara-7B** to understand GUI screenshots.
+It generates action coordinates which are then parsed and plotted on the image.
 """
+custom_css = """
+#out_img { height: 600px; object-fit: contain; }
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
+    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(label="Upload Screenshot", height=500)
+            task_input = gr.Textbox(
+                label="Task Instruction",
+                placeholder="e.g. Input the server address readyforquantum.com...",
+                lines=2
             )
+            submit_btn = gr.Button("Analyze UI & Generate Action", variant="primary")
+        with gr.Column():
+            output_image = gr.Image(label="Visualized Action Points", elem_id="out_img", height=500)
+            output_text = gr.Textbox(label="Raw Model Output", lines=8, show_copy_button=True)
+    # Wire up the button
+    submit_btn.click(
+        fn=process_screenshot,
+        inputs=[input_image, task_input],
+        outputs=[output_text, output_image]
+    )
+    # Example for quick testing
+    gr.Examples(
+        examples=[
+            ["./assets/google.png", "Search for 'Hugging Face'"],
+        ],
+        inputs=[input_image, task_input],
+        label="Quick Examples"
+    )
 if __name__ == "__main__":
+    demo.queue().launch()