Spaces:

prithivMLmods
/

CUA-GUI-Operator

Sleeping

App Files Files Community

prithivMLmods commited on Dec 6, 2025

Commit

c875d85

verified ·

1 Parent(s): 36aecd2

Update app.py

Browse files

Files changed (1) hide show

app.py +320 -359

app.py CHANGED Viewed

@@ -1,106 +1,159 @@
 import os
 import re
 import json
-import time
-import shutil
-import uuid
-import tempfile
-import unicodedata
-from io import BytesIO
-from typing import Tuple, Optional, List, Dict, Any
-import gradio as gr
 import numpy as np
 import torch
 import spaces
 from PIL import Image, ImageDraw, ImageFont
-# Transformers imports
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
 )
 from qwen_vl_utils import process_vision_info
-# Selenium Imports
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service as ChromeService
-from selenium.webdriver.chrome.options import Options as ChromeOptions
-from selenium.webdriver.common.action_chains import ActionChains
-from selenium.webdriver.common.by import By
-from selenium.webdriver.common.keys import Keys
-from webdriver_manager.chrome import ChromeDriverManager
 # -----------------------------------------------------------------------------
-# CONSTANTS & CONFIG
 # -----------------------------------------------------------------------------
-MODEL_ID = "microsoft/Fara-7B"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-WIDTH = 1024
-HEIGHT = 768
-TMP_DIR = "./tmp"
-if not os.path.exists(TMP_DIR):
-    os.makedirs(TMP_DIR)
-# System Prompt
-# We ask for Python code, but we will also handle the JSON format the model seems to prefer.
-OS_SYSTEM_PROMPT = f"""You are a GUI automation agent controlling a Chrome browser.
-Current Screen Resolution: {WIDTH}x{HEIGHT}.
-You will receive a screenshot and a task. Generate the next action to complete the task.
-Supported Actions (Python Format):
-1. `click(x=200, y=300)`: Left click at specific pixel coordinates.
-2. `type_text(text="hello")`: Type text.
-3. `press_key(key="enter")`: Press a key.
-4. `scroll(amount=2, direction="down")`: Scroll.
-5. `open_url(url="https://google.com")`: Open a URL.
-Important:
-- Use precise PIXEL coordinates from the screenshot.
-- Wrap your action in <code> tags.
-- If you need to search, click the search bar first, then type, then press enter.
 """
 # -----------------------------------------------------------------------------
-# MODEL WRAPPER
 # -----------------------------------------------------------------------------
-class FaraModelWrapper:
     def __init__(self, model_id: str, to_device: str = "cuda"):
-        print(f"Loading {model_id} on {to_device}...")
         self.model_id = model_id
         try:
             self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
                 trust_remote_code=True,
-                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
                 device_map="auto" if to_device == "cuda" else None,
             )
             if to_device == "cpu":
                 self.model.to("cpu")
-            self.model.eval()
             print("Model loaded successfully.")
         except Exception as e:
-            print(f"Failed to load Fara, falling back to Qwen2.5-VL-7B. Error: {e}")
-            fallback_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-            self.processor = AutoProcessor.from_pretrained(fallback_id, trust_remote_code=True)
-            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                fallback_id,
-                trust_remote_code=True,
-                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
-                device_map="auto",
-            )
-    def generate(self, messages: list[dict], max_new_tokens=512):
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
         image_inputs, video_inputs = process_vision_info(messages)
         inputs = self.processor(
             text=[text],
             images=image_inputs,
@@ -110,12 +163,10 @@ class FaraModelWrapper:
         )
         inputs = inputs.to(self.model.device)
-        with torch.no_grad():
-            generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens
-            )
         generated_ids_trimmed = [
             out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
@@ -126,330 +177,240 @@ class FaraModelWrapper:
         return output_text
-# Initialize global model
-model = FaraModelWrapper(MODEL_ID, DEVICE)
 # -----------------------------------------------------------------------------
-# SELENIUM SANDBOX
 # -----------------------------------------------------------------------------
-def get_system_chrome_path():
-    paths = ["/usr/bin/chromium", "/usr/bin/chromium-browser", "/usr/bin/google-chrome"]
-    for p in paths:
-        if os.path.exists(p): return p
-    return None
-class SeleniumSandbox:
-    def __init__(self, width=1024, height=768):
-        self.width = width
-        self.height = height
-        self.tmp_dir = tempfile.mkdtemp(prefix="chrome_sandbox_")
-        chrome_opts = ChromeOptions()
-        binary_path = get_system_chrome_path()
-        if binary_path: chrome_opts.binary_location = binary_path
-        chrome_opts.add_argument("--headless=new")
-        chrome_opts.add_argument(f"--user-data-dir={self.tmp_dir}")
-        chrome_opts.add_argument(f"--window-size={width},{height}")
-        chrome_opts.add_argument("--no-sandbox")
-        chrome_opts.add_argument("--disable-dev-shm-usage")
-        chrome_opts.add_argument("--disable-gpu")
-        try:
-            system_driver_path = "/usr/bin/chromedriver"
-            if os.path.exists(system_driver_path):
-                service = ChromeService(executable_path=system_driver_path)
-            else:
-                service = ChromeService(ChromeDriverManager().install())
-            self.driver = webdriver.Chrome(service=service, options=chrome_opts)
-            self.driver.set_window_size(width, height)
-            print("Selenium started.")
-        except Exception as e:
-            print(f"Selenium init failed: {e}")
-            shutil.rmtree(self.tmp_dir, ignore_errors=True)
-            raise e
-    def get_screenshot(self):
-        return Image.open(BytesIO(self.driver.get_screenshot_as_png()))
-    def execute_action(self, action_data: dict):
-        """Execute parsed action on the browser"""
-        action_type = action_data.get('type')
-        print(f"Executing action: {action_data}")
-        try:
-            actions = ActionChains(self.driver)
-            body = self.driver.find_element(By.TAG_NAME, "body")
-            # Helper: Handle both normalized (0-1) and pixel coordinates
-            def get_coords(data):
-                x, y = data.get('x', 0), data.get('y', 0)
-                if x <= 1.0 and y <= 1.0 and x > 0: # Likely normalized
-                    x = int(x * self.width)
-                    y = int(y * self.height)
-                else: # Likely pixels
-                    x = int(x)
-                    y = int(y)
-                return x, y
-            if action_type in ['click', 'left_click', 'right_click', 'double_click']:
-                x_px, y_px = get_coords(action_data)
-                # Reset pointer to top-left then move
-                actions.move_to_element_with_offset(body, 0, 0)
-                actions.move_by_offset(x_px, y_px)
-                if action_type in ['click', 'left_click']: actions.click()
-                elif action_type == 'right_click': actions.context_click()
-                elif action_type == 'double_click': actions.double_click()
-                actions.perform()
-                return f"Clicked at {x_px}, {y_px}"
-            elif action_type == 'type_text':
-                text = action_data.get('text', '')
-                press_enter = action_data.get('press_enter', False)
-                # Check if this type action came with coordinates (from JSON log)
-                if 'x' in action_data and 'y' in action_data:
-                     x_px, y_px = get_coords(action_data)
-                     actions.move_to_element_with_offset(body, 0, 0)
-                     actions.move_by_offset(x_px, y_px)
-                     actions.click()
-                actions.send_keys(text)
-                if press_enter:
-                    actions.send_keys(Keys.ENTER)
-                actions.perform()
-                return f"Typed '{text}'"
-            elif action_type == 'press_key':
-                key_name = action_data.get('key', '').lower()
-                k = getattr(Keys, key_name.upper(), None)
-                if not k:
-                    if key_name == "enter": k = Keys.ENTER
-                    elif key_name == "space": k = Keys.SPACE
-                if k:
-                    actions.send_keys(k)
-                    actions.perform()
-                    return f"Pressed {key_name}"
-            elif action_type == 'scroll':
-                amount = action_data.get('amount', 2)
-                scroll_y = amount * 100
-                self.driver.execute_script(f"window.scrollBy(0, {scroll_y});")
-                return "Scrolled"
-            elif action_type == 'open_url':
-                url = action_data.get('url', '')
-                if not url.startswith('http'): url = 'https://' + url
-                self.driver.get(url)
-                time.sleep(2) # Wait for load
-                return f"Opened {url}"
-            return f"Unknown action {action_type}"
-        except Exception as e:
-            return f"Action failed: {e}"
-    def cleanup(self):
-        try: self.driver.quit()
-        except: pass
-        shutil.rmtree(self.tmp_dir, ignore_errors=True)
 # -----------------------------------------------------------------------------
-# PARSING LOGIC (Fixed to handle JSON logs)
 # -----------------------------------------------------------------------------
-def parse_model_response(response: str) -> dict:
-    """
-    Parses both:
-    1. <code>click(x=...)</code> (Python style)
-    2. <tool_call>{...}</tool_call> (JSON style seen in logs)
-    """
-    # Check for JSON Tool Call first (Priority based on logs)
-    tool_match = re.search(r"<tool_call>(.*?)</tool_call>", response, re.DOTALL)
-    if tool_match:
-        try:
-            tool_data = json.loads(tool_match.group(1))
-            name = tool_data.get("name")
-            args = tool_data.get("arguments", {})
-            # Map JSON schema to our internal schema
-            if name == "Navigate":
-                if "url" in args:
-                    return {"type": "open_url", "url": args["url"]}
-                elif "action" in args:
-                    action_sub = args["action"]
-                    coords = args.get("coordinate", [0, 0])
-                    x, y = coords[0], coords[1]
-                    if action_sub == "left_click":
-                        return {"type": "click", "x": x, "y": y}
-                    elif action_sub == "type":
-                        text = args.get("text", "")
-                        enter = args.get("press_enter", False)
-                        return {"type": "type_text", "text": text, "x": x, "y": y, "press_enter": enter}
-            elif name == "Type":
-                return {
-                    "type": "type_text",
-                    "text": args.get("text", ""),
-                    "press_enter": args.get("press_enter", False)
-                }
-        except Exception as e:
-            print(f"JSON Parse Error: {e}")
-    # Fallback to Python Code Block
-    code_match = re.search(r"<code>\s*(.*?)\s*</code>", response, re.DOTALL)
-    if code_match:
-        action_str = code_match.group(1)
-        # Regex for Python style
-        coord_match = re.match(r"(\w+)\s*\(\s*x\s*=\s*([0-9.]+)\s*,\s*y\s*=\s*([0-9.]+)\s*\)", action_str)
-        if coord_match:
-            return {"type": coord_match.group(1), "x": float(coord_match.group(2)), "y": float(coord_match.group(3))}
-        url_match = re.match(r"open_url\s*\(\s*url\s*=\s*[\"'](.*?)[\"']\s*\)", action_str)
-        if url_match: return {"type": "open_url", "url": url_match.group(1)}
-        text_match = re.match(r"type_text\s*\(\s*text\s*=\s*[\"'](.*?)[\"']\s*\)", action_str)
-        if text_match: return {"type": "type_text", "text": text_match.group(1)}
-    return {}
 # -----------------------------------------------------------------------------
-# MAIN LOOP
 # -----------------------------------------------------------------------------
-@spaces.GPU(duration=180)
-def agent_step(task_instruction: str, history: list, sandbox_state: dict):
-    # Retrieve or Create Sandbox
-    if 'uuid' not in sandbox_state:
-        sandbox_state['uuid'] = str(uuid.uuid4())
-    sid = sandbox_state['uuid']
-    if sid not in SANDBOX_REGISTRY:
-        SANDBOX_REGISTRY[sid] = SeleniumSandbox(WIDTH, HEIGHT)
-    sandbox = SANDBOX_REGISTRY[sid]
-    # 1. Get Screenshot
-    screenshot = sandbox.get_screenshot()
-    # 2. Construct Prompt
-    # We append the history of actions to help the model know state
-    history_text = "\n".join([h.split('\nAction:')[1].strip() if 'Action:' in h else '' for h in history[-3:]])
-    messages = [
-        {"role": "system", "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]},
-        {"role": "user", "content": [
-            {"type": "image", "image": screenshot},
-            {"type": "text", "text": f"Task: {task_instruction}\nPrevious Actions Summary: {history_text}"}
-        ]}
-    ]
-    # 3. Model Inference
-    response = model.generate(messages)
-    # 4. Parse Action
-    action_data = parse_model_response(response)
-    log_entry = f"Step: {len(history)+1}\nThought: {response}\nParsed: {action_data}"
-    # 5. Execute Action
-    if action_data:
-        execution_result = sandbox.execute_action(action_data)
-        # Visual Marker
-        if 'x' in action_data and 'y' in action_data:
-            draw = ImageDraw.Draw(screenshot)
-            # Handle mixed coord types for drawing
-            x_px = action_data['x']
-            y_px = action_data['y']
-            if x_px <= 1.0: x_px *= WIDTH
-            if y_px <= 1.0: y_px *= HEIGHT
-            r = 10
-            draw.ellipse((x_px-r, y_px-r, x_px+r, y_px+r), outline="red", width=3)
-    else:
-        execution_result = "No valid action parsed."
-    log_entry += f"\nResult: {execution_result}"
-    history.append(log_entry)
-    return screenshot, history, sandbox_state
-# Global registry
-SANDBOX_REGISTRY = {}
-def cleanup_sandbox(sandbox_state):
-    sid = sandbox_state.get('uuid')
-    if sid and sid in SANDBOX_REGISTRY:
-        SANDBOX_REGISTRY[sid].cleanup()
-        del SANDBOX_REGISTRY[sid]
-    return [], {}
-# -----------------------------------------------------------------------------
-# GRADIO UI
-# -----------------------------------------------------------------------------
-def run_task_loop(task, history, state):
-    max_steps = 15
-    for i in range(max_steps):
-        try:
-            screenshot, new_history, new_state = agent_step(task, history, state)
-            history = new_history
-            logs_text = "\n\n" + "="*40 + "\n\n".join(history)
-            yield screenshot, logs_text, state
-            if "Done" in history[-1] or "finished" in history[-1].lower():
-                break
-            time.sleep(1)
-        except Exception as e:
-            error_msg = f"Error in loop: {e}"
-            history.append(error_msg)
-            yield None, "\n".join(history), state
-            break
-custom_css = "#view_img { height: 600px; object-fit: contain; }"
-with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
-    state = gr.State({})
-    history = gr.State([])
-    gr.Markdown("# 🤖 Fara CUA - Chrome Agent")
     with gr.Row():
         with gr.Column(scale=1):
-            task_input = gr.Textbox(label="Task Instruction", value="Go to google.com and search for 'SpaceX'")
-            run_btn = gr.Button("Run Agent", variant="primary")
-            clear_btn = gr.Button("Reset / Clear")
-        with gr.Column(scale=2):
-            browser_view = gr.Image(label="Live Browser View", elem_id="view_img", interactive=False)
-    logs_output = gr.Textbox(label="Agent Logs", lines=15, interactive=False)
-    run_btn.click(
-        fn=run_task_loop,
-        inputs=[task_input, history, state],
-        outputs=[browser_view, logs_output, state]
-    )
-    clear_btn.click(
-        fn=cleanup_sandbox,
-        inputs=[state],
-        outputs=[history, state]
-    ).then(
-        lambda: (None, ""),
-        outputs=[browser_view, logs_output]
     )
 if __name__ == "__main__":
-    demo.launch(share=True)

 import os
 import re
 import json
 import numpy as np
 import torch
 import spaces
+import gradio as gr
 from PIL import Image, ImageDraw, ImageFont
+from typing import Tuple, Optional, List, Dict, Any
+# Transformers & Qwen Utils
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
 )
 from qwen_vl_utils import process_vision_info
 # -----------------------------------------------------------------------------
+# 1. PROMPTS (from prompt.py)
 # -----------------------------------------------------------------------------
+OS_ACTIONS = """
+def final_answer(answer: any) -> any:
+    \"\"\"
+    Provides a final answer to the given problem.
+    Args:
+        answer: The final answer to the problem
+    \"\"\"
+def move_mouse(self, x: float, y: float) -> str:
+    \"\"\"
+    Moves the mouse cursor to the specified coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+def click(x: Optional[float] = None, y: Optional[float] = None) -> str:
+    \"\"\"
+    Performs a left-click at the specified normalized coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+def double_click(x: Optional[float] = None, y: Optional[float] = None) -> str:
+    \"\"\"
+    Performs a double-click at the specified normalized coordinates
+    Args:
+        x: The x coordinate (horizontal position)
+        y: The y coordinate (vertical position)
+    \"\"\"
+def type(text: str) -> str:
+    \"\"\"
+    Types the specified text at the current cursor position.
+    Args:
+        text: The text to type
+    \"\"\"
+def press(keys: str | list[str]) -> str:
+    \"\"\"
+    Presses a keyboard key
+    Args:
+        keys: The key or list of keys to press (e.g. "enter", "space", "backspace", "ctrl", etc.).
+    \"\"\"
+def navigate_back() -> str:
+    \"\"\"
+    Goes back to the previous page in the browser. If using this tool doesn't work, just click the button directly.
+    \"\"\"
+def drag(from_coord: list[float], to_coord: list[float]) -> str:
+    \"\"\"
+    Clicks [x1, y1], drags mouse to [x2, y2], then release click.
+    Args:
+        x1: origin x coordinate
+        y1: origin y coordinate
+        x2: end x coordinate
+        y2: end y coordinate
+    \"\"\"
+def scroll(direction: Literal["up", "down"] = "down", amount: int = 1) -> str:
+    \"\"\"
+    Moves the mouse to selected coordinates, then uses the scroll button: this could scroll the page or zoom, depending on the app. DO NOT use scroll to move through linux desktop menus.
+    Args:
+        x: The x coordinate (horizontal position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
+        y: The y coordinate (vertical position) of the element to scroll/zoom, defaults to None to not focus on specific coordinates
+        direction: The direction to scroll ("up" or "down"), defaults to "down". For zoom, "up" zooms in, "down" zooms out.
+        amount: The amount to scroll. A good amount is 1 or 2.
+    \"\"\"
+def wait(seconds: float) -> str:
+    \"\"\"
+    Waits for the specified number of seconds. Very useful in case the prior order is still executing (for example starting very heavy applications like browsers or office apps)
+    Args:
+        seconds: Number of seconds to wait, generally 2 is enough.
+    \"\"\"
+"""
+OS_SYSTEM_PROMPT = f"""You are a helpful GUI agent. You’ll be given a task and a screenshot of the screen. Complete the task using Python function calls.
+For each step:
+	•	First, <think></think> to express the thought process guiding your next action and the reasoning behind it.
+	•	Then, use <code></code> to perform the action. it will be executed in a stateful environment.
+The following functions are exposed to the Python interpreter:
+<code>
+{OS_ACTIONS}
+</code>
+The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
 """
 # -----------------------------------------------------------------------------
+# 2. MODEL WRAPPER (Modified for Fara/QwenVL)
 # -----------------------------------------------------------------------------
+class TransformersModel:
     def __init__(self, model_id: str, to_device: str = "cuda"):
+        print(f"Loading model: {model_id}...")
         self.model_id = model_id
+        # Load Processor
         try:
             self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        except Exception as e:
+            print(f"Error loading processor: {e}")
+            raise e
+        # Load Model
+        try:
             self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                 model_id,
                 trust_remote_code=True,
+                torch_dtype=torch.bfloat16,
                 device_map="auto" if to_device == "cuda" else None,
             )
             if to_device == "cpu":
                 self.model.to("cpu")
             print("Model loaded successfully.")
         except Exception as e:
+            print(f"Error loading Fara/Qwen model: {e}. Ensure you have access/internet.")
+            raise e
+    def generate(self, messages: list[dict], **kwargs):
+        # 1. Prepare text prompt using chat template
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
+        # 2. Process images/videos
         image_inputs, video_inputs = process_vision_info(messages)
+        # 3. Create model inputs
         inputs = self.processor(
             text=[text],
             images=image_inputs,
         )
         inputs = inputs.to(self.model.device)
+        # 4. Generate
+        generated_ids = self.model.generate(**inputs, **kwargs)
+        # 5. Decode (trimming input tokens)
         generated_ids_trimmed = [
             out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         return output_text
 # -----------------------------------------------------------------------------
+# 3. HELPER FUNCTIONS
 # -----------------------------------------------------------------------------
+def array_to_image(image_array: np.ndarray) -> Image.Image:
+    if image_array is None:
+        raise ValueError("No image provided. Please upload an image before submitting.")
+    return Image.fromarray(np.uint8(image_array))
+def get_navigation_prompt(task, image):
+    """Constructs the prompt messages for the model"""
+    return [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}],
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": image},
+                {"type": "text", "text": f"Instruction: {task}\n\nPrevious actions:\nNone"},
+            ],
+        },
+    ]
+def parse_actions_from_response(response: str) -> list[str]:
+    """Parse actions from model response using regex pattern."""
+    # Look for code block
+    pattern = r"<code>\s*(.*?)\s*</code>"
+    matches = re.findall(pattern, response, re.DOTALL)
+    # If no code block, try to find raw function calls if the model forgot tags
+    if not matches:
+        # Fallback: look for lines starting with known functions
+        funcs = ["click", "type", "press", "drag", "scroll", "wait"]
+        lines = response.split('\n')
+        found = []
+        for line in lines:
+            line = line.strip()
+            if any(line.startswith(f) for f in funcs):
+                found.append(line)
+        return found
+    return matches
+def extract_coordinates_from_action(action_code: str) -> list[dict]:
+    """Extract coordinates from action code for localization actions."""
+    localization_actions = []
+    # Patterns for different action types
+    patterns = {
+        'click': r'click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
+        'double_click': r'double_click\((?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))?\)',
+        'move_mouse': r'move_mouse\((?:self,\s*)?(?:x=)?([0-9.]+)(?:,\s*(?:y=)?([0-9.]+))\)',
+        'drag': r'drag\(\[([0-9.]+),\s*([0-9.]+)\],\s*\[([0-9.]+),\s*([0-9.]+)\]\)'
+    }
+    for action_type, pattern in patterns.items():
+        matches = re.finditer(pattern, action_code)
+        for match in matches:
+            if action_type == 'drag':
+                # Drag has from and to coordinates
+                from_x, from_y, to_x, to_y = match.groups()
+                localization_actions.append({
+                    'type': 'drag_from', 'x': float(from_x), 'y': float(from_y), 'action': action_type
+                })
+                localization_actions.append({
+                    'type': 'drag_to', 'x': float(to_x), 'y': float(to_y), 'action': action_type
+                })
+            else:
+                # Single coordinate actions
+                if match.groups()[0]:
+                    x_val = match.group(1)
+                    y_val = match.group(2) if match.group(2) else x_val
+                    # Convert pixel coords to normalized if they look like pixels (assuming > 1000 width usually)
+                    # Note: The prompt implies normalized (0.0-1.0), but if model outputs 500, we handle it visually later
+                    if x_val and y_val:
+                        localization_actions.append({
+                            'type': action_type,
+                            'x': float(x_val),
+                            'y': float(y_val),
+                            'action': action_type
+                        })
+    return localization_actions
+def create_localized_image(original_image: Image.Image, coordinates: list[dict]) -> Optional[Image.Image]:
+    """Create an image with localization markers drawn on it."""
+    if not coordinates:
+        return None
+    img_copy = original_image.copy()
+    draw = ImageDraw.Draw(img_copy)
+    width, height = img_copy.size
+    try:
+        font = ImageFont.load_default()
+    except:
+        font = None
+    colors = {
+        'click': 'red', 'double_click': 'blue', 'move_mouse': 'green',
+        'drag_from': 'orange', 'drag_to': 'purple'
+    }
+    for i, coord in enumerate(coordinates):
+        # Handle normalized vs pixel coordinates
+        x, y = coord['x'], coord['y']
+        if x <= 1.0 and y <= 1.0:
+            pixel_x = int(x * width)
+            pixel_y = int(y * height)
+        else:
+            pixel_x = int(x)
+            pixel_y = int(y)
+        color = colors.get(coord['type'], 'red')
+        # Draw Circle
+        r = 8
+        draw.ellipse([pixel_x - r, pixel_y - r, pixel_x + r, pixel_y + r],
+                     fill=color, outline='white', width=2)
+        # Draw Label
+        label = f"{coord['type']}"
+        text_pos = (pixel_x + 10, pixel_y - 10)
+        if font:
+            draw.text(text_pos, label, fill=color, font=font)
+        else:
+            draw.text(text_pos, label, fill=color)
+        # Draw Arrow for Drag
+        if coord['type'] == 'drag_from' and i + 1 < len(coordinates) and coordinates[i + 1]['type'] == 'drag_to':
+            next_coord = coordinates[i + 1]
+            nx, ny = next_coord['x'], next_coord['y']
+            if nx <= 1.0 and ny <= 1.0:
+                end_x, end_y = int(nx * width), int(ny * height)
+            else:
+                end_x, end_y = int(nx), int(ny)
+            draw.line([pixel_x, pixel_y, end_x, end_y], fill='orange', width=3)
+    return img_copy
 # -----------------------------------------------------------------------------
+# 4. INITIALIZATION
 # -----------------------------------------------------------------------------
+# Using Fara-7B (or fallback)
+MODEL_ID = "microsoft/Fara-7B"
+print(f"Initializing {MODEL_ID}...")
+# Global model instance
+# Note: We initialize this lazily or globally depending on environment.
+# For Gradio Spaces, global init is standard.
+try:
+    model = TransformersModel(model_id=MODEL_ID, to_device="cuda" if torch.cuda.is_available() else "cpu")
+except Exception as e:
+    print(f"Failed to load Fara. Trying fallback Qwen...")
+    model = TransformersModel(model_id="Qwen/Qwen2.5-VL-7B-Instruct", to_device="cuda" if torch.cuda.is_available() else "cpu")
 # -----------------------------------------------------------------------------
+# 5. GRADIO APP
 # -----------------------------------------------------------------------------
+@spaces.GPU
+def navigate(input_numpy_image: np.ndarray, task: str) -> Tuple[str, Optional[Image.Image]]:
+    if input_numpy_image is None:
+        return "Please upload an image.", None
+    input_pil_image = array_to_image(input_numpy_image)
+    # Generate Prompt
+    prompt_msgs = get_navigation_prompt(task, input_pil_image)
+    # Generate Response
+    print("Generating response...")
+    response_str = model.generate(prompt_msgs, max_new_tokens=500)
+    print(f"Model Response: {response_str}")
+    # Parse
+    actions = parse_actions_from_response(response_str)
+    # Extract Coordinates
+    all_coordinates = []
+    for action_code in actions:
+        coords = extract_coordinates_from_action(action_code)
+        all_coordinates.extend(coords)
+    # Visualize
+    localized_image = input_pil_image
+    if all_coordinates:
+        localized_image = create_localized_image(input_pil_image, all_coordinates)
+    return response_str, localized_image
+title = "Fara-7B GUI Operator 🤖"
+description = """
+### Fara GUI Agent Demo
+Upload a screenshot and give an instruction. The model will analyze the UI and output the Python code to execute the action.
+This demo visualizes where the model wants to click or drag.
+"""
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
+    gr.Markdown(description)
+    with gr.Row():
+        input_image = gr.Image(label="Upload Screenshot", height=500, type="numpy")
     with gr.Row():
         with gr.Column(scale=1):
+            task_input = gr.Textbox(
+                label="Instruction",
+                placeholder="e.g. Click on the Search button...",
+                lines=2
+            )
+            submit_btn = gr.Button("Generate Action", variant="primary")
+        with gr.Column(scale=1):
+            output_code = gr.Textbox(label="Generated Python Code", lines=10)
+    # Output image gets updated with markers
+    submit_btn.click(
+        fn=navigate,
+        inputs=[input_image, task_input],
+        outputs=[output_code, input_image]
     )
+    # Optional: Examples
+    # gr.Examples(...)
 if __name__ == "__main__":
+    demo.launch()