Spaces:

prithivMLmods
/

CUA-GUI-Operator

Running on Zero

App Files Files Community

prithivMLmods commited on 14 days ago

Commit

7e47e30

verified ·

1 Parent(s): 640c489

Update app.py

Browse files

Files changed (1) hide show

app.py +326 -641

app.py CHANGED Viewed

@@ -1,21 +1,25 @@
-import json
 import os
-import shutil
 import time
 import uuid
 import tempfile
-import atexit
 import unicodedata
 from io import BytesIO
-from threading import Timer
-from typing import Any, Dict, List, Optional
-from datetime import datetime
 import gradio as gr
 import torch
 import spaces
-from dotenv import load_dotenv
-from PIL import Image, ImageDraw
 # Selenium Imports
 from selenium import webdriver
@@ -26,190 +30,116 @@ from selenium.webdriver.common.by import By
 from selenium.webdriver.common.keys import Keys
 from webdriver_manager.chrome import ChromeDriverManager
-# Smolagents imports
-from smolagents import CodeAgent, tool, AgentImage
-from smolagents.memory import ActionStep, TaskStep
-from smolagents.models import ChatMessage, Model, MessageRole
-from smolagents.gradio_ui import GradioUI, stream_to_gradio
-from smolagents.monitoring import LogLevel
-# Transformers for Fara Model
-from transformers import (
-    Qwen2_5_VLForConditionalGeneration,
-    AutoProcessor,
-)
-from qwen_vl_utils import process_vision_info
-load_dotenv(override=True)
 # -----------------------------------------------------------------------------
-# CONFIGURATION & CONSTANTS
 # -----------------------------------------------------------------------------
-HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
-if HF_TOKEN:
-    from huggingface_hub import login
-    login(token=HF_TOKEN)
-# Browser Sandbox Config
 WIDTH = 1024
 HEIGHT = 768
-TMP_DIR = "./tmp/"
 if not os.path.exists(TMP_DIR):
     os.makedirs(TMP_DIR)
-# -----------------------------------------------------------------------------
-# MODEL INITIALIZATION (Fara-7B / Qwen2.5-VL)
-# -----------------------------------------------------------------------------
-print("Loading Fara Model... This may take a moment.")
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-MODEL_ID_F = "microsoft/Fara-7B"
-# Global model variables
-model_f = None
-processor_f = None
-try:
-    processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
-    model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        MODEL_ID_F,
-        trust_remote_code=True,
-        torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
-        device_map="auto",
-    )
-    print(f"Fara Model loaded successfully on {DEVICE}")
-except Exception as e:
-    print(f"Error loading Fara Model: {e}")
-    print("Falling back to Qwen/Qwen2.5-VL-7B-Instruct...")
-    try:
-        MODEL_ID_F = "Qwen/Qwen2.5-VL-7B-Instruct"
-        processor_f = AutoProcessor.from_pretrained(MODEL_ID_F, trust_remote_code=True)
-        model_f = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            MODEL_ID_F,
-            trust_remote_code=True,
-            torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
-            device_map="auto",
-        )
-        print(f"Fallback Model ({MODEL_ID_F}) loaded successfully.")
-    except Exception as inner_e:
-        print(f"Critical error loading model: {inner_e}")
 # -----------------------------------------------------------------------------
-# GPU ISOLATED INFERENCE FUNCTION
 # -----------------------------------------------------------------------------
-@spaces.GPU(duration=120)
-def run_model_inference(formatted_messages, max_tokens=1024, stop_sequences=None):
-    """
-    Runs inference on the GPU worker.
-    """
-    global model_f, processor_f
-    if model_f is None:
-        raise ValueError("Model is not loaded.")
-    text = processor_f.apply_chat_template(
-        formatted_messages, tokenize=False, add_generation_prompt=True
-    )
-    image_inputs, video_inputs = process_vision_info(formatted_messages)
-    inputs = processor_f(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
-        return_tensors="pt",
-    )
-    inputs = inputs.to(model_f.device)
-    with torch.no_grad():
-        generated_ids = model_f.generate(
-            **inputs,
-            max_new_tokens=max_tokens,
-            stop_strings=stop_sequences,
-            tokenizer=processor_f.tokenizer,
         )
-    generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-    ]
-    output_text = processor_f.batch_decode(
-        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    return output_text
-class FaraLocalModel(Model):
-    """
-    Wrapper for the local Fara (Qwen2.5-VL) model to work with SmolAgents.
-    """
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-    def __call__(
-        self,
-        messages: List[ChatMessage],
-        stop_sequences: Optional[List[str]] = None,
-        grammar: Optional[str] = None,
-        **kwargs,
-    ) -> ChatMessage:
-        formatted_messages = []
-        for msg in messages:
-            # Safely access role and content from ChatMessage object using attributes
-            role = msg.role if hasattr(msg, "role") else "user"
-            content = msg.content if hasattr(msg, "content") else ""
-            new_content = []
-            if isinstance(content, str):
-                new_content.append({"type": "text", "text": content})
-            elif isinstance(content, list):
-                for item in content:
-                    if isinstance(item, str):
-                        new_content.append({"type": "text", "text": item})
-                    elif isinstance(item, dict):
-                        if "type" in item:
-                            if item["type"] == "image":
-                                # Handle path or url - extract value to ensure serializability
-                                val = item.get("image") or item.get("url") or item.get("path")
-                                new_content.append({"type": "image", "image": val})
-                            else:
-                                new_content.append(item)
-            formatted_messages.append({"role": role, "content": new_content})
-        output_text = run_model_inference(
-            formatted_messages=formatted_messages,
-            max_tokens=kwargs.get("max_tokens", 1024),
-            stop_sequences=stop_sequences
         )
-        return ChatMessage(
-            role=MessageRole.ASSISTANT,
-            content=output_text,
-        )
 # -----------------------------------------------------------------------------
-# SELENIUM CHROME SANDBOX
 # -----------------------------------------------------------------------------
 def get_system_chrome_path():
-    # Common paths for chromium in Linux/HF Spaces
-    paths = [
-        "/usr/bin/chromium",
-        "/usr/bin/chromium-browser",
-        "/usr/bin/google-chrome",
-    ]
     for p in paths:
-        if os.path.exists(p):
-            return p
     return None
 class SeleniumSandbox:
@@ -218,535 +148,290 @@ class SeleniumSandbox:
         self.height = height
         self.tmp_dir = tempfile.mkdtemp(prefix="chrome_sandbox_")
-        # Setup Chrome Options
         chrome_opts = ChromeOptions()
-        # Use system binary if available (fixes status 127 in HF Spaces)
         binary_path = get_system_chrome_path()
-        if binary_path:
-            print(f"Using system Chrome binary at: {binary_path}")
-            chrome_opts.binary_location = binary_path
         chrome_opts.add_argument("--headless=new")
         chrome_opts.add_argument(f"--user-data-dir={self.tmp_dir}")
         chrome_opts.add_argument(f"--window-size={width},{height}")
-        chrome_opts.add_argument("--no-sandbox") # Crucial for containers
-        chrome_opts.add_argument("--disable-dev-shm-usage") # Crucial for containers
         chrome_opts.add_argument("--disable-gpu")
-        chrome_opts.add_argument("--disable-extensions")
-        # Initialize Driver
         try:
-            # Check for system driver first
             system_driver_path = "/usr/bin/chromedriver"
             if os.path.exists(system_driver_path):
-                print(f"Using system ChromeDriver at: {system_driver_path}")
                 service = ChromeService(executable_path=system_driver_path)
             else:
-                print("Using webdriver_manager to install ChromeDriver...")
                 service = ChromeService(ChromeDriverManager().install())
             self.driver = webdriver.Chrome(service=service, options=chrome_opts)
             self.driver.set_window_size(width, height)
-            self.driver.get("about:blank")
-            print(f"Selenium Chrome Driver started successfully.")
         except Exception as e:
-            print(f"Failed to initialize Selenium: {e}")
-            self.cleanup()
             raise e
     def get_screenshot(self):
-        """Returns screenshot as PIL Image"""
-        png_data = self.driver.get_screenshot_as_png()
-        return Image.open(BytesIO(png_data))
-    def move_mouse_and_click(self, x, y, click_type="left"):
-        try:
-            body = self.driver.find_element(By.TAG_NAME, "body")
-            actions = ActionChains(self.driver)
-            actions.move_to_element_with_offset(body, 0, 0)
-            actions.move_by_offset(x, y)
-            if click_type == "left":
-                actions.click()
-            elif click_type == "right":
-                actions.context_click()
-            elif click_type == "double":
-                actions.double_click()
-            actions.perform()
-        except Exception as e:
-            print(f"Error in move_mouse_and_click: {e}")
-    def drag_and_drop(self, x1, y1, x2, y2):
-        try:
-            body = self.driver.find_element(By.TAG_NAME, "body")
-            actions = ActionChains(self.driver)
-            actions.move_to_element_with_offset(body, 0, 0)
-            actions.move_by_offset(x1, y1)
-            actions.click_and_hold()
-            actions.move_by_offset(x2 - x1, y2 - y1)
-            actions.release()
-            actions.perform()
-        except Exception as e:
-            print(f"Error in drag_and_drop: {e}")
-    def type_text(self, text):
-        actions = ActionChains(self.driver)
-        actions.send_keys(text)
-        actions.perform()
-    def press_key(self, key_name):
         try:
-            k = getattr(Keys, key_name.upper(), None)
-            if not k:
-                if key_name.lower() == "enter": k = Keys.ENTER
-                elif key_name.lower() == "space": k = Keys.SPACE
-                elif key_name.lower() == "backspace": k = Keys.BACK_SPACE
-                elif key_name.lower() == "esc": k = Keys.ESCAPE
-                else: k = key_name
             actions = ActionChains(self.driver)
-            actions.send_keys(k)
-            actions.perform()
-        except Exception as e:
-            print(f"Error pressing key: {e}")
-    def scroll(self, amount, direction="down"):
-        try:
-            scroll_y = amount * 100
-            if direction == "up":
-                scroll_y = -scroll_y
-            self.driver.execute_script(f"window.scrollBy(0, {scroll_y});")
         except Exception as e:
-            print(f"Error scrolling: {e}")
     def cleanup(self):
-        try:
-            if hasattr(self, 'driver'):
-                self.driver.quit()
-        except:
-            pass
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
 # -----------------------------------------------------------------------------
-# AGENT SETUP
 # -----------------------------------------------------------------------------
-SYSTEM_PROMPT_TEMPLATE = """You are a browser automation assistant controlling a Google Chrome web browser. The current date is <<current_date>>.
-<action process>
-You will be given a task to solve in several steps. At each step you will perform an action.
-After each action, you'll receive an updated screenshot of the browser.
-Then you will proceed as follows, with these sections: don't skip any!
-Short term goal: ...
-What I see: ...
-Reflection: ...
-Action:
-```python
-click(254, 308)
-```<end_code>
-Always format your action ('Action:' part) as Python code blocks as shown above.
-</action_process>
-<tools>
-On top of performing computations in the Python code snippets that you create, you only have access to these tools to interact with the browser:
-{%- for tool in tools.values() %}
-- {{ tool.name }}: {{ tool.description }}
-    Takes inputs: {{tool.inputs}}
-    Returns an output of type: {{tool.output_type}}
-{%- endfor %}
-</tools>
-<click_guidelines>
-The browser has a resolution of <<resolution_x>>x<<resolution_y>> pixels.
-NEVER USE HYPOTHETIC OR ASSUMED COORDINATES, USE TRUE COORDINATES that you can see from the screenshot.
-Use precise coordinates based on the current screenshot.
-Whenever you click, MAKE SURE to click in the middle of the button, text, link or any other clickable element.
-In the screenshot you will see a green crosshair displayed over the position of your last click.
-</click_guidelines>
-<general_guidelines>
-Execute one action at a time.
-Use `open_url` to navigate to websites.
-Use `click` to navigate links and interface elements.
-Use `type_text` to input into forms.
-Use `scroll` to see more content.
-If you get stuck, try using `open_url` to search on Google.
-</general_guidelines>
-""".replace("<<current_date>>", datetime.now().strftime("%A, %d-%B-%Y"))
-def draw_marker_on_image(image_copy, click_coordinates):
-    x, y = click_coordinates
-    draw = ImageDraw.Draw(image_copy)
-    cross_size, linewidth = 10, 3
-    # Draw cross
-    draw.line((x - cross_size, y, x + cross_size, y), fill="green", width=linewidth)
-    draw.line((x, y - cross_size, x, y + cross_size), fill="green", width=linewidth)
-    draw.ellipse(
-        (x - cross_size * 2, y - cross_size * 2, x + cross_size * 2, y + cross_size * 2),
-        outline="green",
-        width=linewidth,
-    )
-    return image_copy
-class SeleniumVisionAgent(CodeAgent):
-    """Agent for Browser automation with Selenium and Vision"""
-    def __init__(
-        self,
-        model: Model,
-        data_dir: str,
-        sandbox: SeleniumSandbox,
-        max_steps: int = 20,
-        verbosity_level: LogLevel = 2,
-        **kwargs,
-    ):
-        self.sandbox = sandbox
-        self.data_dir = data_dir
-        # Initialize
-        print(f"Browser size: {self.sandbox.width}x{self.sandbox.height}")
-        os.makedirs(self.data_dir, exist_ok=True)
-        # Build tools list
-        tools_list = self.build_tools()
-        super().__init__(
-            tools=tools_list,
-            model=model,
-            max_steps=max_steps,
-            verbosity_level=verbosity_level,
-            step_callbacks=[self.take_screenshot_callback],
-            **kwargs,
-        )
-        self.prompt_templates["system_prompt"] = SYSTEM_PROMPT_TEMPLATE.replace(
-            "<<resolution_x>>", str(self.sandbox.width)
-        ).replace("<<resolution_y>>", str(self.sandbox.height))
-    def build_tools(self):
-        """Define and return the list of tools for this agent"""
-        @tool
-        def click(x: int, y: int) -> str:
-            """
-            Performs a left-click at the specified coordinates.
-            Args:
-                x: The x coordinate (horizontal position).
-                y: The y coordinate (vertical position).
-            """
-            self.sandbox.move_mouse_and_click(x, y, "left")
-            self.click_coordinates = [x, y]
-            return f"Clicked at ({x}, {y})"
-        @tool
-        def right_click(x: int, y: int) -> str:
-            """
-            Performs a right-click at the specified coordinates.
-            Args:
-                x: The x coordinate.
-                y: The y coordinate.
-            """
-            self.sandbox.move_mouse_and_click(x, y, "right")
-            self.click_coordinates = [x, y]
-            return f"Right-clicked at ({x}, {y})"
-        @tool
-        def double_click(x: int, y: int) -> str:
-            """
-            Performs a double-click at the specified coordinates.
-            Args:
-                x: The x coordinate.
-                y: The y coordinate.
-            """
-            self.sandbox.move_mouse_and_click(x, y, "double")
-            self.click_coordinates = [x, y]
-            return f"Double-clicked at ({x}, {y})"
-        @tool
-        def type_text(text: str) -> str:
-            """
-            Types the specified text.
-            Args:
-                text: The text to type.
-            """
-            clean_text = unicodedata.normalize("NFD", text)
-            self.sandbox.type_text(clean_text)
-            return f"Typed text: '{clean_text}'"
-        @tool
-        def press_key(key: str) -> str:
-            """
-            Presses a keyboard key (e.g., 'enter', 'backspace', 'esc').
-            Args:
-                key: The key name.
-            """
-            self.sandbox.press_key(key)
-            return f"Pressed key: {key}"
-        @tool
-        def drag_and_drop(x1: int, y1: int, x2: int, y2: int) -> str:
-            """
-            Drags from (x1, y1) and drops at (x2, y2).
-            Args:
-                x1: Start x coordinate.
-                y1: Start y coordinate.
-                x2: End x coordinate.
-                y2: End y coordinate.
-            """
-            self.sandbox.drag_and_drop(x1, y1, x2, y2)
-            return f"Dragged from [{x1}, {y1}] to [{x2}, {y2}]"
-        @tool
-        def scroll(amount: int, direction: str = "down") -> str:
-            """
-            Scrolls the page.
-            Args:
-                amount: The amount to scroll (1-10).
-                direction: "up" or "down".
-            """
-            self.sandbox.scroll(amount, direction)
-            return f"Scrolled {direction} by {amount}"
-        @tool
-        def wait(seconds: float) -> str:
-            """
-            Waits for the specified number of seconds.
-            Args:
-                seconds: The duration to wait.
-            """
-            time.sleep(seconds)
-            return f"Waited for {seconds} seconds"
-        @tool
-        def open_url(url: str) -> str:
-            """
-            Navigates the browser to the specified URL.
-            Args:
-                url: The URL to open.
-            """
-            if not url.startswith(("http://", "https://")):
-                url = "https://" + url
-            try:
-                self.sandbox.driver.get(url)
-                time.sleep(2)
-                title = self.sandbox.driver.title
-                return f"Opened URL: {url}. Page Title: {title}"
-            except Exception as e:
-                return f"Failed to open URL: {e}"
-        @tool
-        def go_back() -> str:
-            """
-            Goes back to the previous page in history.
-            """
-            self.sandbox.driver.back()
-            return "Went back one page"
-        return [click, right_click, double_click, type_text, press_key, drag_and_drop, scroll, wait, open_url, go_back]
-    def take_screenshot_callback(self, memory_step: ActionStep, agent=None) -> None:
-        """Takes a screenshot and saves it to memory"""
-        current_step = memory_step.step_number
-        time.sleep(1.0) # Wait for renders
-        image = self.sandbox.get_screenshot()
-        # Save to disk
-        screenshot_path = os.path.join(self.data_dir, f"step_{current_step:03d}.png")
-        image.save(screenshot_path)
-        image_copy = image.copy()
-        if getattr(self, "click_coordinates", None):
-            image_copy = draw_marker_on_image(image_copy, self.click_coordinates)
-        self.last_marked_screenshot = AgentImage(screenshot_path)
-        # Cleanup old images in memory to save RAM
-        for previous_memory_step in agent.memory.steps:
-            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 1:
-                previous_memory_step.observations_images = None
-            elif isinstance(previous_memory_step, TaskStep):
-                previous_memory_step.task_images = None
-        memory_step.observations_images = [image_copy]
-        self.click_coordinates = None
-def create_agent(data_dir, sandbox):
-    model = FaraLocalModel()
-    return SeleniumVisionAgent(
-        model=model,
-        data_dir=data_dir,
-        sandbox=sandbox,
-        max_steps=30,
-        verbosity_level=2
-    )
-def generate_interaction_id(session_uuid):
-    return f"{session_uuid}_{int(time.time())}"
-def get_agent_summary_erase_images(agent):
-    for memory_step in agent.memory.steps:
-        if hasattr(memory_step, "observations_images"):
-            memory_step.observations_images = None
-        if hasattr(memory_step, "task_images"):
-            memory_step.task_images = None
-    return agent.write_memory_to_messages()
-def save_final_status(folder, status: str, summary, error_message=None) -> None:
-    try:
-        with open(os.path.join(folder, "metadata.json"), "w") as output_file:
-            output_file.write(
-                json.dumps(
-                    {"status": status, "summary": summary, "error_message": error_message},
-                    default=str
-                )
-            )
-    except Exception as e:
-        print(f"Failed to save metadata: {e}")
 # -----------------------------------------------------------------------------
-# UI & APP
 # -----------------------------------------------------------------------------
-custom_css = """
-.modal-container { margin: var(--size-16) auto!important; }
-.browser-container { position: relative; width: 100%; height: 600px; border: 1px solid #444; background: #222; display: flex; align-items: center; justify-content: center; overflow: hidden; }
-.browser-image { max-width: 100%; max-height: 100%; object-fit: contain; }
-#chatbot { height: 800px!important; }
-"""
-class EnrichedGradioUI(GradioUI):
-    def interact_with_agent(
-        self,
-        task_input,
-        stored_messages,
-        session_state,
-        session_uuid,
-        consent_storage,
-        request: gr.Request,
-    ):
-        interaction_id = generate_interaction_id(session_uuid)
-        data_dir = os.path.join(TMP_DIR, interaction_id)
-        sandbox = SeleniumSandbox(width=WIDTH, height=HEIGHT)
-        agent = create_agent(data_dir=data_dir, sandbox=sandbox)
-        session_state["agent"] = agent
         try:
-            stored_messages.append(gr.ChatMessage(role="user", content=task_input))
-            yield stored_messages, None
-            screenshot = sandbox.get_screenshot()
-            for msg in stream_to_gradio(
-                agent,
-                task=task_input,
-                task_images=[screenshot],
-                reset_agent_memory=False,
-            ):
-                if hasattr(agent, "last_marked_screenshot") and msg.content == "-----":
-                    stored_messages.append(
-                        gr.ChatMessage(
-                            role="assistant",
-                            content={
-                                "path": agent.last_marked_screenshot.to_string(),
-                                "mime_type": "image/png",
-                            },
-                        )
-                    )
-                    yield stored_messages, agent.last_marked_screenshot.to_string()
-                else:
-                    stored_messages.append(msg)
-                    yield stored_messages, None
-            if consent_storage:
-                summary = get_agent_summary_erase_images(agent)
-                save_final_status(data_dir, "completed", summary=summary)
-            yield stored_messages, None
         except Exception as e:
-            error_message = f"Error in interaction: {str(e)}"
-            print(error_message)
-            stored_messages.append(
-                gr.ChatMessage(role="assistant", content="Run failed:\n" + error_message)
-            )
-            yield stored_messages, None
-        finally:
-            sandbox.cleanup()
-theme = gr.themes.Default(
-    font=["Oxanium", "sans-serif"], primary_hue="amber", secondary_hue="blue"
-)
-with gr.Blocks(theme=theme, css=custom_css) as demo:
-    session_uuid_state = gr.State(lambda: str(uuid.uuid4()))
-    session_state = gr.State({})
-    stored_messages = gr.State([])
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### Fara CUA - Chrome Agent 🌐")
-            task_input = gr.Textbox(
-                value="Go to google.com and search for 'Hugging Face'",
-                label="Task",
-                lines=3
-            )
-            run_btn = gr.Button("Start Task", variant="primary")
-            stop_btn = gr.Button("Stop", variant="secondary")
-            consent_storage = gr.Checkbox(label="Save logs locally?", value=True)
-            gr.Examples(
-                examples=[
-                    "Go to google.com and search for 'Hugging Face', then click the first link.",
-                    "Go to wikipedia.org, type 'Python' in search, and click the search button.",
-                ],
-                inputs=task_input
-            )
-        with gr.Column(scale=3):
-            with gr.Row():
-                with gr.Column(scale=1):
-                    chatbot_display = gr.Chatbot(
-                        label="Agent Trace",
-                        type="messages",
-                        height=800,
-                        avatar_images=(None, "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png"),
-                    )
-                with gr.Column(scale=1):
-                    gr.Markdown("### Latest Browser View")
-                    live_browser_view = gr.Image(
-                        label="Browser View",
-                        type="filepath",
-                        interactive=False,
-                        height=600
-                    )
-    agent_ui = EnrichedGradioUI(CodeAgent(tools=[], model=Model(), name="init"))
-    def interrupt_agent(session_state):
-        if "agent" in session_state and hasattr(session_state["agent"], "interrupt_switch"):
-            session_state["agent"].interrupt_switch = True
-            return "Interrupted"
-    run_event = run_btn.click(
-        fn=agent_ui.interact_with_agent,
-        inputs=[
-            task_input,
-            stored_messages,
-            session_state,
-            session_uuid_state,
-            consent_storage,
-        ],
-        outputs=[chatbot_display, live_browser_view]
     )
-    stop_btn.click(fn=interrupt_agent, inputs=[session_state], outputs=[])
 if __name__ == "__main__":
-    demo.launch(share=True)

 import os
+import re
 import time
+import shutil
 import uuid
 import tempfile
 import unicodedata
 from io import BytesIO
+from typing import Tuple, Optional, List, Dict, Any
 import gradio as gr
+import numpy as np
 import torch
 import spaces
+from PIL import Image, ImageDraw, ImageFont
+# Transformers imports
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoProcessor,
+)
+from qwen_vl_utils import process_vision_info
 # Selenium Imports
 from selenium import webdriver
 from selenium.webdriver.common.keys import Keys
 from webdriver_manager.chrome import ChromeDriverManager
 # -----------------------------------------------------------------------------
+# CONSTANTS & CONFIG
 # -----------------------------------------------------------------------------
+MODEL_ID = "microsoft/Fara-7B"  # Or your specific Fara model repo
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 WIDTH = 1024
 HEIGHT = 768
+TMP_DIR = "./tmp"
 if not os.path.exists(TMP_DIR):
     os.makedirs(TMP_DIR)
+# System Prompt adapted for Fara/GUI agents
+OS_SYSTEM_PROMPT = """You are a GUI agent. You are given a task and a screenshot of the current status.
+You need to generate the next action to complete the task.
+Supported actions:
+1. `click(x=0.5, y=0.5)`: Click at the specific location.
+2. `right_click(x=0.5, y=0.5)`: Right click at the specific location.
+3. `double_click(x=0.5, y=0.5)`: Double click at the specific location.
+4. `type_text(text="hello")`: Type the text.
+5. `scroll(amount=2, direction="down")`: Scroll the page.
+6. `press_key(key="enter")`: Press a specific key.
+7. `open_url(url="https://google.com")`: Open a specific URL.
+Output format:
+Please wrap the action code in <code> </code> tags.
+Example:
+<code>
+click(x=0.23, y=0.45)
+</code>
+"""
 # -----------------------------------------------------------------------------
+# MODEL WRAPPER (Replacing smolagents)
 # -----------------------------------------------------------------------------
+class FaraModelWrapper:
+    def __init__(self, model_id: str, to_device: str = "cuda"):
+        print(f"Loading {model_id} on {to_device}...")
+        self.model_id = model_id
+        try:
+            self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_id,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
+                device_map="auto" if to_device == "cuda" else None,
+            )
+            if to_device == "cpu":
+                self.model.to("cpu")
+            self.model.eval()
+            print("Model loaded successfully.")
+        except Exception as e:
+            print(f"Failed to load Fara, falling back to Qwen2.5-VL-7B for demo compatibility. Error: {e}")
+            fallback_id = "Qwen/Qwen2.5-VL-7B-Instruct"
+            self.processor = AutoProcessor.from_pretrained(fallback_id, trust_remote_code=True)
+            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                fallback_id,
+                trust_remote_code=True,
+                torch_dtype=torch.bfloat16 if to_device == "cuda" else torch.float32,
+                device_map="auto",
+            )
+    def generate(self, messages: list[dict], max_new_tokens=512):
+        # Prepare inputs for Fara/QwenVL
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
         )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
         )
+        inputs = inputs.to(self.model.device)
+        with torch.no_grad():
+            generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens
+            )
+        # Trim input tokens
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return output_text
+# Initialize global model
+model = FaraModelWrapper(MODEL_ID, DEVICE)
 # -----------------------------------------------------------------------------
+# SELENIUM SANDBOX
 # -----------------------------------------------------------------------------
 def get_system_chrome_path():
+    paths = ["/usr/bin/chromium", "/usr/bin/chromium-browser", "/usr/bin/google-chrome"]
     for p in paths:
+        if os.path.exists(p): return p
     return None
 class SeleniumSandbox:
         self.height = height
         self.tmp_dir = tempfile.mkdtemp(prefix="chrome_sandbox_")
         chrome_opts = ChromeOptions()
         binary_path = get_system_chrome_path()
+        if binary_path: chrome_opts.binary_location = binary_path
         chrome_opts.add_argument("--headless=new")
         chrome_opts.add_argument(f"--user-data-dir={self.tmp_dir}")
         chrome_opts.add_argument(f"--window-size={width},{height}")
+        chrome_opts.add_argument("--no-sandbox")
+        chrome_opts.add_argument("--disable-dev-shm-usage")
         chrome_opts.add_argument("--disable-gpu")
         try:
             system_driver_path = "/usr/bin/chromedriver"
             if os.path.exists(system_driver_path):
                 service = ChromeService(executable_path=system_driver_path)
             else:
                 service = ChromeService(ChromeDriverManager().install())
             self.driver = webdriver.Chrome(service=service, options=chrome_opts)
             self.driver.set_window_size(width, height)
+            print("Selenium started.")
         except Exception as e:
+            print(f"Selenium init failed: {e}")
+            shutil.rmtree(self.tmp_dir, ignore_errors=True)
             raise e
     def get_screenshot(self):
+        return Image.open(BytesIO(self.driver.get_screenshot_as_png()))
+    def execute_action(self, action_data: dict):
+        """Execute parsed action on the browser"""
+        action_type = action_data.get('type')
         try:
             actions = ActionChains(self.driver)
+            body = self.driver.find_element(By.TAG_NAME, "body")
+            # Helper to move to coordinates
+            def move_to(x_norm, y_norm):
+                # Convert normalized (0-1) to pixel coordinates
+                x_px = int(x_norm * self.width)
+                y_px = int(y_norm * self.height)
+                actions.move_to_element_with_offset(body, 0, 0)
+                actions.move_by_offset(x_px, y_px)
+            if action_type in ['click', 'right_click', 'double_click']:
+                move_to(action_data['x'], action_data['y'])
+                if action_type == 'click': actions.click()
+                elif action_type == 'right_click': actions.context_click()
+                elif action_type == 'double_click': actions.double_click()
+                actions.perform()
+            elif action_type == 'type_text':
+                text = action_data.get('text', '')
+                actions.send_keys(text)
+                actions.perform()
+            elif action_type == 'press_key':
+                key_name = action_data.get('key', '').lower()
+                k = getattr(Keys, key_name.upper(), None)
+                if not k:
+                    if key_name == "enter": k = Keys.ENTER
+                    elif key_name == "space": k = Keys.SPACE
+                    elif key_name == "backspace": k = Keys.BACK_SPACE
+                if k:
+                    actions.send_keys(k)
+                    actions.perform()
+            elif action_type == 'scroll':
+                amount = action_data.get('amount', 2)
+                direction = action_data.get('direction', 'down')
+                scroll_y = amount * 100
+                if direction == 'up': scroll_y = -scroll_y
+                self.driver.execute_script(f"window.scrollBy(0, {scroll_y});")
+            elif action_type == 'open_url':
+                url = action_data.get('url', '')
+                if not url.startswith('http'): url = 'https://' + url
+                self.driver.get(url)
+                time.sleep(2)
+            return f"Executed {action_type}"
         except Exception as e:
+            return f"Action failed: {e}"
     def cleanup(self):
+        try: self.driver.quit()
+        except: pass
         shutil.rmtree(self.tmp_dir, ignore_errors=True)
 # -----------------------------------------------------------------------------
+# PARSING LOGIC
 # -----------------------------------------------------------------------------
+def parse_code_block(response: str) -> str:
+    pattern = r"<code>\s*(.*?)\s*</code>"
+    matches = re.findall(pattern, response, re.DOTALL)
+    if matches:
+        return matches[-1].strip() # Return the last code block
+    return ""
+def parse_action_string(action_str: str) -> dict:
+    """Parse string like 'click(x=0.5, y=0.5)' into a dict"""
+    # Simple regex parsing for demonstration
+    action_data = {}
+    # 1. Coordinate actions: name(x=..., y=...)
+    coord_match = re.match(r"(\w+)\s*\(\s*x\s*=\s*([0-9.]+)\s*,\s*y\s*=\s*([0-9.]+)\s*\)", action_str)
+    if coord_match:
+        return {
+            "type": coord_match.group(1),
+            "x": float(coord_match.group(2)),
+            "y": float(coord_match.group(3))
+        }
+    # 2. Open URL: open_url(url="...")
+    url_match = re.match(r"open_url\s*\(\s*url\s*=\s*[\"'](.*?)[\"']\s*\)", action_str)
+    if url_match:
+        return {"type": "open_url", "url": url_match.group(1)}
+    # 3. Type text: type_text(text="...")
+    text_match = re.match(r"type_text\s*\(\s*text\s*=\s*[\"'](.*?)[\"']\s*\)", action_str)
+    if text_match:
+        return {"type": "type_text", "text": text_match.group(1)}
+    # 4. Press key: press_key(key="...")
+    key_match = re.match(r"press_key\s*\(\s*key\s*=\s*[\"'](.*?)[\"']\s*\)", action_str)
+    if key_match:
+        return {"type": "press_key", "key": key_match.group(1)}
+    # 5. Scroll: scroll(amount=..., direction="...")
+    if "scroll" in action_str:
+        return {"type": "scroll", "amount": 2, "direction": "down"} # Default
+    return {}
+# -----------------------------------------------------------------------------
+# MAIN LOOP
+# -----------------------------------------------------------------------------
+@spaces.GPU(duration=120)
+def agent_step(task_instruction: str, history: list, sandbox_state: dict):
+    # Initialize sandbox if needed (handled via state in Gradio mostly, but for safety)
+    if 'uuid' not in sandbox_state:
+        sandbox_state['uuid'] = str(uuid.uuid4())
+        sandbox = SeleniumSandbox(WIDTH, HEIGHT)
+        # Store sandbox instance reference globally or handle cleanup carefully
+        # For this demo, we'll recreate/attach to session based on state if persisting,
+        # but here we'll assume a persistent session for the run.
+    # HACK: For Gradio state persistence with objects that can't be pickled easily,
+    # we often use a global dict mapping UUID -> Sandbox
+    sandbox_id = sandbox_state['uuid']
+    if sandbox_id not in SANDBOX_REGISTRY:
+        SANDBOX_REGISTRY[sandbox_id] = SeleniumSandbox(WIDTH, HEIGHT)
+    sandbox = SANDBOX_REGISTRY[sandbox_id]
+    # 1. Get Screenshot
+    screenshot = sandbox.get_screenshot()
+    # 2. Construct Prompt
+    # Convert history text to string context if needed
+    messages = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": OS_SYSTEM_PROMPT}]
+        },
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": screenshot},
+                {"type": "text", "text": f"Instruction: {task_instruction}\nPrevious Actions: {history[-1] if history else 'None'}"}
+            ]
+        }
+    ]
+    # 3. Model Inference
+    response = model.generate(messages)
+    # 4. Parse Action
+    action_code = parse_code_block(response)
+    action_data = parse_action_string(action_code)
+    log_entry = f"Step: {len(history)+1}\nModel Thought: {response}\nAction: {action_code}"
+    # 5. Execute Action
+    execution_result = "No valid action found"
+    if action_data:
+        execution_result = sandbox.execute_action(action_data)
+        # Draw marker if coordinate action
+        if 'x' in action_data:
+            draw = ImageDraw.Draw(screenshot)
+            x_px = action_data['x'] * WIDTH
+            y_px = action_data['y'] * HEIGHT
+            r = 10
+            draw.ellipse((x_px-r, y_px-r, x_px+r, y_px+r), outline="red", width=3)
+    log_entry += f"\nResult: {execution_result}"
+    history.append(log_entry)
+    # Return updated screenshot and history
+    return screenshot, history, sandbox_state
+# Global registry for sandboxes
+SANDBOX_REGISTRY = {}
+def cleanup_sandbox(sandbox_state):
+    sid = sandbox_state.get('uuid')
+    if sid and sid in SANDBOX_REGISTRY:
+        SANDBOX_REGISTRY[sid].cleanup()
+        del SANDBOX_REGISTRY[sid]
+    return [], {}
 # -----------------------------------------------------------------------------
+# GRADIO UI
 # -----------------------------------------------------------------------------
+def run_task_loop(task, history, state):
+    # This generator function runs the agent loop
+    max_steps = 10
+    for i in range(max_steps):
         try:
+            # Run one step
+            screenshot, new_history, new_state = agent_step(task, history, state)
+            history = new_history
+            # Yield updates to UI
+            # We yield the logs (joined) and the latest image
+            logs_text = "\n\n" + "-"*40 + "\n\n".join(history)
+            yield screenshot, logs_text, state
+            # Check for termination (simplistic)
+            if "Done" in history[-1] or "finished" in history[-1].lower():
+                break
+            time.sleep(1) # Pause for visual effect
         except Exception as e:
+            error_msg = f"Error in loop: {e}"
+            history.append(error_msg)
+            yield None, "\n".join(history), state
+            break
+# UI Layout
+custom_css = """
+#view_img { height: 600px; object-fit: contain; }
+"""
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
+    state = gr.State({})
+    history = gr.State([])
+    gr.Markdown("# 🤖 Fara CUA - Chrome Agent")
     with gr.Row():
         with gr.Column(scale=1):
+            task_input = gr.Textbox(label="Task Instruction", value="Go to google.com and search for 'SpaceX'")
+            run_btn = gr.Button("Run Agent", variant="primary")
+            clear_btn = gr.Button("Reset / Clear")
+        with gr.Column(scale=2):
+            browser_view = gr.Image(label="Live Browser View", elem_id="view_img", interactive=False)
+    logs_output = gr.Textbox(label="Agent Logs", lines=15, interactive=False)
+    # Event handlers
+    run_btn.click(
+        fn=run_task_loop,
+        inputs=[task_input, history, state],
+        outputs=[browser_view, logs_output, state]
     )
+    clear_btn.click(
+        fn=cleanup_sandbox,
+        inputs=[state],
+        outputs=[history, state]
+    ).then(
+        lambda: (None, ""),
+        outputs=[browser_view, logs_output]
+    )
 if __name__ == "__main__":
+    demo.launch()