Spaces:

likhonsheikh
/

open-computer-use-agent

Running

File size: 5,943 Bytes

b55e0e7

"""
Gradio UI for Open Computer Use Agent - HuggingFace Spaces
"""

import asyncio
import base64
import gradio as gr
from PIL import Image
from io import BytesIO
from computer_tool import ComputerTool

# Initialize computer tool
computer = ComputerTool(display_width=1280, display_height=800, display_num=99)


def decode_screenshot(base64_str: str) -> Image.Image:
    """Decode base64 screenshot to PIL Image"""
    img_bytes = base64.b64decode(base64_str)
    return Image.open(BytesIO(img_bytes))


async def take_screenshot():
    """Take a screenshot and return as PIL Image"""
    result = await computer.screenshot()
    if result.base64_image:
        return decode_screenshot(result.base64_image), "Screenshot taken"
    return None, f"Error: {result.error}"


async def do_click(x: int, y: int, button: str):
    """Click at coordinates"""
    clicks = 2 if button == "double" else 1
    btn = "left" if button == "double" else button
    result = await computer.click(x, y, btn, clicks)

    # Take screenshot after action
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


async def do_type(text: str):
    """Type text"""
    result = await computer.type_text(text)
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


async def do_key(key: str):
    """Press key"""
    result = await computer.press_key(key)
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


async def do_scroll(direction: str, amount: int):
    """Scroll"""
    result = await computer.scroll(direction, amount)
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


# Sync wrappers for Gradio
def screenshot_sync():
    return asyncio.run(take_screenshot())

def click_sync(x, y, button):
    return asyncio.run(do_click(int(x), int(y), button))

def type_sync(text):
    return asyncio.run(do_type(text))

def key_sync(key):
    return asyncio.run(do_key(key))

def scroll_sync(direction, amount):
    return asyncio.run(do_scroll(direction, int(amount)))


# Gradio UI
with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🖥️ Open Computer Use Agent

    Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator.

    **How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below.
    """)

    with gr.Row():
        with gr.Column(scale=2):
            # Screenshot display
            screenshot_img = gr.Image(
                label="Desktop View (1280x800)",
                type="pil",
                height=500
            )
            status_text = gr.Textbox(label="Status", interactive=False)

            screenshot_btn = gr.Button("📷 Take Screenshot", variant="primary")

        with gr.Column(scale=1):
            gr.Markdown("### Actions")

            # Click controls
            with gr.Accordion("🖱️ Mouse Click", open=True):
                with gr.Row():
                    click_x = gr.Number(label="X", value=640)
                    click_y = gr.Number(label="Y", value=400)
                click_button = gr.Radio(
                    ["left", "right", "double"],
                    label="Button",
                    value="left"
                )
                click_btn = gr.Button("Click")

            # Type controls
            with gr.Accordion("⌨️ Type Text", open=True):
                type_text = gr.Textbox(label="Text to type", placeholder="Hello World")
                type_btn = gr.Button("Type")

            # Key controls
            with gr.Accordion("🔤 Press Key", open=True):
                key_input = gr.Textbox(
                    label="Key (e.g., enter, ctrl+c, alt+tab)",
                    placeholder="enter"
                )
                key_btn = gr.Button("Press Key")

            # Scroll controls
            with gr.Accordion("📜 Scroll", open=False):
                scroll_dir = gr.Radio(
                    ["up", "down", "left", "right"],
                    label="Direction",
                    value="down"
                )
                scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount")
                scroll_btn = gr.Button("Scroll")

    with gr.Row():
        gr.Markdown("""
        ### 💡 Tips
        - Click "Take Screenshot" first to see the current desktop
        - Click coordinates are relative to the 1280x800 display
        - Use `ctrl+alt+t` to open terminal, `super` for menu
        - The desktop has Firefox ESR pre-installed

        ### 🔗 Links
        - [View noVNC Desktop](/proxy/6080) (direct VNC access)
        - [GitHub](https://github.com) | [HuggingFace](https://huggingface.co)
        """)

    # Event handlers
    screenshot_btn.click(
        screenshot_sync,
        outputs=[screenshot_img, status_text]
    )

    click_btn.click(
        click_sync,
        inputs=[click_x, click_y, click_button],
        outputs=[screenshot_img, status_text]
    )

    type_btn.click(
        type_sync,
        inputs=[type_text],
        outputs=[screenshot_img, status_text]
    )

    key_btn.click(
        key_sync,
        inputs=[key_input],
        outputs=[screenshot_img, status_text]
    )

    scroll_btn.click(
        scroll_sync,
        inputs=[scroll_dir, scroll_amount],
        outputs=[screenshot_img, status_text]
    )

    # Auto-screenshot on load
    demo.load(screenshot_sync, outputs=[screenshot_img, status_text])


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)