|
|
""" |
|
|
Gradio UI for Open Computer Use Agent - HuggingFace Spaces |
|
|
""" |
|
|
|
|
|
import asyncio |
|
|
import base64 |
|
|
import gradio as gr |
|
|
from PIL import Image |
|
|
from io import BytesIO |
|
|
from computer_tool import ComputerTool |
|
|
|
|
|
|
|
|
computer = ComputerTool(display_width=1280, display_height=800, display_num=99) |
|
|
|
|
|
|
|
|
def decode_screenshot(base64_str: str) -> Image.Image: |
|
|
"""Decode base64 screenshot to PIL Image""" |
|
|
img_bytes = base64.b64decode(base64_str) |
|
|
return Image.open(BytesIO(img_bytes)) |
|
|
|
|
|
|
|
|
async def take_screenshot(): |
|
|
"""Take a screenshot and return as PIL Image""" |
|
|
result = await computer.screenshot() |
|
|
if result.base64_image: |
|
|
return decode_screenshot(result.base64_image), "Screenshot taken" |
|
|
return None, f"Error: {result.error}" |
|
|
|
|
|
|
|
|
async def do_click(x: int, y: int, button: str): |
|
|
"""Click at coordinates""" |
|
|
clicks = 2 if button == "double" else 1 |
|
|
btn = "left" if button == "double" else button |
|
|
result = await computer.click(x, y, btn, clicks) |
|
|
|
|
|
|
|
|
ss = await computer.screenshot() |
|
|
img = decode_screenshot(ss.base64_image) if ss.base64_image else None |
|
|
return img, result.output or result.error |
|
|
|
|
|
|
|
|
async def do_type(text: str): |
|
|
"""Type text""" |
|
|
result = await computer.type_text(text) |
|
|
ss = await computer.screenshot() |
|
|
img = decode_screenshot(ss.base64_image) if ss.base64_image else None |
|
|
return img, result.output or result.error |
|
|
|
|
|
|
|
|
async def do_key(key: str): |
|
|
"""Press key""" |
|
|
result = await computer.press_key(key) |
|
|
ss = await computer.screenshot() |
|
|
img = decode_screenshot(ss.base64_image) if ss.base64_image else None |
|
|
return img, result.output or result.error |
|
|
|
|
|
|
|
|
async def do_scroll(direction: str, amount: int): |
|
|
"""Scroll""" |
|
|
result = await computer.scroll(direction, amount) |
|
|
ss = await computer.screenshot() |
|
|
img = decode_screenshot(ss.base64_image) if ss.base64_image else None |
|
|
return img, result.output or result.error |
|
|
|
|
|
|
|
|
|
|
|
def screenshot_sync(): |
|
|
return asyncio.run(take_screenshot()) |
|
|
|
|
|
def click_sync(x, y, button): |
|
|
return asyncio.run(do_click(int(x), int(y), button)) |
|
|
|
|
|
def type_sync(text): |
|
|
return asyncio.run(do_type(text)) |
|
|
|
|
|
def key_sync(key): |
|
|
return asyncio.run(do_key(key)) |
|
|
|
|
|
def scroll_sync(direction, amount): |
|
|
return asyncio.run(do_scroll(direction, int(amount))) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# π₯οΈ Open Computer Use Agent |
|
|
|
|
|
Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator. |
|
|
|
|
|
**How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
|
|
|
screenshot_img = gr.Image( |
|
|
label="Desktop View (1280x800)", |
|
|
type="pil", |
|
|
height=500 |
|
|
) |
|
|
status_text = gr.Textbox(label="Status", interactive=False) |
|
|
|
|
|
screenshot_btn = gr.Button("π· Take Screenshot", variant="primary") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
gr.Markdown("### Actions") |
|
|
|
|
|
|
|
|
with gr.Accordion("π±οΈ Mouse Click", open=True): |
|
|
with gr.Row(): |
|
|
click_x = gr.Number(label="X", value=640) |
|
|
click_y = gr.Number(label="Y", value=400) |
|
|
click_button = gr.Radio( |
|
|
["left", "right", "double"], |
|
|
label="Button", |
|
|
value="left" |
|
|
) |
|
|
click_btn = gr.Button("Click") |
|
|
|
|
|
|
|
|
with gr.Accordion("β¨οΈ Type Text", open=True): |
|
|
type_text = gr.Textbox(label="Text to type", placeholder="Hello World") |
|
|
type_btn = gr.Button("Type") |
|
|
|
|
|
|
|
|
with gr.Accordion("π€ Press Key", open=True): |
|
|
key_input = gr.Textbox( |
|
|
label="Key (e.g., enter, ctrl+c, alt+tab)", |
|
|
placeholder="enter" |
|
|
) |
|
|
key_btn = gr.Button("Press Key") |
|
|
|
|
|
|
|
|
with gr.Accordion("π Scroll", open=False): |
|
|
scroll_dir = gr.Radio( |
|
|
["up", "down", "left", "right"], |
|
|
label="Direction", |
|
|
value="down" |
|
|
) |
|
|
scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount") |
|
|
scroll_btn = gr.Button("Scroll") |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Markdown(""" |
|
|
### π‘ Tips |
|
|
- Click "Take Screenshot" first to see the current desktop |
|
|
- Click coordinates are relative to the 1280x800 display |
|
|
- Use `ctrl+alt+t` to open terminal, `super` for menu |
|
|
- The desktop has Firefox ESR pre-installed |
|
|
|
|
|
### π Links |
|
|
- [View noVNC Desktop](/proxy/6080) (direct VNC access) |
|
|
- [GitHub](https://github.com) | [HuggingFace](https://huggingface.co) |
|
|
""") |
|
|
|
|
|
|
|
|
screenshot_btn.click( |
|
|
screenshot_sync, |
|
|
outputs=[screenshot_img, status_text] |
|
|
) |
|
|
|
|
|
click_btn.click( |
|
|
click_sync, |
|
|
inputs=[click_x, click_y, click_button], |
|
|
outputs=[screenshot_img, status_text] |
|
|
) |
|
|
|
|
|
type_btn.click( |
|
|
type_sync, |
|
|
inputs=[type_text], |
|
|
outputs=[screenshot_img, status_text] |
|
|
) |
|
|
|
|
|
key_btn.click( |
|
|
key_sync, |
|
|
inputs=[key_input], |
|
|
outputs=[screenshot_img, status_text] |
|
|
) |
|
|
|
|
|
scroll_btn.click( |
|
|
scroll_sync, |
|
|
inputs=[scroll_dir, scroll_amount], |
|
|
outputs=[screenshot_img, status_text] |
|
|
) |
|
|
|
|
|
|
|
|
demo.load(screenshot_sync, outputs=[screenshot_img, status_text]) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |
|
|
|