""" Gradio UI for Open Computer Use Agent - HuggingFace Spaces """ import asyncio import base64 import gradio as gr from PIL import Image from io import BytesIO from computer_tool import ComputerTool # Initialize computer tool computer = ComputerTool(display_width=1280, display_height=800, display_num=99) def decode_screenshot(base64_str: str) -> Image.Image: """Decode base64 screenshot to PIL Image""" img_bytes = base64.b64decode(base64_str) return Image.open(BytesIO(img_bytes)) async def take_screenshot(): """Take a screenshot and return as PIL Image""" result = await computer.screenshot() if result.base64_image: return decode_screenshot(result.base64_image), "Screenshot taken" return None, f"Error: {result.error}" async def do_click(x: int, y: int, button: str): """Click at coordinates""" clicks = 2 if button == "double" else 1 btn = "left" if button == "double" else button result = await computer.click(x, y, btn, clicks) # Take screenshot after action ss = await computer.screenshot() img = decode_screenshot(ss.base64_image) if ss.base64_image else None return img, result.output or result.error async def do_type(text: str): """Type text""" result = await computer.type_text(text) ss = await computer.screenshot() img = decode_screenshot(ss.base64_image) if ss.base64_image else None return img, result.output or result.error async def do_key(key: str): """Press key""" result = await computer.press_key(key) ss = await computer.screenshot() img = decode_screenshot(ss.base64_image) if ss.base64_image else None return img, result.output or result.error async def do_scroll(direction: str, amount: int): """Scroll""" result = await computer.scroll(direction, amount) ss = await computer.screenshot() img = decode_screenshot(ss.base64_image) if ss.base64_image else None return img, result.output or result.error # Sync wrappers for Gradio def screenshot_sync(): return asyncio.run(take_screenshot()) def click_sync(x, y, button): return asyncio.run(do_click(int(x), int(y), button)) def type_sync(text): return asyncio.run(do_type(text)) def key_sync(key): return asyncio.run(do_key(key)) def scroll_sync(direction, amount): return asyncio.run(do_scroll(direction, int(amount))) # Gradio UI with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🖥️ Open Computer Use Agent Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator. **How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below. """) with gr.Row(): with gr.Column(scale=2): # Screenshot display screenshot_img = gr.Image( label="Desktop View (1280x800)", type="pil", height=500 ) status_text = gr.Textbox(label="Status", interactive=False) screenshot_btn = gr.Button("📷 Take Screenshot", variant="primary") with gr.Column(scale=1): gr.Markdown("### Actions") # Click controls with gr.Accordion("🖱️ Mouse Click", open=True): with gr.Row(): click_x = gr.Number(label="X", value=640) click_y = gr.Number(label="Y", value=400) click_button = gr.Radio( ["left", "right", "double"], label="Button", value="left" ) click_btn = gr.Button("Click") # Type controls with gr.Accordion("⌨️ Type Text", open=True): type_text = gr.Textbox(label="Text to type", placeholder="Hello World") type_btn = gr.Button("Type") # Key controls with gr.Accordion("🔤 Press Key", open=True): key_input = gr.Textbox( label="Key (e.g., enter, ctrl+c, alt+tab)", placeholder="enter" ) key_btn = gr.Button("Press Key") # Scroll controls with gr.Accordion("📜 Scroll", open=False): scroll_dir = gr.Radio( ["up", "down", "left", "right"], label="Direction", value="down" ) scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount") scroll_btn = gr.Button("Scroll") with gr.Row(): gr.Markdown(""" ### 💡 Tips - Click "Take Screenshot" first to see the current desktop - Click coordinates are relative to the 1280x800 display - Use `ctrl+alt+t` to open terminal, `super` for menu - The desktop has Firefox ESR pre-installed ### 🔗 Links - [View noVNC Desktop](/proxy/6080) (direct VNC access) - [GitHub](https://github.com) | [HuggingFace](https://huggingface.co) """) # Event handlers screenshot_btn.click( screenshot_sync, outputs=[screenshot_img, status_text] ) click_btn.click( click_sync, inputs=[click_x, click_y, click_button], outputs=[screenshot_img, status_text] ) type_btn.click( type_sync, inputs=[type_text], outputs=[screenshot_img, status_text] ) key_btn.click( key_sync, inputs=[key_input], outputs=[screenshot_img, status_text] ) scroll_btn.click( scroll_sync, inputs=[scroll_dir, scroll_amount], outputs=[screenshot_img, status_text] ) # Auto-screenshot on load demo.load(screenshot_sync, outputs=[screenshot_img, status_text]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)