Spaces:

likhonsheikh
/

open-computer-use-agent

Paused

App Files Files Community

likhonsheikh commited on Dec 9, 2025

Commit

b55e0e7

verified ·

1 Parent(s): 6bf3f1f

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +191 -0

app.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""
+Gradio UI for Open Computer Use Agent - HuggingFace Spaces
+"""
+import asyncio
+import base64
+import gradio as gr
+from PIL import Image
+from io import BytesIO
+from computer_tool import ComputerTool
+# Initialize computer tool
+computer = ComputerTool(display_width=1280, display_height=800, display_num=99)
+def decode_screenshot(base64_str: str) -> Image.Image:
+    """Decode base64 screenshot to PIL Image"""
+    img_bytes = base64.b64decode(base64_str)
+    return Image.open(BytesIO(img_bytes))
+async def take_screenshot():
+    """Take a screenshot and return as PIL Image"""
+    result = await computer.screenshot()
+    if result.base64_image:
+        return decode_screenshot(result.base64_image), "Screenshot taken"
+    return None, f"Error: {result.error}"
+async def do_click(x: int, y: int, button: str):
+    """Click at coordinates"""
+    clicks = 2 if button == "double" else 1
+    btn = "left" if button == "double" else button
+    result = await computer.click(x, y, btn, clicks)
+    # Take screenshot after action
+    ss = await computer.screenshot()
+    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
+    return img, result.output or result.error
+async def do_type(text: str):
+    """Type text"""
+    result = await computer.type_text(text)
+    ss = await computer.screenshot()
+    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
+    return img, result.output or result.error
+async def do_key(key: str):
+    """Press key"""
+    result = await computer.press_key(key)
+    ss = await computer.screenshot()
+    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
+    return img, result.output or result.error
+async def do_scroll(direction: str, amount: int):
+    """Scroll"""
+    result = await computer.scroll(direction, amount)
+    ss = await computer.screenshot()
+    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
+    return img, result.output or result.error
+# Sync wrappers for Gradio
+def screenshot_sync():
+    return asyncio.run(take_screenshot())
+def click_sync(x, y, button):
+    return asyncio.run(do_click(int(x), int(y), button))
+def type_sync(text):
+    return asyncio.run(do_type(text))
+def key_sync(key):
+    return asyncio.run(do_key(key))
+def scroll_sync(direction, amount):
+    return asyncio.run(do_scroll(direction, int(amount)))
+# Gradio UI
+with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🖥️ Open Computer Use Agent
+    Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator.
+    **How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below.
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Screenshot display
+            screenshot_img = gr.Image(
+                label="Desktop View (1280x800)",
+                type="pil",
+                height=500
+            )
+            status_text = gr.Textbox(label="Status", interactive=False)
+            screenshot_btn = gr.Button("📷 Take Screenshot", variant="primary")
+        with gr.Column(scale=1):
+            gr.Markdown("### Actions")
+            # Click controls
+            with gr.Accordion("🖱️ Mouse Click", open=True):
+                with gr.Row():
+                    click_x = gr.Number(label="X", value=640)
+                    click_y = gr.Number(label="Y", value=400)
+                click_button = gr.Radio(
+                    ["left", "right", "double"],
+                    label="Button",
+                    value="left"
+                )
+                click_btn = gr.Button("Click")
+            # Type controls
+            with gr.Accordion("⌨️ Type Text", open=True):
+                type_text = gr.Textbox(label="Text to type", placeholder="Hello World")
+                type_btn = gr.Button("Type")
+            # Key controls
+            with gr.Accordion("🔤 Press Key", open=True):
+                key_input = gr.Textbox(
+                    label="Key (e.g., enter, ctrl+c, alt+tab)",
+                    placeholder="enter"
+                )
+                key_btn = gr.Button("Press Key")
+            # Scroll controls
+            with gr.Accordion("📜 Scroll", open=False):
+                scroll_dir = gr.Radio(
+                    ["up", "down", "left", "right"],
+                    label="Direction",
+                    value="down"
+                )
+                scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount")
+                scroll_btn = gr.Button("Scroll")
+    with gr.Row():
+        gr.Markdown("""
+        ### 💡 Tips
+        - Click "Take Screenshot" first to see the current desktop
+        - Click coordinates are relative to the 1280x800 display
+        - Use `ctrl+alt+t` to open terminal, `super` for menu
+        - The desktop has Firefox ESR pre-installed
+        ### 🔗 Links
+        - [View noVNC Desktop](/proxy/6080) (direct VNC access)
+        - [GitHub](https://github.com) | [HuggingFace](https://huggingface.co)
+        """)
+    # Event handlers
+    screenshot_btn.click(
+        screenshot_sync,
+        outputs=[screenshot_img, status_text]
+    )
+    click_btn.click(
+        click_sync,
+        inputs=[click_x, click_y, click_button],
+        outputs=[screenshot_img, status_text]
+    )
+    type_btn.click(
+        type_sync,
+        inputs=[type_text],
+        outputs=[screenshot_img, status_text]
+    )
+    key_btn.click(
+        key_sync,
+        inputs=[key_input],
+        outputs=[screenshot_img, status_text]
+    )
+    scroll_btn.click(
+        scroll_sync,
+        inputs=[scroll_dir, scroll_amount],
+        outputs=[screenshot_img, status_text]
+    )
+    # Auto-screenshot on load
+    demo.load(screenshot_sync, outputs=[screenshot_img, status_text])
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)