File size: 5,943 Bytes
b55e0e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
"""
Gradio UI for Open Computer Use Agent - HuggingFace Spaces
"""
import asyncio
import base64
import gradio as gr
from PIL import Image
from io import BytesIO
from computer_tool import ComputerTool
# Initialize computer tool
computer = ComputerTool(display_width=1280, display_height=800, display_num=99)
def decode_screenshot(base64_str: str) -> Image.Image:
"""Decode base64 screenshot to PIL Image"""
img_bytes = base64.b64decode(base64_str)
return Image.open(BytesIO(img_bytes))
async def take_screenshot():
"""Take a screenshot and return as PIL Image"""
result = await computer.screenshot()
if result.base64_image:
return decode_screenshot(result.base64_image), "Screenshot taken"
return None, f"Error: {result.error}"
async def do_click(x: int, y: int, button: str):
"""Click at coordinates"""
clicks = 2 if button == "double" else 1
btn = "left" if button == "double" else button
result = await computer.click(x, y, btn, clicks)
# Take screenshot after action
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
async def do_type(text: str):
"""Type text"""
result = await computer.type_text(text)
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
async def do_key(key: str):
"""Press key"""
result = await computer.press_key(key)
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
async def do_scroll(direction: str, amount: int):
"""Scroll"""
result = await computer.scroll(direction, amount)
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
# Sync wrappers for Gradio
def screenshot_sync():
return asyncio.run(take_screenshot())
def click_sync(x, y, button):
return asyncio.run(do_click(int(x), int(y), button))
def type_sync(text):
return asyncio.run(do_type(text))
def key_sync(key):
return asyncio.run(do_key(key))
def scroll_sync(direction, amount):
return asyncio.run(do_scroll(direction, int(amount)))
# Gradio UI
with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π₯οΈ Open Computer Use Agent
Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator.
**How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below.
""")
with gr.Row():
with gr.Column(scale=2):
# Screenshot display
screenshot_img = gr.Image(
label="Desktop View (1280x800)",
type="pil",
height=500
)
status_text = gr.Textbox(label="Status", interactive=False)
screenshot_btn = gr.Button("π· Take Screenshot", variant="primary")
with gr.Column(scale=1):
gr.Markdown("### Actions")
# Click controls
with gr.Accordion("π±οΈ Mouse Click", open=True):
with gr.Row():
click_x = gr.Number(label="X", value=640)
click_y = gr.Number(label="Y", value=400)
click_button = gr.Radio(
["left", "right", "double"],
label="Button",
value="left"
)
click_btn = gr.Button("Click")
# Type controls
with gr.Accordion("β¨οΈ Type Text", open=True):
type_text = gr.Textbox(label="Text to type", placeholder="Hello World")
type_btn = gr.Button("Type")
# Key controls
with gr.Accordion("π€ Press Key", open=True):
key_input = gr.Textbox(
label="Key (e.g., enter, ctrl+c, alt+tab)",
placeholder="enter"
)
key_btn = gr.Button("Press Key")
# Scroll controls
with gr.Accordion("π Scroll", open=False):
scroll_dir = gr.Radio(
["up", "down", "left", "right"],
label="Direction",
value="down"
)
scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount")
scroll_btn = gr.Button("Scroll")
with gr.Row():
gr.Markdown("""
### π‘ Tips
- Click "Take Screenshot" first to see the current desktop
- Click coordinates are relative to the 1280x800 display
- Use `ctrl+alt+t` to open terminal, `super` for menu
- The desktop has Firefox ESR pre-installed
### π Links
- [View noVNC Desktop](/proxy/6080) (direct VNC access)
- [GitHub](https://github.com) | [HuggingFace](https://huggingface.co)
""")
# Event handlers
screenshot_btn.click(
screenshot_sync,
outputs=[screenshot_img, status_text]
)
click_btn.click(
click_sync,
inputs=[click_x, click_y, click_button],
outputs=[screenshot_img, status_text]
)
type_btn.click(
type_sync,
inputs=[type_text],
outputs=[screenshot_img, status_text]
)
key_btn.click(
key_sync,
inputs=[key_input],
outputs=[screenshot_img, status_text]
)
scroll_btn.click(
scroll_sync,
inputs=[scroll_dir, scroll_amount],
outputs=[screenshot_img, status_text]
)
# Auto-screenshot on load
demo.load(screenshot_sync, outputs=[screenshot_img, status_text])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)
|