likhonsheikh's picture
Upload app.py with huggingface_hub
b55e0e7 verified
raw
history blame
5.94 kB
"""
Gradio UI for Open Computer Use Agent - HuggingFace Spaces
"""
import asyncio
import base64
import gradio as gr
from PIL import Image
from io import BytesIO
from computer_tool import ComputerTool
# Initialize computer tool
computer = ComputerTool(display_width=1280, display_height=800, display_num=99)
def decode_screenshot(base64_str: str) -> Image.Image:
"""Decode base64 screenshot to PIL Image"""
img_bytes = base64.b64decode(base64_str)
return Image.open(BytesIO(img_bytes))
async def take_screenshot():
"""Take a screenshot and return as PIL Image"""
result = await computer.screenshot()
if result.base64_image:
return decode_screenshot(result.base64_image), "Screenshot taken"
return None, f"Error: {result.error}"
async def do_click(x: int, y: int, button: str):
"""Click at coordinates"""
clicks = 2 if button == "double" else 1
btn = "left" if button == "double" else button
result = await computer.click(x, y, btn, clicks)
# Take screenshot after action
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
async def do_type(text: str):
"""Type text"""
result = await computer.type_text(text)
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
async def do_key(key: str):
"""Press key"""
result = await computer.press_key(key)
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
async def do_scroll(direction: str, amount: int):
"""Scroll"""
result = await computer.scroll(direction, amount)
ss = await computer.screenshot()
img = decode_screenshot(ss.base64_image) if ss.base64_image else None
return img, result.output or result.error
# Sync wrappers for Gradio
def screenshot_sync():
return asyncio.run(take_screenshot())
def click_sync(x, y, button):
return asyncio.run(do_click(int(x), int(y), button))
def type_sync(text):
return asyncio.run(do_type(text))
def key_sync(key):
return asyncio.run(do_key(key))
def scroll_sync(direction, amount):
return asyncio.run(do_scroll(direction, int(amount)))
# Gradio UI
with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ–₯️ Open Computer Use Agent
Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator.
**How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below.
""")
with gr.Row():
with gr.Column(scale=2):
# Screenshot display
screenshot_img = gr.Image(
label="Desktop View (1280x800)",
type="pil",
height=500
)
status_text = gr.Textbox(label="Status", interactive=False)
screenshot_btn = gr.Button("πŸ“· Take Screenshot", variant="primary")
with gr.Column(scale=1):
gr.Markdown("### Actions")
# Click controls
with gr.Accordion("πŸ–±οΈ Mouse Click", open=True):
with gr.Row():
click_x = gr.Number(label="X", value=640)
click_y = gr.Number(label="Y", value=400)
click_button = gr.Radio(
["left", "right", "double"],
label="Button",
value="left"
)
click_btn = gr.Button("Click")
# Type controls
with gr.Accordion("⌨️ Type Text", open=True):
type_text = gr.Textbox(label="Text to type", placeholder="Hello World")
type_btn = gr.Button("Type")
# Key controls
with gr.Accordion("πŸ”€ Press Key", open=True):
key_input = gr.Textbox(
label="Key (e.g., enter, ctrl+c, alt+tab)",
placeholder="enter"
)
key_btn = gr.Button("Press Key")
# Scroll controls
with gr.Accordion("πŸ“œ Scroll", open=False):
scroll_dir = gr.Radio(
["up", "down", "left", "right"],
label="Direction",
value="down"
)
scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount")
scroll_btn = gr.Button("Scroll")
with gr.Row():
gr.Markdown("""
### πŸ’‘ Tips
- Click "Take Screenshot" first to see the current desktop
- Click coordinates are relative to the 1280x800 display
- Use `ctrl+alt+t` to open terminal, `super` for menu
- The desktop has Firefox ESR pre-installed
### πŸ”— Links
- [View noVNC Desktop](/proxy/6080) (direct VNC access)
- [GitHub](https://github.com) | [HuggingFace](https://huggingface.co)
""")
# Event handlers
screenshot_btn.click(
screenshot_sync,
outputs=[screenshot_img, status_text]
)
click_btn.click(
click_sync,
inputs=[click_x, click_y, click_button],
outputs=[screenshot_img, status_text]
)
type_btn.click(
type_sync,
inputs=[type_text],
outputs=[screenshot_img, status_text]
)
key_btn.click(
key_sync,
inputs=[key_input],
outputs=[screenshot_img, status_text]
)
scroll_btn.click(
scroll_sync,
inputs=[scroll_dir, scroll_amount],
outputs=[screenshot_img, status_text]
)
# Auto-screenshot on load
demo.load(screenshot_sync, outputs=[screenshot_img, status_text])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)