Spaces:

likhonsheikh
/

open-computer-use-agent

Running

App Files Files Community

open-computer-use-agent / app.py

likhonsheikh

Upload app.py with huggingface_hub

b55e0e7 verified 3 days ago

raw

history blame

5.94 kB

	"""
	Gradio UI for Open Computer Use Agent - HuggingFace Spaces
	"""

	import asyncio
	import base64
	import gradio as gr
	from PIL import Image
	from io import BytesIO
	from computer_tool import ComputerTool

	# Initialize computer tool
	computer = ComputerTool(display_width=1280, display_height=800, display_num=99)


	def decode_screenshot(base64_str: str) -> Image.Image:
	"""Decode base64 screenshot to PIL Image"""
	img_bytes = base64.b64decode(base64_str)
	return Image.open(BytesIO(img_bytes))


	async def take_screenshot():
	"""Take a screenshot and return as PIL Image"""
	result = await computer.screenshot()
	if result.base64_image:
	return decode_screenshot(result.base64_image), "Screenshot taken"
	return None, f"Error: {result.error}"


	async def do_click(x: int, y: int, button: str):
	"""Click at coordinates"""
	clicks = 2 if button == "double" else 1
	btn = "left" if button == "double" else button
	result = await computer.click(x, y, btn, clicks)

	# Take screenshot after action
	ss = await computer.screenshot()
	img = decode_screenshot(ss.base64_image) if ss.base64_image else None
	return img, result.output or result.error


	async def do_type(text: str):
	"""Type text"""
	result = await computer.type_text(text)
	ss = await computer.screenshot()
	img = decode_screenshot(ss.base64_image) if ss.base64_image else None
	return img, result.output or result.error


	async def do_key(key: str):
	"""Press key"""
	result = await computer.press_key(key)
	ss = await computer.screenshot()
	img = decode_screenshot(ss.base64_image) if ss.base64_image else None
	return img, result.output or result.error


	async def do_scroll(direction: str, amount: int):
	"""Scroll"""
	result = await computer.scroll(direction, amount)
	ss = await computer.screenshot()
	img = decode_screenshot(ss.base64_image) if ss.base64_image else None
	return img, result.output or result.error


	# Sync wrappers for Gradio
	def screenshot_sync():
	return asyncio.run(take_screenshot())

	def click_sync(x, y, button):
	return asyncio.run(do_click(int(x), int(y), button))

	def type_sync(text):
	return asyncio.run(do_type(text))

	def key_sync(key):
	return asyncio.run(do_key(key))

	def scroll_sync(direction, amount):
	return asyncio.run(do_scroll(direction, int(amount)))


	# Gradio UI
	with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🖥️ Open Computer Use Agent

	Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator.

	How it works: A virtual Xfce desktop runs inside this Space. You can control it using the actions below.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Screenshot display
	screenshot_img = gr.Image(
	label="Desktop View (1280x800)",
	type="pil",
	height=500
	)
	status_text = gr.Textbox(label="Status", interactive=False)

	screenshot_btn = gr.Button("📷 Take Screenshot", variant="primary")

	with gr.Column(scale=1):
	gr.Markdown("### Actions")

	# Click controls
	with gr.Accordion("🖱️ Mouse Click", open=True):
	with gr.Row():
	click_x = gr.Number(label="X", value=640)
	click_y = gr.Number(label="Y", value=400)
	click_button = gr.Radio(
	["left", "right", "double"],
	label="Button",
	value="left"
	)
	click_btn = gr.Button("Click")

	# Type controls
	with gr.Accordion("⌨️ Type Text", open=True):
	type_text = gr.Textbox(label="Text to type", placeholder="Hello World")
	type_btn = gr.Button("Type")

	# Key controls
	with gr.Accordion("🔤 Press Key", open=True):
	key_input = gr.Textbox(
	label="Key (e.g., enter, ctrl+c, alt+tab)",
	placeholder="enter"
	)
	key_btn = gr.Button("Press Key")

	# Scroll controls
	with gr.Accordion("📜 Scroll", open=False):
	scroll_dir = gr.Radio(
	["up", "down", "left", "right"],
	label="Direction",
	value="down"
	)
	scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount")
	scroll_btn = gr.Button("Scroll")

	with gr.Row():
	gr.Markdown("""
	### 💡 Tips
	- Click "Take Screenshot" first to see the current desktop
	- Click coordinates are relative to the 1280x800 display
	- Use `ctrl+alt+t` to open terminal, `super` for menu
	- The desktop has Firefox ESR pre-installed

	### 🔗 Links
	- [View noVNC Desktop](/proxy/6080) (direct VNC access)
	- [GitHub](https://github.com) \| [HuggingFace](https://huggingface.co)
	""")

	# Event handlers
	screenshot_btn.click(
	screenshot_sync,
	outputs=[screenshot_img, status_text]
	)

	click_btn.click(
	click_sync,
	inputs=[click_x, click_y, click_button],
	outputs=[screenshot_img, status_text]
	)

	type_btn.click(
	type_sync,
	inputs=[type_text],
	outputs=[screenshot_img, status_text]
	)

	key_btn.click(
	key_sync,
	inputs=[key_input],
	outputs=[screenshot_img, status_text]
	)

	scroll_btn.click(
	scroll_sync,
	inputs=[scroll_dir, scroll_amount],
	outputs=[screenshot_img, status_text]
	)

	# Auto-screenshot on load
	demo.load(screenshot_sync, outputs=[screenshot_img, status_text])


	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)