File size: 5,943 Bytes
b55e0e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
"""
Gradio UI for Open Computer Use Agent - HuggingFace Spaces
"""

import asyncio
import base64
import gradio as gr
from PIL import Image
from io import BytesIO
from computer_tool import ComputerTool

# Initialize computer tool
computer = ComputerTool(display_width=1280, display_height=800, display_num=99)


def decode_screenshot(base64_str: str) -> Image.Image:
    """Decode base64 screenshot to PIL Image"""
    img_bytes = base64.b64decode(base64_str)
    return Image.open(BytesIO(img_bytes))


async def take_screenshot():
    """Take a screenshot and return as PIL Image"""
    result = await computer.screenshot()
    if result.base64_image:
        return decode_screenshot(result.base64_image), "Screenshot taken"
    return None, f"Error: {result.error}"


async def do_click(x: int, y: int, button: str):
    """Click at coordinates"""
    clicks = 2 if button == "double" else 1
    btn = "left" if button == "double" else button
    result = await computer.click(x, y, btn, clicks)

    # Take screenshot after action
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


async def do_type(text: str):
    """Type text"""
    result = await computer.type_text(text)
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


async def do_key(key: str):
    """Press key"""
    result = await computer.press_key(key)
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


async def do_scroll(direction: str, amount: int):
    """Scroll"""
    result = await computer.scroll(direction, amount)
    ss = await computer.screenshot()
    img = decode_screenshot(ss.base64_image) if ss.base64_image else None
    return img, result.output or result.error


# Sync wrappers for Gradio
def screenshot_sync():
    return asyncio.run(take_screenshot())

def click_sync(x, y, button):
    return asyncio.run(do_click(int(x), int(y), button))

def type_sync(text):
    return asyncio.run(do_type(text))

def key_sync(key):
    return asyncio.run(do_key(key))

def scroll_sync(direction, amount):
    return asyncio.run(do_scroll(direction, int(amount)))


# Gradio UI
with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # πŸ–₯️ Open Computer Use Agent

    Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator.

    **How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below.
    """)

    with gr.Row():
        with gr.Column(scale=2):
            # Screenshot display
            screenshot_img = gr.Image(
                label="Desktop View (1280x800)",
                type="pil",
                height=500
            )
            status_text = gr.Textbox(label="Status", interactive=False)

            screenshot_btn = gr.Button("πŸ“· Take Screenshot", variant="primary")

        with gr.Column(scale=1):
            gr.Markdown("### Actions")

            # Click controls
            with gr.Accordion("πŸ–±οΈ Mouse Click", open=True):
                with gr.Row():
                    click_x = gr.Number(label="X", value=640)
                    click_y = gr.Number(label="Y", value=400)
                click_button = gr.Radio(
                    ["left", "right", "double"],
                    label="Button",
                    value="left"
                )
                click_btn = gr.Button("Click")

            # Type controls
            with gr.Accordion("⌨️ Type Text", open=True):
                type_text = gr.Textbox(label="Text to type", placeholder="Hello World")
                type_btn = gr.Button("Type")

            # Key controls
            with gr.Accordion("πŸ”€ Press Key", open=True):
                key_input = gr.Textbox(
                    label="Key (e.g., enter, ctrl+c, alt+tab)",
                    placeholder="enter"
                )
                key_btn = gr.Button("Press Key")

            # Scroll controls
            with gr.Accordion("πŸ“œ Scroll", open=False):
                scroll_dir = gr.Radio(
                    ["up", "down", "left", "right"],
                    label="Direction",
                    value="down"
                )
                scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount")
                scroll_btn = gr.Button("Scroll")

    with gr.Row():
        gr.Markdown("""
        ### πŸ’‘ Tips
        - Click "Take Screenshot" first to see the current desktop
        - Click coordinates are relative to the 1280x800 display
        - Use `ctrl+alt+t` to open terminal, `super` for menu
        - The desktop has Firefox ESR pre-installed

        ### πŸ”— Links
        - [View noVNC Desktop](/proxy/6080) (direct VNC access)
        - [GitHub](https://github.com) | [HuggingFace](https://huggingface.co)
        """)

    # Event handlers
    screenshot_btn.click(
        screenshot_sync,
        outputs=[screenshot_img, status_text]
    )

    click_btn.click(
        click_sync,
        inputs=[click_x, click_y, click_button],
        outputs=[screenshot_img, status_text]
    )

    type_btn.click(
        type_sync,
        inputs=[type_text],
        outputs=[screenshot_img, status_text]
    )

    key_btn.click(
        key_sync,
        inputs=[key_input],
        outputs=[screenshot_img, status_text]
    )

    scroll_btn.click(
        scroll_sync,
        inputs=[scroll_dir, scroll_amount],
        outputs=[screenshot_img, status_text]
    )

    # Auto-screenshot on load
    demo.load(screenshot_sync, outputs=[screenshot_img, status_text])


if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)