likhonsheikh commited on
Commit
b55e0e7
·
verified ·
1 Parent(s): 6bf3f1f

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +191 -0
app.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio UI for Open Computer Use Agent - HuggingFace Spaces
3
+ """
4
+
5
+ import asyncio
6
+ import base64
7
+ import gradio as gr
8
+ from PIL import Image
9
+ from io import BytesIO
10
+ from computer_tool import ComputerTool
11
+
12
+ # Initialize computer tool
13
+ computer = ComputerTool(display_width=1280, display_height=800, display_num=99)
14
+
15
+
16
+ def decode_screenshot(base64_str: str) -> Image.Image:
17
+ """Decode base64 screenshot to PIL Image"""
18
+ img_bytes = base64.b64decode(base64_str)
19
+ return Image.open(BytesIO(img_bytes))
20
+
21
+
22
+ async def take_screenshot():
23
+ """Take a screenshot and return as PIL Image"""
24
+ result = await computer.screenshot()
25
+ if result.base64_image:
26
+ return decode_screenshot(result.base64_image), "Screenshot taken"
27
+ return None, f"Error: {result.error}"
28
+
29
+
30
+ async def do_click(x: int, y: int, button: str):
31
+ """Click at coordinates"""
32
+ clicks = 2 if button == "double" else 1
33
+ btn = "left" if button == "double" else button
34
+ result = await computer.click(x, y, btn, clicks)
35
+
36
+ # Take screenshot after action
37
+ ss = await computer.screenshot()
38
+ img = decode_screenshot(ss.base64_image) if ss.base64_image else None
39
+ return img, result.output or result.error
40
+
41
+
42
+ async def do_type(text: str):
43
+ """Type text"""
44
+ result = await computer.type_text(text)
45
+ ss = await computer.screenshot()
46
+ img = decode_screenshot(ss.base64_image) if ss.base64_image else None
47
+ return img, result.output or result.error
48
+
49
+
50
+ async def do_key(key: str):
51
+ """Press key"""
52
+ result = await computer.press_key(key)
53
+ ss = await computer.screenshot()
54
+ img = decode_screenshot(ss.base64_image) if ss.base64_image else None
55
+ return img, result.output or result.error
56
+
57
+
58
+ async def do_scroll(direction: str, amount: int):
59
+ """Scroll"""
60
+ result = await computer.scroll(direction, amount)
61
+ ss = await computer.screenshot()
62
+ img = decode_screenshot(ss.base64_image) if ss.base64_image else None
63
+ return img, result.output or result.error
64
+
65
+
66
+ # Sync wrappers for Gradio
67
+ def screenshot_sync():
68
+ return asyncio.run(take_screenshot())
69
+
70
+ def click_sync(x, y, button):
71
+ return asyncio.run(do_click(int(x), int(y), button))
72
+
73
+ def type_sync(text):
74
+ return asyncio.run(do_type(text))
75
+
76
+ def key_sync(key):
77
+ return asyncio.run(do_key(key))
78
+
79
+ def scroll_sync(direction, amount):
80
+ return asyncio.run(do_scroll(direction, int(amount)))
81
+
82
+
83
+ # Gradio UI
84
+ with gr.Blocks(title="Open Computer Use Agent", theme=gr.themes.Soft()) as demo:
85
+ gr.Markdown("""
86
+ # 🖥️ Open Computer Use Agent
87
+
88
+ Control a virtual Linux desktop through AI. This is an open-source alternative to OpenAI Operator.
89
+
90
+ **How it works:** A virtual Xfce desktop runs inside this Space. You can control it using the actions below.
91
+ """)
92
+
93
+ with gr.Row():
94
+ with gr.Column(scale=2):
95
+ # Screenshot display
96
+ screenshot_img = gr.Image(
97
+ label="Desktop View (1280x800)",
98
+ type="pil",
99
+ height=500
100
+ )
101
+ status_text = gr.Textbox(label="Status", interactive=False)
102
+
103
+ screenshot_btn = gr.Button("📷 Take Screenshot", variant="primary")
104
+
105
+ with gr.Column(scale=1):
106
+ gr.Markdown("### Actions")
107
+
108
+ # Click controls
109
+ with gr.Accordion("🖱️ Mouse Click", open=True):
110
+ with gr.Row():
111
+ click_x = gr.Number(label="X", value=640)
112
+ click_y = gr.Number(label="Y", value=400)
113
+ click_button = gr.Radio(
114
+ ["left", "right", "double"],
115
+ label="Button",
116
+ value="left"
117
+ )
118
+ click_btn = gr.Button("Click")
119
+
120
+ # Type controls
121
+ with gr.Accordion("⌨️ Type Text", open=True):
122
+ type_text = gr.Textbox(label="Text to type", placeholder="Hello World")
123
+ type_btn = gr.Button("Type")
124
+
125
+ # Key controls
126
+ with gr.Accordion("🔤 Press Key", open=True):
127
+ key_input = gr.Textbox(
128
+ label="Key (e.g., enter, ctrl+c, alt+tab)",
129
+ placeholder="enter"
130
+ )
131
+ key_btn = gr.Button("Press Key")
132
+
133
+ # Scroll controls
134
+ with gr.Accordion("📜 Scroll", open=False):
135
+ scroll_dir = gr.Radio(
136
+ ["up", "down", "left", "right"],
137
+ label="Direction",
138
+ value="down"
139
+ )
140
+ scroll_amount = gr.Slider(1, 10, value=3, step=1, label="Amount")
141
+ scroll_btn = gr.Button("Scroll")
142
+
143
+ with gr.Row():
144
+ gr.Markdown("""
145
+ ### 💡 Tips
146
+ - Click "Take Screenshot" first to see the current desktop
147
+ - Click coordinates are relative to the 1280x800 display
148
+ - Use `ctrl+alt+t` to open terminal, `super` for menu
149
+ - The desktop has Firefox ESR pre-installed
150
+
151
+ ### 🔗 Links
152
+ - [View noVNC Desktop](/proxy/6080) (direct VNC access)
153
+ - [GitHub](https://github.com) | [HuggingFace](https://huggingface.co)
154
+ """)
155
+
156
+ # Event handlers
157
+ screenshot_btn.click(
158
+ screenshot_sync,
159
+ outputs=[screenshot_img, status_text]
160
+ )
161
+
162
+ click_btn.click(
163
+ click_sync,
164
+ inputs=[click_x, click_y, click_button],
165
+ outputs=[screenshot_img, status_text]
166
+ )
167
+
168
+ type_btn.click(
169
+ type_sync,
170
+ inputs=[type_text],
171
+ outputs=[screenshot_img, status_text]
172
+ )
173
+
174
+ key_btn.click(
175
+ key_sync,
176
+ inputs=[key_input],
177
+ outputs=[screenshot_img, status_text]
178
+ )
179
+
180
+ scroll_btn.click(
181
+ scroll_sync,
182
+ inputs=[scroll_dir, scroll_amount],
183
+ outputs=[screenshot_img, status_text]
184
+ )
185
+
186
+ # Auto-screenshot on load
187
+ demo.load(screenshot_sync, outputs=[screenshot_img, status_text])
188
+
189
+
190
+ if __name__ == "__main__":
191
+ demo.launch(server_name="0.0.0.0", server_port=7860)