AbdulElahGwaith commited on
Commit
2d483c2
·
verified ·
1 Parent(s): bd2d604

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .envrc +5 -0
  2. .gitattributes +24 -0
  3. .gitignore +214 -0
  4. .mise.toml +5 -0
  5. CONTRIBUTING_AR.md +44 -0
  6. LICENSE +201 -0
  7. README.md +275 -0
  8. SETUP_GUIDELINE.md +452 -0
  9. assets/authorization.png +3 -0
  10. assets/creategcp.png +3 -0
  11. assets/desktopapp.png +3 -0
  12. assets/developer.png +3 -0
  13. assets/enableapi.png +3 -0
  14. assets/googleidentity.png +0 -0
  15. assets/googlephonecode.png +0 -0
  16. assets/googleshutoff.png +0 -0
  17. assets/netsetting1.png +0 -0
  18. assets/netsetting2.png +0 -0
  19. assets/netsetting3.png +0 -0
  20. assets/netsetting4.png +0 -0
  21. assets/oauth2.0.png +3 -0
  22. assets/oauthapp.png +3 -0
  23. assets/proxysetup-zh.png +3 -0
  24. assets/proxysetup.png +3 -0
  25. assets/pubeval1.png +0 -0
  26. assets/pubeval2.png +3 -0
  27. assets/pubeval3.png +3 -0
  28. assets/pubeval4.png +0 -0
  29. assets/pubeval5.png +0 -0
  30. assets/pubeval_gdrive_auth.jpg +3 -0
  31. assets/pubeval_monitor1.jpg +3 -0
  32. assets/pubeval_monitor2.jpg +3 -0
  33. assets/pubeval_subnet.png +3 -0
  34. assets/publishapp.png +3 -0
  35. assets/testusers.png +3 -0
  36. assets/unsafemode.png +3 -0
  37. assets/usertype.png +3 -0
  38. assets/winnetsetting1.png +3 -0
  39. assets/winnetsetting2.png +0 -0
  40. assets/winnetsetting3.png +3 -0
  41. assets/winnetsetting4.png +3 -0
  42. desktop_env/__init__.py +1 -0
  43. desktop_env/actions.py +203 -0
  44. desktop_env/controllers/__init__.py +0 -0
  45. desktop_env/controllers/python.py +584 -0
  46. desktop_env/controllers/setup.py +920 -0
  47. desktop_env/desktop_env.py +497 -0
  48. desktop_env/desktop_env_os_symphony.py +499 -0
  49. desktop_env/evaluators/README.md +224 -0
  50. desktop_env/evaluators/__init__.py +5 -0
.envrc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ watch_file .mise.toml
2
+ [[ -e ~/.local/bin/mise ]] || (curl -sf https://mise.run | MISE_QUIET=1 sh)
3
+ ~/.local/bin/mise trust 2> /dev/null
4
+ ~/.local/bin/mise install -qy
5
+ direnv_load ~/.local/bin/mise direnv exec
.gitattributes CHANGED
@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/authorization.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/creategcp.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/desktopapp.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/developer.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/enableapi.png filter=lfs diff=lfs merge=lfs -text
41
+ assets/oauth2.0.png filter=lfs diff=lfs merge=lfs -text
42
+ assets/oauthapp.png filter=lfs diff=lfs merge=lfs -text
43
+ assets/proxysetup-zh.png filter=lfs diff=lfs merge=lfs -text
44
+ assets/proxysetup.png filter=lfs diff=lfs merge=lfs -text
45
+ assets/pubeval2.png filter=lfs diff=lfs merge=lfs -text
46
+ assets/pubeval3.png filter=lfs diff=lfs merge=lfs -text
47
+ assets/pubeval_gdrive_auth.jpg filter=lfs diff=lfs merge=lfs -text
48
+ assets/pubeval_monitor1.jpg filter=lfs diff=lfs merge=lfs -text
49
+ assets/pubeval_monitor2.jpg filter=lfs diff=lfs merge=lfs -text
50
+ assets/pubeval_subnet.png filter=lfs diff=lfs merge=lfs -text
51
+ assets/publishapp.png filter=lfs diff=lfs merge=lfs -text
52
+ assets/testusers.png filter=lfs diff=lfs merge=lfs -text
53
+ assets/unsafemode.png filter=lfs diff=lfs merge=lfs -text
54
+ assets/usertype.png filter=lfs diff=lfs merge=lfs -text
55
+ assets/winnetsetting1.png filter=lfs diff=lfs merge=lfs -text
56
+ assets/winnetsetting3.png filter=lfs diff=lfs merge=lfs -text
57
+ assets/winnetsetting4.png filter=lfs diff=lfs merge=lfs -text
58
+ mm_agents/uipath/imgs/element_predictions.png filter=lfs diff=lfs merge=lfs -text
59
+ mm_agents/uipath/imgs/schema.png filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model checkpoints
2
+ *.pth
3
+ *.pt
4
+
5
+ # Credential files
6
+ evaluation_examples/settings/google/settings.json
7
+ evaluation_examples/settings/googledrive/credentials.json
8
+ evaluation_examples/settings/googledrive/client_secrets.json
9
+
10
+ # Byte-compiled / optimized / DLL files
11
+ __pycache__/
12
+ *.py[cod]
13
+ *$py.class
14
+
15
+ # C extensions
16
+ *.so
17
+
18
+ # Distribution / packaging
19
+ .Python
20
+ build/
21
+ develop-eggs/
22
+ dist/
23
+ downloads/
24
+ eggs/
25
+ .eggs/
26
+ lib/
27
+ lib64/
28
+ parts/
29
+ sdist/
30
+ var/
31
+ wheels/
32
+ pip-wheel-metadata/
33
+ share/python-wheels/
34
+ *.egg-info/
35
+ .installed.cfg
36
+ *.egg
37
+ MANIFEST
38
+
39
+ # PyInstaller
40
+ # Usually these files are written by a python script from a template
41
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
42
+ *.manifest
43
+ *.spec
44
+
45
+ # Installer logs
46
+ pip-log.txt
47
+ pip-delete-this-directory.txt
48
+
49
+ # Unit test / coverage reports
50
+ htmlcov/
51
+ .tox/
52
+ .nox/
53
+ .coverage
54
+ .coverage.*
55
+ .cache
56
+ nosetests.xml
57
+ coverage.xml
58
+ *.cover
59
+ *.py,cover
60
+ .hypothesis/
61
+ .pytest_cache/
62
+
63
+ # Translations
64
+ *.mo
65
+ *.pot
66
+
67
+ # Django stuff:
68
+ *.log
69
+ local_settings.py
70
+ db.sqlite3
71
+ db.sqlite3-journal
72
+
73
+ # Flask stuff:
74
+ instance/
75
+ .webassets-cache
76
+
77
+ # Scrapy stuff:
78
+ .scrapy
79
+
80
+ # Sphinx documentation
81
+ docs/_build/
82
+
83
+ # PyBuilder
84
+ target/
85
+
86
+ # Jupyter Notebook
87
+ .ipynb_checkpoints
88
+
89
+ # IPython
90
+ profile_default/
91
+ ipython_config.py
92
+
93
+ # pyenv
94
+ .python-version
95
+
96
+ # pipenv
97
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
98
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
99
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
100
+ # install all needed dependencies.
101
+ #Pipfile.lock
102
+
103
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
104
+ __pypackages__/
105
+
106
+ # Celery stuff
107
+ celerybeat-schedule
108
+ celerybeat.pid
109
+
110
+ # SageMath parsed files
111
+ *.sage.py
112
+
113
+ # Environments
114
+ .env
115
+ .venv
116
+ env/
117
+ venv/
118
+ ENV/
119
+ env.bak/
120
+ venv.bak/
121
+
122
+ # Spyder project settings
123
+ .spyderproject
124
+ .spyproject
125
+
126
+ # Rope project settings
127
+ .ropeproject
128
+
129
+ # mkdocs documentation
130
+ /site
131
+
132
+ # mypy
133
+ .mypy_cache/
134
+ .dmypy.json
135
+ dmypy.json
136
+
137
+ # Pyre type checker
138
+ .pyre/
139
+
140
+ # PyCharm
141
+ **/.idea/**/*
142
+
143
+ # Mac OS
144
+ .DS_Store
145
+
146
+ # data
147
+ **/data/**/*
148
+ !**/utils/data/**/*
149
+
150
+ # tmp files
151
+ **/tmp/**/*
152
+ api_key.py
153
+ tmp.*
154
+
155
+ ## Server logging
156
+ **/.logging/**/*
157
+
158
+ # DB cache
159
+ **/.db_cache/**/*
160
+
161
+ **/debugging/**/*
162
+
163
+ # embedding repo
164
+ instructor-embedding
165
+
166
+ # plugin cache
167
+ **/static/**/*
168
+
169
+ # frontend cache
170
+ frontend/node_modules/
171
+ frontend/.next/
172
+ frontend/.idea
173
+
174
+ tags
175
+ tags-opts
176
+ snapshots
177
+ branch_flag
178
+ branch-config
179
+ *.syncthing.*.tmp
180
+ cache
181
+ version.folder
182
+ at_processing
183
+
184
+ test.xlsx
185
+ test2.xlsx
186
+
187
+ # vm info
188
+ .vms
189
+ /vm_data
190
+ docker_vm_data
191
+ vmware_vm_data
192
+ .vmware*
193
+ .aws*
194
+
195
+ # result
196
+ **/result*/**/*
197
+
198
+ .vscode
199
+
200
+ dataimpulse_proxy_config.json
201
+
202
+ ## reference and draft and debug
203
+ reference/
204
+ draft/
205
+ manual_examine.py
206
+ run_human_examine.sh
207
+ quick_start.py
208
+ result_multi_apps_pengxiang_transformers12evaluation_examples/settings/proxy/dataimpulse.json
209
+ evaluation_examples/settings/proxy/dataimpulse.json
210
+
211
+ # Local test configurations (not for public repo)
212
+ evaluation_examples/spiderman.json
213
+ evaluation_examples/test_50_random_proportional.json
214
+ evaluation_examples/test_chrome.json
.mise.toml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [tools]
2
+ python = "3.12"
3
+
4
+ [env]
5
+ _.python.venv = { path = ".venv", create = true }
CONTRIBUTING_AR.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # دليل المساهمة في OSWorld 🌍
2
+
3
+ شكراً لاهتمامك بالمساهمة في **OSWorld**! نحن نرحب بجميع أنواع المساهمات، سواء كانت تحسينات في الكود، تحديثات للوثائق، أو إضافة مهام تقييم جديدة.
4
+
5
+ ## 🚀 كيف تبدأ؟
6
+
7
+ 1. **إنشاء Fork:** قم بإنشاء نسخة خاصة بك من المستودع على GitHub.
8
+ 2. **الاستنساخ:** قم باستنساخ المستودع محلياً:
9
+ ```bash
10
+ git clone https://github.com/YOUR_USERNAME/OSWorld.git
11
+ ```
12
+ 3. **إعداد البيئة:** اتبع التعليمات الموجودة في `README.md` لتثبيت المتطلبات.
13
+
14
+ ## 🛠 مجالات المساهمة
15
+
16
+ ### 1. تحسين البيئة (Environment)
17
+ يمكنك المساهمة في تحسين دعم المنصات المختلفة مثل:
18
+ * VMware / VirtualBox
19
+ * Docker (KVM)
20
+ * Cloud Providers (AWS, Azure, Aliyun)
21
+
22
+ ### 2. إضافة مهام تقييم (Evaluation Tasks)
23
+ يمكنك إضافة سيناريوهات جديدة في مجالات:
24
+ * تطبيقات الأوفيس (LibreOffice, Microsoft Office)
25
+ * تصفح الويب والمهام اليومية.
26
+ * البرمجيات المهنية (GIMP, VS Code, etc.)
27
+
28
+ ### 3. تحسين الوثائق
29
+ نحن نقدر جداً تحسين ملفات الـ README، إضافة أمثلة توضيحية، أو ترجمة الوثائق للغات أخرى.
30
+
31
+ ## 📝 قواعد الكود
32
+
33
+ * يرجى اتباع معايير **PEP 8** لكود Python.
34
+ * تأكد من إضافة تعليقات توضيحية للكود الجديد.
35
+ * قم بتحديث ملف `requirements.txt` إذا قمت بإضافة مكتبات جديدة.
36
+
37
+ ## 📬 إرسال التعديلات
38
+
39
+ 1. قم بإنشاء فرع جديد (Branch) لوصف تعديلك: `git checkout -b feature/my-new-feature`.
40
+ 2. قم بعمل Commit لتعديلاتك مع رسالة واضحة.
41
+ 3. قم برفع التعديلات (Push) إلى مستودعك.
42
+ 4. افتح **Pull Request** في المستودع الأصلي.
43
+
44
+ نحن نتطلع لرؤية مساهماتكم! 🚀
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright 2024 XLANG NLP Lab
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+ <img src="https://huggingface.co/datasets/xlangai/assets/resolve/main/github_banner_v2.png" alt="Banner">
3
+ </p>
4
+
5
+ <p align="center">
6
+ <a href="https://os-world.github.io/">Website</a> •
7
+ <a href="https://arxiv.org/abs/2404.07972">Paper</a> •
8
+ <a href="https://timothyxxx.github.io/OSWorld/">Doc</a> •
9
+ <a href="https://github.com/xlang-ai/OSWorld/tree/main/evaluation_examples">Data</a> •
10
+ <a href="https://os-world.github.io/explorer.html">Data Viewer</a> •
11
+ <a href="https://discord.gg/4Gnw7eTEZR">Discord</a> •
12
+ <a href="CONTRIBUTING_AR.md">دليل المساهمة (AR)</a> •
13
+ <a href="https://drive.google.com/file/d/1XlEy49otYDyBlA3O9NbR0BpPfr2TXgaD/view?usp=drive_link">Cache</a>
14
+ </p>
15
+
16
+ <p align="center">
17
+ <a href="https://img.shields.io/badge/PRs-Welcome-red">
18
+ <img src="https://img.shields.io/badge/PRs-Welcome-red">
19
+ </a>
20
+ <a href="https://img.shields.io/github/last-commit/xlang-ai/OSWorld?color=green">
21
+ <img src="https://img.shields.io/github/last-commit/xlang-ai/OSWorld?color=green">
22
+ </a>
23
+ <a href="https://opensource.org/licenses/Apache-2.0">
24
+ <img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg">
25
+ </a>
26
+ <a href="https://badge.fury.io/py/desktop-env">
27
+ <img src="https://badge.fury.io/py/desktop-env.svg">
28
+ </a>
29
+ <a href="https://pepy.tech/project/desktop-env">
30
+ <img src="https://static.pepy.tech/badge/desktop-env">
31
+ </a>
32
+ <br/>
33
+ </p>
34
+
35
+
36
+ ## 📢 Updates
37
+ - 2025-07-28: Introducing **OSWorld-Verified**! We have made major updates, fixed several issues reported by the community, with more support for AWS (can reduce evaluation time to within 1 hour through parallelization!), and making the benchmark signals more effective. Check out more in the [report](https://xlang.ai/blog/osworld-verified). We have run new model results in the latest version and updated them on the [official website](https://os-world.github.io/). Please compare your OSWorld results with the new benchmark results when running the latest version.
38
+ - 2025-05-01: If you need pre-downloaded files for init state setup, we downloaded for you [here](https://drive.google.com/file/d/1XlEy49otYDyBlA3O9NbR0BpPfr2TXgaD/view?usp=drive_link).
39
+ - 2024-10-22: We supported Docker🐳 for hosting virtual machines on virtualized platforms. Check below for detailed instructions!
40
+ - 2024-06-15: We refactor the code of environment part to decompose VMware Integration, and start to support other platforms such as VirtualBox, AWS, Azure, etc. Hold tight!
41
+ - 2024-04-11: We released our [paper](https://arxiv.org/abs/2404.07972), [environment and benchmark](https://github.com/xlang-ai/OSWorld), and [project page](https://os-world.github.io/). Check it out!
42
+
43
+ ## 💾 Installation
44
+ ### VMware/VirtualBox (Desktop, Laptop, Bare Metal Machine)
45
+ Suppose you are operating on a system that has not been virtualized (e.g. your desktop, laptop, bare metal machine), meaning you are not utilizing a virtualized environment like AWS, Azure, or k8s.
46
+ If this is the case, proceed with the instructions below. However, if you are on a virtualized platform, please refer to the [Docker](https://github.com/xlang-ai/OSWorld?tab=readme-ov-file#docker-server-with-kvm-support-for-the-better) section.
47
+
48
+ 1. First, clone this repository and `cd` into it. Then, install the dependencies listed in `requirements.txt`. It is recommended that you use the latest version of Conda to manage the environment, but you can also choose to manually install the dependencies. Please ensure that the version of Python is >= 3.10.
49
+ ```bash
50
+ # Clone the OSWorld repository
51
+ git clone https://github.com/xlang-ai/OSWorld
52
+
53
+ # Change directory into the cloned repository
54
+ cd OSWorld
55
+
56
+ # Optional: Create a Conda environment for OSWorld
57
+ # conda create -n osworld python=3.10
58
+ # conda activate osworld
59
+
60
+ # Install required dependencies
61
+ pip install -r requirements.txt
62
+ ```
63
+
64
+ Alternatively, you can install the environment without any benchmark tasks:
65
+ ```bash
66
+ pip install desktop-env
67
+ ```
68
+
69
+ 2. Install [VMware Workstation Pro](https://www.vmware.com/products/workstation-pro/workstation-pro-evaluation.html) (for systems with Apple Chips, you should install [VMware Fusion](https://support.broadcom.com/group/ecx/productdownloads?subfamily=VMware+Fusion)) and configure the `vmrun` command. The installation process can refer to [How to install VMware Workstation Pro](desktop_env/providers/vmware/INSTALL_VMWARE.md). Verify the successful installation by running the following:
70
+ ```bash
71
+ vmrun -T ws list
72
+ ```
73
+ If the installation along with the environment variable set is successful, you will see the message showing the current running virtual machines.
74
+ > **Note:** We also support using [VirtualBox](https://www.virtualbox.org/) if you have issues with VMware Pro. However, features such as parallelism and macOS on Apple chips might not be well-supported.
75
+
76
+ All set! Our setup script will automatically download the necessary virtual machines and configure the environment for you.
77
+
78
+ ### Docker (Server with KVM Support for Better Performance)
79
+ If you are running on a non-bare metal server, or prefer not to use VMware and VirtualBox platforms, we recommend using our Docker support.
80
+
81
+ #### Prerequisite: Check if your machine supports KVM
82
+ We recommend running the VM with KVM support. To check if your hosting platform supports KVM, run
83
+ ```
84
+ egrep -c '(vmx|svm)' /proc/cpuinfo
85
+ ```
86
+ on Linux. If the return value is greater than zero, the processor should be able to support KVM.
87
+ > **Note**: macOS hosts generally do not support KVM. You are advised to use VMware if you would like to run OSWorld on macOS.
88
+
89
+ #### Install Docker
90
+ If your hosting platform supports a graphical user interface (GUI), you may refer to [Install Docker Desktop on Linux](https://docs.docker.com/desktop/install/linux/) or [Install Docker Desktop on Windows](https://docs.docker.com/desktop/install/windows-install/) based on your OS. Otherwise, you may [Install Docker Engine](https://docs.docker.com/engine/install/).
91
+
92
+ #### Running Experiments
93
+ Add the following arguments when initializing `DesktopEnv`:
94
+ - `provider_name`: `docker`
95
+ - `os_type`: `Ubuntu` or `Windows`, depending on the OS of the VM
96
+ > **Note**: If the experiment is interrupted abnormally (e.g., by interrupting signals), there may be residual docker containers which could affect system performance over time. Please run `docker stop $(docker ps -q) && docker rm $(docker ps -a -q)` to clean up.
97
+
98
+ ### AWS
99
+ Using cloud services for parallel evaluation can significantly accelerate evaluation efficiency (can reduce evaluation time to within 1 hour through parallelization!) and can even be used as infrastructure for training.
100
+ We provide comprehensive AWS support with a Host-Client architecture that enables large-scale parallel evaluation of OSWorld tasks.
101
+ For detailed setup instructions, see [Setup Guideline](SETUP_GUIDELINE.md) and [AWS Configuration Guide](https://github.com/xlang-ai/OSWorld/blob/main/desktop_env/providers/aws/AWS_GUIDELINE.md).
102
+
103
+ ### Others
104
+ We are working on supporting more 👷. Please hold tight!
105
+
106
+
107
+ ## 🚀 Quick Start
108
+ Run the following minimal example to interact with the environment:
109
+
110
+ ```bash
111
+ # Basic usage with default settings
112
+ python quickstart.py
113
+
114
+ # Customize provider and VM path
115
+ python quickstart.py --provider_name vmware --path_to_vm "path/to/your/vm.vmx"
116
+ ```
117
+
118
+ You will see all the logs of the system running normally, including the successful creation of the environment, completion of setup, and successful execution of actions. In the end, you will observe a successful right-click on the screen, which means you are ready to go.
119
+
120
+ ## 🧪 Experiments
121
+ ### Agent Baselines
122
+
123
+ > **⚠️ Important Configuration Requirements:**
124
+ >
125
+ > * **Google Account Tasks**: Some tasks require Google account access and OAuth2.0 configuration. Please refer to [Setup Guideline - Google Account Setup](SETUP_GUIDELINE.md#1-google-account-setup) for detailed setup instructions.
126
+ > * **Proxy Configuration**: Some tasks may require proxy settings to function properly (this depends on the strength of website defenses against your network location). Please refer to [Setup Guideline - Proxy Configuration](SETUP_GUIDELINE.md#2-proxy-configuration).
127
+ > * **Impact of Missing Configuration**: If these configurations are not properly set up, the corresponding tasks will fail to execute correctly, leading to lower evaluation scores.
128
+
129
+
130
+ If you wish to run the baseline agent used in our paper, you can execute the following command as an example under the GPT-4o pure-screenshot setting:
131
+
132
+ Set **OPENAI_API_KEY** environment variable with your API key
133
+ ```bash
134
+ export OPENAI_API_KEY='changeme'
135
+ ```
136
+
137
+ Optionally, set **OPENAI_BASE_URL** to use a custom OpenAI-compatible API endpoint
138
+ ```bash
139
+ export OPENAI_BASE_URL='http://your-custom-endpoint.com/v1' # Optional: defaults to https://api.openai.com
140
+ ```
141
+
142
+ Single-threaded execution (deprecated, using `vmware` provider as example)
143
+ ```bash
144
+ python run.py \
145
+ --provider_name vmware \
146
+ --path_to_vm Ubuntu/Ubuntu.vmx \
147
+ --headless \
148
+ --observation_type screenshot \
149
+ --model gpt-4o \
150
+ --sleep_after_execution 3 \
151
+ --max_steps 15 \
152
+ --result_dir ./results \
153
+ --client_password password
154
+ ```
155
+
156
+ Parallel execution (example showing switching provider to `docker`)
157
+ ```bash
158
+ python scripts/python/run_multienv.py \
159
+ --provider_name docker \
160
+ --headless \
161
+ --observation_type screenshot \
162
+ --model gpt-4o \
163
+ --sleep_after_execution 3 \
164
+ --max_steps 15 \
165
+ --num_envs 10 \
166
+ --client_password password
167
+ ```
168
+
169
+ The results, which include screenshots, actions, and video recordings of the agent's task completion, will be saved in the `./results` (or other `result_dir` you specified) directory in this case.
170
+ You can then run the following command to obtain the result:
171
+
172
+ ```bash
173
+ # Basic usage with default parameters
174
+ python show_result.py
175
+
176
+ # Specify custom parameters
177
+ python show_result.py \
178
+ --action_space pyautogui \
179
+ --model gpt-4o \
180
+ --observation_type screenshot \
181
+ --result_dir ./results
182
+
183
+ # Show detailed scores per domain (format: score/total)
184
+ python show_result.py --detailed
185
+ ```
186
+
187
+ The script will display:
188
+ - Per-domain success rates
189
+ - Category-level statistics (Office, Daily, Professional)
190
+ - Overall success rate and total score
191
+ - With `--detailed` flag: compact format showing "score/total" for each domain
192
+
193
+ ### Manual Task Examination
194
+ For manual verification and examination of specific benchmark tasks, you can use the manual examination tool:
195
+
196
+ ```bash
197
+ python scripts/python/manual_examine.py \
198
+ --headless \
199
+ --observation_type screenshot \
200
+ --result_dir ./results_human_examine \
201
+ --test_all_meta_path evaluation_examples/test_all.json \
202
+ --domain libreoffice_impress \
203
+ --example_id a669ef01-ded5-4099-9ea9-25e99b569840 \
204
+ --max_steps 3
205
+ ```
206
+
207
+ This tool allows you to:
208
+ - Manually execute tasks in the environment
209
+ - Verify task correctness and evaluation metrics
210
+ - Record the execution process with screenshots and videos
211
+ - Examine specific problematic tasks
212
+
213
+ See `scripts/bash/run_manual_examine.sh` for example task IDs across different domains.
214
+
215
+ ## Evaluation
216
+ ### Local Evaluation
217
+ Please start by reading through the [agent interface](https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/README.md) and the [environment interface](https://github.com/xlang-ai/OSWorld/blob/main/desktop_env/README.md).
218
+ Correctly implement the agent interface and import your customized version in the `run.py` (for single-threaded execution) or `scripts/python/run_multienv.py` / `scripts/python/run_multienv_xxx.py` (for parallel execution) file.
219
+ Afterward, you can execute a command similar to the one in the previous section to run the benchmark on your agent.
220
+
221
+ ### Public Evaluation
222
+ If you want your results to be verified and displayed on the verified leaderboard, you need to schedule a meeting with us (current maintainer: tianbaoxiexxx@gmail.com, yuanmengqi732@gmail.com) to run your agent code on our side and have us report the results.
223
+ You need to upload and allow us to disclose your agent implementation under the OSWorld framework (you may choose not to expose your model API to the public), along with a report that allows the public to understand what's happening behind the scenes.
224
+ Alternatively, if you are from a trusted institution, you can share your monitoring data and trajectories with us.
225
+ Please carefully follow the [Setup Guideline - Public Evaluation Platform](SETUP_GUIDELINE.md#3-public-evaluation-platform) to get results.
226
+
227
+
228
+ ## ❓ FAQ
229
+ ### What is the username and password for the virtual machines?
230
+ The username and password for the virtual machines are as follows (for provider `vmware`, `virtualbox` and `docker`): we set the account credentials for Ubuntu as `user` / `password`.
231
+ For cloud service providers like `aws`, to prevent attacks due to weak passwords, we default to `osworld-public-evaluation`.
232
+ If you make further modifications, remember to set the client_password variable and pass it to DesktopEnv and Agent (if supported) when running experiments.
233
+ Some features like setting up proxy require the environment to have the client VM password to obtain sudo privileges, and for some OSWorld tasks, the agent needs the password to obtain sudo privileges to complete them.
234
+
235
+ ### How to setup the account and credentials for Google and Google Drive?
236
+
237
+ See [Setup Guideline - Google Account Setup](SETUP_GUIDELINE.md#1-google-account-setup).
238
+
239
+ ### How can I configure a proxy for the VM (if I'm behind the GFW, or I don't want some of my tasks to be identified as bot and get lower scores)?
240
+
241
+ See [Setup Guideline - Proxy Configuration](SETUP_GUIDELINE.md#2-proxy-configuration).
242
+ We also provide a pre-configured solution based on DataImpulse, please refer to the [proxy setup section](SETUP_GUIDELINE.md#23-proxy-for-specific-tasks-recommended).
243
+
244
+ ### Open Source Contributors
245
+
246
+ Thanks to all the contributors!
247
+
248
+ <a href="https://github.com/xlang-ai/OSWorld/graphs/contributors">
249
+ <img src="https://stg.contrib.rocks/image?repo=xlang-ai/OSWorld" />
250
+ </a>
251
+
252
+
253
+ ## 📄 Citation
254
+ If you find this environment useful, please consider citing our work:
255
+ ```
256
+ @misc{OSWorld,
257
+ title={OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments},
258
+ author={Tianbao Xie and Danyang Zhang and Jixuan Chen and Xiaochuan Li and Siheng Zhao and Ruisheng Cao and Toh Jing Hua and Zhoujun Cheng and Dongchan Shin and Fangyu Lei and Yitao Liu and Yiheng Xu and Shuyan Zhou and Silvio Savarese and Caiming Xiong and Victor Zhong and Tao Yu},
259
+ year={2024},
260
+ eprint={2404.07972},
261
+ archivePrefix={arXiv},
262
+ primaryClass={cs.AI}
263
+ }
264
+ ```
265
+
266
+ ## Acknowledgement for OSWorld-Verified
267
+ Special thanks to the following institutions that provided feedback and participated in the fixes (as well as institutions that provided feedback during the process): [MoonShot AI, a.k.a. Kimi](https://www.moonshot.ai/),[Human Data](https://www.hud.so/), [OpenAI](https://openai.com/), [ByteDance Seed TARS](https://seed-tars.com/), [Anthropic](https://www.anthropic.com/), [Simular](https://www.simular.ai/), [HKU Data Intelligence Lab](https://sites.google.com/view/chaoh)
268
+
269
+ Special thanks to the following students who participated in the specific fixes: [Mengqi Yuan](https://yuanmengqi.github.io/), [Danyang Zhang](https://zdy023.github.io/), [Xinzhuang Xiong](https://thisisxxz.com/), [Zhennan Shen](https://scholar.google.com/citations?user=JPwg5MwAAAAJ&hl=en), [Zilong Zhou](https://github.com/adlsdztony), Yanxu Chen, [Jiaqi Deng](https://millank0817.github.io/), [Tianbao Xie](https://tianbaoxie.com/), Junda Chen, [Jixuan Chen](https://chenjix.github.io/), [Haoyuan Wu](https://www.linkedin.com/in/haoyuan-wu-240878291/).
270
+
271
+ Special thanks to the following students who participated in running the re-evaluation: [Mengqi Yuan](https://yuanmengqi.github.io/), [Zilong Zhou](https://github.com/adlsdztony), [Xinyuan Wang](https://xinyuanwangcs.github.io/), [Bowen Wang](https://bowenbryanwang.github.io/).
272
+
273
+ ## You might also be interested
274
+
275
+ - **OSWorld-MCP**: Benchmarking MCP Tool Invocation in Computer-Use Agents. [Website](https://osworld-mcp.github.io/)
SETUP_GUIDELINE.md ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # OSWorld Setup and Evaluation Guide
2
+
3
+ This comprehensive guide covers all aspects of setting up and running OSWorld evaluations, including account configuration, proxy setup, and public evaluation platform deployment.
4
+
5
+ ## Table of Contents
6
+
7
+ 1. [Google Account Setup](#1-google-account-setup)
8
+ 2. [Proxy Configuration](#2-proxy-configuration)
9
+ 3. [Public Evaluation Platform](#3-public-evaluation-platform)
10
+
11
+ ---
12
+
13
+ ## 1. Google Account Setup
14
+
15
+ For tasks including Google or Google Drive, you need a real Google account with configured OAuth2.0 secrets.
16
+
17
+ > **Attention**: To prevent environment reset and result evaluation conflicts caused by multiple people using the same Google account simultaneously, please register a private Google account rather than using a shared one.
18
+
19
+ ### 1.1 Register A Blank Google Account
20
+
21
+ 1. Go to Google website and register a blank new account
22
+ - You do not need to provide any recovery email or phone for testing purposes
23
+ - **IGNORE** any security recommendations
24
+ - Turn **OFF** the [2-Step Verification](https://support.google.com/accounts/answer/1064203?hl=en&co=GENIE.Platform%3DDesktop#:~:text=Open%20your%20Google%20Account.,Select%20Turn%20off.) to avoid failure in environment setup
25
+
26
+ <p align="center">
27
+ <img src="assets/googleshutoff.png" width="40%" alt="Shut Off 2-Step Verification">
28
+ </p>
29
+
30
+ > **Attention**: We strongly recommend registering a new blank account instead of using an existing one to avoid messing up your personal workspace.
31
+
32
+ 2. Copy and rename `settings.json.template` to `settings.json` under `evaluation_examples/settings/google/`. Replace the two fields:
33
+
34
+ ```json
35
+ {
36
+ "email": "your_google_account@gmail.com",
37
+ "password": "your_google_account_password"
38
+ }
39
+ ```
40
+
41
+ ### 1.2 Create A Google Cloud Project
42
+
43
+ 1. Navigate to [Google Cloud Project Creation](https://console.cloud.google.com/projectcreate) and create a new GCP (see [Create a Google Cloud Project](https://developers.google.com/workspace/guides/create-project) for detailed steps)
44
+
45
+ 2. Go to the [Google Drive API console](https://console.cloud.google.com/apis/library/drive.googleapis.com?) and enable the Google Drive API for the created project (see [Enable and disable APIs](https://support.google.com/googleapi/answer/6158841?hl=en))
46
+
47
+ <p align="center">
48
+ <img src="assets/creategcp.png" width="45%" style="margin-right: 5%;" alt="Create GCP">
49
+ <img src="assets/enableapi.png" width="45%" alt="Google Drive API">
50
+ </p>
51
+
52
+ ### 1.3 Configure OAuth Consent Screen
53
+
54
+ Go to [OAuth consent screen](https://console.cloud.google.com/apis/credentials/consent):
55
+
56
+ 1. Select **External** as the User Type and click **CREATE**
57
+
58
+ <p align="center">
59
+ <img src="assets/external.png" width="80%" alt="External User Type">
60
+ </p>
61
+
62
+ 2. Fill in the required fields:
63
+ - **App name**: Any name you prefer
64
+ - **User support email**: Your Google account email
65
+ - **Developer contact information**: Your Google account email
66
+ - Click **SAVE AND CONTINUE**
67
+
68
+ <p align="center">
69
+ <img src="assets/appinfo.png" width="80%" alt="App Information">
70
+ </p>
71
+
72
+ 3. Add scopes:
73
+ - Click **ADD OR REMOVE SCOPES**
74
+ - Filter and select: `https://www.googleapis.com/auth/drive`
75
+ - Click **UPDATE** and **SAVE AND CONTINUE**
76
+
77
+ <p align="center">
78
+ <img src="assets/addscope.png" width="80%" alt="Add Scopes">
79
+ </p>
80
+
81
+ 4. Add test users:
82
+ - Click **ADD USERS**
83
+ - Add your Google account email
84
+ - Click **SAVE AND CONTINUE**
85
+
86
+ <p align="center">
87
+ <img src="assets/adduser.png" width="80%" alt="Add Test Users">
88
+ </p>
89
+
90
+ ### 1.4 Create OAuth2.0 Credentials
91
+
92
+ 1. Go to [Credentials](https://console.cloud.google.com/apis/credentials) page
93
+ 2. Click **CREATE CREDENTIALS** → **OAuth client ID**
94
+ 3. Select **Desktop app** as Application type
95
+ 4. Name it (e.g., "OSWorld Desktop Client")
96
+ 5. Click **CREATE**
97
+
98
+ <p align="center">
99
+ <img src="assets/createcredential.png" width="80%" alt="Create Credentials">
100
+ </p>
101
+
102
+ 6. Download the JSON file and rename it to `credentials.json`
103
+ 7. Place it in `evaluation_examples/settings/google/`
104
+
105
+ <p align="center">
106
+ <img src="assets/downloadjson.png" width="80%" alt="Download JSON">
107
+ </p>
108
+
109
+ ### 1.5 Potential Issues
110
+
111
+ #### Issue 1: Access Blocked During OAuth Flow
112
+
113
+ **Symptom**: "Access blocked: OSWorld's request is invalid" error
114
+
115
+ **Solution**: Ensure you've added your Google account as a test user in the OAuth consent screen configuration.
116
+
117
+ #### Issue 2: Scope Not Granted
118
+
119
+ **Symptom**: Application doesn't have necessary permissions
120
+
121
+ **Solution**: Verify that `https://www.googleapis.com/auth/drive` scope is added in the OAuth consent screen.
122
+
123
+ ---
124
+
125
+ ## 2. Proxy Configuration
126
+
127
+ If you're using OSWorld behind a firewall or need proxy configuration, follow these steps.
128
+
129
+ ### 2.1 Configure Proxy on Host Machine
130
+
131
+ By default, proxy software usually listens only to localhost (`127.0.0.1`), which cannot be reached from the virtual machine. You need to make your proxy software listen to the VMware network card IP or `0.0.0.0`.
132
+
133
+ #### Find VM and Host IP Addresses
134
+
135
+ After launching the VM:
136
+
137
+ ```bash
138
+ # Run this command on host
139
+ # Change ws to fusion if you use VMware Fusion
140
+ vmrun -T ws getGuestIPAddress /path/to/vmx/file
141
+ ```
142
+
143
+ **On Linux (Ubuntu)**:
144
+ ```bash
145
+ ip a # Check IP addresses of each network card
146
+ ```
147
+
148
+ **On Windows**:
149
+ ```cmd
150
+ ipconfig # Check IP addresses of each network card
151
+ ```
152
+
153
+ Look for the VMware network card (usually named `vmnetX` like `vmnet8`). Make sure to use an IP address within the same network segment as the VM.
154
+
155
+ #### Configure Proxy Software
156
+
157
+ Configure your proxy software to listen on the VMware network card IP:
158
+
159
+ <p align="center">
160
+ <img src="assets/proxysetup.png" width="80%" alt="Proxy Setup">
161
+ </p>
162
+
163
+ #### Alternative: Port Forwarding
164
+
165
+ If you cannot change the listening address, set up port forwarding.
166
+
167
+ **On Linux (Ubuntu)**:
168
+ ```bash
169
+ # Forward 192.168.108.1:1080 to 127.0.0.1:1080
170
+ socat TCP-LISTEN:1080,bind=192.168.108.1,fork TCP:127.0.0.1:1080
171
+ ```
172
+
173
+ **On Windows** (with admin privileges):
174
+ ```cmd
175
+ netsh interface portproxy add v4tov4 listenport=1080 listenaddress=192.168.108.1 connectport=1080 connectaddress=127.0.0.1
176
+ ```
177
+
178
+ ### 2.2 Configure Proxy in Virtual Machine
179
+
180
+ #### For VMware/VirtualBox
181
+
182
+ 1. Start the VM and log in
183
+ 2. Open terminal and edit proxy settings:
184
+
185
+ ```bash
186
+ # Edit environment variables
187
+ sudo nano /etc/environment
188
+ ```
189
+
190
+ 3. Add the following lines (replace with your host IP and port):
191
+
192
+ ```bash
193
+ http_proxy="http://192.168.108.1:1080"
194
+ https_proxy="http://192.168.108.1:1080"
195
+ no_proxy="localhost,127.0.0.1"
196
+ ```
197
+
198
+ 4. For APT package manager:
199
+
200
+ ```bash
201
+ sudo nano /etc/apt/apt.conf.d/proxy.conf
202
+ ```
203
+
204
+ Add:
205
+ ```
206
+ Acquire::http::Proxy "http://192.168.108.1:1080";
207
+ Acquire::https::Proxy "http://192.168.108.1:1080";
208
+ ```
209
+
210
+ 5. Reboot the VM or reload environment:
211
+
212
+ ```bash
213
+ source /etc/environment
214
+ ```
215
+
216
+ #### For Docker
217
+
218
+ When using Docker provider, you can set proxy environment variables:
219
+
220
+ ```python
221
+ env = DesktopEnv(
222
+ provider_name="docker",
223
+ # ... other parameters
224
+ )
225
+ ```
226
+
227
+ Set environment variables before running:
228
+ ```bash
229
+ export HTTP_PROXY=http://your-proxy:port
230
+ export HTTPS_PROXY=http://your-proxy:port
231
+ ```
232
+
233
+ ### 2.3 Proxy for Specific Tasks (Recommended)
234
+
235
+ OSWorld provides built-in proxy support using DataImpulse or similar services:
236
+
237
+ 1. Register at [DataImpulse](https://dataimpulse.com/)
238
+ 2. Purchase a US residential IP package (approximately $1 per 1GB)
239
+ 3. Configure credentials in `evaluation_examples/settings/proxy/dataimpulse.json`:
240
+
241
+ ```json
242
+ [
243
+ {
244
+ "host": "gw.dataimpulse.com",
245
+ "port": 823,
246
+ "username": "your_username",
247
+ "password": "your_password",
248
+ "protocol": "http",
249
+ "provider": "dataimpulse",
250
+ "type": "residential",
251
+ "country": "US",
252
+ "note": "Dataimpulse Residential Proxy"
253
+ }
254
+ ]
255
+ ```
256
+
257
+ OSWorld will automatically use proxy for tasks that need it when `enable_proxy=True` in DesktopEnv.
258
+
259
+ ---
260
+
261
+ ## 3. Public Evaluation Platform
262
+
263
+ We provide an AWS-based platform for large-scale parallel evaluation of OSWorld tasks.
264
+
265
+ ### 3.1 Architecture Overview
266
+
267
+ - **Host Instance**: Central controller that stores code, configurations, and manages task execution
268
+ - **Client Instances**: Worker nodes automatically launched to perform tasks in parallel
269
+
270
+ ### 3.2 Platform Deployment
271
+
272
+ #### Step 1: Launch the Host Instance
273
+
274
+ 1. Create an EC2 instance in AWS console
275
+ 2. **Instance type recommendations**:
276
+ - `t3.medium`: For < 5 parallel environments
277
+ - `t3.large`: For < 15 parallel environments
278
+ - `c4.8xlarge`: For 15+ parallel environments
279
+ 3. **AMI**: Ubuntu Server 24.04 LTS (HVM), SSD Volume Type
280
+ 4. **Storage**: At least 50GB
281
+ 5. **Security group**: Open port 8080 for monitor service
282
+ 6. **VPC**: Use default (note the VPC ID for later)
283
+
284
+ #### Step 2: Connect to Host Instance
285
+
286
+ 1. Download the `.pem` key file when creating the instance
287
+ 2. Set permissions:
288
+ ```bash
289
+ chmod 400 <your_key_file_path>
290
+ ```
291
+ 3. Connect via SSH:
292
+ ```bash
293
+ ssh -i <your_key_path> ubuntu@<your_public_dns>
294
+ ```
295
+
296
+ #### Step 3: Set Up Host Machine
297
+
298
+ ```bash
299
+ # Clone OSWorld repository
300
+ git clone https://github.com/xlang-ai/OSWorld
301
+ cd OSWorld
302
+
303
+ # Optional: Create Conda environment
304
+ # conda create -n osworld python=3.10
305
+ # conda activate osworld
306
+
307
+ # Install dependencies
308
+ pip install -r requirements.txt
309
+ ```
310
+
311
+ #### Step 4: Configure AWS Client Machines
312
+
313
+ ##### Security Group Configuration
314
+
315
+ Create a security group with the following rules:
316
+
317
+ **Inbound Rules** (8 rules required):
318
+
319
+ | Type | Protocol | Port Range | Source | Description |
320
+ |------------|----------|------------|----------------|----------------------------|
321
+ | SSH | TCP | 22 | 0.0.0.0/0 | SSH access |
322
+ | HTTP | TCP | 80 | 172.31.0.0/16 | HTTP traffic |
323
+ | Custom TCP | TCP | 5000 | 172.31.0.0/16 | OSWorld backend service |
324
+ | Custom TCP | TCP | 5910 | 0.0.0.0/0 | NoVNC visualization port |
325
+ | Custom TCP | TCP | 8006 | 172.31.0.0/16 | VNC service port |
326
+ | Custom TCP | TCP | 8080 | 172.31.0.0/16 | VLC service port |
327
+ | Custom TCP | TCP | 8081 | 172.31.0.0/16 | Additional service port |
328
+ | Custom TCP | TCP | 9222 | 172.31.0.0/16 | Chrome control port |
329
+
330
+ **Outbound Rules** (1 rule required):
331
+
332
+ | Type | Protocol | Port Range | Destination | Description |
333
+ |-------------|----------|------------|-------------|----------------------------|
334
+ | All traffic | All | All | 0.0.0.0/0 | Allow all outbound traffic |
335
+
336
+ Record the `AWS_SECURITY_GROUP_ID`.
337
+
338
+ ##### VPC and Subnet Configuration
339
+
340
+ 1. Note the **VPC ID** and **Subnet ID** from your host instance
341
+ 2. Record the **Subnet ID** as `AWS_SUBNET_ID`
342
+
343
+ ##### AWS Access Keys
344
+
345
+ 1. Go to AWS Console → Security Credentials
346
+ 2. Create access key
347
+ 3. Record `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`
348
+
349
+ ### 3.3 Environment Setup
350
+
351
+ #### Google Drive Integration (Optional)
352
+
353
+ Follow [Section 1: Google Account Setup](#1-google-account-setup) above.
354
+
355
+ **Note**: OSWorld includes 8 Google Drive tasks out of 369 total tasks. You can:
356
+ - Complete setup for all 369 tasks, or
357
+ - Skip Google Drive tasks and evaluate 361 tasks (officially supported)
358
+
359
+ #### Set Environment Variables
360
+
361
+ ```bash
362
+ # API Keys (if using)
363
+ # export OPENAI_API_KEY="your_openai_api_key"
364
+ # export ANTHROPIC_API_KEY="your_anthropic_api_key"
365
+
366
+ # AWS Configuration
367
+ export AWS_ACCESS_KEY_ID="your_access_key"
368
+ export AWS_SECRET_ACCESS_KEY="your_security_access_key"
369
+ export AWS_REGION="us-east-1" # or your preferred region
370
+ export AWS_SECURITY_GROUP_ID="sg-xxxx"
371
+ export AWS_SUBNET_ID="subnet-xxxx"
372
+ ```
373
+
374
+ ### 3.4 Running Evaluations
375
+
376
+ ```bash
377
+ # Example: Run OpenAI CUA
378
+ python scripts/python/run_multienv_openaicua.py \
379
+ --headless \
380
+ --observation_type screenshot \
381
+ --model computer-use-preview \
382
+ --result_dir ./results_operator \
383
+ --test_all_meta_path evaluation_examples/test_all.json \
384
+ --region us-east-1 \
385
+ --max_steps 50 \
386
+ --num_envs 5 \
387
+ --client_password osworld-public-evaluation
388
+
389
+ # Example: Run Claude (via AWS Bedrock)
390
+ python scripts/python/run_multienv_claude.py \
391
+ --headless \
392
+ --observation_type screenshot \
393
+ --action_space claude_computer_use \
394
+ --model claude-4-sonnet-20250514 \
395
+ --result_dir ./results_claude \
396
+ --test_all_meta_path evaluation_examples/test_all.json \
397
+ --max_steps 50 \
398
+ --num_envs 5 \
399
+ --provider_name aws \
400
+ --client_password osworld-public-evaluation
401
+ ```
402
+
403
+ **Key Parameters**:
404
+ - `--num_envs`: Number of parallel environments
405
+ - `--max_steps`: Maximum steps per task
406
+ - `--result_dir`: Output directory for results
407
+ - `--test_all_meta_path`: Path to test set metadata
408
+ - `--region`: AWS region
409
+
410
+ ### 3.5 Monitoring and Results
411
+
412
+ #### Web Monitoring Tool
413
+
414
+ ```bash
415
+ cd monitor
416
+ pip install -r requirements.txt
417
+ python main.py
418
+ ```
419
+
420
+ Access at: `http://<host-public-ip>:8080`
421
+
422
+ #### VNC Remote Desktop Access
423
+
424
+ Access VMs via VNC at: `http://<client-public-ip>:5910/vnc.html`
425
+
426
+ Default password: `osworld-public-evaluation`
427
+
428
+ ### 3.6 Submitting Results
429
+
430
+ For leaderboard submission, contact:
431
+ - tianbaoxiexxx@gmail.com
432
+ - yuanmengqi732@gmail.com
433
+
434
+ **Options**:
435
+ 1. **Self-reported**: Submit results with monitor data and trajectories
436
+ 2. **Verified**: Schedule a meeting to run your agent code on our infrastructure
437
+
438
+ ---
439
+
440
+ ## Additional Resources
441
+
442
+ - [Main README](README.md) - Project overview and quick start
443
+ - [Installation Guide](README.md#-installation) - Detailed installation instructions
444
+ - [FAQ](README.md#-faq) - Frequently asked questions
445
+ - [Scripts Documentation](scripts/README.md) - Information about run scripts
446
+
447
+ ## Support
448
+
449
+ If you encounter issues or have questions:
450
+ - Open an issue on [GitHub](https://github.com/xlang-ai/OSWorld/issues)
451
+ - Join our [Discord](https://discord.gg/4Gnw7eTEZR)
452
+ - Email the maintainers (see contact information above)
assets/authorization.png ADDED

Git LFS Details

  • SHA256: dd2c1e15672a7a473a3fe59e2d00fc2441c732cb391f8ca9bbd917d12f4eee16
  • Pointer size: 131 Bytes
  • Size of remote file: 821 kB
assets/creategcp.png ADDED

Git LFS Details

  • SHA256: 49ced4afcfcb7cbe6180777fdc43416c4beaadfab9977cbaa65f21d45cffcd31
  • Pointer size: 131 Bytes
  • Size of remote file: 192 kB
assets/desktopapp.png ADDED

Git LFS Details

  • SHA256: 03af749d338e64d4d5ec7db9913847adf9a3101f171114c12ced08911823fd2e
  • Pointer size: 131 Bytes
  • Size of remote file: 224 kB
assets/developer.png ADDED

Git LFS Details

  • SHA256: ab7292cfcc5b523a66ca55bcb4e39792f8800a7d708664b101805602be46d6f8
  • Pointer size: 131 Bytes
  • Size of remote file: 190 kB
assets/enableapi.png ADDED

Git LFS Details

  • SHA256: 6f6aed97e6c8df6f4856a14a5e28440ae77ed5cff606323bd4ae1273ee933dae
  • Pointer size: 131 Bytes
  • Size of remote file: 188 kB
assets/googleidentity.png ADDED
assets/googlephonecode.png ADDED
assets/googleshutoff.png ADDED
assets/netsetting1.png ADDED
assets/netsetting2.png ADDED
assets/netsetting3.png ADDED
assets/netsetting4.png ADDED
assets/oauth2.0.png ADDED

Git LFS Details

  • SHA256: 144f68b6e625d5712ff85910ce583d1cb593e51c7f17dcbcc56b8b9dd4083d35
  • Pointer size: 131 Bytes
  • Size of remote file: 151 kB
assets/oauthapp.png ADDED

Git LFS Details

  • SHA256: 44e1549a798924dcda2411e0060cc1a8e43dea3326b2d443f4886beec712db36
  • Pointer size: 131 Bytes
  • Size of remote file: 219 kB
assets/proxysetup-zh.png ADDED

Git LFS Details

  • SHA256: a66c94b1d2518c397485e54c30356830f4104a1a32320c784fa39e4b8fe215fb
  • Pointer size: 131 Bytes
  • Size of remote file: 110 kB
assets/proxysetup.png ADDED

Git LFS Details

  • SHA256: 7821f4eca280d08c64627834e2ef3cd3a8bdab9340daefd69aaa41cc632157d5
  • Pointer size: 131 Bytes
  • Size of remote file: 115 kB
assets/pubeval1.png ADDED
assets/pubeval2.png ADDED

Git LFS Details

  • SHA256: 156bb2cf3192c05eed6207b9530c5a6aba66fe3e569cd57f0eb4286d204c2aaa
  • Pointer size: 131 Bytes
  • Size of remote file: 178 kB
assets/pubeval3.png ADDED

Git LFS Details

  • SHA256: cc2693e712ddc548f588c206fb834658cce285c0fd85e9a772bcb5ebedb158fd
  • Pointer size: 131 Bytes
  • Size of remote file: 316 kB
assets/pubeval4.png ADDED
assets/pubeval5.png ADDED
assets/pubeval_gdrive_auth.jpg ADDED

Git LFS Details

  • SHA256: b6f4c435173be710c9a61625e27c8770379c2a9308cec192d5ad2d116dabe977
  • Pointer size: 131 Bytes
  • Size of remote file: 172 kB
assets/pubeval_monitor1.jpg ADDED

Git LFS Details

  • SHA256: 3fe75f55e0037b9ef90495e1878cb9609434afd9369038b283ef81b448a2b385
  • Pointer size: 132 Bytes
  • Size of remote file: 1.04 MB
assets/pubeval_monitor2.jpg ADDED

Git LFS Details

  • SHA256: eba2246d4094f2975553b4bdbc5f65f7b8312e3f7976b284b100771db5579125
  • Pointer size: 131 Bytes
  • Size of remote file: 755 kB
assets/pubeval_subnet.png ADDED

Git LFS Details

  • SHA256: eeabe4188dee3770f93458519358bb89c0321af0cdbc335892ad1ae8ca9609ac
  • Pointer size: 131 Bytes
  • Size of remote file: 467 kB
assets/publishapp.png ADDED

Git LFS Details

  • SHA256: af2186dac796b7c3715ba7398015a67f5c947cda07af7fcf40902f65797d8f05
  • Pointer size: 131 Bytes
  • Size of remote file: 128 kB
assets/testusers.png ADDED

Git LFS Details

  • SHA256: bb861503030f57f94a1e0b4ab751abee559196a9e138dde4638849b5c45c5b2e
  • Pointer size: 131 Bytes
  • Size of remote file: 210 kB
assets/unsafemode.png ADDED

Git LFS Details

  • SHA256: 02ed11a510b0869539dacd46f18bab82b3bf86695b4c720c29c322f4797bfc1a
  • Pointer size: 131 Bytes
  • Size of remote file: 856 kB
assets/usertype.png ADDED

Git LFS Details

  • SHA256: 6f84d8da769bb21ef587ed6e27fc704f6da60ee223319c38891bd3e4f572c4ee
  • Pointer size: 131 Bytes
  • Size of remote file: 307 kB
assets/winnetsetting1.png ADDED

Git LFS Details

  • SHA256: 60bb6e2d288e7a658b9be4ced22267c5711dc867665dcd32a8b26ec425056642
  • Pointer size: 131 Bytes
  • Size of remote file: 131 kB
assets/winnetsetting2.png ADDED
assets/winnetsetting3.png ADDED

Git LFS Details

  • SHA256: 05517faec92982d5a9bcf3628530861cb167c4780ccfec4f5c6cf4f1a3c327b8
  • Pointer size: 131 Bytes
  • Size of remote file: 244 kB
assets/winnetsetting4.png ADDED

Git LFS Details

  • SHA256: 533c4409a2e799e0583054de98305bcabce2da833cd857f7b8f38bcbd662cfa0
  • Pointer size: 131 Bytes
  • Size of remote file: 220 kB
desktop_env/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+
desktop_env/actions.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ X_MAX = 1920 # TODO: get the screen resolution
2
+ Y_MAX = 1080
3
+
4
+ KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright']
5
+
6
+ ACTION_SPACE = [
7
+ {
8
+ "action_type": "MOVE_TO",
9
+ "note": "move the cursor to the specified position",
10
+ "parameters": {
11
+ "x": {
12
+ "type": float,
13
+ "range": [0, X_MAX],
14
+ "optional": False,
15
+ },
16
+ "y": {
17
+ "type": float,
18
+ "range": [0, Y_MAX],
19
+ "optional": False,
20
+ }
21
+ }
22
+ },
23
+ {
24
+ "action_type": "CLICK",
25
+ "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
26
+ "parameters": {
27
+ "button": {
28
+ "type": str,
29
+ "range": ["left", "right", "middle"],
30
+ "optional": True,
31
+ },
32
+ "x": {
33
+ "type": float,
34
+ "range": [0, X_MAX],
35
+ "optional": True,
36
+ },
37
+ "y": {
38
+ "type": float,
39
+ "range": [0, Y_MAX],
40
+ "optional": True,
41
+ },
42
+ "num_clicks": {
43
+ "type": int,
44
+ "range": [1, 2, 3],
45
+ "optional": True,
46
+ },
47
+ }
48
+ },
49
+ {
50
+ "action_type": "MOUSE_DOWN",
51
+ "note": "press the left button if the button not specified, otherwise press the specified button",
52
+ "parameters": {
53
+ "button": {
54
+ "type": str,
55
+ "range": ["left", "right", "middle"],
56
+ "optional": True,
57
+ }
58
+ }
59
+ },
60
+ {
61
+ "action_type": "MOUSE_UP",
62
+ "note": "release the left button if the button not specified, otherwise release the specified button",
63
+ "parameters": {
64
+ "button": {
65
+ "type": str,
66
+ "range": ["left", "right", "middle"],
67
+ "optional": True,
68
+ }
69
+ }
70
+ },
71
+ {
72
+ "action_type": "RIGHT_CLICK",
73
+ "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
74
+ "parameters": {
75
+ "x": {
76
+ "type": float,
77
+ "range": [0, X_MAX],
78
+ "optional": True,
79
+ },
80
+ "y": {
81
+ "type": float,
82
+ "range": [0, Y_MAX],
83
+ "optional": True,
84
+ }
85
+ }
86
+ },
87
+ {
88
+ "action_type": "DOUBLE_CLICK",
89
+ "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
90
+ "parameters": {
91
+ "x": {
92
+ "type": float,
93
+ "range": [0, X_MAX],
94
+ "optional": True,
95
+ },
96
+ "y": {
97
+ "type": float,
98
+ "range": [0, Y_MAX],
99
+ "optional": True,
100
+ }
101
+ }
102
+ },
103
+ {
104
+ "action_type": "DRAG_TO",
105
+ "note": "drag the cursor to the specified position with the left button pressed",
106
+ "parameters": {
107
+ "x": {
108
+ "type": float,
109
+ "range": [0, X_MAX],
110
+ "optional": False,
111
+ },
112
+ "y": {
113
+ "type": float,
114
+ "range": [0, Y_MAX],
115
+ "optional": False,
116
+ }
117
+ }
118
+ },
119
+ {
120
+ "action_type": "SCROLL",
121
+ "note": "scroll the mouse wheel up or down",
122
+ "parameters": {
123
+ "dx": {
124
+ "type": int,
125
+ "range": None,
126
+ "optional": False,
127
+ },
128
+ "dy": {
129
+ "type": int,
130
+ "range": None,
131
+ "optional": False,
132
+ }
133
+ }
134
+ },
135
+ {
136
+ "action_type": "TYPING",
137
+ "note": "type the specified text",
138
+ "parameters": {
139
+ "text": {
140
+ "type": str,
141
+ "range": None,
142
+ "optional": False,
143
+ }
144
+ }
145
+ },
146
+ {
147
+ "action_type": "PRESS",
148
+ "note": "press the specified key and release it",
149
+ "parameters": {
150
+ "key": {
151
+ "type": str,
152
+ "range": KEYBOARD_KEYS,
153
+ "optional": False,
154
+ }
155
+ }
156
+ },
157
+ {
158
+ "action_type": "KEY_DOWN",
159
+ "note": "press the specified key",
160
+ "parameters": {
161
+ "key": {
162
+ "type": str,
163
+ "range": KEYBOARD_KEYS,
164
+ "optional": False,
165
+ }
166
+ }
167
+ },
168
+ {
169
+ "action_type": "KEY_UP",
170
+ "note": "release the specified key",
171
+ "parameters": {
172
+ "key": {
173
+ "type": str,
174
+ "range": KEYBOARD_KEYS,
175
+ "optional": False,
176
+ }
177
+ }
178
+ },
179
+ {
180
+ "action_type": "HOTKEY",
181
+ "note": "press the specified key combination",
182
+ "parameters": {
183
+ "keys": {
184
+ "type": list,
185
+ "range": [KEYBOARD_KEYS],
186
+ "optional": False,
187
+ }
188
+ }
189
+ },
190
+ ############################################################################################################
191
+ {
192
+ "action_type": "WAIT",
193
+ "note": "wait until the next action",
194
+ },
195
+ {
196
+ "action_type": "FAIL",
197
+ "note": "decide the task can not be performed",
198
+ },
199
+ {
200
+ "action_type": "DONE",
201
+ "note": "decide the task is done",
202
+ }
203
+ ]
desktop_env/controllers/__init__.py ADDED
File without changes
desktop_env/controllers/python.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import random
4
+ from typing import Any, Dict, Optional
5
+ import time
6
+ import traceback
7
+ import requests
8
+
9
+ from desktop_env.actions import KEYBOARD_KEYS
10
+
11
+ logger = logging.getLogger("desktopenv.pycontroller")
12
+
13
+
14
+ class PythonController:
15
+ def __init__(self, vm_ip: str,
16
+ server_port: int,
17
+ pkgs_prefix: str = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"):
18
+ self.vm_ip = vm_ip
19
+ self.http_server = f"http://{vm_ip}:{server_port}"
20
+ self.pkgs_prefix = pkgs_prefix # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages
21
+ self.retry_times = 3
22
+ self.retry_interval = 5
23
+
24
+ @staticmethod
25
+ def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool:
26
+ """Quick validation for PNG/JPEG payload using magic bytes; Content-Type is advisory.
27
+ Returns True only when bytes look like a real PNG or JPEG.
28
+ """
29
+ if not isinstance(data, (bytes, bytearray)) or not data:
30
+ return False
31
+ # PNG magic
32
+ if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n":
33
+ return True
34
+ # JPEG magic
35
+ if len(data) >= 3 and data[:3] == b"\xff\xd8\xff":
36
+ return True
37
+ # If server explicitly marks as image, accept as a weak fallback (some environments strip magic)
38
+ if content_type and ("image/png" in content_type or "image/jpeg" in content_type or "image/jpg" in content_type):
39
+ return True
40
+ return False
41
+
42
+ def get_screenshot(self) -> Optional[bytes]:
43
+ """
44
+ Gets a screenshot from the server. With the cursor. None -> no screenshot or unexpected error.
45
+ """
46
+
47
+ for attempt_idx in range(self.retry_times):
48
+ try:
49
+ response = requests.get(self.http_server + "/screenshot", timeout=10)
50
+ if response.status_code == 200:
51
+ content_type = response.headers.get("Content-Type", "")
52
+ content = response.content
53
+ if self._is_valid_image_response(content_type, content):
54
+ logger.info("Got screenshot successfully")
55
+ return content
56
+ else:
57
+ logger.error("Invalid screenshot payload (attempt %d/%d).", attempt_idx + 1, self.retry_times)
58
+ logger.info("Retrying to get screenshot.")
59
+ else:
60
+ logger.error("Failed to get screenshot. Status code: %d", response.status_code)
61
+ logger.info("Retrying to get screenshot.")
62
+ except Exception as e:
63
+ logger.error("An error occurred while trying to get the screenshot: %s", e)
64
+ logger.info("Retrying to get screenshot.")
65
+ time.sleep(self.retry_interval)
66
+
67
+ logger.error("Failed to get screenshot.")
68
+ return None
69
+
70
+ def get_accessibility_tree(self) -> Optional[str]:
71
+ """
72
+ Gets the accessibility tree from the server. None -> no accessibility tree or unexpected error.
73
+ """
74
+
75
+ for _ in range(self.retry_times):
76
+ try:
77
+ response: requests.Response = requests.get(self.http_server + "/accessibility")
78
+ if response.status_code == 200:
79
+ logger.info("Got accessibility tree successfully")
80
+ return response.json()["AT"]
81
+ else:
82
+ logger.error("Failed to get accessibility tree. Status code: %d", response.status_code)
83
+ logger.info("Retrying to get accessibility tree.")
84
+ except Exception as e:
85
+ logger.error("An error occurred while trying to get the accessibility tree: %s", e)
86
+ logger.info("Retrying to get accessibility tree.")
87
+ time.sleep(self.retry_interval)
88
+
89
+ logger.error("Failed to get accessibility tree.")
90
+ return None
91
+
92
+ def get_terminal_output(self) -> Optional[str]:
93
+ """
94
+ Gets the terminal output from the server. None -> no terminal output or unexpected error.
95
+ """
96
+
97
+ for _ in range(self.retry_times):
98
+ try:
99
+ response = requests.get(self.http_server + "/terminal")
100
+ if response.status_code == 200:
101
+ logger.info("Got terminal output successfully")
102
+ return response.json()["output"]
103
+ else:
104
+ logger.error("Failed to get terminal output. Status code: %d", response.status_code)
105
+ logger.info("Retrying to get terminal output.")
106
+ except Exception as e:
107
+ logger.error("An error occurred while trying to get the terminal output: %s", e)
108
+ logger.info("Retrying to get terminal output.")
109
+ time.sleep(self.retry_interval)
110
+
111
+ logger.error("Failed to get terminal output.")
112
+ return None
113
+
114
+ def get_file(self, file_path: str) -> Optional[bytes]:
115
+ """
116
+ Gets a file from the server.
117
+ """
118
+
119
+ for _ in range(self.retry_times):
120
+ try:
121
+ response = requests.post(self.http_server + "/file", data={"file_path": file_path})
122
+ if response.status_code == 200:
123
+ logger.info("File downloaded successfully")
124
+ return response.content
125
+ else:
126
+ logger.error("Failed to get file. Status code: %d", response.status_code)
127
+ logger.info("Retrying to get file.")
128
+ except Exception as e:
129
+ logger.error("An error occurred while trying to get the file: %s", e)
130
+ logger.info("Retrying to get file.")
131
+ time.sleep(self.retry_interval)
132
+
133
+ logger.error("Failed to get file.")
134
+ return None
135
+
136
+ def execute_python_command(self, command: str) -> None:
137
+ """
138
+ Executes a python command on the server.
139
+ It can be used to execute the pyautogui commands, or... any other python command. who knows?
140
+ """
141
+ # command_list = ["python", "-c", self.pkgs_prefix.format(command=command)]
142
+ command_list = ["python", "-c", self.pkgs_prefix.format(command=command)]
143
+ payload = json.dumps({"command": command_list, "shell": False})
144
+
145
+ for _ in range(self.retry_times):
146
+ try:
147
+ response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'},
148
+ data=payload, timeout=90)
149
+ if response.status_code == 200:
150
+ logger.info("Command executed successfully: %s", response.text)
151
+ return response.json()
152
+ else:
153
+ logger.error("Failed to execute command. Status code: %d", response.status_code)
154
+ logger.info("Retrying to execute command.")
155
+ except requests.exceptions.ReadTimeout:
156
+ break
157
+ except Exception as e:
158
+ logger.error("An error occurred while trying to execute the command: %s", e)
159
+ logger.info("Retrying to execute command.")
160
+ time.sleep(self.retry_interval)
161
+
162
+ logger.error("Failed to execute command.")
163
+ return None
164
+
165
+ def run_python_script(self, script: str) -> Optional[Dict[str, Any]]:
166
+ """
167
+ Executes a python script on the server.
168
+ """
169
+ payload = json.dumps({"code": script})
170
+
171
+ for _ in range(self.retry_times):
172
+ try:
173
+ response = requests.post(self.http_server + "/run_python", headers={'Content-Type': 'application/json'},
174
+ data=payload, timeout=90)
175
+ if response.status_code == 200:
176
+ return response.json()
177
+ else:
178
+ return {"status": "error", "message": "Failed to execute command.", "output": None, "error": response.json()["error"]}
179
+ except requests.exceptions.ReadTimeout:
180
+ break
181
+ except Exception:
182
+ logger.error("An error occurred while trying to execute the command: %s", traceback.format_exc())
183
+ logger.info("Retrying to execute command.")
184
+ time.sleep(self.retry_interval)
185
+
186
+ logger.error("Failed to execute command.")
187
+ return {"status": "error", "message": "Failed to execute command.", "output": "", "error": "Retry limit reached."}
188
+
189
+ def run_bash_script(self, script: str, timeout: int = 30, working_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
190
+ """
191
+ Executes a bash script on the server.
192
+
193
+ :param script: The bash script content (can be multi-line)
194
+ :param timeout: Execution timeout in seconds (default: 30)
195
+ :param working_dir: Working directory for script execution (optional)
196
+ :return: Dictionary with status, output, error, and returncode, or None if failed
197
+ """
198
+ payload = json.dumps({
199
+ "script": script,
200
+ "timeout": timeout,
201
+ "working_dir": working_dir
202
+ })
203
+
204
+ for _ in range(self.retry_times):
205
+ try:
206
+ response = requests.post(
207
+ self.http_server + "/run_bash_script",
208
+ headers={'Content-Type': 'application/json'},
209
+ data=payload,
210
+ timeout=timeout + 100 # Add buffer to HTTP timeout
211
+ )
212
+ if response.status_code == 200:
213
+ result = response.json()
214
+ logger.info("Bash script executed successfully with return code: %d", result.get("returncode", -1))
215
+ return result
216
+ else:
217
+ logger.error("Failed to execute bash script. Status code: %d, response: %s",
218
+ response.status_code, response.text)
219
+ logger.info("Retrying to execute bash script.")
220
+ except requests.exceptions.ReadTimeout:
221
+ logger.error("Bash script execution timed out")
222
+ return {
223
+ "status": "error",
224
+ "output": "",
225
+ "error": f"Script execution timed out after {timeout} seconds",
226
+ "returncode": -1
227
+ }
228
+ except Exception as e:
229
+ logger.error("An error occurred while trying to execute the bash script: %s", e)
230
+ logger.info("Retrying to execute bash script.")
231
+ time.sleep(self.retry_interval)
232
+
233
+ logger.error("Failed to execute bash script after %d retries.", self.retry_times)
234
+ return {
235
+ "status": "error",
236
+ "output": "",
237
+ "error": f"Failed to execute bash script after {self.retry_times} retries",
238
+ "returncode": -1
239
+ }
240
+
241
+ def execute_action(self, action):
242
+ """
243
+ Executes an action on the server computer.
244
+ """
245
+ # Handle string actions
246
+ if action in ['WAIT', 'FAIL', 'DONE']:
247
+ return
248
+
249
+ # Handle dictionary actions
250
+ if type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']:
251
+ return
252
+
253
+ action_type = action["action_type"]
254
+ parameters = action["parameters"] if "parameters" in action else {param: action[param] for param in action if param != 'action_type'}
255
+ move_mode = random.choice(
256
+ ["pyautogui.easeInQuad", "pyautogui.easeOutQuad", "pyautogui.easeInOutQuad", "pyautogui.easeInBounce",
257
+ "pyautogui.easeInElastic"])
258
+ duration = random.uniform(0.5, 1)
259
+
260
+ if action_type == "MOVE_TO":
261
+ if parameters == {} or None:
262
+ self.execute_python_command("pyautogui.moveTo()")
263
+ elif "x" in parameters and "y" in parameters:
264
+ x = parameters["x"]
265
+ y = parameters["y"]
266
+ self.execute_python_command(f"pyautogui.moveTo({x}, {y}, {duration}, {move_mode})")
267
+ else:
268
+ raise Exception(f"Unknown parameters: {parameters}")
269
+
270
+ elif action_type == "CLICK":
271
+ if parameters == {} or None:
272
+ self.execute_python_command("pyautogui.click()")
273
+ elif "button" in parameters and "x" in parameters and "y" in parameters:
274
+ button = parameters["button"]
275
+ x = parameters["x"]
276
+ y = parameters["y"]
277
+ if "num_clicks" in parameters:
278
+ num_clicks = parameters["num_clicks"]
279
+ self.execute_python_command(
280
+ f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})")
281
+ else:
282
+ self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})")
283
+ elif "button" in parameters and "x" not in parameters and "y" not in parameters:
284
+ button = parameters["button"]
285
+ if "num_clicks" in parameters:
286
+ num_clicks = parameters["num_clicks"]
287
+ self.execute_python_command(f"pyautogui.click(button='{button}', clicks={num_clicks})")
288
+ else:
289
+ self.execute_python_command(f"pyautogui.click(button='{button}')")
290
+ elif "button" not in parameters and "x" in parameters and "y" in parameters:
291
+ x = parameters["x"]
292
+ y = parameters["y"]
293
+ if "num_clicks" in parameters:
294
+ num_clicks = parameters["num_clicks"]
295
+ self.execute_python_command(f"pyautogui.click(x={x}, y={y}, clicks={num_clicks})")
296
+ else:
297
+ self.execute_python_command(f"pyautogui.click(x={x}, y={y})")
298
+ else:
299
+ raise Exception(f"Unknown parameters: {parameters}")
300
+
301
+ elif action_type == "MOUSE_DOWN":
302
+ if parameters == {} or None:
303
+ self.execute_python_command("pyautogui.mouseDown()")
304
+ elif "button" in parameters:
305
+ button = parameters["button"]
306
+ self.execute_python_command(f"pyautogui.mouseDown(button='{button}')")
307
+ else:
308
+ raise Exception(f"Unknown parameters: {parameters}")
309
+
310
+ elif action_type == "MOUSE_UP":
311
+ if parameters == {} or None:
312
+ self.execute_python_command("pyautogui.mouseUp()")
313
+ elif "button" in parameters:
314
+ button = parameters["button"]
315
+ self.execute_python_command(f"pyautogui.mouseUp(button='{button}')")
316
+ else:
317
+ raise Exception(f"Unknown parameters: {parameters}")
318
+
319
+ elif action_type == "RIGHT_CLICK":
320
+ if parameters == {} or None:
321
+ self.execute_python_command("pyautogui.rightClick()")
322
+ elif "x" in parameters and "y" in parameters:
323
+ x = parameters["x"]
324
+ y = parameters["y"]
325
+ self.execute_python_command(f"pyautogui.rightClick(x={x}, y={y})")
326
+ else:
327
+ raise Exception(f"Unknown parameters: {parameters}")
328
+
329
+ elif action_type == "DOUBLE_CLICK":
330
+ if parameters == {} or None:
331
+ self.execute_python_command("pyautogui.doubleClick()")
332
+ elif "x" in parameters and "y" in parameters:
333
+ x = parameters["x"]
334
+ y = parameters["y"]
335
+ self.execute_python_command(f"pyautogui.doubleClick(x={x}, y={y})")
336
+ else:
337
+ raise Exception(f"Unknown parameters: {parameters}")
338
+
339
+ elif action_type == "DRAG_TO":
340
+ if "x" in parameters and "y" in parameters:
341
+ x = parameters["x"]
342
+ y = parameters["y"]
343
+ self.execute_python_command(
344
+ f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)")
345
+
346
+ elif action_type == "SCROLL":
347
+ # todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out
348
+ if "dx" in parameters and "dy" in parameters:
349
+ dx = parameters["dx"]
350
+ dy = parameters["dy"]
351
+ self.execute_python_command(f"pyautogui.hscroll({dx})")
352
+ self.execute_python_command(f"pyautogui.vscroll({dy})")
353
+ elif "dx" in parameters and "dy" not in parameters:
354
+ dx = parameters["dx"]
355
+ self.execute_python_command(f"pyautogui.hscroll({dx})")
356
+ elif "dx" not in parameters and "dy" in parameters:
357
+ dy = parameters["dy"]
358
+ self.execute_python_command(f"pyautogui.vscroll({dy})")
359
+ else:
360
+ raise Exception(f"Unknown parameters: {parameters}")
361
+
362
+ elif action_type == "TYPING":
363
+ if "text" not in parameters:
364
+ raise Exception(f"Unknown parameters: {parameters}")
365
+ # deal with special ' and \ characters
366
+ # text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'")
367
+ # self.execute_python_command(f"pyautogui.typewrite('{text}')")
368
+ text = parameters["text"]
369
+ self.execute_python_command("pyautogui.typewrite({:})".format(repr(text)))
370
+
371
+ elif action_type == "PRESS":
372
+ if "key" not in parameters:
373
+ raise Exception(f"Unknown parameters: {parameters}")
374
+ key = parameters["key"]
375
+ if key.lower() not in KEYBOARD_KEYS:
376
+ raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
377
+ self.execute_python_command(f"pyautogui.press('{key}')")
378
+
379
+ elif action_type == "KEY_DOWN":
380
+ if "key" not in parameters:
381
+ raise Exception(f"Unknown parameters: {parameters}")
382
+ key = parameters["key"]
383
+ if key.lower() not in KEYBOARD_KEYS:
384
+ raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
385
+ self.execute_python_command(f"pyautogui.keyDown('{key}')")
386
+
387
+ elif action_type == "KEY_UP":
388
+ if "key" not in parameters:
389
+ raise Exception(f"Unknown parameters: {parameters}")
390
+ key = parameters["key"]
391
+ if key.lower() not in KEYBOARD_KEYS:
392
+ raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
393
+ self.execute_python_command(f"pyautogui.keyUp('{key}')")
394
+
395
+ elif action_type == "HOTKEY":
396
+ if "keys" not in parameters:
397
+ raise Exception(f"Unknown parameters: {parameters}")
398
+ keys = parameters["keys"]
399
+ if not isinstance(keys, list):
400
+ raise Exception("Keys must be a list of keys")
401
+ for key in keys:
402
+ if key.lower() not in KEYBOARD_KEYS:
403
+ raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
404
+
405
+ keys_para_rep = "', '".join(keys)
406
+ self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')")
407
+
408
+ elif action_type in ['WAIT', 'FAIL', 'DONE']:
409
+ pass
410
+
411
+ else:
412
+ raise Exception(f"Unknown action type: {action_type}")
413
+
414
+ # Record video
415
+ def start_recording(self):
416
+ """
417
+ Starts recording the screen.
418
+ """
419
+
420
+ for _ in range(self.retry_times):
421
+ try:
422
+ response = requests.post(self.http_server + "/start_recording")
423
+ if response.status_code == 200:
424
+ logger.info("Recording started successfully")
425
+ return
426
+ else:
427
+ logger.error("Failed to start recording. Status code: %d", response.status_code)
428
+ logger.info("Retrying to start recording.")
429
+ except Exception as e:
430
+ logger.error("An error occurred while trying to start recording: %s", e)
431
+ logger.info("Retrying to start recording.")
432
+ time.sleep(self.retry_interval)
433
+
434
+ logger.error("Failed to start recording.")
435
+
436
+ def end_recording(self, dest: str):
437
+ """
438
+ Ends recording the screen.
439
+ """
440
+
441
+ for _ in range(self.retry_times):
442
+ try:
443
+ response = requests.post(self.http_server + "/end_recording")
444
+ if response.status_code == 200:
445
+ logger.info("Recording stopped successfully")
446
+ with open(dest, 'wb') as f:
447
+ for chunk in response.iter_content(chunk_size=8192):
448
+ if chunk:
449
+ f.write(chunk)
450
+ return
451
+ else:
452
+ logger.error("Failed to stop recording. Status code: %d", response.status_code)
453
+ logger.info("Retrying to stop recording.")
454
+ except Exception as e:
455
+ logger.error("An error occurred while trying to stop recording: %s", e)
456
+ logger.info("Retrying to stop recording.")
457
+ time.sleep(self.retry_interval)
458
+
459
+ logger.error("Failed to stop recording.")
460
+
461
+ # Additional info
462
+ def get_vm_platform(self):
463
+ """
464
+ Gets the size of the vm screen.
465
+ """
466
+ return self.execute_python_command("import platform; print(platform.system())")['output'].strip()
467
+
468
+ def get_vm_machine(self):
469
+ """
470
+ Gets the machine of the vm.
471
+ """
472
+ return self.execute_python_command("import platform; print(platform.machine())")['output'].strip()
473
+
474
+
475
+ def get_vm_screen_size(self):
476
+ """
477
+ Gets the size of the vm screen.
478
+ """
479
+
480
+ for _ in range(self.retry_times):
481
+ try:
482
+ response = requests.post(self.http_server + "/screen_size")
483
+ if response.status_code == 200:
484
+ logger.info("Got screen size successfully")
485
+ return response.json()
486
+ else:
487
+ logger.error("Failed to get screen size. Status code: %d", response.status_code)
488
+ logger.info("Retrying to get screen size.")
489
+ except Exception as e:
490
+ logger.error("An error occurred while trying to get the screen size: %s", e)
491
+ logger.info("Retrying to get screen size.")
492
+ time.sleep(self.retry_interval)
493
+
494
+ logger.error("Failed to get screen size.")
495
+ return None
496
+
497
+ def get_vm_window_size(self, app_class_name: str):
498
+ """
499
+ Gets the size of the vm app window.
500
+ """
501
+
502
+ for _ in range(self.retry_times):
503
+ try:
504
+ response = requests.post(self.http_server + "/window_size", data={"app_class_name": app_class_name})
505
+ if response.status_code == 200:
506
+ logger.info("Got window size successfully")
507
+ return response.json()
508
+ else:
509
+ logger.error("Failed to get window size. Status code: %d", response.status_code)
510
+ logger.info("Retrying to get window size.")
511
+ except Exception as e:
512
+ logger.error("An error occurred while trying to get the window size: %s", e)
513
+ logger.info("Retrying to get window size.")
514
+ time.sleep(self.retry_interval)
515
+
516
+ logger.error("Failed to get window size.")
517
+ return None
518
+
519
+ def get_vm_wallpaper(self):
520
+ """
521
+ Gets the wallpaper of the vm.
522
+ """
523
+
524
+ for _ in range(self.retry_times):
525
+ try:
526
+ response = requests.post(self.http_server + "/wallpaper")
527
+ if response.status_code == 200:
528
+ logger.info("Got wallpaper successfully")
529
+ return response.content
530
+ else:
531
+ logger.error("Failed to get wallpaper. Status code: %d", response.status_code)
532
+ logger.info("Retrying to get wallpaper.")
533
+ except Exception as e:
534
+ logger.error("An error occurred while trying to get the wallpaper: %s", e)
535
+ logger.info("Retrying to get wallpaper.")
536
+ time.sleep(self.retry_interval)
537
+
538
+ logger.error("Failed to get wallpaper.")
539
+ return None
540
+
541
+ def get_vm_desktop_path(self) -> Optional[str]:
542
+ """
543
+ Gets the desktop path of the vm.
544
+ """
545
+
546
+ for _ in range(self.retry_times):
547
+ try:
548
+ response = requests.post(self.http_server + "/desktop_path")
549
+ if response.status_code == 200:
550
+ logger.info("Got desktop path successfully")
551
+ return response.json()["desktop_path"]
552
+ else:
553
+ logger.error("Failed to get desktop path. Status code: %d", response.status_code)
554
+ logger.info("Retrying to get desktop path.")
555
+ except Exception as e:
556
+ logger.error("An error occurred while trying to get the desktop path: %s", e)
557
+ logger.info("Retrying to get desktop path.")
558
+ time.sleep(self.retry_interval)
559
+
560
+ logger.error("Failed to get desktop path.")
561
+ return None
562
+
563
+ def get_vm_directory_tree(self, path) -> Optional[Dict[str, Any]]:
564
+ """
565
+ Gets the directory tree of the vm.
566
+ """
567
+ payload = json.dumps({"path": path})
568
+
569
+ for _ in range(self.retry_times):
570
+ try:
571
+ response = requests.post(self.http_server + "/list_directory", headers={'Content-Type': 'application/json'}, data=payload)
572
+ if response.status_code == 200:
573
+ logger.info("Got directory tree successfully")
574
+ return response.json()["directory_tree"]
575
+ else:
576
+ logger.error("Failed to get directory tree. Status code: %d", response.status_code)
577
+ logger.info("Retrying to get directory tree.")
578
+ except Exception as e:
579
+ logger.error("An error occurred while trying to get directory tree: %s", e)
580
+ logger.info("Retrying to get directory tree.")
581
+ time.sleep(self.retry_interval)
582
+
583
+ logger.error("Failed to get directory tree.")
584
+ return None
desktop_env/controllers/setup.py ADDED
@@ -0,0 +1,920 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ import os.path
5
+ import platform
6
+ import shutil
7
+ import sqlite3
8
+ import tempfile
9
+ import time
10
+ import traceback
11
+ import uuid
12
+ from datetime import datetime, timedelta
13
+ from typing import Any, Union, Optional
14
+ from typing import Dict, List
15
+
16
+ import requests
17
+ from playwright.sync_api import sync_playwright, TimeoutError
18
+ from pydrive.auth import GoogleAuth
19
+ from pydrive.drive import GoogleDrive, GoogleDriveFile, GoogleDriveFileList
20
+ from requests_toolbelt.multipart.encoder import MultipartEncoder
21
+
22
+ from desktop_env.controllers.python import PythonController
23
+ from desktop_env.evaluators.metrics.utils import compare_urls
24
+ from desktop_env.providers.aws.proxy_pool import get_global_proxy_pool, init_proxy_pool, ProxyInfo
25
+
26
+ import dotenv
27
+ # Load environment variables from .env file
28
+ dotenv.load_dotenv()
29
+
30
+
31
+ PROXY_CONFIG_FILE = os.getenv("PROXY_CONFIG_FILE", "evaluation_examples/settings/proxy/dataimpulse.json") # Default proxy config file
32
+
33
+ logger = logging.getLogger("desktopenv.setup")
34
+
35
+ FILE_PATH = os.path.dirname(os.path.abspath(__file__))
36
+
37
+ init_proxy_pool(PROXY_CONFIG_FILE) # initialize the global proxy pool
38
+
39
+ MAX_RETRIES = 20
40
+
41
+ class SetupController:
42
+ def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080):
43
+ self.vm_ip: str = vm_ip
44
+ self.server_port: int = server_port
45
+ self.chromium_port: int = chromium_port
46
+ self.vlc_port: int = vlc_port
47
+ self.http_server: str = f"http://{vm_ip}:{server_port}"
48
+ self.http_server_setup_root: str = f"http://{vm_ip}:{server_port}/setup"
49
+ self.cache_dir: str = cache_dir
50
+ self.use_proxy: bool = False
51
+ self.client_password: str = client_password
52
+ self.screen_width: int = screen_width
53
+ self.screen_height: int = screen_height
54
+
55
+ def reset_cache_dir(self, cache_dir: str):
56
+ self.cache_dir = cache_dir
57
+
58
+ def setup(self, config: List[Dict[str, Any]], use_proxy: bool = False)-> bool:
59
+ """
60
+ Args:
61
+ config (List[Dict[str, Any]]): list of dict like {str: Any}. each
62
+ config dict has the structure like
63
+ {
64
+ "type": str, corresponding to the `_{:}_setup` methods of
65
+ this class
66
+ "parameters": dict like {str, Any} providing the keyword
67
+ parameters
68
+ }
69
+ """
70
+ self.use_proxy = use_proxy
71
+ # make sure connection can be established
72
+ logger.info(f"try to connect {self.http_server}")
73
+ retry = 0
74
+ while retry < MAX_RETRIES:
75
+ try:
76
+ _ = requests.get(self.http_server + "/terminal")
77
+ break
78
+ except:
79
+ time.sleep(5)
80
+ retry += 1
81
+ logger.info(f"retry: {retry}/{MAX_RETRIES}")
82
+
83
+ if retry == MAX_RETRIES:
84
+ return False
85
+
86
+
87
+ for i, cfg in enumerate(config):
88
+ config_type: str = cfg["type"]
89
+ parameters: Dict[str, Any] = cfg["parameters"]
90
+
91
+ # Assumes all the setup the functions should follow this name
92
+ # protocol
93
+ setup_function: str = "_{:}_setup".format(config_type)
94
+ assert hasattr(self, setup_function), f'Setup controller cannot find init function {setup_function}'
95
+
96
+ try:
97
+ logger.info(f"Executing setup step {i+1}/{len(config)}: {setup_function}")
98
+ logger.debug(f"Setup parameters: {parameters}")
99
+ getattr(self, setup_function)(**parameters)
100
+ logger.info(f"SETUP COMPLETED: {setup_function}({str(parameters)})")
101
+ except Exception as e:
102
+ logger.error(f"SETUP FAILED at step {i+1}/{len(config)}: {setup_function}({str(parameters)})")
103
+ logger.error(f"Error details: {e}")
104
+ logger.error(f"Traceback: {traceback.format_exc()}")
105
+ raise Exception(f"Setup step {i+1} failed: {setup_function} - {e}") from e
106
+
107
+ return True
108
+
109
+ def _download_setup(self, files: List[Dict[str, str]]):
110
+ """
111
+ Args:
112
+ files (List[Dict[str, str]]): files to download. lisf of dict like
113
+ {
114
+ "url": str, the url to download
115
+ "path": str, the path on the VM to store the downloaded file
116
+ }
117
+ """
118
+ for f in files:
119
+ url: str = f["url"]
120
+ path: str = f["path"]
121
+ cache_path: str = os.path.join(self.cache_dir, "{:}_{:}".format(
122
+ uuid.uuid5(uuid.NAMESPACE_URL, url),
123
+ os.path.basename(path)))
124
+ if not url or not path:
125
+ raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
126
+
127
+ if not os.path.exists(cache_path):
128
+ logger.info(f"Cache file not found, downloading from {url} to {cache_path}")
129
+ max_retries = 3
130
+ downloaded = False
131
+ e = None
132
+ for i in range(max_retries):
133
+ try:
134
+ logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
135
+ response = requests.get(url, stream=True, timeout=300) # Add 5 minute timeout
136
+ response.raise_for_status()
137
+
138
+ # Get file size if available
139
+ total_size = int(response.headers.get('content-length', 0))
140
+ if total_size > 0:
141
+ logger.info(f"File size: {total_size / (1024*1024):.2f} MB")
142
+
143
+ downloaded_size = 0
144
+ with open(cache_path, 'wb') as f:
145
+ for chunk in response.iter_content(chunk_size=8192):
146
+ if chunk:
147
+ f.write(chunk)
148
+ downloaded_size += len(chunk)
149
+ if total_size > 0 and downloaded_size % (1024*1024) == 0: # Log every MB
150
+ progress = (downloaded_size / total_size) * 100
151
+ logger.info(f"Download progress: {progress:.1f}%")
152
+
153
+ logger.info(f"File downloaded successfully to {cache_path} ({downloaded_size / (1024*1024):.2f} MB)")
154
+ downloaded = True
155
+ break
156
+
157
+ except requests.RequestException as e:
158
+ logger.error(
159
+ f"Failed to download {url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
160
+ # Clean up partial download
161
+ if os.path.exists(cache_path):
162
+ os.remove(cache_path)
163
+ if not downloaded:
164
+ raise requests.RequestException(f"Failed to download {url}. No retries left.")
165
+
166
+ form = MultipartEncoder({
167
+ "file_path": path,
168
+ "file_data": (os.path.basename(path), open(cache_path, "rb"))
169
+ })
170
+ headers = {"Content-Type": form.content_type}
171
+ logger.debug(form.content_type)
172
+
173
+ # send request to server to upload file
174
+ try:
175
+ logger.info(f"Uploading {os.path.basename(path)} to VM at {path}")
176
+ logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
177
+ response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form, timeout=600) # 10 minute timeout for upload
178
+ if response.status_code == 200:
179
+ logger.info(f"File uploaded successfully: {path}")
180
+ logger.debug("Upload response: %s", response.text)
181
+ else:
182
+ logger.error(f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}")
183
+ raise requests.RequestException(f"Upload failed with status {response.status_code}")
184
+ except requests.exceptions.RequestException as e:
185
+ logger.error(f"An error occurred while trying to upload {path}: {e}")
186
+ raise
187
+
188
+ def _upload_file_setup(self, files: List[Dict[str, str]]):
189
+ """
190
+ Args:
191
+ files (List[Dict[str, str]]): files to download. lisf of dict like
192
+ {
193
+ "local_path": str, the local path to the file to upload
194
+ "path": str, the path on the VM to store the downloaded file
195
+ }
196
+ """
197
+ for f in files:
198
+ local_path: str = f["local_path"]
199
+ path: str = f["path"]
200
+
201
+ if not os.path.exists(local_path):
202
+ raise Exception(f"Setup Upload - Invalid local path ({local_path}).")
203
+
204
+ file_size = None
205
+ try:
206
+ file_size = os.path.getsize(local_path)
207
+ except Exception:
208
+ pass
209
+
210
+ max_retries = 3
211
+ last_error: Optional[Exception] = None
212
+
213
+ for attempt in range(max_retries):
214
+ try:
215
+ logger.info(
216
+ f"Uploading {os.path.basename(local_path)}{f' ({file_size} bytes)' if file_size is not None else ''} "
217
+ f"to VM at {path} (attempt {attempt + 1}/{max_retries})"
218
+ )
219
+ logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
220
+
221
+ # Open the file inside each attempt to ensure fresh stream position
222
+ with open(local_path, "rb") as fp:
223
+ form = MultipartEncoder({
224
+ "file_path": path,
225
+ "file_data": (os.path.basename(path), fp)
226
+ })
227
+ headers = {"Content-Type": form.content_type}
228
+ logger.debug(form.content_type)
229
+
230
+ # Explicit connect/read timeout to avoid hanging forever
231
+ response = requests.post(
232
+ self.http_server + "/setup" + "/upload",
233
+ headers=headers,
234
+ data=form,
235
+ timeout=(10, 600)
236
+ )
237
+
238
+ if response.status_code == 200:
239
+ logger.info(f"File uploaded successfully: {path}")
240
+ logger.debug("Upload response: %s", response.text)
241
+ last_error = None
242
+ break
243
+ else:
244
+ msg = f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}"
245
+ logger.error(msg)
246
+ last_error = requests.RequestException(msg)
247
+
248
+ except requests.exceptions.RequestException as e:
249
+ last_error = e
250
+ logger.error(f"Upload attempt {attempt + 1} failed for {path}: {e}")
251
+
252
+ # Exponential backoff between retries
253
+ if attempt < max_retries - 1:
254
+ time.sleep(2 ** attempt)
255
+
256
+ if last_error is not None:
257
+ raise last_error
258
+
259
+ def _change_wallpaper_setup(self, path: str):
260
+ if not path:
261
+ raise Exception(f"Setup Wallpaper - Invalid path ({path}).")
262
+
263
+ payload = json.dumps({"path": path})
264
+ headers = {
265
+ 'Content-Type': 'application/json'
266
+ }
267
+
268
+ # send request to server to change wallpaper
269
+ try:
270
+ response = requests.post(self.http_server + "/setup" + "/change_wallpaper", headers=headers, data=payload)
271
+ if response.status_code == 200:
272
+ logger.info("Command executed successfully: %s", response.text)
273
+ else:
274
+ logger.error("Failed to change wallpaper. Status code: %s", response.text)
275
+ except requests.exceptions.RequestException as e:
276
+ logger.error("An error occurred while trying to send the request: %s", e)
277
+
278
+ def _tidy_desktop_setup(self, **config):
279
+ raise NotImplementedError()
280
+
281
+ def _open_setup(self, path: str):
282
+ if not path:
283
+ raise Exception(f"Setup Open - Invalid path ({path}).")
284
+
285
+ payload = json.dumps({"path": path})
286
+ headers = {
287
+ 'Content-Type': 'application/json'
288
+ }
289
+
290
+ # send request to server to open file
291
+ try:
292
+ # The server-side call is now blocking and can take time.
293
+ # We set a timeout that is slightly longer than the server's timeout (1800s).
294
+ response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810)
295
+ response.raise_for_status() # This will raise an exception for 4xx and 5xx status codes
296
+ logger.info("Command executed successfully: %s", response.text)
297
+ except requests.exceptions.RequestException as e:
298
+ logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}")
299
+ raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e
300
+
301
+ def _launch_setup(self, command: Union[str, List[str]], shell: bool = False):
302
+ if not command:
303
+ raise Exception("Empty command to launch.")
304
+
305
+ if not shell and isinstance(command, str) and len(command.split()) > 1:
306
+ logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.")
307
+ command = command.split()
308
+
309
+ if command[0] == "google-chrome" and self.use_proxy:
310
+ command.append("--proxy-server=http://127.0.0.1:18888") # Use the proxy server set up by _proxy_setup
311
+
312
+ payload = json.dumps({"command": command, "shell": shell})
313
+ headers = {"Content-Type": "application/json"}
314
+
315
+ try:
316
+ logger.info("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/launch")
317
+ response = requests.post(self.http_server + "/setup" + "/launch", headers=headers, data=payload)
318
+ if response.status_code == 200:
319
+ logger.info("Command executed successfully: %s", response.text)
320
+ else:
321
+ logger.error("Failed to launch application. Status code: %s", response.text)
322
+ except requests.exceptions.RequestException as e:
323
+ logger.error("An error occurred while trying to send the request: %s", e)
324
+
325
+ def _execute_setup(
326
+ self,
327
+ command: List[str],
328
+ stdout: str = "",
329
+ stderr: str = "",
330
+ shell: bool = False,
331
+ until: Optional[Dict[str, Any]] = None
332
+ ):
333
+ if not command:
334
+ raise Exception("Empty command to launch.")
335
+
336
+ until: Dict[str, Any] = until or {}
337
+ terminates: bool = False
338
+ nb_failings = 0
339
+
340
+ def replace_screen_env_in_command(command):
341
+ password = self.client_password
342
+ width = self.screen_width
343
+ height = self.screen_height
344
+ width_half = str(width // 2)
345
+ height_half = str(height // 2)
346
+ new_command_list = []
347
+ new_command = ""
348
+ if isinstance(command, str):
349
+ new_command = command.replace("{CLIENT_PASSWORD}", password)
350
+ new_command = new_command.replace("{SCREEN_WIDTH_HALF}", width_half)
351
+ new_command = new_command.replace("{SCREEN_HEIGHT_HALF}", height_half)
352
+ new_command = new_command.replace("{SCREEN_WIDTH}", str(width))
353
+ new_command = new_command.replace("{SCREEN_HEIGHT}", str(height))
354
+ return new_command
355
+ else:
356
+ for item in command:
357
+ item = item.replace("{CLIENT_PASSWORD}", password)
358
+ item = item.replace("{SCREEN_WIDTH_HALF}", width_half)
359
+ item = item.replace("{SCREEN_HEIGHT_HALF}", height_half)
360
+ item = item.replace("{SCREEN_WIDTH}", str(width))
361
+ item = item.replace("{SCREEN_HEIGHT}", str(height))
362
+ new_command_list.append(item)
363
+ return new_command_list
364
+ command = replace_screen_env_in_command(command)
365
+ payload = json.dumps({"command": command, "shell": shell})
366
+ headers = {"Content-Type": "application/json"}
367
+
368
+ while not terminates:
369
+ try:
370
+ response = requests.post(self.http_server + "/setup" + "/execute", headers=headers, data=payload)
371
+ if response.status_code == 200:
372
+ results: Dict[str, str] = response.json()
373
+ if stdout:
374
+ with open(os.path.join(self.cache_dir, stdout), "w") as f:
375
+ f.write(results["output"])
376
+ if stderr:
377
+ with open(os.path.join(self.cache_dir, stderr), "w") as f:
378
+ f.write(results["error"])
379
+ logger.info("Command executed successfully: %s -> %s"
380
+ , " ".join(command) if isinstance(command, list) else command
381
+ , response.text
382
+ )
383
+ else:
384
+ logger.error("Failed to launch application. Status code: %s", response.text)
385
+ results = None
386
+ nb_failings += 1
387
+ except requests.exceptions.RequestException as e:
388
+ logger.error("An error occurred while trying to send the request: %s", e)
389
+ traceback.print_exc()
390
+
391
+ results = None
392
+ nb_failings += 1
393
+
394
+ if len(until) == 0:
395
+ terminates = True
396
+ elif results is not None:
397
+ terminates = "returncode" in until and results["returncode"] == until["returncode"] \
398
+ or "stdout" in until and until["stdout"] in results["output"] \
399
+ or "stderr" in until and until["stderr"] in results["error"]
400
+ terminates = terminates or nb_failings >= 5
401
+ if not terminates:
402
+ time.sleep(0.3)
403
+
404
+ def _execute_with_verification_setup(
405
+ self,
406
+ command: List[str],
407
+ verification: Dict[str, Any] = None,
408
+ max_wait_time: int = 10,
409
+ check_interval: float = 1.0,
410
+ shell: bool = False
411
+ ):
412
+ """Execute command with verification of results
413
+
414
+ Args:
415
+ command: Command to execute
416
+ verification: Dict with verification criteria:
417
+ - window_exists: Check if window with this name exists
418
+ - command_success: Execute this command and check if it succeeds
419
+ max_wait_time: Maximum time to wait for verification
420
+ check_interval: Time between verification checks
421
+ shell: Whether to use shell
422
+ """
423
+ if not command:
424
+ raise Exception("Empty command to launch.")
425
+
426
+ verification = verification or {}
427
+
428
+ payload = json.dumps({
429
+ "command": command,
430
+ "shell": shell,
431
+ "verification": verification,
432
+ "max_wait_time": max_wait_time,
433
+ "check_interval": check_interval
434
+ })
435
+ headers = {"Content-Type": "application/json"}
436
+
437
+ try:
438
+ response = requests.post(self.http_server + "/setup" + "/execute_with_verification",
439
+ headers=headers, data=payload, timeout=max_wait_time + 10)
440
+ if response.status_code == 200:
441
+ result = response.json()
442
+ logger.info("Command executed and verified successfully: %s -> %s"
443
+ , " ".join(command) if isinstance(command, list) else command
444
+ , response.text
445
+ )
446
+ return result
447
+ else:
448
+ logger.error("Failed to execute with verification. Status code: %s", response.text)
449
+ raise Exception(f"Command verification failed: {response.text}")
450
+ except requests.exceptions.RequestException as e:
451
+ logger.error("An error occurred while trying to send the request: %s", e)
452
+ traceback.print_exc()
453
+ raise Exception(f"Request failed: {e}")
454
+
455
+ def _command_setup(self, command: List[str], **kwargs):
456
+ self._execute_setup(command, **kwargs)
457
+
458
+ def _sleep_setup(self, seconds: float):
459
+ time.sleep(seconds)
460
+
461
+ def _act_setup(self, action_seq: List[Union[Dict[str, Any], str]]):
462
+ # TODO
463
+ raise NotImplementedError()
464
+
465
+ def _replay_setup(self, trajectory: str):
466
+ """
467
+ Args:
468
+ trajectory (str): path to the replay trajectory file
469
+ """
470
+
471
+ # TODO
472
+ raise NotImplementedError()
473
+
474
+ def _activate_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
475
+ if not window_name:
476
+ raise Exception(f"Setup Open - Invalid path ({window_name}).")
477
+
478
+ payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
479
+ headers = {
480
+ 'Content-Type': 'application/json'
481
+ }
482
+
483
+ # send request to server to open file
484
+ try:
485
+ response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload)
486
+ if response.status_code == 200:
487
+ logger.info("Command executed successfully: %s", response.text)
488
+ else:
489
+ logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text)
490
+ except requests.exceptions.RequestException as e:
491
+ logger.error("An error occurred while trying to send the request: %s", e)
492
+
493
+ def _close_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
494
+ if not window_name:
495
+ raise Exception(f"Setup Open - Invalid path ({window_name}).")
496
+
497
+ payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
498
+ headers = {
499
+ 'Content-Type': 'application/json'
500
+ }
501
+
502
+ # send request to server to open file
503
+ try:
504
+ response = requests.post(self.http_server + "/setup" + "/close_window", headers=headers, data=payload)
505
+ if response.status_code == 200:
506
+ logger.info("Command executed successfully: %s", response.text)
507
+ else:
508
+ logger.error(f"Failed to close window {window_name}. Status code: %s", response.text)
509
+ except requests.exceptions.RequestException as e:
510
+ logger.error("An error occurred while trying to send the request: %s", e)
511
+
512
+ def _proxy_setup(self, client_password: str = ""):
513
+ """Setup system-wide proxy configuration using proxy pool
514
+
515
+ Args:
516
+ client_password (str): Password for sudo operations, defaults to "password"
517
+ """
518
+ retry = 0
519
+ while retry < MAX_RETRIES:
520
+ try:
521
+ _ = requests.get(self.http_server + "/terminal")
522
+ break
523
+ except:
524
+ time.sleep(5)
525
+ retry += 1
526
+ logger.info(f"retry: {retry}/{MAX_RETRIES}")
527
+
528
+ if retry == MAX_RETRIES:
529
+ return False
530
+
531
+ # Get proxy from global proxy pool
532
+ proxy_pool = get_global_proxy_pool()
533
+ current_proxy = proxy_pool.get_next_proxy()
534
+
535
+ if not current_proxy:
536
+ logger.error("No proxy available from proxy pool")
537
+ raise Exception("No proxy available from proxy pool")
538
+
539
+ # Format proxy URL
540
+ proxy_url = proxy_pool._format_proxy_url(current_proxy)
541
+ logger.info(f"Setting up proxy: {current_proxy.host}:{current_proxy.port}")
542
+
543
+ # Configure system proxy environment variables
544
+ proxy_commands = [
545
+ f"echo '{client_password}' | sudo -S bash -c \"apt-get update\"", ## TODO: remove this line if ami is already updated
546
+ f"echo '{client_password}' | sudo -S bash -c \"apt-get install -y tinyproxy\"", ## TODO: remove this line if tinyproxy is already installed
547
+ f"echo '{client_password}' | sudo -S bash -c \"echo 'Port 18888' > /tmp/tinyproxy.conf\"",
548
+ f"echo '{client_password}' | sudo -S bash -c \"echo 'Allow 127.0.0.1' >> /tmp/tinyproxy.conf\"",
549
+ f"echo '{client_password}' | sudo -S bash -c \"echo 'Upstream http {current_proxy.username}:{current_proxy.password}@{current_proxy.host}:{current_proxy.port}' >> /tmp/tinyproxy.conf\"",
550
+
551
+ # CML commands to set environment variables for proxy
552
+ f"echo 'export http_proxy={proxy_url}' >> ~/.bashrc",
553
+ f"echo 'export https_proxy={proxy_url}' >> ~/.bashrc",
554
+ f"echo 'export HTTP_PROXY={proxy_url}' >> ~/.bashrc",
555
+ f"echo 'export HTTPS_PROXY={proxy_url}' >> ~/.bashrc",
556
+ ]
557
+
558
+ # Execute all proxy configuration commands
559
+ for cmd in proxy_commands:
560
+ try:
561
+ self._execute_setup([cmd], shell=True)
562
+ except Exception as e:
563
+ logger.error(f"Failed to execute proxy setup command: {e}")
564
+ proxy_pool.mark_proxy_failed(current_proxy)
565
+ raise
566
+
567
+ self._launch_setup(["tinyproxy -c /tmp/tinyproxy.conf -d"], shell=True)
568
+
569
+ # Reload environment variables
570
+ reload_cmd = "source /etc/environment"
571
+ try:
572
+ logger.info(f"Proxy setup completed successfully for {current_proxy.host}:{current_proxy.port}")
573
+ proxy_pool.mark_proxy_success(current_proxy)
574
+ except Exception as e:
575
+ logger.error(f"Failed to reload environment variables: {e}")
576
+ proxy_pool.mark_proxy_failed(current_proxy)
577
+ raise
578
+
579
+ # Chrome setup
580
+ def _chrome_open_tabs_setup(self, urls_to_open: List[str]):
581
+ host = self.vm_ip
582
+ port = self.chromium_port # fixme: this port is hard-coded, need to be changed from config file
583
+
584
+ remote_debugging_url = f"http://{host}:{port}"
585
+ logger.info("Connect to Chrome @: %s", remote_debugging_url)
586
+ logger.debug("PLAYWRIGHT ENV: %s", repr(os.environ))
587
+ for attempt in range(15):
588
+ if attempt > 0:
589
+ time.sleep(5)
590
+
591
+ browser = None
592
+ with sync_playwright() as p:
593
+ try:
594
+ browser = p.chromium.connect_over_cdp(remote_debugging_url)
595
+ # break
596
+ except Exception as e:
597
+ if attempt < 14:
598
+ logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
599
+ # time.sleep(10)
600
+ continue
601
+ else:
602
+ logger.error(f"Failed to connect after multiple attempts: {e}")
603
+ raise e
604
+
605
+ if not browser:
606
+ return
607
+
608
+ logger.info("Opening %s...", urls_to_open)
609
+ for i, url in enumerate(urls_to_open):
610
+ # Use the first context (which should be the only one if using default profile)
611
+ if i == 0:
612
+ context = browser.contexts[0]
613
+
614
+ page = context.new_page() # Create a new page (tab) within the existing context
615
+ try:
616
+ page.goto(url, timeout=60000)
617
+ except:
618
+ logger.warning("Opening %s exceeds time limit", url) # only for human test
619
+ logger.info(f"Opened tab {i + 1}: {url}")
620
+
621
+ if i == 0:
622
+ # clear the default tab
623
+ default_page = context.pages[0]
624
+ default_page.close()
625
+
626
+ # Do not close the context or browser; they will remain open after script ends
627
+ return browser, context
628
+
629
+ def _chrome_close_tabs_setup(self, urls_to_close: List[str]):
630
+ time.sleep(5) # Wait for Chrome to finish launching
631
+
632
+ host = self.vm_ip
633
+ port = self.chromium_port # fixme: this port is hard-coded, need to be changed from config file
634
+
635
+ remote_debugging_url = f"http://{host}:{port}"
636
+ with sync_playwright() as p:
637
+ browser = None
638
+ for attempt in range(15):
639
+ try:
640
+ browser = p.chromium.connect_over_cdp(remote_debugging_url)
641
+ break
642
+ except Exception as e:
643
+ if attempt < 14:
644
+ logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
645
+ time.sleep(5)
646
+ else:
647
+ logger.error(f"Failed to connect after multiple attempts: {e}")
648
+ raise e
649
+
650
+ if not browser:
651
+ return
652
+
653
+ for i, url in enumerate(urls_to_close):
654
+ # Use the first context (which should be the only one if using default profile)
655
+ if i == 0:
656
+ context = browser.contexts[0]
657
+
658
+ for page in context.pages:
659
+
660
+ # if two urls are the same, close the tab
661
+ if compare_urls(page.url, url):
662
+ context.pages.pop(context.pages.index(page))
663
+ page.close()
664
+ logger.info(f"Closed tab {i + 1}: {url}")
665
+ break
666
+
667
+ # Do not close the context or browser; they will remain open after script ends
668
+ return browser, context
669
+
670
+ # google drive setup
671
+ def _googledrive_setup(self, **config):
672
+ """ Clean google drive space (eliminate the impact of previous experiments to reset the environment)
673
+ @args:
674
+ config(Dict[str, Any]): contain keys
675
+ settings_file(str): path to google drive settings file, which will be loaded by pydrive.auth.GoogleAuth()
676
+ operation(List[str]): each operation is chosen from ['delete', 'upload']
677
+ args(List[Dict[str, Any]]): parameters for each operation
678
+ different args dict for different operations:
679
+ for delete:
680
+ query(str): query pattern string to search files or folder in google drive to delete, please refer to
681
+ https://developers.google.com/drive/api/guides/search-files?hl=en about how to write query string.
682
+ trash(bool): whether to delete files permanently or move to trash. By default, trash=false, completely delete it.
683
+ for mkdirs:
684
+ path(List[str]): the path in the google drive to create folder
685
+ for upload:
686
+ path(str): remote url to download file
687
+ dest(List[str]): the path in the google drive to store the downloaded file
688
+ """
689
+ settings_file = config.get('settings_file', 'evaluation_examples/settings/googledrive/settings.yml')
690
+ gauth = GoogleAuth(settings_file=settings_file)
691
+ drive = GoogleDrive(gauth)
692
+
693
+ def mkdir_in_googledrive(paths: List[str]):
694
+ paths = [paths] if type(paths) != list else paths
695
+ parent_id = 'root'
696
+ for p in paths:
697
+ q = f'"{parent_id}" in parents and title = "{p}" and mimeType = "application/vnd.google-apps.folder" and trashed = false'
698
+ folder = drive.ListFile({'q': q}).GetList()
699
+ if len(folder) == 0: # not exists, create it
700
+ parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
701
+ file = drive.CreateFile({'title': p, 'mimeType': 'application/vnd.google-apps.folder', **parents})
702
+ file.Upload()
703
+ parent_id = file['id']
704
+ else:
705
+ parent_id = folder[0]['id']
706
+ return parent_id
707
+
708
+ for oid, operation in enumerate(config['operation']):
709
+ if operation == 'delete': # delete a specific file
710
+ # query pattern string, by default, remove all files/folders not in the trash to the trash
711
+ params = config['args'][oid]
712
+ q = params.get('query', '')
713
+ trash = params.get('trash', False)
714
+ q_file = f"( {q} ) and mimeType != 'application/vnd.google-apps.folder'" if q.strip() else "mimeType != 'application/vnd.google-apps.folder'"
715
+ filelist: GoogleDriveFileList = drive.ListFile({'q': q_file}).GetList()
716
+ q_folder = f"( {q} ) and mimeType = 'application/vnd.google-apps.folder'" if q.strip() else "mimeType = 'application/vnd.google-apps.folder'"
717
+ folderlist: GoogleDriveFileList = drive.ListFile({'q': q_folder}).GetList()
718
+ for file in filelist: # first delete file, then folder
719
+ file: GoogleDriveFile
720
+ if trash:
721
+ file.Trash()
722
+ else:
723
+ file.Delete()
724
+ for folder in folderlist:
725
+ folder: GoogleDriveFile
726
+ # note that, if a folder is trashed/deleted, all files and folders in it will be trashed/deleted
727
+ if trash:
728
+ folder.Trash()
729
+ else:
730
+ folder.Delete()
731
+ elif operation == 'mkdirs':
732
+ params = config['args'][oid]
733
+ mkdir_in_googledrive(params['path'])
734
+ elif operation == 'upload':
735
+ params = config['args'][oid]
736
+ url = params['url']
737
+ with tempfile.NamedTemporaryFile(mode='wb', delete=False) as tmpf:
738
+ response = requests.get(url, stream=True)
739
+ response.raise_for_status()
740
+ for chunk in response.iter_content(chunk_size=8192):
741
+ if chunk:
742
+ tmpf.write(chunk)
743
+ tmpf.close()
744
+ paths = [params['path']] if params['path'] != list else params['path']
745
+ parent_id = mkdir_in_googledrive(paths[:-1])
746
+ parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
747
+ file = drive.CreateFile({'title': paths[-1], **parents})
748
+ file.SetContentFile(tmpf.name)
749
+ file.Upload()
750
+ return
751
+ else:
752
+ raise ValueError('[ERROR]: not implemented clean type!')
753
+
754
+ def _login_setup(self, **config):
755
+ """ Login to a website with account and password information.
756
+ @args:
757
+ config(Dict[str, Any]): contain keys
758
+ settings_file(str): path to the settings file
759
+ platform(str): platform to login, implemented platforms include:
760
+ googledrive: https://drive.google.com/drive/my-drive
761
+
762
+ """
763
+ host = self.vm_ip
764
+ port = self.chromium_port
765
+
766
+ remote_debugging_url = f"http://{host}:{port}"
767
+ with sync_playwright() as p:
768
+ browser = None
769
+ for attempt in range(15):
770
+ try:
771
+ browser = p.chromium.connect_over_cdp(remote_debugging_url)
772
+ break
773
+ except Exception as e:
774
+ if attempt < 14:
775
+ logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
776
+ time.sleep(5)
777
+ else:
778
+ logger.error(f"Failed to connect after multiple attempts: {e}")
779
+ raise e
780
+ if not browser:
781
+ return
782
+
783
+ context = browser.contexts[0]
784
+ platform = config['platform']
785
+
786
+ if platform == 'googledrive':
787
+ url = 'https://drive.google.com/drive/my-drive'
788
+ page = context.new_page() # Create a new page (tab) within the existing context
789
+ try:
790
+ page.goto(url, timeout=60000)
791
+ except:
792
+ logger.warning("Opening %s exceeds time limit", url) # only for human test
793
+ logger.info(f"Opened new page: {url}")
794
+ settings = json.load(open(config['settings_file']))
795
+ email, password = settings['email'], settings['password']
796
+
797
+ try:
798
+ page.wait_for_selector('input[type="email"]', state="visible", timeout=3000)
799
+ page.fill('input[type="email"]', email)
800
+ page.click('#identifierNext > div > button')
801
+ page.wait_for_selector('input[type="password"]', state="visible", timeout=5000)
802
+ page.fill('input[type="password"]', password)
803
+ page.click('#passwordNext > div > button')
804
+ page.wait_for_load_state('load', timeout=5000)
805
+ except TimeoutError:
806
+ logger.info('[ERROR]: timeout when waiting for google drive login page to load!')
807
+ return
808
+
809
+ else:
810
+ raise NotImplementedError
811
+
812
+ return browser, context
813
+
814
+ def _update_browse_history_setup(self, **config):
815
+ cache_path = os.path.join(self.cache_dir, "history_new.sqlite")
816
+ db_url = "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938/history_empty.sqlite?download=true"
817
+ if not os.path.exists(cache_path):
818
+ max_retries = 3
819
+ downloaded = False
820
+ e = None
821
+ for i in range(max_retries):
822
+ try:
823
+ response = requests.get(db_url, stream=True)
824
+ response.raise_for_status()
825
+
826
+ with open(cache_path, 'wb') as f:
827
+ for chunk in response.iter_content(chunk_size=8192):
828
+ if chunk:
829
+ f.write(chunk)
830
+ logger.info("File downloaded successfully")
831
+ downloaded = True
832
+ break
833
+
834
+ except requests.RequestException as e:
835
+ logger.error(
836
+ f"Failed to download {db_url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
837
+ if not downloaded:
838
+ raise requests.RequestException(f"Failed to download {db_url}. No retries left. Error: {e}")
839
+ else:
840
+ logger.info("File already exists in cache directory")
841
+ # copy a new history file in the tmp folder
842
+ with tempfile.TemporaryDirectory() as tmp_dir:
843
+ db_path = os.path.join(tmp_dir, "history_empty.sqlite")
844
+ shutil.copy(cache_path, db_path)
845
+
846
+ history = config['history']
847
+
848
+ for history_item in history:
849
+ url = history_item['url']
850
+ title = history_item['title']
851
+ visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
852
+
853
+ # Chrome use ms from 1601-01-01 as timestamp
854
+ epoch_start = datetime(1601, 1, 1)
855
+ chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
856
+
857
+ conn = sqlite3.connect(db_path)
858
+ cursor = conn.cursor()
859
+
860
+ cursor.execute('''
861
+ INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
862
+ VALUES (?, ?, ?, ?, ?, ?)
863
+ ''', (url, title, 1, 0, chrome_timestamp, 0))
864
+
865
+ url_id = cursor.lastrowid
866
+
867
+ cursor.execute('''
868
+ INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
869
+ VALUES (?, ?, ?, ?, ?, ?)
870
+ ''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
871
+
872
+ conn.commit()
873
+ conn.close()
874
+
875
+ logger.info('Fake browsing history added successfully.')
876
+
877
+ controller = PythonController(self.vm_ip, self.server_port)
878
+
879
+ # get the path of the history file according to the platform
880
+ os_type = controller.get_vm_platform()
881
+
882
+ if os_type == 'Windows':
883
+ chrome_history_path = controller.execute_python_command(
884
+ """import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
885
+ 'output'].strip()
886
+ elif os_type == 'Darwin':
887
+ chrome_history_path = controller.execute_python_command(
888
+ """import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
889
+ 'output'].strip()
890
+ elif os_type == 'Linux':
891
+ if "arm" in platform.machine():
892
+ chrome_history_path = controller.execute_python_command(
893
+ "import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
894
+ 'output'].strip()
895
+ else:
896
+ chrome_history_path = controller.execute_python_command(
897
+ "import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
898
+ 'output'].strip()
899
+ else:
900
+ raise Exception('Unsupported operating system')
901
+
902
+ form = MultipartEncoder({
903
+ "file_path": chrome_history_path,
904
+ "file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
905
+ })
906
+ headers = {"Content-Type": form.content_type}
907
+ logger.debug(form.content_type)
908
+
909
+ # send request to server to upload file
910
+ try:
911
+ logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
912
+ response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
913
+ if response.status_code == 200:
914
+ logger.info("Command executed successfully: %s", response.text)
915
+ else:
916
+ logger.error("Failed to upload file. Status code: %s", response.text)
917
+ except requests.exceptions.RequestException as e:
918
+ logger.error("An error occurred while trying to send the request: %s", e)
919
+
920
+ self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
desktop_env/desktop_env.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import time
6
+ import re
7
+ from typing import Callable, Any, Optional, Tuple
8
+ from typing import List, Dict, Union
9
+
10
+ import gymnasium as gym
11
+
12
+ from desktop_env.controllers.python import PythonController
13
+ from desktop_env.controllers.setup import SetupController
14
+ from desktop_env.evaluators import metrics, getters
15
+ from desktop_env.providers import create_vm_manager_and_provider
16
+
17
+ logger = logging.getLogger("desktopenv.env")
18
+
19
+ Metric = Callable[[Any, Any], float]
20
+ Getter = Callable[[gym.Env, Dict[str, Any]], Any]
21
+
22
+ MAX_RETRIES = 5 # Maximum retries for environment setup
23
+
24
+
25
+
26
+ def _fix_pyautogui_less_than_bug(command: str) -> str:
27
+ """
28
+ Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
29
+
30
+ This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
31
+ References:
32
+ - https://github.com/asweigart/pyautogui/issues/198
33
+ - https://github.com/xlang-ai/OSWorld/issues/257
34
+
35
+ Args:
36
+ command (str): The original pyautogui command
37
+
38
+ Returns:
39
+ str: The fixed command with '<' characters handled properly
40
+ """
41
+ # Pattern to match press('<') or press('\u003c') calls
42
+ press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
43
+
44
+ # Handle press('<') calls
45
+ def replace_press_less_than(match):
46
+ return 'pyautogui.hotkey("shift", ",")'
47
+
48
+ # First handle press('<') calls
49
+ command = re.sub(press_pattern, replace_press_less_than, command)
50
+
51
+ # Pattern to match typewrite calls with quoted strings
52
+ typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
53
+
54
+ # Then handle typewrite calls
55
+ def process_typewrite_match(match):
56
+ quote_char = match.group(1)
57
+ content = match.group(2)
58
+
59
+ # Preprocess: Try to decode Unicode escapes like \u003c to actual '<'
60
+ # This handles cases where '<' is represented as escaped Unicode
61
+ try:
62
+ # Attempt to decode unicode escapes
63
+ decoded_content = content.encode('utf-8').decode('unicode_escape')
64
+ content = decoded_content
65
+ except UnicodeDecodeError:
66
+ # If decoding fails, proceed with original content to avoid breaking existing logic
67
+ pass # English comment: Graceful degradation - fall back to original content if decoding fails
68
+
69
+ # Check if content contains '<'
70
+ if '<' not in content:
71
+ return match.group(0)
72
+
73
+ # Split by '<' and rebuild
74
+ parts = content.split('<')
75
+ result_parts = []
76
+
77
+ for i, part in enumerate(parts):
78
+ if i == 0:
79
+ # First part
80
+ if part:
81
+ result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
82
+ else:
83
+ # Add hotkey for '<' and then typewrite for the rest
84
+ result_parts.append('pyautogui.hotkey("shift", ",")')
85
+ if part:
86
+ result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
87
+
88
+ return '; '.join(result_parts)
89
+
90
+ command = re.sub(typewrite_pattern, process_typewrite_match, command)
91
+
92
+ return command
93
+
94
+
95
+ class DesktopEnv(gym.Env):
96
+ """
97
+ DesktopEnv with OpenAI Gym interface. It provides a desktop environment for setting and evaluating desktop automation tasks.
98
+ """
99
+ def __init__(
100
+ self,
101
+ provider_name: str = "vmware",
102
+ region: str = None,
103
+ path_to_vm: str = None,
104
+ snapshot_name: str = "init_state",
105
+ action_space: str = "pyautogui",
106
+ cache_dir: str = "cache",
107
+ screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))),
108
+ headless: bool = False,
109
+ require_a11y_tree: bool = True,
110
+ require_terminal: bool = False,
111
+ os_type: str = "Ubuntu",
112
+ enable_proxy: bool = False,
113
+ client_password: str = "",
114
+ ):
115
+ """
116
+ Args:
117
+ provider_name (str): virtualization provider name, default to "vmware"
118
+ region (str): the region for allocate machines, work for cloud services, default to "us-east-1"
119
+ path_to_vm (str): path to .vmx file
120
+ snapshot_name (str): snapshot name to revert to, default to "init_state"
121
+ action_space (str): "computer_13" | "pyautogui"
122
+ cache_dir (str): cache directory to cache task-related stuffs like
123
+ reference file for evaluation
124
+ screen_size (Tuple[int]): screen size of the VM
125
+ headless (bool): whether to run the VM in headless mode
126
+ require_a11y_tree (bool): whether to require accessibility tree
127
+ require_terminal (bool): whether to require terminal output
128
+ os_type (str): operating system type, default to "Ubuntu"
129
+ enable_proxy (bool): whether to enable proxy support, default to False
130
+ """
131
+ # Initialize VM manager and vitualization provider
132
+ self.region = region
133
+ self.provider_name = provider_name
134
+ self.enable_proxy = enable_proxy # Store proxy enablement setting
135
+ if client_password == "":
136
+ if self.provider_name == "aws":
137
+ self.client_password = "osworld-public-evaluation"
138
+ else:
139
+ self.client_password = "password"
140
+ else:
141
+ self.client_password = client_password
142
+
143
+ self.screen_width = screen_size[0]
144
+ self.screen_height = screen_size[1]
145
+
146
+ # Default
147
+ self.server_port = 5000
148
+ self.chromium_port = 9222
149
+ self.vnc_port = 8006
150
+ self.vlc_port = 8080
151
+
152
+ # Initialize with default (no proxy) provider
153
+ self.current_use_proxy = False
154
+ self.manager, self.provider = create_vm_manager_and_provider(provider_name, region, use_proxy=False)
155
+
156
+ self.os_type = os_type
157
+
158
+ # Track whether environment has been used (step/setup) to optimize snapshot revert
159
+ # docker, aws, gcp, azure are always unused as the emulator starts from a clean state
160
+ # vmware, virtualbox are always used as the emulator starts from a dirty state
161
+ if self.provider_name in {"docker", "aws", "gcp", "azure", "aliyun", "volcengine"}:
162
+ self.is_environment_used = False
163
+ elif self.provider_name in {"vmware", "virtualbox"}:
164
+ self.is_environment_used = True
165
+ else:
166
+ raise ValueError(f"Invalid provider name: {self.provider_name}")
167
+
168
+ # Initialize environment variables
169
+ if path_to_vm:
170
+ self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(path_to_vm))) \
171
+ if provider_name in {"vmware", "virtualbox"} else path_to_vm
172
+ else:
173
+ self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=region, screen_size=(self.screen_width, self.screen_height))
174
+
175
+ self.snapshot_name = snapshot_name
176
+ self.cache_dir_base: str = cache_dir
177
+ # todo: add the logic to get the screen size from the VM
178
+ self.headless = headless
179
+ self.require_a11y_tree = require_a11y_tree
180
+ self.require_terminal = require_terminal
181
+
182
+ # Initialize emulator and controller
183
+ logger.info("Initializing...")
184
+ self._start_emulator()
185
+
186
+ # mode: human or machine
187
+ self.instruction = None
188
+ assert action_space in ["computer_13", "pyautogui", "claude_computer_use", "autoglm_computer_use"]
189
+ self.action_space = action_space # todo: refactor it to the ActType
190
+
191
+ # episodic stuffs, like counters, will be updated or reset
192
+ # when calling self.reset()
193
+ self._traj_no: int = -1
194
+ self._step_no: int = 0
195
+ self.action_history: List[Dict[str, any]] = []
196
+
197
+
198
+ def _start_emulator(self):
199
+ try:
200
+ # Power on the virtual machine
201
+ self.provider.start_emulator(self.path_to_vm, self.headless, self.os_type)
202
+
203
+ # Get the ip from the virtual machine, and setup the controller
204
+ vm_ip_ports = self.provider.get_ip_address(self.path_to_vm).split(':')
205
+ self.vm_ip = vm_ip_ports[0]
206
+ # Get the ports from the virtual machine (for Docker provider only)
207
+ if len(vm_ip_ports) > 1:
208
+ self.server_port = int(vm_ip_ports[1])
209
+ self.chromium_port = int(vm_ip_ports[2])
210
+ self.vnc_port = int(vm_ip_ports[3])
211
+ self.vlc_port = int(vm_ip_ports[4])
212
+ self.controller = PythonController(vm_ip=self.vm_ip, server_port=self.server_port)
213
+ self.setup_controller = SetupController(vm_ip=self.vm_ip, server_port=self.server_port, chromium_port=self.chromium_port, vlc_port=self.vlc_port, cache_dir=self.cache_dir_base, client_password=self.client_password, screen_width=self.screen_width, screen_height=self.screen_height)
214
+
215
+ except Exception as e:
216
+ try:
217
+ self.provider.stop_emulator(self.path_to_vm)
218
+ except Exception as stop_err:
219
+ logger.warning(f"Cleanup after interrupt failed: {stop_err}")
220
+ raise
221
+
222
+ def _revert_to_snapshot(self):
223
+ # Revert to certain snapshot of the virtual machine, and refresh the path to vm and ip of vm
224
+ # due to the fact it could be changed when implemented by cloud services
225
+ path_to_vm = self.provider.revert_to_snapshot(self.path_to_vm, self.snapshot_name)
226
+ if path_to_vm and not path_to_vm == self.path_to_vm:
227
+ # path_to_vm has to be a new path
228
+
229
+ self.manager.delete_vm(self.path_to_vm, self.region)
230
+ self.manager.add_vm(path_to_vm, self.region)
231
+ self.manager.occupy_vm(path_to_vm, os.getpid(), self.region)
232
+ self.path_to_vm = path_to_vm
233
+
234
+ def _save_state(self, snapshot_name=None):
235
+ # Save the current virtual machine state to a certain snapshot name
236
+ self.provider.save_state(self.path_to_vm, snapshot_name)
237
+
238
+ def close(self):
239
+ # Close (release) the virtual machine
240
+ self.provider.stop_emulator(self.path_to_vm)
241
+
242
+ def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
243
+
244
+ # Reset to certain task in OSWorld
245
+ logger.info("Resetting environment...")
246
+ logger.info("Switching task...")
247
+ logger.info("Setting counters...")
248
+ self._traj_no += 1
249
+ self._step_no = 0
250
+ self.action_history.clear()
251
+
252
+ for attempt in range(MAX_RETRIES):
253
+ # Only revert to snapshot if environment has been used (step/setup)
254
+ # This optimization is especially important for cloud providers like AWS
255
+ # where unnecessary snapshot operations are costly and time-consuming
256
+
257
+ if task_config is not None:
258
+ # Only consider task proxy requirement if proxy is enabled at system level
259
+ task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
260
+ if not self.enable_proxy and task_config.get("proxy", False):
261
+ logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
262
+
263
+ if task_use_proxy != self.current_use_proxy:
264
+ # keep because get_info_from_website depend on this
265
+ self.current_use_proxy = task_use_proxy
266
+
267
+ if self.is_environment_used:
268
+ logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
269
+ self._revert_to_snapshot()
270
+ logger.info("Starting emulator...")
271
+ self._start_emulator()
272
+ logger.info("Emulator started.")
273
+ # Reset the usage flag after reverting
274
+ self.is_environment_used = False
275
+ else:
276
+ logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
277
+
278
+ if task_config is not None:
279
+ if task_config.get("proxy", False) and self.enable_proxy:
280
+ # If using proxy and proxy is enabled, set up the proxy configuration
281
+ self.setup_controller._proxy_setup(self.client_password)
282
+ self._set_task_info(task_config)
283
+ self.setup_controller.reset_cache_dir(self.cache_dir)
284
+ logger.info("Setting up environment...")
285
+ success = self.setup_controller.setup(self.config, task_config.get("proxy", False) and self.enable_proxy)
286
+ if success:
287
+ # Mark environment as used when setup is successfully executed
288
+ if self.config: # Only mark as used if there were actual setup operations
289
+ self.is_environment_used = True
290
+ break
291
+ else:
292
+ logger.error(
293
+ "Environment setup failed, retrying (%d/%d)...",
294
+ attempt + 1,
295
+ MAX_RETRIES,
296
+ )
297
+ time.sleep(5)
298
+ else:
299
+ break
300
+
301
+ logger.info("Environment setup complete.")
302
+
303
+ observation = self._get_obs()
304
+ return observation
305
+
306
+ def _get_obs(self):
307
+ # We provide screenshot, accessibility_tree (optional), terminal (optional), and instruction.
308
+ # can be customized and scaled
309
+ return {
310
+ "screenshot": self.controller.get_screenshot(),
311
+ "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
312
+ "terminal": self.controller.get_terminal_output() if self.require_terminal else None,
313
+ "instruction": self.instruction
314
+ }
315
+
316
+ @property
317
+ def vm_platform(self):
318
+ return self.controller.get_vm_platform()
319
+
320
+ @property
321
+ def vm_screen_size(self):
322
+ return self.controller.get_vm_screen_size()
323
+
324
+ def _set_task_info(self, task_config: Dict[str, Any]):
325
+ """Set task info (proxy logic is handled in reset method)"""
326
+ self.task_id: str = task_config["id"]
327
+ self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
328
+ os.makedirs(self.cache_dir, exist_ok=True)
329
+ self.instruction = task_config["instruction"]
330
+ self.config = task_config["config"] if "config" in task_config else []
331
+
332
+ self._set_evaluator_info(task_config)
333
+
334
+ def _set_evaluator_info(self, task_config: Dict[str, Any]):
335
+ """Set evaluator information from task config"""
336
+ # evaluator dict
337
+ # func -> metric function string, or list of metric function strings
338
+ # conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or"
339
+ # result -> result getter config, or list of result getter configs
340
+ # expected (optional) -> expected getter config, or list of expected getter configs
341
+ # options (optional) -> metric options, or list of metric options
342
+ # if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length
343
+ # even if one of the metrics does not need expected or options field, it should be included in the list with None
344
+ self.evaluator = task_config["evaluator"]
345
+ self.metric: Metric = [getattr(metrics, func) for func in self.evaluator["func"]] \
346
+ if isinstance(self.evaluator["func"], list) \
347
+ else getattr(metrics, self.evaluator["func"])
348
+ self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics
349
+ if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
350
+ self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
351
+ self.evaluator["result"]] \
352
+ if isinstance(self.evaluator["result"], list) \
353
+ else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
354
+ else:
355
+ self.result_getter = [None] * len(self.metric) \
356
+ if isinstance(self.metric, list) \
357
+ else None
358
+
359
+ if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
360
+ self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
361
+ self.evaluator["expected"]] \
362
+ if isinstance(self.evaluator["expected"], list) \
363
+ else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
364
+ else:
365
+ self.expected_getter = [None] * len(self.metric) \
366
+ if isinstance(self.metric, list) \
367
+ else None
368
+ self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in
369
+ self.evaluator["options"]] \
370
+ if isinstance(self.evaluator.get("options", {}), list) \
371
+ else self.evaluator["options"] \
372
+ if "options" in self.evaluator \
373
+ else [{}] * len(self.metric) \
374
+ if isinstance(self.metric, list) \
375
+ else {}
376
+
377
+ assert (not isinstance(self.evaluator["func"], list)
378
+ or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
379
+ self.metric_options)))
380
+
381
+ def step(self, action, pause=2):
382
+ self._step_no += 1
383
+ self.action_history.append(action)
384
+
385
+ # Mark environment as used when step is called
386
+ self.is_environment_used = True
387
+
388
+ reward = 0 # todo: Define reward calculation for each example
389
+ done = False # todo: Define episode termination condition for each example
390
+ info = {}
391
+ logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
392
+ # handle the special actions
393
+ if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
394
+ if action == 'WAIT' or (type(action) == dict and action.get('action_type') == 'WAIT'):
395
+ time.sleep(pause)
396
+ elif action == 'FAIL' or (type(action) == dict and action.get('action_type') == 'FAIL'):
397
+ done = True
398
+ info = {"fail": True}
399
+ elif action == 'DONE' or (type(action) == dict and action.get('action_type') == 'DONE'):
400
+ done = True
401
+ info = {"done": True}
402
+
403
+ if self.action_space == "computer_13":
404
+ # the set of all possible actions defined in the action representation
405
+ self.controller.execute_action(action)
406
+ elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
407
+ if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']):
408
+ self.controller.execute_action(action)
409
+ else:
410
+ # the set of all possible python commands insides `pyautogui`
411
+ if type(action) == str:
412
+ # Fix PyAutoGUI '<' character bug before execution
413
+ fixed_command = _fix_pyautogui_less_than_bug(action)
414
+ self.controller.execute_python_command(fixed_command)
415
+ elif type(action) == dict:
416
+ # Fix PyAutoGUI '<' character bug before execution
417
+ fixed_command = _fix_pyautogui_less_than_bug(action['command'])
418
+ self.controller.execute_python_command(fixed_command)
419
+
420
+ time.sleep(pause)
421
+ observation = self._get_obs()
422
+
423
+ return observation, reward, done, info
424
+
425
+ def evaluate(self):
426
+ """
427
+ Evaluate whether the task is successfully completed.
428
+ """
429
+
430
+ postconfig = self.evaluator.get("postconfig", [])
431
+ self.setup_controller.setup(postconfig, self.enable_proxy)
432
+ # Mark environment as used if there were postconfig setup operations
433
+ if postconfig:
434
+ self.is_environment_used = True
435
+
436
+ if self.evaluator['func'] == "infeasible":
437
+ if len(self.action_history) > 0:
438
+ last_action = self.action_history[-1]
439
+ if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
440
+ return 1
441
+ return 0
442
+ else:
443
+ if len(self.action_history) > 0:
444
+ last_action = self.action_history[-1]
445
+ if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
446
+ return 0
447
+
448
+ if type(self.metric) == list:
449
+ # Multiple metrics to evaluate whether the task is successfully completed
450
+ results = []
451
+ assert len(self.metric) == len(self.result_getter), "The number of metrics and result getters must be the same"
452
+ if "expected" in self.evaluator:
453
+ assert len(self.metric) == len(self.expected_getter), "The number of metrics and expected getters must be the same"
454
+ for idx, metric in enumerate(self.metric):
455
+ try:
456
+ config = self.evaluator["result"][idx]
457
+ result_state = self.result_getter[idx](self, config)
458
+ except FileNotFoundError:
459
+ logger.error("File not found!")
460
+ if self.metric_conj == 'and':
461
+ return 0
462
+
463
+ if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
464
+ expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
465
+ metric: int = metric(result_state, expected_state, **self.metric_options[idx])
466
+ else:
467
+ metric: int = metric(result_state, **self.metric_options[idx])
468
+
469
+ if self.metric_conj == 'and' and float(metric) == 0.0:
470
+ return 0
471
+ elif self.metric_conj == 'or' and float(metric) == 1.0:
472
+ return 1
473
+ else:
474
+ results.append(metric)
475
+
476
+ return sum(results) / len(results) if self.metric_conj == 'and' else max(results)
477
+ else:
478
+ # Single metric to evaluate whether the task is successfully completed
479
+ try:
480
+ result_state = self.result_getter(self, self.evaluator["result"])
481
+ except FileNotFoundError:
482
+ logger.error("File not found!")
483
+ return 0
484
+
485
+ if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
486
+ expected_state = self.expected_getter(self, self.evaluator["expected"])
487
+ metric: float = self.metric(result_state, expected_state, **self.metric_options)
488
+ else:
489
+ metric: float = self.metric(result_state, **self.metric_options)
490
+
491
+ return metric
492
+
493
+ def render(self, mode='rgb_array'):
494
+ if mode == 'rgb_array':
495
+ return self.controller.get_screenshot()
496
+ else:
497
+ raise ValueError('Unsupported render mode: {}'.format(mode))
desktop_env/desktop_env_os_symphony.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import time
6
+ import re
7
+ from typing import Callable, Any, Optional, Tuple
8
+ from typing import List, Dict, Union
9
+
10
+ import gymnasium as gym
11
+
12
+ from desktop_env.controllers.python import PythonController
13
+ from desktop_env.controllers.setup import SetupController
14
+ from desktop_env.evaluators import metrics, getters
15
+ from desktop_env.providers import create_vm_manager_and_provider
16
+
17
+ logger = logging.getLogger("desktopenv.env")
18
+
19
+ Metric = Callable[[Any, Any], float]
20
+ Getter = Callable[[gym.Env, Dict[str, Any]], Any]
21
+
22
+ MAX_RETRIES = 5 # Maximum retries for environment setup
23
+
24
+
25
+
26
+ def _fix_pyautogui_less_than_bug(command: str) -> str:
27
+ """
28
+ Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
29
+
30
+ This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
31
+ References:
32
+ - https://github.com/asweigart/pyautogui/issues/198
33
+ - https://github.com/xlang-ai/OSWorld/issues/257
34
+
35
+ Args:
36
+ command (str): The original pyautogui command
37
+
38
+ Returns:
39
+ str: The fixed command with '<' characters handled properly
40
+ """
41
+ # Pattern to match press('<') or press('\u003c') calls
42
+ press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
43
+
44
+ # Handle press('<') calls
45
+ def replace_press_less_than(match):
46
+ return 'pyautogui.hotkey("shift", ",")'
47
+
48
+ # First handle press('<') calls
49
+ command = re.sub(press_pattern, replace_press_less_than, command)
50
+
51
+ # Pattern to match typewrite calls with quoted strings
52
+ typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
53
+
54
+ # Then handle typewrite calls
55
+ def process_typewrite_match(match):
56
+ quote_char = match.group(1)
57
+ content = match.group(2)
58
+
59
+ # Preprocess: Try to decode Unicode escapes like \u003c to actual '<'
60
+ # This handles cases where '<' is represented as escaped Unicode
61
+ try:
62
+ # Attempt to decode unicode escapes
63
+ decoded_content = content.encode('utf-8').decode('unicode_escape')
64
+ content = decoded_content
65
+ except UnicodeDecodeError:
66
+ # If decoding fails, proceed with original content to avoid breaking existing logic
67
+ pass # English comment: Graceful degradation - fall back to original content if decoding fails
68
+
69
+ # Check if content contains '<'
70
+ if '<' not in content:
71
+ return match.group(0)
72
+
73
+ # Split by '<' and rebuild
74
+ parts = content.split('<')
75
+ result_parts = []
76
+
77
+ for i, part in enumerate(parts):
78
+ if i == 0:
79
+ # First part
80
+ if part:
81
+ result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
82
+ else:
83
+ # Add hotkey for '<' and then typewrite for the rest
84
+ result_parts.append('pyautogui.hotkey("shift", ",")')
85
+ if part:
86
+ result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
87
+
88
+ return '; '.join(result_parts)
89
+
90
+ command = re.sub(typewrite_pattern, process_typewrite_match, command)
91
+
92
+ return command
93
+
94
+
95
+ class DesktopEnv(gym.Env):
96
+ """
97
+ DesktopEnv with OpenAI Gym interface. It provides a desktop environment for setting and evaluating desktop automation tasks.
98
+ """
99
+ def __init__(
100
+ self,
101
+ provider_name: str = "vmware",
102
+ region: str = None,
103
+ path_to_vm: str = None,
104
+ snapshot_name: str = "init_state",
105
+ action_space: str = "pyautogui",
106
+ cache_dir: str = "cache",
107
+ screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))),
108
+ headless: bool = False,
109
+ require_a11y_tree: bool = True,
110
+ require_terminal: bool = False,
111
+ os_type: str = "Ubuntu",
112
+ enable_proxy: bool = False,
113
+ client_password: str = "",
114
+ ):
115
+ """
116
+ Args:
117
+ provider_name (str): virtualization provider name, default to "vmware"
118
+ region (str): the region for allocate machines, work for cloud services, default to "us-east-1"
119
+ path_to_vm (str): path to .vmx file
120
+ snapshot_name (str): snapshot name to revert to, default to "init_state"
121
+ action_space (str): "computer_13" | "pyautogui"
122
+ cache_dir (str): cache directory to cache task-related stuffs like
123
+ reference file for evaluation
124
+ screen_size (Tuple[int]): screen size of the VM
125
+ headless (bool): whether to run the VM in headless mode
126
+ require_a11y_tree (bool): whether to require accessibility tree
127
+ require_terminal (bool): whether to require terminal output
128
+ os_type (str): operating system type, default to "Ubuntu"
129
+ enable_proxy (bool): whether to enable proxy support, default to False
130
+ """
131
+ # Initialize VM manager and vitualization provider
132
+ self.region = region
133
+ self.provider_name = provider_name
134
+ self.enable_proxy = enable_proxy # Store proxy enablement setting
135
+ if client_password == "":
136
+ if self.provider_name == "aws":
137
+ self.client_password = "osworld-public-evaluation"
138
+ else:
139
+ self.client_password = "password"
140
+ else:
141
+ self.client_password = client_password
142
+
143
+ self.screen_width = screen_size[0]
144
+ self.screen_height = screen_size[1]
145
+
146
+ # Default
147
+ self.server_port = 5000
148
+ self.chromium_port = 9222
149
+ self.vnc_port = 8006
150
+ self.vlc_port = 8080
151
+
152
+ # Initialize with default (no proxy) provider
153
+ self.current_use_proxy = False
154
+ self.manager, self.provider = None, None
155
+ self.os_type = os_type
156
+ self.path_to_vm = path_to_vm
157
+ # Track whether environment has been used (step/setup) to optimize snapshot revert
158
+ # docker, aws, gcp, azure are always unused as the emulator starts from a clean state
159
+ # vmware, virtualbox are always used as the emulator starts from a dirty state
160
+ if self.provider_name in {"docker", "aws", "gcp", "azure", "aliyun", "volcengine"}:
161
+ self.is_environment_used = False
162
+ elif self.provider_name in {"vmware", "virtualbox"}:
163
+ self.is_environment_used = True
164
+ else:
165
+ raise ValueError(f"Invalid provider name: {self.provider_name}")
166
+
167
+ self.snapshot_name = snapshot_name
168
+ self.cache_dir_base: str = cache_dir
169
+ self.headless = headless
170
+ self.require_a11y_tree = require_a11y_tree
171
+ self.require_terminal = require_terminal
172
+
173
+ # mode: human or machine
174
+ self.instruction = None
175
+ assert action_space in ["computer_13", "pyautogui", "claude_computer_use", "autoglm_computer_use"]
176
+ self.action_space = action_space # todo: refactor it to the ActType
177
+
178
+ # episodic stuffs, like counters, will be updated or reset
179
+ # when calling self.reset()
180
+ self._traj_no: int = -1
181
+ self._step_no: int = 0
182
+ self.action_history: List[Dict[str, any]] = []
183
+
184
+ def start(self):
185
+ # Initialize emulator and controller
186
+ if not self.manager and not self.provider:
187
+ logger.info("Initializing...")
188
+ self.manager, self.provider = create_vm_manager_and_provider(self.provider_name, self.region, use_proxy=False)
189
+
190
+ if self.path_to_vm:
191
+ self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(self.path_to_vm))) \
192
+ if self.provider_name in {"vmware", "virtualbox"} else self.path_to_vm
193
+ else:
194
+ self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=self.region, screen_size=(self.screen_width, self.screen_height))
195
+
196
+ self._start_emulator()
197
+
198
+ def _start_emulator(self):
199
+ try:
200
+ # Power on the virtual machine
201
+ self.provider.start_emulator(self.path_to_vm, self.headless, self.os_type)
202
+
203
+ # Get the ip from the virtual machine, and setup the controller
204
+ vm_ip_ports = self.provider.get_ip_address(self.path_to_vm).split(':')
205
+ self.vm_ip = vm_ip_ports[0]
206
+ # Get the ports from the virtual machine (for Docker provider only)
207
+ if len(vm_ip_ports) > 1:
208
+ self.server_port = int(vm_ip_ports[1])
209
+ self.chromium_port = int(vm_ip_ports[2])
210
+ self.vnc_port = int(vm_ip_ports[3])
211
+ self.vlc_port = int(vm_ip_ports[4])
212
+ self.controller = PythonController(vm_ip=self.vm_ip, server_port=self.server_port)
213
+ self.setup_controller = SetupController(vm_ip=self.vm_ip, server_port=self.server_port, chromium_port=self.chromium_port, vlc_port=self.vlc_port, cache_dir=self.cache_dir_base, client_password=self.client_password, screen_width=self.screen_width, screen_height=self.screen_height)
214
+
215
+ except Exception as e:
216
+ try:
217
+ self.provider.stop_emulator(self.path_to_vm)
218
+ except Exception as stop_err:
219
+ logger.warning(f"Cleanup after interrupt failed: {stop_err}")
220
+ raise
221
+
222
+ def _revert_to_snapshot(self):
223
+ # Revert to certain snapshot of the virtual machine, and refresh the path to vm and ip of vm
224
+ # due to the fact it could be changed when implemented by cloud services
225
+ path_to_vm = self.provider.revert_to_snapshot(self.path_to_vm, self.snapshot_name)
226
+ if path_to_vm and not path_to_vm == self.path_to_vm:
227
+ # path_to_vm has to be a new path
228
+
229
+ self.manager.delete_vm(self.path_to_vm, self.region)
230
+ self.manager.add_vm(path_to_vm, self.region)
231
+ self.manager.occupy_vm(path_to_vm, os.getpid(), self.region)
232
+ self.path_to_vm = path_to_vm
233
+
234
+ def _save_state(self, snapshot_name=None):
235
+ # Save the current virtual machine state to a certain snapshot name
236
+ self.provider.save_state(self.path_to_vm, snapshot_name)
237
+
238
+ def close(self):
239
+ # Close (release) the virtual machine
240
+ self.provider.stop_emulator(self.path_to_vm)
241
+
242
+ def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
243
+
244
+ # Reset to certain task in OSWorld
245
+ logger.info("Resetting environment...")
246
+ logger.info("Switching task...")
247
+ logger.info("Setting counters...")
248
+ self._traj_no += 1
249
+ self._step_no = 0
250
+ self.action_history.clear()
251
+
252
+ for attempt in range(MAX_RETRIES):
253
+ # Only revert to snapshot if environment has been used (step/setup)
254
+ # This optimization is especially important for cloud providers like AWS
255
+ # where unnecessary snapshot operations are costly and time-consuming
256
+
257
+ if task_config is not None:
258
+ # Only consider task proxy requirement if proxy is enabled at system level
259
+ task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
260
+ if not self.enable_proxy and task_config.get("proxy", False):
261
+ logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
262
+
263
+ if task_use_proxy != self.current_use_proxy:
264
+ # keep because get_info_from_website depend on this
265
+ self.current_use_proxy = task_use_proxy
266
+
267
+ if self.is_environment_used:
268
+ logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
269
+ self._revert_to_snapshot()
270
+ logger.info("Starting emulator...")
271
+ self._start_emulator()
272
+ logger.info("Emulator started.")
273
+ # Reset the usage flag after reverting
274
+ self.is_environment_used = False
275
+ else:
276
+ logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
277
+
278
+ if task_config is not None:
279
+ if task_config.get("proxy", False) and self.enable_proxy:
280
+ # If using proxy and proxy is enabled, set up the proxy configuration
281
+ self.setup_controller._proxy_setup(self.client_password)
282
+ self._set_task_info(task_config)
283
+ self.setup_controller.reset_cache_dir(self.cache_dir)
284
+ logger.info("Setting up environment...")
285
+ success = self.setup_controller.setup(self.config, task_config.get("proxy", False) and self.enable_proxy)
286
+ if success:
287
+ # Mark environment as used when setup is successfully executed
288
+ if self.config: # Only mark as used if there were actual setup operations
289
+ self.is_environment_used = True
290
+ break
291
+ else:
292
+ logger.error(
293
+ "Environment setup failed, retrying (%d/%d)...",
294
+ attempt + 1,
295
+ MAX_RETRIES,
296
+ )
297
+ time.sleep(5)
298
+ else:
299
+ break
300
+
301
+ logger.info("Environment setup complete.")
302
+
303
+ observation = self._get_obs()
304
+ return observation
305
+
306
+ def _get_obs(self):
307
+ # We provide screenshot, accessibility_tree (optional), terminal (optional), and instruction.
308
+ # can be customized and scaled
309
+ return {
310
+ "screenshot": self.controller.get_screenshot(),
311
+ "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
312
+ "terminal": self.controller.get_terminal_output() if self.require_terminal else None,
313
+ "instruction": self.instruction
314
+ }
315
+
316
+ @property
317
+ def vm_platform(self):
318
+ return self.controller.get_vm_platform()
319
+
320
+ @property
321
+ def vm_screen_size(self):
322
+ return self.controller.get_vm_screen_size()
323
+
324
+ def _set_task_info(self, task_config: Dict[str, Any]):
325
+ """Set task info (proxy logic is handled in reset method)"""
326
+ self.task_id: str = task_config["id"]
327
+ self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
328
+ os.makedirs(self.cache_dir, exist_ok=True)
329
+ self.instruction = task_config["instruction"]
330
+ self.config = task_config["config"] if "config" in task_config else []
331
+
332
+ self._set_evaluator_info(task_config)
333
+
334
+ def _set_evaluator_info(self, task_config: Dict[str, Any]):
335
+ """Set evaluator information from task config"""
336
+ if "evaluator" not in task_config:
337
+ return
338
+ # evaluator dict
339
+ # func -> metric function string, or list of metric function strings
340
+ # conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or"
341
+ # result -> result getter config, or list of result getter configs
342
+ # expected (optional) -> expected getter config, or list of expected getter configs
343
+ # options (optional) -> metric options, or list of metric options
344
+ # if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length
345
+ # even if one of the metrics does not need expected or options field, it should be included in the list with None
346
+ self.evaluator = task_config["evaluator"]
347
+ self.metric: Metric = [getattr(metrics, func) for func in self.evaluator["func"]] \
348
+ if isinstance(self.evaluator["func"], list) \
349
+ else getattr(metrics, self.evaluator["func"])
350
+ self.metric_conj: str = self.evaluator.get("conj", "and") # take conjunction of multiple metrics
351
+ if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
352
+ self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
353
+ self.evaluator["result"]] \
354
+ if isinstance(self.evaluator["result"], list) \
355
+ else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
356
+ else:
357
+ self.result_getter = [None] * len(self.metric) \
358
+ if isinstance(self.metric, list) \
359
+ else None
360
+
361
+ if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
362
+ self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
363
+ self.evaluator["expected"]] \
364
+ if isinstance(self.evaluator["expected"], list) \
365
+ else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
366
+ else:
367
+ self.expected_getter = [None] * len(self.metric) \
368
+ if isinstance(self.metric, list) \
369
+ else None
370
+ self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in
371
+ self.evaluator["options"]] \
372
+ if isinstance(self.evaluator.get("options", {}), list) \
373
+ else self.evaluator["options"] \
374
+ if "options" in self.evaluator \
375
+ else [{}] * len(self.metric) \
376
+ if isinstance(self.metric, list) \
377
+ else {}
378
+
379
+ assert (not isinstance(self.evaluator["func"], list)
380
+ or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
381
+ self.metric_options)))
382
+
383
+ def step(self, action, pause=2):
384
+ self._step_no += 1
385
+ self.action_history.append(action)
386
+
387
+ # Mark environment as used when step is called
388
+ self.is_environment_used = True
389
+
390
+ reward = 0 # todo: Define reward calculation for each example
391
+ done = False # todo: Define episode termination condition for each example
392
+ info = {}
393
+ logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
394
+ # handle the special actions
395
+ if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
396
+ if action == 'WAIT' or (type(action) == dict and action.get('action_type') == 'WAIT'):
397
+ time.sleep(pause)
398
+ elif action == 'FAIL' or (type(action) == dict and action.get('action_type') == 'FAIL'):
399
+ done = True
400
+ info = {"fail": True}
401
+ elif action == 'DONE' or (type(action) == dict and action.get('action_type') == 'DONE'):
402
+ done = True
403
+ info = {"done": True}
404
+
405
+ if self.action_space == "computer_13":
406
+ # the set of all possible actions defined in the action representation
407
+ self.controller.execute_action(action)
408
+ elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
409
+ if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']):
410
+ self.controller.execute_action(action)
411
+ else:
412
+ # the set of all possible python commands insides `pyautogui`
413
+ if type(action) == str:
414
+ # Fix PyAutoGUI '<' character bug before execution
415
+ fixed_command = _fix_pyautogui_less_than_bug(action)
416
+ self.controller.execute_python_command(fixed_command)
417
+ elif type(action) == dict:
418
+ # Fix PyAutoGUI '<' character bug before execution
419
+ fixed_command = _fix_pyautogui_less_than_bug(action['command'])
420
+ self.controller.execute_python_command(fixed_command)
421
+
422
+ time.sleep(pause)
423
+ observation = self._get_obs()
424
+
425
+ return observation, reward, done, info
426
+
427
+ def evaluate(self):
428
+ """
429
+ Evaluate whether the task is successfully completed.
430
+ """
431
+
432
+ postconfig = self.evaluator.get("postconfig", [])
433
+ self.setup_controller.setup(postconfig, self.enable_proxy)
434
+ # Mark environment as used if there were postconfig setup operations
435
+ if postconfig:
436
+ self.is_environment_used = True
437
+
438
+ if self.evaluator['func'] == "infeasible":
439
+ if len(self.action_history) > 0:
440
+ last_action = self.action_history[-1]
441
+ if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
442
+ return 1
443
+ return 0
444
+ else:
445
+ if len(self.action_history) > 0:
446
+ last_action = self.action_history[-1]
447
+ if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
448
+ return 0
449
+
450
+ if type(self.metric) == list:
451
+ # Multiple metrics to evaluate whether the task is successfully completed
452
+ results = []
453
+ assert len(self.metric) == len(self.result_getter), "The number of metrics and result getters must be the same"
454
+ if "expected" in self.evaluator:
455
+ assert len(self.metric) == len(self.expected_getter), "The number of metrics and expected getters must be the same"
456
+ for idx, metric in enumerate(self.metric):
457
+ try:
458
+ config = self.evaluator["result"][idx]
459
+ result_state = self.result_getter[idx](self, config)
460
+ except FileNotFoundError:
461
+ logger.error("File not found!")
462
+ if self.metric_conj == 'and':
463
+ return 0
464
+
465
+ if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
466
+ expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
467
+ metric: int = metric(result_state, expected_state, **self.metric_options[idx])
468
+ else:
469
+ metric: int = metric(result_state, **self.metric_options[idx])
470
+
471
+ if self.metric_conj == 'and' and float(metric) == 0.0:
472
+ return 0
473
+ elif self.metric_conj == 'or' and float(metric) == 1.0:
474
+ return 1
475
+ else:
476
+ results.append(metric)
477
+
478
+ return sum(results) / len(results) if self.metric_conj == 'and' else max(results)
479
+ else:
480
+ # Single metric to evaluate whether the task is successfully completed
481
+ try:
482
+ result_state = self.result_getter(self, self.evaluator["result"])
483
+ except FileNotFoundError:
484
+ logger.error("File not found!")
485
+ return 0
486
+
487
+ if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
488
+ expected_state = self.expected_getter(self, self.evaluator["expected"])
489
+ metric: float = self.metric(result_state, expected_state, **self.metric_options)
490
+ else:
491
+ metric: float = self.metric(result_state, **self.metric_options)
492
+
493
+ return metric
494
+
495
+ def render(self, mode='rgb_array'):
496
+ if mode == 'rgb_array':
497
+ return self.controller.get_screenshot()
498
+ else:
499
+ raise ValueError('Unsupported render mode: {}'.format(mode))
desktop_env/evaluators/README.md ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluator Setup Details
2
+ Setup scaffolding for the evaluators in the desktop environment for those who want to know the details of the evaluator setup for customized evaluation and extension
3
+
4
+ ## Overall
5
+ Inside the virtual machine, disable the system crash report by:
6
+ ```
7
+ sudo vim /etc/default/apport
8
+ ```
9
+ and then change the `enabled` to `0`.
10
+
11
+ ## VSCode
12
+ todo
13
+
14
+ ## LibreOffice
15
+ For LibreOffice, please enter into the app first, and then enable the no pop-up when 'ctrl + s'.
16
+
17
+ ## LibreOffice Press
18
+ ### Setting Up the python-pptx Library
19
+ ```shell
20
+ pip install python-pptx
21
+ ```
22
+
23
+ ## LibreOffice Writer
24
+
25
+ ### Setting Up the python-docx and odfpy Library
26
+ ```shell
27
+ pip install python-docx
28
+ pip install odfpy
29
+ ```
30
+
31
+ ## LibreOffice Calc
32
+
33
+ ### Required Libraries
34
+
35
+ ```
36
+ openpyxl
37
+ pandas
38
+ lxml
39
+ xmltodict
40
+ ```
41
+
42
+ ### How to Generate CSV from XLSX
43
+
44
+ ```sh
45
+ libreoffice --convert-to "csv:Text - txt - csv (StarCalc):44,34,UTF8,,,,false,true,true,false,false,1" --out-dir /home/user /home/user/abc.xlsx
46
+ ```
47
+
48
+ This command will generate `abc-Sheet1.csv` under `/home/user`. The last `1` in
49
+ the conversion options indicates the sheet number (starting from 1) to export.
50
+ Detailed usage should be referred to at [CSV Filter
51
+ Options](https://help.libreoffice.org/latest/ro/text/shared/guide/csv_params.html).
52
+
53
+ Refer to `libreoffice_calc/21df9241-f8d7-4509-b7f1-37e501a823f7.json` for an
54
+ example.
55
+
56
+ ### About `compare_table`
57
+
58
+ Evaluation to xlsx files mainly relies on `compare_table`. It accepts two file
59
+ names and a list of rules defined as `options`. Refer to
60
+ `libreoffice_calc/21df9241-f8d7-4509-b7f1-37e501a823f7.json` for an example.
61
+
62
+ In each rule, there is a required field `type`. The supported types are defined
63
+ in `compare_table` function. The most common two are `sheet_data` and
64
+ `sheet_print`. `sheet_data` compares the internal cell values through pandoc,
65
+ while `sheet_print` compares the shown cell values through csv. A csv should be
66
+ generated and downloaded for `sheet_print`. See the previous section and
67
+ example in `libreoffice_calc/21df9241-f8d7-4509-b7f1-37e501a823f7.json`.
68
+
69
+ Other fields in a rule are described for each evaluation type in
70
+ `compare_table` function. `sheet_idx0` (or `sheet_idx1`, `sheet_idx`) is a
71
+ common field to indicate which sheet is to extracted from the workbook. If an
72
+ integer i is given, then it extracts the i-th sheet from result xlsx (i starts
73
+ from 0). If a string is given, it should be preceded with "RI", "RN", "EI", or
74
+ "EN". "R" indicates to extract from result xlsx while "E" indicates to extract
75
+ from expected (golden) xlsx. "I" indicates a sheet number (starting from 0) and
76
+ "N" indicates a sheet name (usually, they're like "Sheet1", "Sheet2", ...).
77
+
78
+ Some rules use a atructure like `{"method": "eq", "ref": "abc"}`. These rules
79
+ are checked through `utils._match_value_to_rule` function. Check it for the
80
+ implemented matching methods.
81
+
82
+ ## Chrome
83
+
84
+ ### Starting Chrome with Remote Debugging for Python
85
+
86
+ To enable remote debugging in Chrome, which allows tools like Playwright for Python to connect to and control an existing Chrome instance, follow these steps:
87
+
88
+ #### Manually Enabling Remote Debugging in Chrome
89
+
90
+ 1. **Locate the Chrome Shortcut**:
91
+ - Find the Chrome shortcut that you usually use to open the browser. This could be on your desktop, start menu, or taskbar.
92
+
93
+ 2. **Edit Shortcut Properties**:
94
+ - Right-click on the Chrome shortcut and select `Properties`.
95
+
96
+ 3. **Modify the Target Field**:
97
+ - In the `Target` field, add `--remote-debugging-port=9222` at the end of the path. Ensure there is a space between the path and the flag you add.
98
+ - It should look something like this: `"C:\Path\To\Chrome.exe" --remote-debugging-port=9222`.
99
+
100
+ 4. **Apply and Close**:
101
+ - Click `Apply` and then `OK` to close the dialog.
102
+
103
+ 5. **Start Chrome**:
104
+ - Use this modified shortcut to start Chrome. Chrome will now start with remote debugging enabled on port 9222.
105
+
106
+ 6. **Confirm Remote Debugging**:
107
+ - Open a browser and navigate to `http://localhost:9222`. If you see a webpage with information about active tabs, remote debugging is working.
108
+
109
+ ---
110
+
111
+ ### Setting Up Playwright for Python
112
+
113
+ Playwright for Python is a browser automation library to control Chromium, Firefox, and WebKit with a single API.
114
+
115
+ #### Installing Playwright
116
+
117
+ - Ensure you have Python installed on your system. If not, download and install it from the [Python official website](https://www.python.org/).
118
+
119
+ - Install Playwright using pip (Python Package Installer). Open a command line or terminal and run:
120
+
121
+ ```bash
122
+ pip install playwright
123
+ ```
124
+
125
+ - After installing Playwright, you need to run the install command to download the necessary browser binaries:
126
+
127
+ ```bash
128
+ playwright install
129
+ ```
130
+
131
+ #### Writing a Playwright Script in Python
132
+
133
+ - Create a Python file for your automation script.
134
+
135
+ - Import the Playwright module at the beginning of your script:
136
+
137
+ ```python
138
+ from playwright.sync_api import sync_playwright
139
+ ```
140
+
141
+ - You can now use Playwright's API to control browsers.
142
+
143
+ #### Example Playwright Script
144
+
145
+ Here is a simple example to open a page using Playwright:
146
+
147
+ ```python
148
+ from playwright.sync_api import sync_playwright
149
+
150
+ def run(playwright):
151
+ browser = playwright.chromium.launch()
152
+ page = browser.new_page()
153
+ page.goto("http://example.com")
154
+ ## other actions...
155
+ browser.close()
156
+
157
+ with sync_playwright() as playwright:
158
+ run(playwright)
159
+ ```
160
+
161
+ - This script launches Chromium, opens a new page, navigates to `example.com`, and then closes the browser.
162
+
163
+ #### Troubleshooting
164
+
165
+ - If you encounter issues with Playwright, ensure that your Python environment is correctly set up and that you have installed Playwright and its dependencies correctly.
166
+ - For detailed documentation, visit the [Playwright for Python Documentation](https://playwright.dev/python/docs/intro).
167
+
168
+
169
+ ## VLC Media Player
170
+
171
+ ### Bugs fix
172
+ One thing on Ubuntu need to do, enter into the `meida`>`convert/save`>select files>`convert/save`
173
+ Then enter the profile of `Audio - MP3`, change the profile for mp3, section audiocodec from "MP3" to "MPEG Audio"
174
+ Otherwise the mp3 file will be created but with 0 bytes. It's a bug of VLC.
175
+
176
+ ### Setting Up VLC's HTTP Interface
177
+
178
+ To enable and use the HTTP interface in VLC Media Player for remote control and status checks, follow these steps:
179
+
180
+ #### 1. Open VLC Preferences
181
+
182
+ - Open VLC Media Player.
183
+ - Go to `Tools` > `Preferences` from the menu.
184
+
185
+ #### 2. Show All Settings
186
+
187
+ - In the Preferences window, at the bottom left corner, select `All` under `Show settings` to display advanced settings.
188
+
189
+ #### 3. Enable Main Interfaces
190
+
191
+ - In the advanced preferences, expand the `Interface` section.
192
+ - Click on `Main interfaces`.
193
+ - Check the box for `Web` to enable the HTTP interface.
194
+
195
+ #### 4. Configure Lua HTTP
196
+
197
+ - Expand the `Main interfaces` node and select `Lua`.
198
+ - Under `Lua HTTP`, set a password `password` in the `Lua HTTP` section. This password will be required to access the HTTP interface.
199
+
200
+ #### 5. Save and Restart VLC
201
+
202
+ - Click `Save` to apply the changes.
203
+ - Restart VLC Media Player for the changes to take effect.
204
+
205
+ #### 6. Accessing the HTTP Interface
206
+
207
+ - Open a web browser and go to `http://localhost:8080`.
208
+ - You will be prompted for a password. Enter the password you set in the Lua HTTP settings.
209
+ - Once logged in, you will have access to VLC's HTTP interface for remote control.
210
+
211
+ #### Packages
212
+ ```bash
213
+
214
+ pip install opencv-python-headless Pillow imagehash
215
+ ```
216
+
217
+ #### Troubleshooting
218
+
219
+ - If you cannot access the HTTP interface, check if your firewall or security software is blocking the connection.
220
+ - Ensure VLC is running and the correct port (default is 8080) is being used.
221
+ - If the port is in use by another application, you may change the port number in VLC's settings.
222
+
223
+ ## GIMP
224
+ Click on the "Keep" of the image loading pop-up.
desktop_env/evaluators/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #from .table import compare_table
2
+
3
+ #eval_funcs = {
4
+ #"compare_table(expected, actual)": compare_table
5
+ #}