AbdulElahGwaith commited on Feb 21

Commit

2d483c2

verified ·

1 Parent(s): bd2d604

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.envrc +5 -0
.gitattributes +24 -0
.gitignore +214 -0
.mise.toml +5 -0
CONTRIBUTING_AR.md +44 -0
LICENSE +201 -0
README.md +275 -0
SETUP_GUIDELINE.md +452 -0
assets/authorization.png +3 -0
assets/creategcp.png +3 -0
assets/desktopapp.png +3 -0
assets/developer.png +3 -0
assets/enableapi.png +3 -0
assets/googleidentity.png +0 -0
assets/googlephonecode.png +0 -0
assets/googleshutoff.png +0 -0
assets/netsetting1.png +0 -0
assets/netsetting2.png +0 -0
assets/netsetting3.png +0 -0
assets/netsetting4.png +0 -0
assets/oauth2.0.png +3 -0
assets/oauthapp.png +3 -0
assets/proxysetup-zh.png +3 -0
assets/proxysetup.png +3 -0
assets/pubeval1.png +0 -0
assets/pubeval2.png +3 -0
assets/pubeval3.png +3 -0
assets/pubeval4.png +0 -0
assets/pubeval5.png +0 -0
assets/pubeval_gdrive_auth.jpg +3 -0
assets/pubeval_monitor1.jpg +3 -0
assets/pubeval_monitor2.jpg +3 -0
assets/pubeval_subnet.png +3 -0
assets/publishapp.png +3 -0
assets/testusers.png +3 -0
assets/unsafemode.png +3 -0
assets/usertype.png +3 -0
assets/winnetsetting1.png +3 -0
assets/winnetsetting2.png +0 -0
assets/winnetsetting3.png +3 -0
assets/winnetsetting4.png +3 -0
desktop_env/__init__.py +1 -0
desktop_env/actions.py +203 -0
desktop_env/controllers/__init__.py +0 -0
desktop_env/controllers/python.py +584 -0
desktop_env/controllers/setup.py +920 -0
desktop_env/desktop_env.py +497 -0
desktop_env/desktop_env_os_symphony.py +499 -0
desktop_env/evaluators/README.md +224 -0
desktop_env/evaluators/__init__.py +5 -0

.envrc ADDED Viewed

	@@ -0,0 +1,5 @@

+watch_file .mise.toml
+[[ -e ~/.local/bin/mise ]] || (curl -sf https://mise.run | MISE_QUIET=1 sh)
+~/.local/bin/mise trust 2> /dev/null
+~/.local/bin/mise install -qy
+direnv_load ~/.local/bin/mise direnv exec

.gitattributes CHANGED Viewed

@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/authorization.png filter=lfs diff=lfs merge=lfs -text
+assets/creategcp.png filter=lfs diff=lfs merge=lfs -text
+assets/desktopapp.png filter=lfs diff=lfs merge=lfs -text
+assets/developer.png filter=lfs diff=lfs merge=lfs -text
+assets/enableapi.png filter=lfs diff=lfs merge=lfs -text
+assets/oauth2.0.png filter=lfs diff=lfs merge=lfs -text
+assets/oauthapp.png filter=lfs diff=lfs merge=lfs -text
+assets/proxysetup-zh.png filter=lfs diff=lfs merge=lfs -text
+assets/proxysetup.png filter=lfs diff=lfs merge=lfs -text
+assets/pubeval2.png filter=lfs diff=lfs merge=lfs -text
+assets/pubeval3.png filter=lfs diff=lfs merge=lfs -text
+assets/pubeval_gdrive_auth.jpg filter=lfs diff=lfs merge=lfs -text
+assets/pubeval_monitor1.jpg filter=lfs diff=lfs merge=lfs -text
+assets/pubeval_monitor2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/pubeval_subnet.png filter=lfs diff=lfs merge=lfs -text
+assets/publishapp.png filter=lfs diff=lfs merge=lfs -text
+assets/testusers.png filter=lfs diff=lfs merge=lfs -text
+assets/unsafemode.png filter=lfs diff=lfs merge=lfs -text
+assets/usertype.png filter=lfs diff=lfs merge=lfs -text
+assets/winnetsetting1.png filter=lfs diff=lfs merge=lfs -text
+assets/winnetsetting3.png filter=lfs diff=lfs merge=lfs -text
+assets/winnetsetting4.png filter=lfs diff=lfs merge=lfs -text
+mm_agents/uipath/imgs/element_predictions.png filter=lfs diff=lfs merge=lfs -text
+mm_agents/uipath/imgs/schema.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,214 @@

+# Model checkpoints
+*.pth
+*.pt
+# Credential files
+evaluation_examples/settings/google/settings.json
+evaluation_examples/settings/googledrive/credentials.json
+evaluation_examples/settings/googledrive/client_secrets.json
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# PyCharm
+**/.idea/**/*
+# Mac OS
+.DS_Store
+# data
+**/data/**/*
+!**/utils/data/**/*
+# tmp files
+**/tmp/**/*
+api_key.py
+tmp.*
+## Server logging
+**/.logging/**/*
+# DB cache
+**/.db_cache/**/*
+**/debugging/**/*
+# embedding repo
+instructor-embedding
+# plugin cache
+**/static/**/*
+# frontend cache
+frontend/node_modules/
+frontend/.next/
+frontend/.idea
+tags
+tags-opts
+snapshots
+branch_flag
+branch-config
+*.syncthing.*.tmp
+cache
+version.folder
+at_processing
+test.xlsx
+test2.xlsx
+# vm info
+.vms
+/vm_data
+docker_vm_data
+vmware_vm_data
+.vmware*
+.aws*
+# result
+**/result*/**/*
+.vscode
+dataimpulse_proxy_config.json
+## reference and draft and debug
+reference/
+draft/
+manual_examine.py
+run_human_examine.sh
+quick_start.py
+result_multi_apps_pengxiang_transformers12evaluation_examples/settings/proxy/dataimpulse.json
+evaluation_examples/settings/proxy/dataimpulse.json
+# Local test configurations (not for public repo)
+evaluation_examples/spiderman.json
+evaluation_examples/test_50_random_proportional.json
+evaluation_examples/test_chrome.json

.mise.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[tools]
+python = "3.12"
+[env]
+_.python.venv = { path = ".venv", create = true }

CONTRIBUTING_AR.md ADDED Viewed

	@@ -0,0 +1,44 @@

+# دليل المساهمة في OSWorld 🌍
+شكراً لاهتمامك بالمساهمة في **OSWorld**! نحن نرحب بجميع أنواع المساهمات، سواء كانت تحسينات في الكود، تحديثات للوثائق، أو إضافة مهام تقييم جديدة.
+## 🚀 كيف تبدأ؟
+1.  **إنشاء Fork:** قم بإنشاء نسخة خاصة بك من المستودع على GitHub.
+2.  **الاستنساخ:** قم باستنساخ المستودع محلياً:
+    ```bash
+    git clone https://github.com/YOUR_USERNAME/OSWorld.git
+    ```
+3.  **إعداد البيئة:** اتبع التعليمات الموجودة في `README.md` لتثبيت المتطلبات.
+## 🛠 مجالات المساهمة
+### 1. تحسين البيئة (Environment)
+يمكنك المساهمة في تحسين دعم المنصات المختلفة مثل:
+*   VMware / VirtualBox
+*   Docker (KVM)
+*   Cloud Providers (AWS, Azure, Aliyun)
+### 2. إضافة مهام تقييم (Evaluation Tasks)
+يمكنك إضافة سيناريوهات جديدة في مجالات:
+*   تطبيقات الأوفيس (LibreOffice, Microsoft Office)
+*   تصفح الويب والمهام اليومية.
+*   البرمجيات المهنية (GIMP, VS Code, etc.)
+### 3. تحسين الوثائق
+نحن نقدر جداً تحسين ملفات الـ README، إضافة أمثلة توضيحية، أو ترجمة الوثائق للغات أخرى.
+## 📝 قواعد الكود
+*   يرجى اتباع معايير **PEP 8** لكود Python.
+*   تأكد من إضافة تعليقات توضيحية للكود الجديد.
+*   قم بتحديث ملف `requirements.txt` إذا قمت بإضافة مكتبات جديدة.
+## 📬 إرسال التعديلات
+1.  قم بإنشاء فرع جديد (Branch) لوصف تعديلك: `git checkout -b feature/my-new-feature`.
+2.  قم بعمل Commit لتعديلاتك مع رسالة واضحة.
+3.  قم برفع التعديلات (Push) إلى مستودعك.
+4.  افتح **Pull Request** في المستودع الأصلي.
+نحن نتطلع لرؤية مساهماتكم! 🚀

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 XLANG NLP Lab
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,275 @@

+<p align="center">
+  <img src="https://huggingface.co/datasets/xlangai/assets/resolve/main/github_banner_v2.png" alt="Banner">
+</p>
+<p align="center">
+  <a href="https://os-world.github.io/">Website</a> •
+  <a href="https://arxiv.org/abs/2404.07972">Paper</a> •
+  <a href="https://timothyxxx.github.io/OSWorld/">Doc</a> •
+  <a href="https://github.com/xlang-ai/OSWorld/tree/main/evaluation_examples">Data</a> •
+  <a href="https://os-world.github.io/explorer.html">Data Viewer</a> •
+  <a href="https://discord.gg/4Gnw7eTEZR">Discord</a> •
+  <a href="CONTRIBUTING_AR.md">دليل المساهمة (AR)</a> •
+  <a href="https://drive.google.com/file/d/1XlEy49otYDyBlA3O9NbR0BpPfr2TXgaD/view?usp=drive_link">Cache</a>
+</p>
+<p align="center">
+    <a href="https://img.shields.io/badge/PRs-Welcome-red">
+        <img src="https://img.shields.io/badge/PRs-Welcome-red">
+    </a>
+    <a href="https://img.shields.io/github/last-commit/xlang-ai/OSWorld?color=green">
+        <img src="https://img.shields.io/github/last-commit/xlang-ai/OSWorld?color=green">
+    </a>
+    <a href="https://opensource.org/licenses/Apache-2.0">
+        <img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg">
+    </a>
+    <a href="https://badge.fury.io/py/desktop-env">
+        <img src="https://badge.fury.io/py/desktop-env.svg">
+    </a>
+    <a href="https://pepy.tech/project/desktop-env">
+        <img src="https://static.pepy.tech/badge/desktop-env">
+    </a>
+    <br/>
+</p>
+## 📢 Updates
+- 2025-07-28: Introducing **OSWorld-Verified**! We have made major updates, fixed several issues reported by the community, with more support for AWS (can reduce evaluation time to within 1 hour through parallelization!), and making the benchmark signals more effective. Check out more in the [report](https://xlang.ai/blog/osworld-verified). We have run new model results in the latest version and updated them on the [official website](https://os-world.github.io/). Please compare your OSWorld results with the new benchmark results when running the latest version.
+- 2025-05-01: If you need pre-downloaded files for init state setup, we downloaded for you [here](https://drive.google.com/file/d/1XlEy49otYDyBlA3O9NbR0BpPfr2TXgaD/view?usp=drive_link).
+- 2024-10-22: We supported Docker🐳 for hosting virtual machines on virtualized platforms. Check below for detailed instructions!
+- 2024-06-15: We refactor the code of environment part to decompose VMware Integration, and start to support other platforms such as VirtualBox, AWS, Azure, etc. Hold tight!
+- 2024-04-11: We released our [paper](https://arxiv.org/abs/2404.07972), [environment and benchmark](https://github.com/xlang-ai/OSWorld), and [project page](https://os-world.github.io/). Check it out!
+## 💾 Installation
+### VMware/VirtualBox (Desktop, Laptop, Bare Metal Machine)
+Suppose you are operating on a system that has not been virtualized (e.g. your desktop, laptop, bare metal machine), meaning you are not utilizing a virtualized environment like AWS, Azure, or k8s.
+If this is the case, proceed with the instructions below. However, if you are on a virtualized platform, please refer to the [Docker](https://github.com/xlang-ai/OSWorld?tab=readme-ov-file#docker-server-with-kvm-support-for-the-better) section.
+1. First, clone this repository and `cd` into it. Then, install the dependencies listed in `requirements.txt`. It is recommended that you use the latest version of Conda to manage the environment, but you can also choose to manually install the dependencies. Please ensure that the version of Python is >= 3.10.
+```bash
+# Clone the OSWorld repository
+git clone https://github.com/xlang-ai/OSWorld
+# Change directory into the cloned repository
+cd OSWorld
+# Optional: Create a Conda environment for OSWorld
+# conda create -n osworld python=3.10
+# conda activate osworld
+# Install required dependencies
+pip install -r requirements.txt
+```
+Alternatively, you can install the environment without any benchmark tasks:
+```bash
+pip install desktop-env
+```
+2. Install [VMware Workstation Pro](https://www.vmware.com/products/workstation-pro/workstation-pro-evaluation.html) (for systems with Apple Chips, you should install [VMware Fusion](https://support.broadcom.com/group/ecx/productdownloads?subfamily=VMware+Fusion)) and configure the `vmrun` command.  The installation process can refer to [How to install VMware Workstation Pro](desktop_env/providers/vmware/INSTALL_VMWARE.md). Verify the successful installation by running the following:
+```bash
+vmrun -T ws list
+```
+If the installation along with the environment variable set is successful, you will see the message showing the current running virtual machines.
+> **Note:** We also support using [VirtualBox](https://www.virtualbox.org/) if you have issues with VMware Pro. However, features such as parallelism and macOS on Apple chips might not be well-supported.
+All set! Our setup script will automatically download the necessary virtual machines and configure the environment for you.
+### Docker (Server with KVM Support for Better Performance)
+If you are running on a non-bare metal server, or prefer not to use VMware and VirtualBox platforms, we recommend using our Docker support.
+#### Prerequisite: Check if your machine supports KVM
+We recommend running the VM with KVM support. To check if your hosting platform supports KVM, run
+```
+egrep -c '(vmx|svm)' /proc/cpuinfo
+```
+on Linux. If the return value is greater than zero, the processor should be able to support KVM.
+> **Note**: macOS hosts generally do not support KVM. You are advised to use VMware if you would like to run OSWorld on macOS.
+#### Install Docker
+If your hosting platform supports a graphical user interface (GUI), you may refer to [Install Docker Desktop on Linux](https://docs.docker.com/desktop/install/linux/) or [Install Docker Desktop on Windows](https://docs.docker.com/desktop/install/windows-install/) based on your OS. Otherwise, you may [Install Docker Engine](https://docs.docker.com/engine/install/).
+#### Running Experiments
+Add the following arguments when initializing `DesktopEnv`:
+- `provider_name`: `docker`
+- `os_type`: `Ubuntu` or `Windows`, depending on the OS of the VM
+> **Note**: If the experiment is interrupted abnormally (e.g., by interrupting signals), there may be residual docker containers which could affect system performance over time. Please run `docker stop $(docker ps -q) && docker rm $(docker ps -a -q)` to clean up.
+### AWS
+Using cloud services for parallel evaluation can significantly accelerate evaluation efficiency (can reduce evaluation time to within 1 hour through parallelization!) and can even be used as infrastructure for training.
+We provide comprehensive AWS support with a Host-Client architecture that enables large-scale parallel evaluation of OSWorld tasks.
+For detailed setup instructions, see [Setup Guideline](SETUP_GUIDELINE.md) and [AWS Configuration Guide](https://github.com/xlang-ai/OSWorld/blob/main/desktop_env/providers/aws/AWS_GUIDELINE.md).
+### Others
+We are working on supporting more 👷. Please hold tight!
+## 🚀 Quick Start
+Run the following minimal example to interact with the environment:
+```bash
+# Basic usage with default settings
+python quickstart.py
+# Customize provider and VM path
+python quickstart.py --provider_name vmware --path_to_vm "path/to/your/vm.vmx"
+```
+You will see all the logs of the system running normally, including the successful creation of the environment, completion of setup, and successful execution of actions. In the end, you will observe a successful right-click on the screen, which means you are ready to go.
+## 🧪 Experiments
+### Agent Baselines
+> **⚠️ Important Configuration Requirements:**
+>
+> * **Google Account Tasks**: Some tasks require Google account access and OAuth2.0 configuration. Please refer to [Setup Guideline - Google Account Setup](SETUP_GUIDELINE.md#1-google-account-setup) for detailed setup instructions.
+> * **Proxy Configuration**: Some tasks may require proxy settings to function properly (this depends on the strength of website defenses against your network location). Please refer to [Setup Guideline - Proxy Configuration](SETUP_GUIDELINE.md#2-proxy-configuration).
+> * **Impact of Missing Configuration**: If these configurations are not properly set up, the corresponding tasks will fail to execute correctly, leading to lower evaluation scores.
+If you wish to run the baseline agent used in our paper, you can execute the following command as an example under the GPT-4o pure-screenshot setting:
+Set **OPENAI_API_KEY** environment variable with your API key
+```bash
+export OPENAI_API_KEY='changeme'
+```
+Optionally, set **OPENAI_BASE_URL** to use a custom OpenAI-compatible API endpoint
+```bash
+export OPENAI_BASE_URL='http://your-custom-endpoint.com/v1'  # Optional: defaults to https://api.openai.com
+```
+Single-threaded execution (deprecated, using `vmware` provider as example)
+```bash
+python run.py \
+    --provider_name vmware \
+    --path_to_vm Ubuntu/Ubuntu.vmx \
+    --headless \
+    --observation_type screenshot \
+    --model gpt-4o \
+    --sleep_after_execution 3 \
+    --max_steps 15 \
+    --result_dir ./results \
+    --client_password password
+```
+Parallel execution (example showing switching provider to `docker`)
+```bash
+python scripts/python/run_multienv.py \
+    --provider_name docker \
+    --headless \
+    --observation_type screenshot \
+    --model gpt-4o \
+    --sleep_after_execution 3 \
+    --max_steps 15 \
+    --num_envs 10 \
+    --client_password password
+```
+The results, which include screenshots, actions, and video recordings of the agent's task completion, will be saved in the `./results` (or other `result_dir` you specified) directory in this case.
+You can then run the following command to obtain the result:
+```bash
+# Basic usage with default parameters
+python show_result.py
+# Specify custom parameters
+python show_result.py \
+    --action_space pyautogui \
+    --model gpt-4o \
+    --observation_type screenshot \
+    --result_dir ./results
+# Show detailed scores per domain (format: score/total)
+python show_result.py --detailed
+```
+The script will display:
+- Per-domain success rates
+- Category-level statistics (Office, Daily, Professional)
+- Overall success rate and total score
+- With `--detailed` flag: compact format showing "score/total" for each domain
+### Manual Task Examination
+For manual verification and examination of specific benchmark tasks, you can use the manual examination tool:
+```bash
+python scripts/python/manual_examine.py \
+    --headless \
+    --observation_type screenshot \
+    --result_dir ./results_human_examine \
+    --test_all_meta_path evaluation_examples/test_all.json \
+    --domain libreoffice_impress \
+    --example_id a669ef01-ded5-4099-9ea9-25e99b569840 \
+    --max_steps 3
+```
+This tool allows you to:
+- Manually execute tasks in the environment
+- Verify task correctness and evaluation metrics
+- Record the execution process with screenshots and videos
+- Examine specific problematic tasks
+See `scripts/bash/run_manual_examine.sh` for example task IDs across different domains.
+## Evaluation
+### Local Evaluation
+Please start by reading through the [agent interface](https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/README.md) and the [environment interface](https://github.com/xlang-ai/OSWorld/blob/main/desktop_env/README.md).
+Correctly implement the agent interface and import your customized version in the `run.py` (for single-threaded execution) or `scripts/python/run_multienv.py` / `scripts/python/run_multienv_xxx.py` (for parallel execution) file.
+Afterward, you can execute a command similar to the one in the previous section to run the benchmark on your agent.
+### Public Evaluation
+If you want your results to be verified and displayed on the verified leaderboard, you need to schedule a meeting with us (current maintainer: tianbaoxiexxx@gmail.com, yuanmengqi732@gmail.com) to run your agent code on our side and have us report the results.
+You need to upload and allow us to disclose your agent implementation under the OSWorld framework (you may choose not to expose your model API to the public), along with a report that allows the public to understand what's happening behind the scenes.
+Alternatively, if you are from a trusted institution, you can share your monitoring data and trajectories with us.
+Please carefully follow the [Setup Guideline - Public Evaluation Platform](SETUP_GUIDELINE.md#3-public-evaluation-platform) to get results.
+## ❓ FAQ
+### What is the username and password for the virtual machines?
+The username and password for the virtual machines are as follows (for provider `vmware`, `virtualbox` and `docker`): we set the account credentials for Ubuntu as `user` / `password`.
+For cloud service providers like `aws`, to prevent attacks due to weak passwords, we default to `osworld-public-evaluation`.
+If you make further modifications, remember to set the client_password variable and pass it to DesktopEnv and Agent (if supported) when running experiments.
+Some features like setting up proxy require the environment to have the client VM password to obtain sudo privileges, and for some OSWorld tasks, the agent needs the password to obtain sudo privileges to complete them.
+### How to setup the account and credentials for Google and Google Drive?
+See [Setup Guideline - Google Account Setup](SETUP_GUIDELINE.md#1-google-account-setup).
+### How can I configure a proxy for the VM (if I'm behind the GFW, or I don't want some of my tasks to be identified as bot and get lower scores)?
+See [Setup Guideline - Proxy Configuration](SETUP_GUIDELINE.md#2-proxy-configuration).
+We also provide a pre-configured solution based on DataImpulse, please refer to the [proxy setup section](SETUP_GUIDELINE.md#23-proxy-for-specific-tasks-recommended).
+### Open Source Contributors
+Thanks to all the contributors!
+<a href="https://github.com/xlang-ai/OSWorld/graphs/contributors">
+  <img src="https://stg.contrib.rocks/image?repo=xlang-ai/OSWorld" />
+</a>
+## 📄 Citation
+If you find this environment useful, please consider citing our work:
+```
+@misc{OSWorld,
+      title={OSWorld: Benchmarking Multimodal Agents for Open-Ended Tasks in Real Computer Environments},
+      author={Tianbao Xie and Danyang Zhang and Jixuan Chen and Xiaochuan Li and Siheng Zhao and Ruisheng Cao and Toh Jing Hua and Zhoujun Cheng and Dongchan Shin and Fangyu Lei and Yitao Liu and Yiheng Xu and Shuyan Zhou and Silvio Savarese and Caiming Xiong and Victor Zhong and Tao Yu},
+      year={2024},
+      eprint={2404.07972},
+      archivePrefix={arXiv},
+      primaryClass={cs.AI}
+}
+```
+## Acknowledgement for OSWorld-Verified
+Special thanks to the following institutions that provided feedback and participated in the fixes (as well as institutions that provided feedback during the process): [MoonShot AI, a.k.a. Kimi](https://www.moonshot.ai/)，[Human Data](https://www.hud.so/), [OpenAI](https://openai.com/), [ByteDance Seed TARS](https://seed-tars.com/), [Anthropic](https://www.anthropic.com/), [Simular](https://www.simular.ai/), [HKU Data Intelligence Lab](https://sites.google.com/view/chaoh)
+Special thanks to the following students who participated in the specific fixes: [Mengqi Yuan](https://yuanmengqi.github.io/), [Danyang Zhang](https://zdy023.github.io/), [Xinzhuang Xiong](https://thisisxxz.com/),  [Zhennan Shen](https://scholar.google.com/citations?user=JPwg5MwAAAAJ&hl=en), [Zilong Zhou](https://github.com/adlsdztony), Yanxu Chen, [Jiaqi Deng](https://millank0817.github.io/), [Tianbao Xie](https://tianbaoxie.com/), Junda Chen, [Jixuan Chen](https://chenjix.github.io/), [Haoyuan Wu](https://www.linkedin.com/in/haoyuan-wu-240878291/).
+Special thanks to the following students who participated in running the re-evaluation: [Mengqi Yuan](https://yuanmengqi.github.io/), [Zilong Zhou](https://github.com/adlsdztony), [Xinyuan Wang](https://xinyuanwangcs.github.io/), [Bowen Wang](https://bowenbryanwang.github.io/).
+## You might also be interested
+- **OSWorld-MCP**: Benchmarking MCP Tool Invocation in Computer-Use Agents. [Website](https://osworld-mcp.github.io/)

SETUP_GUIDELINE.md ADDED Viewed

	@@ -0,0 +1,452 @@

+# OSWorld Setup and Evaluation Guide
+This comprehensive guide covers all aspects of setting up and running OSWorld evaluations, including account configuration, proxy setup, and public evaluation platform deployment.
+## Table of Contents
+1. [Google Account Setup](#1-google-account-setup)
+2. [Proxy Configuration](#2-proxy-configuration)
+3. [Public Evaluation Platform](#3-public-evaluation-platform)
+---
+## 1. Google Account Setup
+For tasks including Google or Google Drive, you need a real Google account with configured OAuth2.0 secrets.
+> **Attention**: To prevent environment reset and result evaluation conflicts caused by multiple people using the same Google account simultaneously, please register a private Google account rather than using a shared one.
+### 1.1 Register A Blank Google Account
+1. Go to Google website and register a blank new account
+   - You do not need to provide any recovery email or phone for testing purposes
+   - **IGNORE** any security recommendations
+   - Turn **OFF** the [2-Step Verification](https://support.google.com/accounts/answer/1064203?hl=en&co=GENIE.Platform%3DDesktop#:~:text=Open%20your%20Google%20Account.,Select%20Turn%20off.) to avoid failure in environment setup
+<p align="center">
+  <img src="assets/googleshutoff.png" width="40%" alt="Shut Off 2-Step Verification">
+</p>
+> **Attention**: We strongly recommend registering a new blank account instead of using an existing one to avoid messing up your personal workspace.
+2. Copy and rename `settings.json.template` to `settings.json` under `evaluation_examples/settings/google/`. Replace the two fields:
+```json
+{
+    "email": "your_google_account@gmail.com",
+    "password": "your_google_account_password"
+}
+```
+### 1.2 Create A Google Cloud Project
+1. Navigate to [Google Cloud Project Creation](https://console.cloud.google.com/projectcreate) and create a new GCP (see [Create a Google Cloud Project](https://developers.google.com/workspace/guides/create-project) for detailed steps)
+2. Go to the [Google Drive API console](https://console.cloud.google.com/apis/library/drive.googleapis.com?) and enable the Google Drive API for the created project (see [Enable and disable APIs](https://support.google.com/googleapi/answer/6158841?hl=en))
+<p align="center">
+  <img src="assets/creategcp.png" width="45%" style="margin-right: 5%;" alt="Create GCP">
+  <img src="assets/enableapi.png" width="45%" alt="Google Drive API">
+</p>
+### 1.3 Configure OAuth Consent Screen
+Go to [OAuth consent screen](https://console.cloud.google.com/apis/credentials/consent):
+1. Select **External** as the User Type and click **CREATE**
+<p align="center">
+  <img src="assets/external.png" width="80%" alt="External User Type">
+</p>
+2. Fill in the required fields:
+   - **App name**: Any name you prefer
+   - **User support email**: Your Google account email
+   - **Developer contact information**: Your Google account email
+   - Click **SAVE AND CONTINUE**
+<p align="center">
+  <img src="assets/appinfo.png" width="80%" alt="App Information">
+</p>
+3. Add scopes:
+   - Click **ADD OR REMOVE SCOPES**
+   - Filter and select: `https://www.googleapis.com/auth/drive`
+   - Click **UPDATE** and **SAVE AND CONTINUE**
+<p align="center">
+  <img src="assets/addscope.png" width="80%" alt="Add Scopes">
+</p>
+4. Add test users:
+   - Click **ADD USERS**
+   - Add your Google account email
+   - Click **SAVE AND CONTINUE**
+<p align="center">
+  <img src="assets/adduser.png" width="80%" alt="Add Test Users">
+</p>
+### 1.4 Create OAuth2.0 Credentials
+1. Go to [Credentials](https://console.cloud.google.com/apis/credentials) page
+2. Click **CREATE CREDENTIALS** → **OAuth client ID**
+3. Select **Desktop app** as Application type
+4. Name it (e.g., "OSWorld Desktop Client")
+5. Click **CREATE**
+<p align="center">
+  <img src="assets/createcredential.png" width="80%" alt="Create Credentials">
+</p>
+6. Download the JSON file and rename it to `credentials.json`
+7. Place it in `evaluation_examples/settings/google/`
+<p align="center">
+  <img src="assets/downloadjson.png" width="80%" alt="Download JSON">
+</p>
+### 1.5 Potential Issues
+#### Issue 1: Access Blocked During OAuth Flow
+**Symptom**: "Access blocked: OSWorld's request is invalid" error
+**Solution**: Ensure you've added your Google account as a test user in the OAuth consent screen configuration.
+#### Issue 2: Scope Not Granted
+**Symptom**: Application doesn't have necessary permissions
+**Solution**: Verify that `https://www.googleapis.com/auth/drive` scope is added in the OAuth consent screen.
+---
+## 2. Proxy Configuration
+If you're using OSWorld behind a firewall or need proxy configuration, follow these steps.
+### 2.1 Configure Proxy on Host Machine
+By default, proxy software usually listens only to localhost (`127.0.0.1`), which cannot be reached from the virtual machine. You need to make your proxy software listen to the VMware network card IP or `0.0.0.0`.
+#### Find VM and Host IP Addresses
+After launching the VM:
+```bash
+# Run this command on host
+# Change ws to fusion if you use VMware Fusion
+vmrun -T ws getGuestIPAddress /path/to/vmx/file
+```
+**On Linux (Ubuntu)**:
+```bash
+ip a  # Check IP addresses of each network card
+```
+**On Windows**:
+```cmd
+ipconfig  # Check IP addresses of each network card
+```
+Look for the VMware network card (usually named `vmnetX` like `vmnet8`). Make sure to use an IP address within the same network segment as the VM.
+#### Configure Proxy Software
+Configure your proxy software to listen on the VMware network card IP:
+<p align="center">
+  <img src="assets/proxysetup.png" width="80%" alt="Proxy Setup">
+</p>
+#### Alternative: Port Forwarding
+If you cannot change the listening address, set up port forwarding.
+**On Linux (Ubuntu)**:
+```bash
+# Forward 192.168.108.1:1080 to 127.0.0.1:1080
+socat TCP-LISTEN:1080,bind=192.168.108.1,fork TCP:127.0.0.1:1080
+```
+**On Windows** (with admin privileges):
+```cmd
+netsh interface portproxy add v4tov4 listenport=1080 listenaddress=192.168.108.1 connectport=1080 connectaddress=127.0.0.1
+```
+### 2.2 Configure Proxy in Virtual Machine
+#### For VMware/VirtualBox
+1. Start the VM and log in
+2. Open terminal and edit proxy settings:
+```bash
+# Edit environment variables
+sudo nano /etc/environment
+```
+3. Add the following lines (replace with your host IP and port):
+```bash
+http_proxy="http://192.168.108.1:1080"
+https_proxy="http://192.168.108.1:1080"
+no_proxy="localhost,127.0.0.1"
+```
+4. For APT package manager:
+```bash
+sudo nano /etc/apt/apt.conf.d/proxy.conf
+```
+Add:
+```
+Acquire::http::Proxy "http://192.168.108.1:1080";
+Acquire::https::Proxy "http://192.168.108.1:1080";
+```
+5. Reboot the VM or reload environment:
+```bash
+source /etc/environment
+```
+#### For Docker
+When using Docker provider, you can set proxy environment variables:
+```python
+env = DesktopEnv(
+    provider_name="docker",
+    # ... other parameters
+)
+```
+Set environment variables before running:
+```bash
+export HTTP_PROXY=http://your-proxy:port
+export HTTPS_PROXY=http://your-proxy:port
+```
+### 2.3 Proxy for Specific Tasks (Recommended)
+OSWorld provides built-in proxy support using DataImpulse or similar services:
+1. Register at [DataImpulse](https://dataimpulse.com/)
+2. Purchase a US residential IP package (approximately $1 per 1GB)
+3. Configure credentials in `evaluation_examples/settings/proxy/dataimpulse.json`:
+```json
+[
+    {
+        "host": "gw.dataimpulse.com",
+        "port": 823,
+        "username": "your_username",
+        "password": "your_password",
+        "protocol": "http",
+        "provider": "dataimpulse",
+        "type": "residential",
+        "country": "US",
+        "note": "Dataimpulse Residential Proxy"
+    }
+]
+```
+OSWorld will automatically use proxy for tasks that need it when `enable_proxy=True` in DesktopEnv.
+---
+## 3. Public Evaluation Platform
+We provide an AWS-based platform for large-scale parallel evaluation of OSWorld tasks.
+### 3.1 Architecture Overview
+- **Host Instance**: Central controller that stores code, configurations, and manages task execution
+- **Client Instances**: Worker nodes automatically launched to perform tasks in parallel
+### 3.2 Platform Deployment
+#### Step 1: Launch the Host Instance
+1. Create an EC2 instance in AWS console
+2. **Instance type recommendations**:
+   - `t3.medium`: For < 5 parallel environments
+   - `t3.large`: For < 15 parallel environments
+   - `c4.8xlarge`: For 15+ parallel environments
+3. **AMI**: Ubuntu Server 24.04 LTS (HVM), SSD Volume Type
+4. **Storage**: At least 50GB
+5. **Security group**: Open port 8080 for monitor service
+6. **VPC**: Use default (note the VPC ID for later)
+#### Step 2: Connect to Host Instance
+1. Download the `.pem` key file when creating the instance
+2. Set permissions:
+   ```bash
+   chmod 400 <your_key_file_path>
+   ```
+3. Connect via SSH:
+   ```bash
+   ssh -i <your_key_path> ubuntu@<your_public_dns>
+   ```
+#### Step 3: Set Up Host Machine
+```bash
+# Clone OSWorld repository
+git clone https://github.com/xlang-ai/OSWorld
+cd OSWorld
+# Optional: Create Conda environment
+# conda create -n osworld python=3.10
+# conda activate osworld
+# Install dependencies
+pip install -r requirements.txt
+```
+#### Step 4: Configure AWS Client Machines
+##### Security Group Configuration
+Create a security group with the following rules:
+**Inbound Rules** (8 rules required):
+| Type       | Protocol | Port Range | Source         | Description                |
+|------------|----------|------------|----------------|----------------------------|
+| SSH        | TCP      | 22         | 0.0.0.0/0      | SSH access                 |
+| HTTP       | TCP      | 80         | 172.31.0.0/16  | HTTP traffic               |
+| Custom TCP | TCP      | 5000       | 172.31.0.0/16  | OSWorld backend service    |
+| Custom TCP | TCP      | 5910       | 0.0.0.0/0      | NoVNC visualization port   |
+| Custom TCP | TCP      | 8006       | 172.31.0.0/16  | VNC service port           |
+| Custom TCP | TCP      | 8080       | 172.31.0.0/16  | VLC service port           |
+| Custom TCP | TCP      | 8081       | 172.31.0.0/16  | Additional service port    |
+| Custom TCP | TCP      | 9222       | 172.31.0.0/16  | Chrome control port        |
+**Outbound Rules** (1 rule required):
+| Type        | Protocol | Port Range | Destination | Description                 |
+|-------------|----------|------------|-------------|----------------------------|
+| All traffic | All      | All        | 0.0.0.0/0   | Allow all outbound traffic |
+Record the `AWS_SECURITY_GROUP_ID`.
+##### VPC and Subnet Configuration
+1. Note the **VPC ID** and **Subnet ID** from your host instance
+2. Record the **Subnet ID** as `AWS_SUBNET_ID`
+##### AWS Access Keys
+1. Go to AWS Console → Security Credentials
+2. Create access key
+3. Record `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`
+### 3.3 Environment Setup
+#### Google Drive Integration (Optional)
+Follow [Section 1: Google Account Setup](#1-google-account-setup) above.
+**Note**: OSWorld includes 8 Google Drive tasks out of 369 total tasks. You can:
+- Complete setup for all 369 tasks, or
+- Skip Google Drive tasks and evaluate 361 tasks (officially supported)
+#### Set Environment Variables
+```bash
+# API Keys (if using)
+# export OPENAI_API_KEY="your_openai_api_key"
+# export ANTHROPIC_API_KEY="your_anthropic_api_key"
+# AWS Configuration
+export AWS_ACCESS_KEY_ID="your_access_key"
+export AWS_SECRET_ACCESS_KEY="your_security_access_key"
+export AWS_REGION="us-east-1"  # or your preferred region
+export AWS_SECURITY_GROUP_ID="sg-xxxx"
+export AWS_SUBNET_ID="subnet-xxxx"
+```
+### 3.4 Running Evaluations
+```bash
+# Example: Run OpenAI CUA
+python scripts/python/run_multienv_openaicua.py \
+    --headless \
+    --observation_type screenshot \
+    --model computer-use-preview \
+    --result_dir ./results_operator \
+    --test_all_meta_path evaluation_examples/test_all.json \
+    --region us-east-1 \
+    --max_steps 50 \
+    --num_envs 5 \
+    --client_password osworld-public-evaluation
+# Example: Run Claude (via AWS Bedrock)
+python scripts/python/run_multienv_claude.py \
+    --headless \
+    --observation_type screenshot \
+    --action_space claude_computer_use \
+    --model claude-4-sonnet-20250514 \
+    --result_dir ./results_claude \
+    --test_all_meta_path evaluation_examples/test_all.json \
+    --max_steps 50 \
+    --num_envs 5 \
+    --provider_name aws \
+    --client_password osworld-public-evaluation
+```
+**Key Parameters**:
+- `--num_envs`: Number of parallel environments
+- `--max_steps`: Maximum steps per task
+- `--result_dir`: Output directory for results
+- `--test_all_meta_path`: Path to test set metadata
+- `--region`: AWS region
+### 3.5 Monitoring and Results
+#### Web Monitoring Tool
+```bash
+cd monitor
+pip install -r requirements.txt
+python main.py
+```
+Access at: `http://<host-public-ip>:8080`
+#### VNC Remote Desktop Access
+Access VMs via VNC at: `http://<client-public-ip>:5910/vnc.html`
+Default password: `osworld-public-evaluation`
+### 3.6 Submitting Results
+For leaderboard submission, contact:
+- tianbaoxiexxx@gmail.com
+- yuanmengqi732@gmail.com
+**Options**:
+1. **Self-reported**: Submit results with monitor data and trajectories
+2. **Verified**: Schedule a meeting to run your agent code on our infrastructure
+---
+## Additional Resources
+- [Main README](README.md) - Project overview and quick start
+- [Installation Guide](README.md#-installation) - Detailed installation instructions
+- [FAQ](README.md#-faq) - Frequently asked questions
+- [Scripts Documentation](scripts/README.md) - Information about run scripts
+## Support
+If you encounter issues or have questions:
+- Open an issue on [GitHub](https://github.com/xlang-ai/OSWorld/issues)
+- Join our [Discord](https://discord.gg/4Gnw7eTEZR)
+- Email the maintainers (see contact information above)

assets/authorization.png ADDED Viewed

Git LFS Details

SHA256: dd2c1e15672a7a473a3fe59e2d00fc2441c732cb391f8ca9bbd917d12f4eee16
Pointer size: 131 Bytes
Size of remote file: 821 kB

assets/creategcp.png ADDED Viewed

Git LFS Details

SHA256: 49ced4afcfcb7cbe6180777fdc43416c4beaadfab9977cbaa65f21d45cffcd31
Pointer size: 131 Bytes
Size of remote file: 192 kB

assets/desktopapp.png ADDED Viewed

Git LFS Details

SHA256: 03af749d338e64d4d5ec7db9913847adf9a3101f171114c12ced08911823fd2e
Pointer size: 131 Bytes
Size of remote file: 224 kB

assets/developer.png ADDED Viewed

Git LFS Details

SHA256: ab7292cfcc5b523a66ca55bcb4e39792f8800a7d708664b101805602be46d6f8
Pointer size: 131 Bytes
Size of remote file: 190 kB

assets/enableapi.png ADDED Viewed

Git LFS Details

SHA256: 6f6aed97e6c8df6f4856a14a5e28440ae77ed5cff606323bd4ae1273ee933dae
Pointer size: 131 Bytes
Size of remote file: 188 kB

assets/googleidentity.png ADDED Viewed

assets/googlephonecode.png ADDED Viewed

assets/googleshutoff.png ADDED Viewed

assets/netsetting1.png ADDED Viewed

assets/netsetting2.png ADDED Viewed

assets/netsetting3.png ADDED Viewed

assets/netsetting4.png ADDED Viewed

assets/oauth2.0.png ADDED Viewed

Git LFS Details

SHA256: 144f68b6e625d5712ff85910ce583d1cb593e51c7f17dcbcc56b8b9dd4083d35
Pointer size: 131 Bytes
Size of remote file: 151 kB

assets/oauthapp.png ADDED Viewed

Git LFS Details

SHA256: 44e1549a798924dcda2411e0060cc1a8e43dea3326b2d443f4886beec712db36
Pointer size: 131 Bytes
Size of remote file: 219 kB

assets/proxysetup-zh.png ADDED Viewed

Git LFS Details

SHA256: a66c94b1d2518c397485e54c30356830f4104a1a32320c784fa39e4b8fe215fb
Pointer size: 131 Bytes
Size of remote file: 110 kB

assets/proxysetup.png ADDED Viewed

Git LFS Details

SHA256: 7821f4eca280d08c64627834e2ef3cd3a8bdab9340daefd69aaa41cc632157d5
Pointer size: 131 Bytes
Size of remote file: 115 kB

assets/pubeval1.png ADDED Viewed

assets/pubeval2.png ADDED Viewed

Git LFS Details

SHA256: 156bb2cf3192c05eed6207b9530c5a6aba66fe3e569cd57f0eb4286d204c2aaa
Pointer size: 131 Bytes
Size of remote file: 178 kB

assets/pubeval3.png ADDED Viewed

Git LFS Details

SHA256: cc2693e712ddc548f588c206fb834658cce285c0fd85e9a772bcb5ebedb158fd
Pointer size: 131 Bytes
Size of remote file: 316 kB

assets/pubeval4.png ADDED Viewed

assets/pubeval5.png ADDED Viewed

assets/pubeval_gdrive_auth.jpg ADDED Viewed

Git LFS Details

SHA256: b6f4c435173be710c9a61625e27c8770379c2a9308cec192d5ad2d116dabe977
Pointer size: 131 Bytes
Size of remote file: 172 kB

assets/pubeval_monitor1.jpg ADDED Viewed

Git LFS Details

SHA256: 3fe75f55e0037b9ef90495e1878cb9609434afd9369038b283ef81b448a2b385
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

assets/pubeval_monitor2.jpg ADDED Viewed

Git LFS Details

SHA256: eba2246d4094f2975553b4bdbc5f65f7b8312e3f7976b284b100771db5579125
Pointer size: 131 Bytes
Size of remote file: 755 kB

assets/pubeval_subnet.png ADDED Viewed

Git LFS Details

SHA256: eeabe4188dee3770f93458519358bb89c0321af0cdbc335892ad1ae8ca9609ac
Pointer size: 131 Bytes
Size of remote file: 467 kB

assets/publishapp.png ADDED Viewed

Git LFS Details

SHA256: af2186dac796b7c3715ba7398015a67f5c947cda07af7fcf40902f65797d8f05
Pointer size: 131 Bytes
Size of remote file: 128 kB

assets/testusers.png ADDED Viewed

Git LFS Details

SHA256: bb861503030f57f94a1e0b4ab751abee559196a9e138dde4638849b5c45c5b2e
Pointer size: 131 Bytes
Size of remote file: 210 kB

assets/unsafemode.png ADDED Viewed

Git LFS Details

SHA256: 02ed11a510b0869539dacd46f18bab82b3bf86695b4c720c29c322f4797bfc1a
Pointer size: 131 Bytes
Size of remote file: 856 kB

assets/usertype.png ADDED Viewed

Git LFS Details

SHA256: 6f84d8da769bb21ef587ed6e27fc704f6da60ee223319c38891bd3e4f572c4ee
Pointer size: 131 Bytes
Size of remote file: 307 kB

assets/winnetsetting1.png ADDED Viewed

Git LFS Details

SHA256: 60bb6e2d288e7a658b9be4ced22267c5711dc867665dcd32a8b26ec425056642
Pointer size: 131 Bytes
Size of remote file: 131 kB

assets/winnetsetting2.png ADDED Viewed

assets/winnetsetting3.png ADDED Viewed

Git LFS Details

SHA256: 05517faec92982d5a9bcf3628530861cb167c4780ccfec4f5c6cf4f1a3c327b8
Pointer size: 131 Bytes
Size of remote file: 244 kB

assets/winnetsetting4.png ADDED Viewed

Git LFS Details

SHA256: 533c4409a2e799e0583054de98305bcabce2da833cd857f7b8f38bcbd662cfa0
Pointer size: 131 Bytes
Size of remote file: 220 kB

desktop_env/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

desktop_env/actions.py ADDED Viewed

	@@ -0,0 +1,203 @@

+X_MAX = 1920  # TODO: get the screen resolution
+Y_MAX = 1080
+KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright']
+ACTION_SPACE = [
+    {
+        "action_type": "MOVE_TO",
+        "note": "move the cursor to the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "CLICK",
+        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            },
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            },
+            "num_clicks": {
+                "type": int,
+                "range": [1, 2, 3],
+                "optional": True,
+            },
+        }
+    },
+    {
+        "action_type": "MOUSE_DOWN",
+        "note": "press the left button if the button not specified, otherwise press the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "MOUSE_UP",
+        "note": "release the left button if the button not specified, otherwise release the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "RIGHT_CLICK",
+        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DOUBLE_CLICK",
+        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DRAG_TO",
+        "note": "drag the cursor to the specified position with the left button pressed",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "SCROLL",
+        "note": "scroll the mouse wheel up or down",
+        "parameters": {
+            "dx": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            },
+            "dy": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "TYPING",
+        "note": "type the specified text",
+        "parameters": {
+            "text": {
+                "type": str,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "PRESS",
+        "note": "press the specified key and release it",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_DOWN",
+        "note": "press the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_UP",
+        "note": "release the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "HOTKEY",
+        "note": "press the specified key combination",
+        "parameters": {
+            "keys": {
+                "type": list,
+                "range": [KEYBOARD_KEYS],
+                "optional": False,
+            }
+        }
+    },
+    ############################################################################################################
+    {
+        "action_type": "WAIT",
+        "note": "wait until the next action",
+    },
+    {
+        "action_type": "FAIL",
+        "note": "decide the task can not be performed",
+    },
+    {
+        "action_type": "DONE",
+        "note": "decide the task is done",
+    }
+]

desktop_env/controllers/__init__.py ADDED Viewed

File without changes

desktop_env/controllers/python.py ADDED Viewed

	@@ -0,0 +1,584 @@

+import json
+import logging
+import random
+from typing import Any, Dict, Optional
+import time
+import traceback
+import requests
+from desktop_env.actions import KEYBOARD_KEYS
+logger = logging.getLogger("desktopenv.pycontroller")
+class PythonController:
+    def __init__(self, vm_ip: str,
+                 server_port: int,
+                 pkgs_prefix: str = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"):
+        self.vm_ip = vm_ip
+        self.http_server = f"http://{vm_ip}:{server_port}"
+        self.pkgs_prefix = pkgs_prefix  # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages
+        self.retry_times = 3
+        self.retry_interval = 5
+    @staticmethod
+    def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool:
+        """Quick validation for PNG/JPEG payload using magic bytes; Content-Type is advisory.
+        Returns True only when bytes look like a real PNG or JPEG.
+        """
+        if not isinstance(data, (bytes, bytearray)) or not data:
+            return False
+        # PNG magic
+        if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n":
+            return True
+        # JPEG magic
+        if len(data) >= 3 and data[:3] == b"\xff\xd8\xff":
+            return True
+        # If server explicitly marks as image, accept as a weak fallback (some environments strip magic)
+        if content_type and ("image/png" in content_type or "image/jpeg" in content_type or "image/jpg" in content_type):
+            return True
+        return False
+    def get_screenshot(self) -> Optional[bytes]:
+        """
+        Gets a screenshot from the server. With the cursor. None -> no screenshot or unexpected error.
+        """
+        for attempt_idx in range(self.retry_times):
+            try:
+                response = requests.get(self.http_server + "/screenshot", timeout=10)
+                if response.status_code == 200:
+                    content_type = response.headers.get("Content-Type", "")
+                    content = response.content
+                    if self._is_valid_image_response(content_type, content):
+                        logger.info("Got screenshot successfully")
+                        return content
+                    else:
+                        logger.error("Invalid screenshot payload (attempt %d/%d).", attempt_idx + 1, self.retry_times)
+                        logger.info("Retrying to get screenshot.")
+                else:
+                    logger.error("Failed to get screenshot. Status code: %d", response.status_code)
+                    logger.info("Retrying to get screenshot.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the screenshot: %s", e)
+                logger.info("Retrying to get screenshot.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get screenshot.")
+        return None
+    def get_accessibility_tree(self) -> Optional[str]:
+        """
+        Gets the accessibility tree from the server. None -> no accessibility tree or unexpected error.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response: requests.Response = requests.get(self.http_server + "/accessibility")
+                if response.status_code == 200:
+                    logger.info("Got accessibility tree successfully")
+                    return response.json()["AT"]
+                else:
+                    logger.error("Failed to get accessibility tree. Status code: %d", response.status_code)
+                    logger.info("Retrying to get accessibility tree.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the accessibility tree: %s", e)
+                logger.info("Retrying to get accessibility tree.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get accessibility tree.")
+        return None
+    def get_terminal_output(self) -> Optional[str]:
+        """
+        Gets the terminal output from the server. None -> no terminal output or unexpected error.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.get(self.http_server + "/terminal")
+                if response.status_code == 200:
+                    logger.info("Got terminal output successfully")
+                    return response.json()["output"]
+                else:
+                    logger.error("Failed to get terminal output. Status code: %d", response.status_code)
+                    logger.info("Retrying to get terminal output.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the terminal output: %s", e)
+                logger.info("Retrying to get terminal output.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get terminal output.")
+        return None
+    def get_file(self, file_path: str) -> Optional[bytes]:
+        """
+        Gets a file from the server.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/file", data={"file_path": file_path})
+                if response.status_code == 200:
+                    logger.info("File downloaded successfully")
+                    return response.content
+                else:
+                    logger.error("Failed to get file. Status code: %d", response.status_code)
+                    logger.info("Retrying to get file.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the file: %s", e)
+                logger.info("Retrying to get file.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get file.")
+        return None
+    def execute_python_command(self, command: str) -> None:
+        """
+        Executes a python command on the server.
+        It can be used to execute the pyautogui commands, or... any other python command. who knows?
+        """
+        # command_list = ["python", "-c", self.pkgs_prefix.format(command=command)]
+        command_list = ["python", "-c", self.pkgs_prefix.format(command=command)]
+        payload = json.dumps({"command": command_list, "shell": False})
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'},
+                                         data=payload, timeout=90)
+                if response.status_code == 200:
+                    logger.info("Command executed successfully: %s", response.text)
+                    return response.json()
+                else:
+                    logger.error("Failed to execute command. Status code: %d", response.status_code)
+                    logger.info("Retrying to execute command.")
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception as e:
+                logger.error("An error occurred while trying to execute the command: %s", e)
+                logger.info("Retrying to execute command.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to execute command.")
+        return None
+    def run_python_script(self, script: str) -> Optional[Dict[str, Any]]:
+        """
+        Executes a python script on the server.
+        """
+        payload = json.dumps({"code": script})
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/run_python", headers={'Content-Type': 'application/json'},
+                                         data=payload, timeout=90)
+                if response.status_code == 200:
+                    return response.json()
+                else:
+                    return {"status": "error", "message": "Failed to execute command.", "output": None, "error": response.json()["error"]}
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception:
+                logger.error("An error occurred while trying to execute the command: %s", traceback.format_exc())
+                logger.info("Retrying to execute command.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to execute command.")
+        return {"status": "error", "message": "Failed to execute command.", "output": "", "error": "Retry limit reached."}
+    def run_bash_script(self, script: str, timeout: int = 30, working_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
+        """
+        Executes a bash script on the server.
+        :param script: The bash script content (can be multi-line)
+        :param timeout: Execution timeout in seconds (default: 30)
+        :param working_dir: Working directory for script execution (optional)
+        :return: Dictionary with status, output, error, and returncode, or None if failed
+        """
+        payload = json.dumps({
+            "script": script,
+            "timeout": timeout,
+            "working_dir": working_dir
+        })
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(
+                    self.http_server + "/run_bash_script",
+                    headers={'Content-Type': 'application/json'},
+                    data=payload,
+                    timeout=timeout + 100  # Add buffer to HTTP timeout
+                )
+                if response.status_code == 200:
+                    result = response.json()
+                    logger.info("Bash script executed successfully with return code: %d", result.get("returncode", -1))
+                    return result
+                else:
+                    logger.error("Failed to execute bash script. Status code: %d, response: %s",
+                                response.status_code, response.text)
+                    logger.info("Retrying to execute bash script.")
+            except requests.exceptions.ReadTimeout:
+                logger.error("Bash script execution timed out")
+                return {
+                    "status": "error",
+                    "output": "",
+                    "error": f"Script execution timed out after {timeout} seconds",
+                    "returncode": -1
+                }
+            except Exception as e:
+                logger.error("An error occurred while trying to execute the bash script: %s", e)
+                logger.info("Retrying to execute bash script.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to execute bash script after %d retries.", self.retry_times)
+        return {
+            "status": "error",
+            "output": "",
+            "error": f"Failed to execute bash script after {self.retry_times} retries",
+            "returncode": -1
+        }
+    def execute_action(self, action):
+        """
+        Executes an action on the server computer.
+        """
+        # Handle string actions
+        if action in ['WAIT', 'FAIL', 'DONE']:
+            return
+        # Handle dictionary actions
+        if type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']:
+            return
+        action_type = action["action_type"]
+        parameters = action["parameters"] if "parameters" in action else {param: action[param] for param in action if param != 'action_type'}
+        move_mode = random.choice(
+            ["pyautogui.easeInQuad", "pyautogui.easeOutQuad", "pyautogui.easeInOutQuad", "pyautogui.easeInBounce",
+             "pyautogui.easeInElastic"])
+        duration = random.uniform(0.5, 1)
+        if action_type == "MOVE_TO":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.moveTo()")
+            elif "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(f"pyautogui.moveTo({x}, {y}, {duration}, {move_mode})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+        elif action_type == "CLICK":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.click()")
+            elif "button" in parameters and "x" in parameters and "y" in parameters:
+                button = parameters["button"]
+                x = parameters["x"]
+                y = parameters["y"]
+                if "num_clicks" in parameters:
+                    num_clicks = parameters["num_clicks"]
+                    self.execute_python_command(
+                        f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})")
+            elif "button" in parameters and "x" not in parameters and "y" not in parameters:
+                button = parameters["button"]
+                if "num_clicks" in parameters:
+                    num_clicks = parameters["num_clicks"]
+                    self.execute_python_command(f"pyautogui.click(button='{button}', clicks={num_clicks})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(button='{button}')")
+            elif "button" not in parameters and "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                if "num_clicks" in parameters:
+                    num_clicks = parameters["num_clicks"]
+                    self.execute_python_command(f"pyautogui.click(x={x}, y={y}, clicks={num_clicks})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(x={x}, y={y})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+        elif action_type == "MOUSE_DOWN":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.mouseDown()")
+            elif "button" in parameters:
+                button = parameters["button"]
+                self.execute_python_command(f"pyautogui.mouseDown(button='{button}')")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+        elif action_type == "MOUSE_UP":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.mouseUp()")
+            elif "button" in parameters:
+                button = parameters["button"]
+                self.execute_python_command(f"pyautogui.mouseUp(button='{button}')")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+        elif action_type == "RIGHT_CLICK":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.rightClick()")
+            elif "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(f"pyautogui.rightClick(x={x}, y={y})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+        elif action_type == "DOUBLE_CLICK":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.doubleClick()")
+            elif "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(f"pyautogui.doubleClick(x={x}, y={y})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+        elif action_type == "DRAG_TO":
+            if "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(
+                    f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)")
+        elif action_type == "SCROLL":
+            # todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out
+            if "dx" in parameters and "dy" in parameters:
+                dx = parameters["dx"]
+                dy = parameters["dy"]
+                self.execute_python_command(f"pyautogui.hscroll({dx})")
+                self.execute_python_command(f"pyautogui.vscroll({dy})")
+            elif "dx" in parameters and "dy" not in parameters:
+                dx = parameters["dx"]
+                self.execute_python_command(f"pyautogui.hscroll({dx})")
+            elif "dx" not in parameters and "dy" in parameters:
+                dy = parameters["dy"]
+                self.execute_python_command(f"pyautogui.vscroll({dy})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+        elif action_type == "TYPING":
+            if "text" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            # deal with special ' and \ characters
+            # text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'")
+            # self.execute_python_command(f"pyautogui.typewrite('{text}')")
+            text = parameters["text"]
+            self.execute_python_command("pyautogui.typewrite({:})".format(repr(text)))
+        elif action_type == "PRESS":
+            if "key" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            key = parameters["key"]
+            if key.lower() not in KEYBOARD_KEYS:
+                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+            self.execute_python_command(f"pyautogui.press('{key}')")
+        elif action_type == "KEY_DOWN":
+            if "key" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            key = parameters["key"]
+            if key.lower() not in KEYBOARD_KEYS:
+                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+            self.execute_python_command(f"pyautogui.keyDown('{key}')")
+        elif action_type == "KEY_UP":
+            if "key" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            key = parameters["key"]
+            if key.lower() not in KEYBOARD_KEYS:
+                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+            self.execute_python_command(f"pyautogui.keyUp('{key}')")
+        elif action_type == "HOTKEY":
+            if "keys" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            keys = parameters["keys"]
+            if not isinstance(keys, list):
+                raise Exception("Keys must be a list of keys")
+            for key in keys:
+                if key.lower() not in KEYBOARD_KEYS:
+                    raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+            keys_para_rep = "', '".join(keys)
+            self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')")
+        elif action_type in ['WAIT', 'FAIL', 'DONE']:
+            pass
+        else:
+            raise Exception(f"Unknown action type: {action_type}")
+    # Record video
+    def start_recording(self):
+        """
+        Starts recording the screen.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/start_recording")
+                if response.status_code == 200:
+                    logger.info("Recording started successfully")
+                    return
+                else:
+                    logger.error("Failed to start recording. Status code: %d", response.status_code)
+                    logger.info("Retrying to start recording.")
+            except Exception as e:
+                logger.error("An error occurred while trying to start recording: %s", e)
+                logger.info("Retrying to start recording.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to start recording.")
+    def end_recording(self, dest: str):
+        """
+        Ends recording the screen.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/end_recording")
+                if response.status_code == 200:
+                    logger.info("Recording stopped successfully")
+                    with open(dest, 'wb') as f:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                    return
+                else:
+                    logger.error("Failed to stop recording. Status code: %d", response.status_code)
+                    logger.info("Retrying to stop recording.")
+            except Exception as e:
+                logger.error("An error occurred while trying to stop recording: %s", e)
+                logger.info("Retrying to stop recording.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to stop recording.")
+    # Additional info
+    def get_vm_platform(self):
+        """
+        Gets the size of the vm screen.
+        """
+        return self.execute_python_command("import platform; print(platform.system())")['output'].strip()
+    def get_vm_machine(self):
+        """
+        Gets the machine of the vm.
+        """
+        return self.execute_python_command("import platform; print(platform.machine())")['output'].strip()
+    def get_vm_screen_size(self):
+        """
+        Gets the size of the vm screen.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/screen_size")
+                if response.status_code == 200:
+                    logger.info("Got screen size successfully")
+                    return response.json()
+                else:
+                    logger.error("Failed to get screen size. Status code: %d", response.status_code)
+                    logger.info("Retrying to get screen size.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the screen size: %s", e)
+                logger.info("Retrying to get screen size.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get screen size.")
+        return None
+    def get_vm_window_size(self, app_class_name: str):
+        """
+        Gets the size of the vm app window.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/window_size", data={"app_class_name": app_class_name})
+                if response.status_code == 200:
+                    logger.info("Got window size successfully")
+                    return response.json()
+                else:
+                    logger.error("Failed to get window size. Status code: %d", response.status_code)
+                    logger.info("Retrying to get window size.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the window size: %s", e)
+                logger.info("Retrying to get window size.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get window size.")
+        return None
+    def get_vm_wallpaper(self):
+        """
+        Gets the wallpaper of the vm.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/wallpaper")
+                if response.status_code == 200:
+                    logger.info("Got wallpaper successfully")
+                    return response.content
+                else:
+                    logger.error("Failed to get wallpaper. Status code: %d", response.status_code)
+                    logger.info("Retrying to get wallpaper.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the wallpaper: %s", e)
+                logger.info("Retrying to get wallpaper.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get wallpaper.")
+        return None
+    def get_vm_desktop_path(self) -> Optional[str]:
+        """
+        Gets the desktop path of the vm.
+        """
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/desktop_path")
+                if response.status_code == 200:
+                    logger.info("Got desktop path successfully")
+                    return response.json()["desktop_path"]
+                else:
+                    logger.error("Failed to get desktop path. Status code: %d", response.status_code)
+                    logger.info("Retrying to get desktop path.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the desktop path: %s", e)
+                logger.info("Retrying to get desktop path.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get desktop path.")
+        return None
+    def get_vm_directory_tree(self, path) -> Optional[Dict[str, Any]]:
+        """
+        Gets the directory tree of the vm.
+        """
+        payload = json.dumps({"path": path})
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/list_directory", headers={'Content-Type': 'application/json'}, data=payload)
+                if response.status_code == 200:
+                    logger.info("Got directory tree successfully")
+                    return response.json()["directory_tree"]
+                else:
+                    logger.error("Failed to get directory tree. Status code: %d", response.status_code)
+                    logger.info("Retrying to get directory tree.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get directory tree: %s", e)
+                logger.info("Retrying to get directory tree.")
+            time.sleep(self.retry_interval)
+        logger.error("Failed to get directory tree.")
+        return None

desktop_env/controllers/setup.py ADDED Viewed

	@@ -0,0 +1,920 @@

+import json
+import logging
+import os
+import os.path
+import platform
+import shutil
+import sqlite3
+import tempfile
+import time
+import traceback
+import uuid
+from datetime import datetime, timedelta
+from typing import Any, Union, Optional
+from typing import Dict, List
+import requests
+from playwright.sync_api import sync_playwright, TimeoutError
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive, GoogleDriveFile, GoogleDriveFileList
+from requests_toolbelt.multipart.encoder import MultipartEncoder
+from desktop_env.controllers.python import PythonController
+from desktop_env.evaluators.metrics.utils import compare_urls
+from desktop_env.providers.aws.proxy_pool import get_global_proxy_pool, init_proxy_pool, ProxyInfo
+import dotenv
+# Load environment variables from .env file
+dotenv.load_dotenv()
+PROXY_CONFIG_FILE = os.getenv("PROXY_CONFIG_FILE", "evaluation_examples/settings/proxy/dataimpulse.json")  # Default proxy config file
+logger = logging.getLogger("desktopenv.setup")
+FILE_PATH = os.path.dirname(os.path.abspath(__file__))
+init_proxy_pool(PROXY_CONFIG_FILE)  # initialize the global proxy pool
+MAX_RETRIES = 20
+class SetupController:
+    def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080):
+        self.vm_ip: str = vm_ip
+        self.server_port: int = server_port
+        self.chromium_port: int = chromium_port
+        self.vlc_port: int = vlc_port
+        self.http_server: str = f"http://{vm_ip}:{server_port}"
+        self.http_server_setup_root: str = f"http://{vm_ip}:{server_port}/setup"
+        self.cache_dir: str = cache_dir
+        self.use_proxy: bool = False
+        self.client_password: str = client_password
+        self.screen_width: int = screen_width
+        self.screen_height: int = screen_height
+    def reset_cache_dir(self, cache_dir: str):
+        self.cache_dir = cache_dir
+    def setup(self, config: List[Dict[str, Any]], use_proxy: bool = False)-> bool:
+        """
+        Args:
+            config (List[Dict[str, Any]]): list of dict like {str: Any}. each
+              config dict has the structure like
+                {
+                    "type": str, corresponding to the `_{:}_setup` methods of
+                      this class
+                    "parameters": dict like {str, Any} providing the keyword
+                      parameters
+                }
+        """
+        self.use_proxy = use_proxy
+        # make sure connection can be established
+        logger.info(f"try to connect {self.http_server}")
+        retry = 0
+        while retry < MAX_RETRIES:
+            try:
+                _ = requests.get(self.http_server + "/terminal")
+                break
+            except:
+                time.sleep(5)
+                retry += 1
+                logger.info(f"retry: {retry}/{MAX_RETRIES}")
+            if retry == MAX_RETRIES:
+                return False
+        for i, cfg in enumerate(config):
+            config_type: str = cfg["type"]
+            parameters: Dict[str, Any] = cfg["parameters"]
+            # Assumes all the setup the functions should follow this name
+            # protocol
+            setup_function: str = "_{:}_setup".format(config_type)
+            assert hasattr(self, setup_function), f'Setup controller cannot find init function {setup_function}'
+            try:
+                logger.info(f"Executing setup step {i+1}/{len(config)}: {setup_function}")
+                logger.debug(f"Setup parameters: {parameters}")
+                getattr(self, setup_function)(**parameters)
+                logger.info(f"SETUP COMPLETED: {setup_function}({str(parameters)})")
+            except Exception as e:
+                logger.error(f"SETUP FAILED at step {i+1}/{len(config)}: {setup_function}({str(parameters)})")
+                logger.error(f"Error details: {e}")
+                logger.error(f"Traceback: {traceback.format_exc()}")
+                raise Exception(f"Setup step {i+1} failed: {setup_function} - {e}") from e
+        return True
+    def _download_setup(self, files: List[Dict[str, str]]):
+        """
+        Args:
+            files (List[Dict[str, str]]): files to download. lisf of dict like
+              {
+                "url": str, the url to download
+                "path": str, the path on the VM to store the downloaded file
+              }
+        """
+        for f in files:
+            url: str = f["url"]
+            path: str = f["path"]
+            cache_path: str = os.path.join(self.cache_dir, "{:}_{:}".format(
+                uuid.uuid5(uuid.NAMESPACE_URL, url),
+                os.path.basename(path)))
+            if not url or not path:
+                raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
+            if not os.path.exists(cache_path):
+                logger.info(f"Cache file not found, downloading from {url} to {cache_path}")
+                max_retries = 3
+                downloaded = False
+                e = None
+                for i in range(max_retries):
+                    try:
+                        logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
+                        response = requests.get(url, stream=True, timeout=300)  # Add 5 minute timeout
+                        response.raise_for_status()
+                        # Get file size if available
+                        total_size = int(response.headers.get('content-length', 0))
+                        if total_size > 0:
+                            logger.info(f"File size: {total_size / (1024*1024):.2f} MB")
+                        downloaded_size = 0
+                        with open(cache_path, 'wb') as f:
+                            for chunk in response.iter_content(chunk_size=8192):
+                                if chunk:
+                                    f.write(chunk)
+                                    downloaded_size += len(chunk)
+                                    if total_size > 0 and downloaded_size % (1024*1024) == 0:  # Log every MB
+                                        progress = (downloaded_size / total_size) * 100
+                                        logger.info(f"Download progress: {progress:.1f}%")
+                        logger.info(f"File downloaded successfully to {cache_path} ({downloaded_size / (1024*1024):.2f} MB)")
+                        downloaded = True
+                        break
+                    except requests.RequestException as e:
+                        logger.error(
+                            f"Failed to download {url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+                        # Clean up partial download
+                        if os.path.exists(cache_path):
+                            os.remove(cache_path)
+                if not downloaded:
+                    raise requests.RequestException(f"Failed to download {url}. No retries left.")
+            form = MultipartEncoder({
+                "file_path": path,
+                "file_data": (os.path.basename(path), open(cache_path, "rb"))
+            })
+            headers = {"Content-Type": form.content_type}
+            logger.debug(form.content_type)
+            # send request to server to upload file
+            try:
+                logger.info(f"Uploading {os.path.basename(path)} to VM at {path}")
+                logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
+                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form, timeout=600)  # 10 minute timeout for upload
+                if response.status_code == 200:
+                    logger.info(f"File uploaded successfully: {path}")
+                    logger.debug("Upload response: %s", response.text)
+                else:
+                    logger.error(f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}")
+                    raise requests.RequestException(f"Upload failed with status {response.status_code}")
+            except requests.exceptions.RequestException as e:
+                logger.error(f"An error occurred while trying to upload {path}: {e}")
+                raise
+    def _upload_file_setup(self, files: List[Dict[str, str]]):
+        """
+        Args:
+            files (List[Dict[str, str]]): files to download. lisf of dict like
+              {
+                "local_path": str, the local path to the file to upload
+                "path": str, the path on the VM to store the downloaded file
+              }
+        """
+        for f in files:
+            local_path: str = f["local_path"]
+            path: str = f["path"]
+            if not os.path.exists(local_path):
+                raise Exception(f"Setup Upload - Invalid local path ({local_path}).")
+            file_size = None
+            try:
+                file_size = os.path.getsize(local_path)
+            except Exception:
+                pass
+            max_retries = 3
+            last_error: Optional[Exception] = None
+            for attempt in range(max_retries):
+                try:
+                    logger.info(
+                        f"Uploading {os.path.basename(local_path)}{f' ({file_size} bytes)' if file_size is not None else ''} "
+                        f"to VM at {path} (attempt {attempt + 1}/{max_retries})"
+                    )
+                    logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
+                    # Open the file inside each attempt to ensure fresh stream position
+                    with open(local_path, "rb") as fp:
+                        form = MultipartEncoder({
+                            "file_path": path,
+                            "file_data": (os.path.basename(path), fp)
+                        })
+                        headers = {"Content-Type": form.content_type}
+                        logger.debug(form.content_type)
+                        # Explicit connect/read timeout to avoid hanging forever
+                        response = requests.post(
+                            self.http_server + "/setup" + "/upload",
+                            headers=headers,
+                            data=form,
+                            timeout=(10, 600)
+                        )
+                        if response.status_code == 200:
+                            logger.info(f"File uploaded successfully: {path}")
+                            logger.debug("Upload response: %s", response.text)
+                            last_error = None
+                            break
+                        else:
+                            msg = f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}"
+                            logger.error(msg)
+                            last_error = requests.RequestException(msg)
+                except requests.exceptions.RequestException as e:
+                    last_error = e
+                    logger.error(f"Upload attempt {attempt + 1} failed for {path}: {e}")
+                # Exponential backoff between retries
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+            if last_error is not None:
+                raise last_error
+    def _change_wallpaper_setup(self, path: str):
+        if not path:
+            raise Exception(f"Setup Wallpaper - Invalid path ({path}).")
+        payload = json.dumps({"path": path})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        # send request to server to change wallpaper
+        try:
+            response = requests.post(self.http_server + "/setup" + "/change_wallpaper", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error("Failed to change wallpaper. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+    def _tidy_desktop_setup(self, **config):
+        raise NotImplementedError()
+    def _open_setup(self, path: str):
+        if not path:
+            raise Exception(f"Setup Open - Invalid path ({path}).")
+        payload = json.dumps({"path": path})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        # send request to server to open file
+        try:
+            # The server-side call is now blocking and can take time.
+            # We set a timeout that is slightly longer than the server's timeout (1800s).
+            response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810)
+            response.raise_for_status()  # This will raise an exception for 4xx and 5xx status codes
+            logger.info("Command executed successfully: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}")
+            raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e
+    def _launch_setup(self, command: Union[str, List[str]], shell: bool = False):
+        if not command:
+            raise Exception("Empty command to launch.")
+        if not shell and isinstance(command, str) and len(command.split()) > 1:
+            logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.")
+            command = command.split()
+        if command[0] == "google-chrome" and self.use_proxy:
+            command.append("--proxy-server=http://127.0.0.1:18888")  # Use the proxy server set up by _proxy_setup
+        payload = json.dumps({"command": command, "shell": shell})
+        headers = {"Content-Type": "application/json"}
+        try:
+            logger.info("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/launch")
+            response = requests.post(self.http_server + "/setup" + "/launch", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error("Failed to launch application. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+    def _execute_setup(
+            self,
+            command: List[str],
+            stdout: str = "",
+            stderr: str = "",
+            shell: bool = False,
+            until: Optional[Dict[str, Any]] = None
+    ):
+        if not command:
+            raise Exception("Empty command to launch.")
+        until: Dict[str, Any] = until or {}
+        terminates: bool = False
+        nb_failings = 0
+        def replace_screen_env_in_command(command):
+            password = self.client_password
+            width = self.screen_width
+            height = self.screen_height
+            width_half = str(width // 2)
+            height_half = str(height // 2)
+            new_command_list = []
+            new_command = ""
+            if isinstance(command, str):
+                new_command = command.replace("{CLIENT_PASSWORD}", password)
+                new_command = new_command.replace("{SCREEN_WIDTH_HALF}", width_half)
+                new_command = new_command.replace("{SCREEN_HEIGHT_HALF}", height_half)
+                new_command = new_command.replace("{SCREEN_WIDTH}", str(width))
+                new_command = new_command.replace("{SCREEN_HEIGHT}", str(height))
+                return new_command
+            else:
+                for item in command:
+                    item = item.replace("{CLIENT_PASSWORD}", password)
+                    item = item.replace("{SCREEN_WIDTH_HALF}", width_half)
+                    item = item.replace("{SCREEN_HEIGHT_HALF}", height_half)
+                    item = item.replace("{SCREEN_WIDTH}", str(width))
+                    item = item.replace("{SCREEN_HEIGHT}", str(height))
+                    new_command_list.append(item)
+                return new_command_list
+        command = replace_screen_env_in_command(command)
+        payload = json.dumps({"command": command, "shell": shell})
+        headers = {"Content-Type": "application/json"}
+        while not terminates:
+            try:
+                response = requests.post(self.http_server + "/setup" + "/execute", headers=headers, data=payload)
+                if response.status_code == 200:
+                    results: Dict[str, str] = response.json()
+                    if stdout:
+                        with open(os.path.join(self.cache_dir, stdout), "w") as f:
+                            f.write(results["output"])
+                    if stderr:
+                        with open(os.path.join(self.cache_dir, stderr), "w") as f:
+                            f.write(results["error"])
+                    logger.info("Command executed successfully: %s -> %s"
+                                , " ".join(command) if isinstance(command, list) else command
+                                , response.text
+                                )
+                else:
+                    logger.error("Failed to launch application. Status code: %s", response.text)
+                    results = None
+                    nb_failings += 1
+            except requests.exceptions.RequestException as e:
+                logger.error("An error occurred while trying to send the request: %s", e)
+                traceback.print_exc()
+                results = None
+                nb_failings += 1
+            if len(until) == 0:
+                terminates = True
+            elif results is not None:
+                terminates = "returncode" in until and results["returncode"] == until["returncode"] \
+                             or "stdout" in until and until["stdout"] in results["output"] \
+                             or "stderr" in until and until["stderr"] in results["error"]
+            terminates = terminates or nb_failings >= 5
+            if not terminates:
+                time.sleep(0.3)
+    def _execute_with_verification_setup(
+            self,
+            command: List[str],
+            verification: Dict[str, Any] = None,
+            max_wait_time: int = 10,
+            check_interval: float = 1.0,
+            shell: bool = False
+    ):
+        """Execute command with verification of results
+        Args:
+            command: Command to execute
+            verification: Dict with verification criteria:
+                - window_exists: Check if window with this name exists
+                - command_success: Execute this command and check if it succeeds
+            max_wait_time: Maximum time to wait for verification
+            check_interval: Time between verification checks
+            shell: Whether to use shell
+        """
+        if not command:
+            raise Exception("Empty command to launch.")
+        verification = verification or {}
+        payload = json.dumps({
+            "command": command,
+            "shell": shell,
+            "verification": verification,
+            "max_wait_time": max_wait_time,
+            "check_interval": check_interval
+        })
+        headers = {"Content-Type": "application/json"}
+        try:
+            response = requests.post(self.http_server + "/setup" + "/execute_with_verification",
+                                   headers=headers, data=payload, timeout=max_wait_time + 10)
+            if response.status_code == 200:
+                result = response.json()
+                logger.info("Command executed and verified successfully: %s -> %s"
+                            , " ".join(command) if isinstance(command, list) else command
+                            , response.text
+                            )
+                return result
+            else:
+                logger.error("Failed to execute with verification. Status code: %s", response.text)
+                raise Exception(f"Command verification failed: {response.text}")
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+            traceback.print_exc()
+            raise Exception(f"Request failed: {e}")
+    def _command_setup(self, command: List[str], **kwargs):
+        self._execute_setup(command, **kwargs)
+    def _sleep_setup(self, seconds: float):
+        time.sleep(seconds)
+    def _act_setup(self, action_seq: List[Union[Dict[str, Any], str]]):
+        # TODO
+        raise NotImplementedError()
+    def _replay_setup(self, trajectory: str):
+        """
+        Args:
+            trajectory (str): path to the replay trajectory file
+        """
+        # TODO
+        raise NotImplementedError()
+    def _activate_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
+        if not window_name:
+            raise Exception(f"Setup Open - Invalid path ({window_name}).")
+        payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        # send request to server to open file
+        try:
+            response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+    def _close_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
+        if not window_name:
+            raise Exception(f"Setup Open - Invalid path ({window_name}).")
+        payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+        # send request to server to open file
+        try:
+            response = requests.post(self.http_server + "/setup" + "/close_window", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error(f"Failed to close window {window_name}. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+    def _proxy_setup(self, client_password: str = ""):
+        """Setup system-wide proxy configuration using proxy pool
+        Args:
+            client_password (str): Password for sudo operations, defaults to "password"
+        """
+        retry = 0
+        while retry < MAX_RETRIES:
+            try:
+                _ = requests.get(self.http_server + "/terminal")
+                break
+            except:
+                time.sleep(5)
+                retry += 1
+                logger.info(f"retry: {retry}/{MAX_RETRIES}")
+            if retry == MAX_RETRIES:
+                return False
+        # Get proxy from global proxy pool
+        proxy_pool = get_global_proxy_pool()
+        current_proxy = proxy_pool.get_next_proxy()
+        if not current_proxy:
+            logger.error("No proxy available from proxy pool")
+            raise Exception("No proxy available from proxy pool")
+        # Format proxy URL
+        proxy_url = proxy_pool._format_proxy_url(current_proxy)
+        logger.info(f"Setting up proxy: {current_proxy.host}:{current_proxy.port}")
+        # Configure system proxy environment variables
+        proxy_commands = [
+            f"echo '{client_password}' | sudo -S bash -c \"apt-get update\"", ## TODO: remove this line if ami is already updated
+            f"echo '{client_password}' | sudo -S bash -c \"apt-get install -y tinyproxy\"", ## TODO: remove this line if tinyproxy is already installed
+            f"echo '{client_password}' | sudo -S bash -c \"echo 'Port 18888' > /tmp/tinyproxy.conf\"",
+            f"echo '{client_password}' | sudo -S bash -c \"echo 'Allow 127.0.0.1' >> /tmp/tinyproxy.conf\"",
+            f"echo '{client_password}' | sudo -S bash -c \"echo 'Upstream http {current_proxy.username}:{current_proxy.password}@{current_proxy.host}:{current_proxy.port}' >> /tmp/tinyproxy.conf\"",
+            # CML commands to set environment variables for proxy
+            f"echo 'export http_proxy={proxy_url}' >> ~/.bashrc",
+            f"echo 'export https_proxy={proxy_url}' >> ~/.bashrc",
+            f"echo 'export HTTP_PROXY={proxy_url}' >> ~/.bashrc",
+            f"echo 'export HTTPS_PROXY={proxy_url}' >> ~/.bashrc",
+        ]
+        # Execute all proxy configuration commands
+        for cmd in proxy_commands:
+            try:
+                self._execute_setup([cmd], shell=True)
+            except Exception as e:
+                logger.error(f"Failed to execute proxy setup command: {e}")
+                proxy_pool.mark_proxy_failed(current_proxy)
+                raise
+        self._launch_setup(["tinyproxy -c /tmp/tinyproxy.conf -d"], shell=True)
+        # Reload environment variables
+        reload_cmd = "source /etc/environment"
+        try:
+            logger.info(f"Proxy setup completed successfully for {current_proxy.host}:{current_proxy.port}")
+            proxy_pool.mark_proxy_success(current_proxy)
+        except Exception as e:
+            logger.error(f"Failed to reload environment variables: {e}")
+            proxy_pool.mark_proxy_failed(current_proxy)
+            raise
+    # Chrome setup
+    def _chrome_open_tabs_setup(self, urls_to_open: List[str]):
+        host = self.vm_ip
+        port = self.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+        remote_debugging_url = f"http://{host}:{port}"
+        logger.info("Connect to Chrome @: %s", remote_debugging_url)
+        logger.debug("PLAYWRIGHT ENV: %s", repr(os.environ))
+        for attempt in range(15):
+            if attempt > 0:
+                time.sleep(5)
+            browser = None
+            with sync_playwright() as p:
+                try:
+                    browser = p.chromium.connect_over_cdp(remote_debugging_url)
+                    # break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        # time.sleep(10)
+                        continue
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+                if not browser:
+                    return
+                logger.info("Opening %s...", urls_to_open)
+                for i, url in enumerate(urls_to_open):
+                    # Use the first context (which should be the only one if using default profile)
+                    if i == 0:
+                        context = browser.contexts[0]
+                    page = context.new_page()  # Create a new page (tab) within the existing context
+                    try:
+                        page.goto(url, timeout=60000)
+                    except:
+                        logger.warning("Opening %s exceeds time limit", url)  # only for human test
+                    logger.info(f"Opened tab {i + 1}: {url}")
+                    if i == 0:
+                        # clear the default tab
+                        default_page = context.pages[0]
+                        default_page.close()
+                # Do not close the context or browser; they will remain open after script ends
+                return browser, context
+    def _chrome_close_tabs_setup(self, urls_to_close: List[str]):
+        time.sleep(5)  # Wait for Chrome to finish launching
+        host = self.vm_ip
+        port = self.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+        remote_debugging_url = f"http://{host}:{port}"
+        with sync_playwright() as p:
+            browser = None
+            for attempt in range(15):
+                try:
+                    browser = p.chromium.connect_over_cdp(remote_debugging_url)
+                    break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        time.sleep(5)
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+            if not browser:
+                return
+            for i, url in enumerate(urls_to_close):
+                # Use the first context (which should be the only one if using default profile)
+                if i == 0:
+                    context = browser.contexts[0]
+                for page in context.pages:
+                    # if two urls are the same, close the tab
+                    if compare_urls(page.url, url):
+                        context.pages.pop(context.pages.index(page))
+                        page.close()
+                        logger.info(f"Closed tab {i + 1}: {url}")
+                        break
+            # Do not close the context or browser; they will remain open after script ends
+            return browser, context
+    # google drive setup
+    def _googledrive_setup(self, **config):
+        """ Clean google drive space (eliminate the impact of previous experiments to reset the environment)
+        @args:
+            config(Dict[str, Any]): contain keys
+                settings_file(str): path to google drive settings file, which will be loaded by pydrive.auth.GoogleAuth()
+                operation(List[str]): each operation is chosen from ['delete', 'upload']
+                args(List[Dict[str, Any]]): parameters for each operation
+            different args dict for different operations:
+                for delete:
+                    query(str): query pattern string to search files or folder in google drive to delete, please refer to
+                        https://developers.google.com/drive/api/guides/search-files?hl=en about how to write query string.
+                    trash(bool): whether to delete files permanently or move to trash. By default, trash=false, completely delete it.
+                for mkdirs:
+                    path(List[str]): the path in the google drive to create folder
+                for upload:
+                    path(str): remote url to download file
+                    dest(List[str]): the path in the google drive to store the downloaded file
+        """
+        settings_file = config.get('settings_file', 'evaluation_examples/settings/googledrive/settings.yml')
+        gauth = GoogleAuth(settings_file=settings_file)
+        drive = GoogleDrive(gauth)
+        def mkdir_in_googledrive(paths: List[str]):
+            paths = [paths] if type(paths) != list else paths
+            parent_id = 'root'
+            for p in paths:
+                q = f'"{parent_id}" in parents and title = "{p}" and mimeType = "application/vnd.google-apps.folder" and trashed = false'
+                folder = drive.ListFile({'q': q}).GetList()
+                if len(folder) == 0:  # not exists, create it
+                    parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
+                    file = drive.CreateFile({'title': p, 'mimeType': 'application/vnd.google-apps.folder', **parents})
+                    file.Upload()
+                    parent_id = file['id']
+                else:
+                    parent_id = folder[0]['id']
+            return parent_id
+        for oid, operation in enumerate(config['operation']):
+            if operation == 'delete':  # delete a specific file
+                # query pattern string, by default, remove all files/folders not in the trash to the trash
+                params = config['args'][oid]
+                q = params.get('query', '')
+                trash = params.get('trash', False)
+                q_file = f"( {q} ) and mimeType != 'application/vnd.google-apps.folder'" if q.strip() else "mimeType != 'application/vnd.google-apps.folder'"
+                filelist: GoogleDriveFileList = drive.ListFile({'q': q_file}).GetList()
+                q_folder = f"( {q} ) and mimeType = 'application/vnd.google-apps.folder'" if q.strip() else "mimeType = 'application/vnd.google-apps.folder'"
+                folderlist: GoogleDriveFileList = drive.ListFile({'q': q_folder}).GetList()
+                for file in filelist:  # first delete file, then folder
+                    file: GoogleDriveFile
+                    if trash:
+                        file.Trash()
+                    else:
+                        file.Delete()
+                for folder in folderlist:
+                    folder: GoogleDriveFile
+                    # note that, if a folder is trashed/deleted, all files and folders in it will be trashed/deleted
+                    if trash:
+                        folder.Trash()
+                    else:
+                        folder.Delete()
+            elif operation == 'mkdirs':
+                params = config['args'][oid]
+                mkdir_in_googledrive(params['path'])
+            elif operation == 'upload':
+                params = config['args'][oid]
+                url = params['url']
+                with tempfile.NamedTemporaryFile(mode='wb', delete=False) as tmpf:
+                    response = requests.get(url, stream=True)
+                    response.raise_for_status()
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            tmpf.write(chunk)
+                    tmpf.close()
+                    paths = [params['path']] if params['path'] != list else params['path']
+                    parent_id = mkdir_in_googledrive(paths[:-1])
+                    parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
+                    file = drive.CreateFile({'title': paths[-1], **parents})
+                    file.SetContentFile(tmpf.name)
+                    file.Upload()
+                return
+            else:
+                raise ValueError('[ERROR]: not implemented clean type!')
+    def _login_setup(self, **config):
+        """ Login to a website with account and password information.
+        @args:
+            config(Dict[str, Any]): contain keys
+                settings_file(str): path to the settings file
+                platform(str): platform to login, implemented platforms include:
+                    googledrive: https://drive.google.com/drive/my-drive
+        """
+        host = self.vm_ip
+        port = self.chromium_port
+        remote_debugging_url = f"http://{host}:{port}"
+        with sync_playwright() as p:
+            browser = None
+            for attempt in range(15):
+                try:
+                    browser = p.chromium.connect_over_cdp(remote_debugging_url)
+                    break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        time.sleep(5)
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+            if not browser:
+                return
+            context = browser.contexts[0]
+            platform = config['platform']
+            if platform == 'googledrive':
+                url = 'https://drive.google.com/drive/my-drive'
+                page = context.new_page()  # Create a new page (tab) within the existing context
+                try:
+                    page.goto(url, timeout=60000)
+                except:
+                    logger.warning("Opening %s exceeds time limit", url)  # only for human test
+                logger.info(f"Opened new page: {url}")
+                settings = json.load(open(config['settings_file']))
+                email, password = settings['email'], settings['password']
+                try:
+                    page.wait_for_selector('input[type="email"]', state="visible", timeout=3000)
+                    page.fill('input[type="email"]', email)
+                    page.click('#identifierNext > div > button')
+                    page.wait_for_selector('input[type="password"]', state="visible", timeout=5000)
+                    page.fill('input[type="password"]', password)
+                    page.click('#passwordNext > div > button')
+                    page.wait_for_load_state('load', timeout=5000)
+                except TimeoutError:
+                    logger.info('[ERROR]: timeout when waiting for google drive login page to load!')
+                    return
+            else:
+                raise NotImplementedError
+            return browser, context
+    def _update_browse_history_setup(self, **config):
+        cache_path = os.path.join(self.cache_dir, "history_new.sqlite")
+        db_url = "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938/history_empty.sqlite?download=true"
+        if not os.path.exists(cache_path):
+            max_retries = 3
+            downloaded = False
+            e = None
+            for i in range(max_retries):
+                try:
+                    response = requests.get(db_url, stream=True)
+                    response.raise_for_status()
+                    with open(cache_path, 'wb') as f:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                    logger.info("File downloaded successfully")
+                    downloaded = True
+                    break
+                except requests.RequestException as e:
+                    logger.error(
+                        f"Failed to download {db_url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+            if not downloaded:
+                raise requests.RequestException(f"Failed to download {db_url}. No retries left. Error: {e}")
+        else:
+            logger.info("File already exists in cache directory")
+        # copy a new history file in the tmp folder
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            db_path = os.path.join(tmp_dir, "history_empty.sqlite")
+            shutil.copy(cache_path, db_path)
+            history = config['history']
+            for history_item in history:
+                url = history_item['url']
+                title = history_item['title']
+                visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
+                # Chrome use ms from 1601-01-01 as timestamp
+                epoch_start = datetime(1601, 1, 1)
+                chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
+                conn = sqlite3.connect(db_path)
+                cursor = conn.cursor()
+                cursor.execute('''
+                    INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                ''', (url, title, 1, 0, chrome_timestamp, 0))
+                url_id = cursor.lastrowid
+                cursor.execute('''
+                    INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                ''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
+                conn.commit()
+                conn.close()
+            logger.info('Fake browsing history added successfully.')
+            controller = PythonController(self.vm_ip, self.server_port)
+            # get the path of the history file according to the platform
+            os_type = controller.get_vm_platform()
+            if os_type == 'Windows':
+                chrome_history_path = controller.execute_python_command(
+                    """import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
+                    'output'].strip()
+            elif os_type == 'Darwin':
+                chrome_history_path = controller.execute_python_command(
+                    """import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
+                    'output'].strip()
+            elif os_type == 'Linux':
+                if "arm" in platform.machine():
+                    chrome_history_path = controller.execute_python_command(
+                        "import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
+                        'output'].strip()
+                else:
+                    chrome_history_path = controller.execute_python_command(
+                        "import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
+                        'output'].strip()
+            else:
+                raise Exception('Unsupported operating system')
+            form = MultipartEncoder({
+                "file_path": chrome_history_path,
+                "file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
+            })
+            headers = {"Content-Type": form.content_type}
+            logger.debug(form.content_type)
+            # send request to server to upload file
+            try:
+                logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
+                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
+                if response.status_code == 200:
+                    logger.info("Command executed successfully: %s", response.text)
+                else:
+                    logger.error("Failed to upload file. Status code: %s", response.text)
+            except requests.exceptions.RequestException as e:
+                logger.error("An error occurred while trying to send the request: %s", e)
+            self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)

desktop_env/desktop_env.py ADDED Viewed

	@@ -0,0 +1,497 @@

+from __future__ import annotations
+import logging
+import os
+import time
+import re
+from typing import Callable, Any, Optional, Tuple
+from typing import List, Dict, Union
+import gymnasium as gym
+from desktop_env.controllers.python import PythonController
+from desktop_env.controllers.setup import SetupController
+from desktop_env.evaluators import metrics, getters
+from desktop_env.providers import create_vm_manager_and_provider
+logger = logging.getLogger("desktopenv.env")
+Metric = Callable[[Any, Any], float]
+Getter = Callable[[gym.Env, Dict[str, Any]], Any]
+MAX_RETRIES = 5 # Maximum retries for environment setup
+def _fix_pyautogui_less_than_bug(command: str) -> str:
+    """
+    Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
+    This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
+    References:
+    - https://github.com/asweigart/pyautogui/issues/198
+    - https://github.com/xlang-ai/OSWorld/issues/257
+    Args:
+        command (str): The original pyautogui command
+    Returns:
+        str: The fixed command with '<' characters handled properly
+    """
+    # Pattern to match press('<') or press('\u003c') calls
+    press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
+    # Handle press('<') calls
+    def replace_press_less_than(match):
+        return 'pyautogui.hotkey("shift", ",")'
+    # First handle press('<') calls
+    command = re.sub(press_pattern, replace_press_less_than, command)
+    # Pattern to match typewrite calls with quoted strings
+    typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
+    # Then handle typewrite calls
+    def process_typewrite_match(match):
+        quote_char = match.group(1)
+        content = match.group(2)
+        # Preprocess: Try to decode Unicode escapes like \u003c to actual '<'
+        # This handles cases where '<' is represented as escaped Unicode
+        try:
+            # Attempt to decode unicode escapes
+            decoded_content = content.encode('utf-8').decode('unicode_escape')
+            content = decoded_content
+        except UnicodeDecodeError:
+            # If decoding fails, proceed with original content to avoid breaking existing logic
+            pass  # English comment: Graceful degradation - fall back to original content if decoding fails
+        # Check if content contains '<'
+        if '<' not in content:
+            return match.group(0)
+        # Split by '<' and rebuild
+        parts = content.split('<')
+        result_parts = []
+        for i, part in enumerate(parts):
+            if i == 0:
+                # First part
+                if part:
+                    result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
+            else:
+                # Add hotkey for '<' and then typewrite for the rest
+                result_parts.append('pyautogui.hotkey("shift", ",")')
+                if part:
+                    result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
+        return '; '.join(result_parts)
+    command = re.sub(typewrite_pattern, process_typewrite_match, command)
+    return command
+class DesktopEnv(gym.Env):
+    """
+    DesktopEnv with OpenAI Gym interface. It provides a desktop environment for setting and evaluating desktop automation tasks.
+    """
+    def __init__(
+            self,
+            provider_name: str = "vmware",
+            region: str = None,
+            path_to_vm: str = None,
+            snapshot_name: str = "init_state",
+            action_space: str = "pyautogui",
+            cache_dir: str = "cache",
+            screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))),
+            headless: bool = False,
+            require_a11y_tree: bool = True,
+            require_terminal: bool = False,
+            os_type: str = "Ubuntu",
+            enable_proxy: bool = False,
+            client_password: str = "",
+    ):
+        """
+        Args:
+            provider_name (str): virtualization provider name, default to "vmware"
+            region (str): the region for allocate machines, work for cloud services, default to  "us-east-1"
+            path_to_vm (str): path to .vmx file
+            snapshot_name (str): snapshot name to revert to, default to "init_state"
+            action_space (str): "computer_13" | "pyautogui"
+            cache_dir (str): cache directory to cache task-related stuffs like
+              reference file for evaluation
+            screen_size (Tuple[int]): screen size of the VM
+            headless (bool): whether to run the VM in headless mode
+            require_a11y_tree (bool): whether to require accessibility tree
+            require_terminal (bool): whether to require terminal output
+            os_type (str): operating system type, default to "Ubuntu"
+            enable_proxy (bool): whether to enable proxy support, default to False
+        """
+        # Initialize VM manager and vitualization provider
+        self.region = region
+        self.provider_name = provider_name
+        self.enable_proxy = enable_proxy  # Store proxy enablement setting
+        if client_password == "":
+            if self.provider_name == "aws":
+                self.client_password = "osworld-public-evaluation"
+            else:
+                self.client_password = "password"
+        else:
+            self.client_password = client_password
+        self.screen_width = screen_size[0]
+        self.screen_height = screen_size[1]
+        # Default
+        self.server_port = 5000
+        self.chromium_port = 9222
+        self.vnc_port = 8006
+        self.vlc_port = 8080
+        # Initialize with default (no proxy) provider
+        self.current_use_proxy = False
+        self.manager, self.provider = create_vm_manager_and_provider(provider_name, region, use_proxy=False)
+        self.os_type = os_type
+        # Track whether environment has been used (step/setup) to optimize snapshot revert
+        # docker, aws, gcp, azure are always unused as the emulator starts from a clean state
+        # vmware, virtualbox are always used as the emulator starts from a dirty state
+        if self.provider_name in {"docker", "aws", "gcp", "azure", "aliyun", "volcengine"}:
+            self.is_environment_used = False
+        elif self.provider_name in {"vmware", "virtualbox"}:
+            self.is_environment_used = True
+        else:
+            raise ValueError(f"Invalid provider name: {self.provider_name}")
+        # Initialize environment variables
+        if path_to_vm:
+            self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(path_to_vm))) \
+                if provider_name in {"vmware", "virtualbox"} else path_to_vm
+        else:
+            self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=region, screen_size=(self.screen_width, self.screen_height))
+        self.snapshot_name = snapshot_name
+        self.cache_dir_base: str = cache_dir
+        # todo: add the logic to get the screen size from the VM
+        self.headless = headless
+        self.require_a11y_tree = require_a11y_tree
+        self.require_terminal = require_terminal
+        # Initialize emulator and controller
+        logger.info("Initializing...")
+        self._start_emulator()
+        # mode: human or machine
+        self.instruction = None
+        assert action_space in ["computer_13", "pyautogui", "claude_computer_use", "autoglm_computer_use"]
+        self.action_space = action_space  # todo: refactor it to the ActType
+        # episodic stuffs, like counters, will be updated or reset
+        # when calling self.reset()
+        self._traj_no: int = -1
+        self._step_no: int = 0
+        self.action_history: List[Dict[str, any]] = []
+    def _start_emulator(self):
+        try:
+            # Power on the virtual machine
+            self.provider.start_emulator(self.path_to_vm, self.headless, self.os_type)
+            # Get the ip from the virtual machine, and setup the controller
+            vm_ip_ports = self.provider.get_ip_address(self.path_to_vm).split(':')
+            self.vm_ip = vm_ip_ports[0]
+            # Get the ports from the virtual machine (for Docker provider only)
+            if len(vm_ip_ports) > 1:
+                self.server_port = int(vm_ip_ports[1])
+                self.chromium_port = int(vm_ip_ports[2])
+                self.vnc_port = int(vm_ip_ports[3])
+                self.vlc_port = int(vm_ip_ports[4])
+            self.controller = PythonController(vm_ip=self.vm_ip, server_port=self.server_port)
+            self.setup_controller = SetupController(vm_ip=self.vm_ip, server_port=self.server_port, chromium_port=self.chromium_port, vlc_port=self.vlc_port, cache_dir=self.cache_dir_base, client_password=self.client_password, screen_width=self.screen_width, screen_height=self.screen_height)
+        except Exception as e:
+            try:
+                self.provider.stop_emulator(self.path_to_vm)
+            except Exception as stop_err:
+                logger.warning(f"Cleanup after interrupt failed: {stop_err}")
+            raise
+    def _revert_to_snapshot(self):
+        # Revert to certain snapshot of the virtual machine, and refresh the path to vm and ip of vm
+        # due to the fact it could be changed when implemented by cloud services
+        path_to_vm = self.provider.revert_to_snapshot(self.path_to_vm, self.snapshot_name)
+        if path_to_vm and not path_to_vm == self.path_to_vm:
+            # path_to_vm has to be a new path
+            self.manager.delete_vm(self.path_to_vm, self.region)
+            self.manager.add_vm(path_to_vm, self.region)
+            self.manager.occupy_vm(path_to_vm, os.getpid(), self.region)
+            self.path_to_vm = path_to_vm
+    def _save_state(self, snapshot_name=None):
+        # Save the current virtual machine state to a certain snapshot name
+        self.provider.save_state(self.path_to_vm, snapshot_name)
+    def close(self):
+        # Close (release) the virtual machine
+        self.provider.stop_emulator(self.path_to_vm)
+    def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
+        # Reset to certain task in OSWorld
+        logger.info("Resetting environment...")
+        logger.info("Switching task...")
+        logger.info("Setting counters...")
+        self._traj_no += 1
+        self._step_no = 0
+        self.action_history.clear()
+        for attempt in range(MAX_RETRIES):
+            # Only revert to snapshot if environment has been used (step/setup)
+            # This optimization is especially important for cloud providers like AWS
+            # where unnecessary snapshot operations are costly and time-consuming
+            if task_config is not None:
+                # Only consider task proxy requirement if proxy is enabled at system level
+                task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
+                if not self.enable_proxy and task_config.get("proxy", False):
+                    logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
+                if task_use_proxy != self.current_use_proxy:
+                    # keep because get_info_from_website depend on this
+                    self.current_use_proxy = task_use_proxy
+            if self.is_environment_used:
+                logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
+                self._revert_to_snapshot()
+                logger.info("Starting emulator...")
+                self._start_emulator()
+                logger.info("Emulator started.")
+                # Reset the usage flag after reverting
+                self.is_environment_used = False
+            else:
+                logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
+            if task_config is not None:
+                if task_config.get("proxy", False) and self.enable_proxy:
+                    # If using proxy and proxy is enabled, set up the proxy configuration
+                    self.setup_controller._proxy_setup(self.client_password)
+                self._set_task_info(task_config)
+                self.setup_controller.reset_cache_dir(self.cache_dir)
+                logger.info("Setting up environment...")
+                success = self.setup_controller.setup(self.config, task_config.get("proxy", False) and self.enable_proxy)
+                if success:
+                    # Mark environment as used when setup is successfully executed
+                    if self.config:  # Only mark as used if there were actual setup operations
+                        self.is_environment_used = True
+                    break
+                else:
+                    logger.error(
+                        "Environment setup failed, retrying (%d/%d)...",
+                        attempt + 1,
+                        MAX_RETRIES,
+                    )
+                    time.sleep(5)
+            else:
+                break
+        logger.info("Environment setup complete.")
+        observation = self._get_obs()
+        return observation
+    def _get_obs(self):
+        # We provide screenshot, accessibility_tree (optional), terminal (optional), and instruction.
+        # can be customized and scaled
+        return {
+            "screenshot": self.controller.get_screenshot(),
+            "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
+            "terminal": self.controller.get_terminal_output() if self.require_terminal else None,
+            "instruction": self.instruction
+        }
+    @property
+    def vm_platform(self):
+        return self.controller.get_vm_platform()
+    @property
+    def vm_screen_size(self):
+        return self.controller.get_vm_screen_size()
+    def _set_task_info(self, task_config: Dict[str, Any]):
+        """Set task info (proxy logic is handled in reset method)"""
+        self.task_id: str = task_config["id"]
+        self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
+        os.makedirs(self.cache_dir, exist_ok=True)
+        self.instruction = task_config["instruction"]
+        self.config = task_config["config"] if "config" in task_config else []
+        self._set_evaluator_info(task_config)
+    def _set_evaluator_info(self, task_config: Dict[str, Any]):
+        """Set evaluator information from task config"""
+        # evaluator dict
+        # func -> metric function string, or list of metric function strings
+        # conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or"
+        # result -> result getter config, or list of result getter configs
+        # expected (optional) -> expected getter config, or list of expected getter configs
+        # options (optional) -> metric options, or list of metric options
+        # if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length
+        # even if one of the metrics does not need expected or options field, it should be included in the list with None
+        self.evaluator = task_config["evaluator"]
+        self.metric: Metric = [getattr(metrics, func) for func in self.evaluator["func"]] \
+            if isinstance(self.evaluator["func"], list) \
+            else getattr(metrics, self.evaluator["func"])
+        self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
+        if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
+            self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
+                                          self.evaluator["result"]] \
+                if isinstance(self.evaluator["result"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
+        else:
+            self.result_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+        if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
+            self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
+                                            self.evaluator["expected"]] \
+                if isinstance(self.evaluator["expected"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
+        else:
+            self.expected_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+        self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in
+                                                                            self.evaluator["options"]] \
+            if isinstance(self.evaluator.get("options", {}), list) \
+            else self.evaluator["options"] \
+            if "options" in self.evaluator \
+            else [{}] * len(self.metric) \
+            if isinstance(self.metric, list) \
+            else {}
+        assert (not isinstance(self.evaluator["func"], list)
+                or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
+                    self.metric_options)))
+    def step(self, action, pause=2):
+        self._step_no += 1
+        self.action_history.append(action)
+        # Mark environment as used when step is called
+        self.is_environment_used = True
+        reward = 0  # todo: Define reward calculation for each example
+        done = False  # todo: Define episode termination condition for each example
+        info = {}
+        logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
+        # handle the special actions
+        if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
+            if action == 'WAIT' or (type(action) == dict and action.get('action_type') == 'WAIT'):
+                time.sleep(pause)
+            elif action == 'FAIL' or (type(action) == dict and action.get('action_type') == 'FAIL'):
+                done = True
+                info = {"fail": True}
+            elif action == 'DONE' or (type(action) == dict and action.get('action_type') == 'DONE'):
+                done = True
+                info = {"done": True}
+        if self.action_space == "computer_13":
+            # the set of all possible actions defined in the action representation
+            self.controller.execute_action(action)
+        elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
+            if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']):
+                self.controller.execute_action(action)
+            else:
+                # the set of all possible python commands insides `pyautogui`
+                if type(action) == str:
+                    # Fix PyAutoGUI '<' character bug before execution
+                    fixed_command = _fix_pyautogui_less_than_bug(action)
+                    self.controller.execute_python_command(fixed_command)
+                elif type(action) == dict:
+                    # Fix PyAutoGUI '<' character bug before execution
+                    fixed_command = _fix_pyautogui_less_than_bug(action['command'])
+                    self.controller.execute_python_command(fixed_command)
+        time.sleep(pause)
+        observation = self._get_obs()
+        return observation, reward, done, info
+    def evaluate(self):
+        """
+        Evaluate whether the task is successfully completed.
+        """
+        postconfig = self.evaluator.get("postconfig", [])
+        self.setup_controller.setup(postconfig, self.enable_proxy)
+        # Mark environment as used if there were postconfig setup operations
+        if postconfig:
+            self.is_environment_used = True
+        if self.evaluator['func'] == "infeasible":
+            if len(self.action_history) > 0:
+                last_action = self.action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 1
+            return 0
+        else:
+            if len(self.action_history) > 0:
+                last_action = self.action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 0
+        if type(self.metric) == list:
+            # Multiple metrics to evaluate whether the task is successfully completed
+            results = []
+            assert len(self.metric) == len(self.result_getter), "The number of metrics and result getters must be the same"
+            if "expected" in self.evaluator:
+                assert len(self.metric) == len(self.expected_getter), "The number of metrics and expected getters must be the same"
+            for idx, metric in enumerate(self.metric):
+                try:
+                    config = self.evaluator["result"][idx]
+                    result_state = self.result_getter[idx](self, config)
+                except FileNotFoundError:
+                    logger.error("File not found!")
+                    if self.metric_conj == 'and':
+                        return 0
+                if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                    expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
+                    metric: int = metric(result_state, expected_state, **self.metric_options[idx])
+                else:
+                    metric: int = metric(result_state, **self.metric_options[idx])
+                if self.metric_conj == 'and' and float(metric) == 0.0:
+                    return 0
+                elif self.metric_conj == 'or' and float(metric) == 1.0:
+                    return 1
+                else:
+                    results.append(metric)
+            return sum(results) / len(results) if self.metric_conj == 'and' else max(results)
+        else:
+            # Single metric to evaluate whether the task is successfully completed
+            try:
+                result_state = self.result_getter(self, self.evaluator["result"])
+            except FileNotFoundError:
+                logger.error("File not found!")
+                return 0
+            if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                expected_state = self.expected_getter(self, self.evaluator["expected"])
+                metric: float = self.metric(result_state, expected_state, **self.metric_options)
+            else:
+                metric: float = self.metric(result_state, **self.metric_options)
+        return metric
+    def render(self, mode='rgb_array'):
+        if mode == 'rgb_array':
+            return self.controller.get_screenshot()
+        else:
+            raise ValueError('Unsupported render mode: {}'.format(mode))

desktop_env/desktop_env_os_symphony.py ADDED Viewed

	@@ -0,0 +1,499 @@

+from __future__ import annotations
+import logging
+import os
+import time
+import re
+from typing import Callable, Any, Optional, Tuple
+from typing import List, Dict, Union
+import gymnasium as gym
+from desktop_env.controllers.python import PythonController
+from desktop_env.controllers.setup import SetupController
+from desktop_env.evaluators import metrics, getters
+from desktop_env.providers import create_vm_manager_and_provider
+logger = logging.getLogger("desktopenv.env")
+Metric = Callable[[Any, Any], float]
+Getter = Callable[[gym.Env, Dict[str, Any]], Any]
+MAX_RETRIES = 5 # Maximum retries for environment setup
+def _fix_pyautogui_less_than_bug(command: str) -> str:
+    """
+    Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
+    This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
+    References:
+    - https://github.com/asweigart/pyautogui/issues/198
+    - https://github.com/xlang-ai/OSWorld/issues/257
+    Args:
+        command (str): The original pyautogui command
+    Returns:
+        str: The fixed command with '<' characters handled properly
+    """
+    # Pattern to match press('<') or press('\u003c') calls
+    press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
+    # Handle press('<') calls
+    def replace_press_less_than(match):
+        return 'pyautogui.hotkey("shift", ",")'
+    # First handle press('<') calls
+    command = re.sub(press_pattern, replace_press_less_than, command)
+    # Pattern to match typewrite calls with quoted strings
+    typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
+    # Then handle typewrite calls
+    def process_typewrite_match(match):
+        quote_char = match.group(1)
+        content = match.group(2)
+        # Preprocess: Try to decode Unicode escapes like \u003c to actual '<'
+        # This handles cases where '<' is represented as escaped Unicode
+        try:
+            # Attempt to decode unicode escapes
+            decoded_content = content.encode('utf-8').decode('unicode_escape')
+            content = decoded_content
+        except UnicodeDecodeError:
+            # If decoding fails, proceed with original content to avoid breaking existing logic
+            pass  # English comment: Graceful degradation - fall back to original content if decoding fails
+        # Check if content contains '<'
+        if '<' not in content:
+            return match.group(0)
+        # Split by '<' and rebuild
+        parts = content.split('<')
+        result_parts = []
+        for i, part in enumerate(parts):
+            if i == 0:
+                # First part
+                if part:
+                    result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
+            else:
+                # Add hotkey for '<' and then typewrite for the rest
+                result_parts.append('pyautogui.hotkey("shift", ",")')
+                if part:
+                    result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
+        return '; '.join(result_parts)
+    command = re.sub(typewrite_pattern, process_typewrite_match, command)
+    return command
+class DesktopEnv(gym.Env):
+    """
+    DesktopEnv with OpenAI Gym interface. It provides a desktop environment for setting and evaluating desktop automation tasks.
+    """
+    def __init__(
+            self,
+            provider_name: str = "vmware",
+            region: str = None,
+            path_to_vm: str = None,
+            snapshot_name: str = "init_state",
+            action_space: str = "pyautogui",
+            cache_dir: str = "cache",
+            screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))),
+            headless: bool = False,
+            require_a11y_tree: bool = True,
+            require_terminal: bool = False,
+            os_type: str = "Ubuntu",
+            enable_proxy: bool = False,
+            client_password: str = "",
+    ):
+        """
+        Args:
+            provider_name (str): virtualization provider name, default to "vmware"
+            region (str): the region for allocate machines, work for cloud services, default to  "us-east-1"
+            path_to_vm (str): path to .vmx file
+            snapshot_name (str): snapshot name to revert to, default to "init_state"
+            action_space (str): "computer_13" | "pyautogui"
+            cache_dir (str): cache directory to cache task-related stuffs like
+              reference file for evaluation
+            screen_size (Tuple[int]): screen size of the VM
+            headless (bool): whether to run the VM in headless mode
+            require_a11y_tree (bool): whether to require accessibility tree
+            require_terminal (bool): whether to require terminal output
+            os_type (str): operating system type, default to "Ubuntu"
+            enable_proxy (bool): whether to enable proxy support, default to False
+        """
+        # Initialize VM manager and vitualization provider
+        self.region = region
+        self.provider_name = provider_name
+        self.enable_proxy = enable_proxy  # Store proxy enablement setting
+        if client_password == "":
+            if self.provider_name == "aws":
+                self.client_password = "osworld-public-evaluation"
+            else:
+                self.client_password = "password"
+        else:
+            self.client_password = client_password
+        self.screen_width = screen_size[0]
+        self.screen_height = screen_size[1]
+        # Default
+        self.server_port = 5000
+        self.chromium_port = 9222
+        self.vnc_port = 8006
+        self.vlc_port = 8080
+        # Initialize with default (no proxy) provider
+        self.current_use_proxy = False
+        self.manager, self.provider = None, None
+        self.os_type = os_type
+        self.path_to_vm = path_to_vm
+        # Track whether environment has been used (step/setup) to optimize snapshot revert
+        # docker, aws, gcp, azure are always unused as the emulator starts from a clean state
+        # vmware, virtualbox are always used as the emulator starts from a dirty state
+        if self.provider_name in {"docker", "aws", "gcp", "azure", "aliyun", "volcengine"}:
+            self.is_environment_used = False
+        elif self.provider_name in {"vmware", "virtualbox"}:
+            self.is_environment_used = True
+        else:
+            raise ValueError(f"Invalid provider name: {self.provider_name}")
+        self.snapshot_name = snapshot_name
+        self.cache_dir_base: str = cache_dir
+        self.headless = headless
+        self.require_a11y_tree = require_a11y_tree
+        self.require_terminal = require_terminal
+        # mode: human or machine
+        self.instruction = None
+        assert action_space in ["computer_13", "pyautogui", "claude_computer_use", "autoglm_computer_use"]
+        self.action_space = action_space  # todo: refactor it to the ActType
+        # episodic stuffs, like counters, will be updated or reset
+        # when calling self.reset()
+        self._traj_no: int = -1
+        self._step_no: int = 0
+        self.action_history: List[Dict[str, any]] = []
+    def start(self):
+        # Initialize emulator and controller
+        if not self.manager and not self.provider:
+            logger.info("Initializing...")
+            self.manager, self.provider = create_vm_manager_and_provider(self.provider_name, self.region, use_proxy=False)
+            if self.path_to_vm:
+                self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(self.path_to_vm))) \
+                    if self.provider_name in {"vmware", "virtualbox"} else self.path_to_vm
+            else:
+                self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=self.region, screen_size=(self.screen_width, self.screen_height))
+            self._start_emulator()
+    def _start_emulator(self):
+        try:
+            # Power on the virtual machine
+            self.provider.start_emulator(self.path_to_vm, self.headless, self.os_type)
+            # Get the ip from the virtual machine, and setup the controller
+            vm_ip_ports = self.provider.get_ip_address(self.path_to_vm).split(':')
+            self.vm_ip = vm_ip_ports[0]
+            # Get the ports from the virtual machine (for Docker provider only)
+            if len(vm_ip_ports) > 1:
+                self.server_port = int(vm_ip_ports[1])
+                self.chromium_port = int(vm_ip_ports[2])
+                self.vnc_port = int(vm_ip_ports[3])
+                self.vlc_port = int(vm_ip_ports[4])
+            self.controller = PythonController(vm_ip=self.vm_ip, server_port=self.server_port)
+            self.setup_controller = SetupController(vm_ip=self.vm_ip, server_port=self.server_port, chromium_port=self.chromium_port, vlc_port=self.vlc_port, cache_dir=self.cache_dir_base, client_password=self.client_password, screen_width=self.screen_width, screen_height=self.screen_height)
+        except Exception as e:
+            try:
+                self.provider.stop_emulator(self.path_to_vm)
+            except Exception as stop_err:
+                logger.warning(f"Cleanup after interrupt failed: {stop_err}")
+            raise
+    def _revert_to_snapshot(self):
+        # Revert to certain snapshot of the virtual machine, and refresh the path to vm and ip of vm
+        # due to the fact it could be changed when implemented by cloud services
+        path_to_vm = self.provider.revert_to_snapshot(self.path_to_vm, self.snapshot_name)
+        if path_to_vm and not path_to_vm == self.path_to_vm:
+            # path_to_vm has to be a new path
+            self.manager.delete_vm(self.path_to_vm, self.region)
+            self.manager.add_vm(path_to_vm, self.region)
+            self.manager.occupy_vm(path_to_vm, os.getpid(), self.region)
+            self.path_to_vm = path_to_vm
+    def _save_state(self, snapshot_name=None):
+        # Save the current virtual machine state to a certain snapshot name
+        self.provider.save_state(self.path_to_vm, snapshot_name)
+    def close(self):
+        # Close (release) the virtual machine
+        self.provider.stop_emulator(self.path_to_vm)
+    def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
+        # Reset to certain task in OSWorld
+        logger.info("Resetting environment...")
+        logger.info("Switching task...")
+        logger.info("Setting counters...")
+        self._traj_no += 1
+        self._step_no = 0
+        self.action_history.clear()
+        for attempt in range(MAX_RETRIES):
+            # Only revert to snapshot if environment has been used (step/setup)
+            # This optimization is especially important for cloud providers like AWS
+            # where unnecessary snapshot operations are costly and time-consuming
+            if task_config is not None:
+                # Only consider task proxy requirement if proxy is enabled at system level
+                task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
+                if not self.enable_proxy and task_config.get("proxy", False):
+                    logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
+                if task_use_proxy != self.current_use_proxy:
+                    # keep because get_info_from_website depend on this
+                    self.current_use_proxy = task_use_proxy
+            if self.is_environment_used:
+                logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
+                self._revert_to_snapshot()
+                logger.info("Starting emulator...")
+                self._start_emulator()
+                logger.info("Emulator started.")
+                # Reset the usage flag after reverting
+                self.is_environment_used = False
+            else:
+                logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
+            if task_config is not None:
+                if task_config.get("proxy", False) and self.enable_proxy:
+                    # If using proxy and proxy is enabled, set up the proxy configuration
+                    self.setup_controller._proxy_setup(self.client_password)
+                self._set_task_info(task_config)
+                self.setup_controller.reset_cache_dir(self.cache_dir)
+                logger.info("Setting up environment...")
+                success = self.setup_controller.setup(self.config, task_config.get("proxy", False) and self.enable_proxy)
+                if success:
+                    # Mark environment as used when setup is successfully executed
+                    if self.config:  # Only mark as used if there were actual setup operations
+                        self.is_environment_used = True
+                    break
+                else:
+                    logger.error(
+                        "Environment setup failed, retrying (%d/%d)...",
+                        attempt + 1,
+                        MAX_RETRIES,
+                    )
+                    time.sleep(5)
+            else:
+                break
+        logger.info("Environment setup complete.")
+        observation = self._get_obs()
+        return observation
+    def _get_obs(self):
+        # We provide screenshot, accessibility_tree (optional), terminal (optional), and instruction.
+        # can be customized and scaled
+        return {
+            "screenshot": self.controller.get_screenshot(),
+            "accessibility_tree": self.controller.get_accessibility_tree() if self.require_a11y_tree else None,
+            "terminal": self.controller.get_terminal_output() if self.require_terminal else None,
+            "instruction": self.instruction
+        }
+    @property
+    def vm_platform(self):
+        return self.controller.get_vm_platform()
+    @property
+    def vm_screen_size(self):
+        return self.controller.get_vm_screen_size()
+    def _set_task_info(self, task_config: Dict[str, Any]):
+        """Set task info (proxy logic is handled in reset method)"""
+        self.task_id: str = task_config["id"]
+        self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
+        os.makedirs(self.cache_dir, exist_ok=True)
+        self.instruction = task_config["instruction"]
+        self.config = task_config["config"] if "config" in task_config else []
+        self._set_evaluator_info(task_config)
+    def _set_evaluator_info(self, task_config: Dict[str, Any]):
+        """Set evaluator information from task config"""
+        if "evaluator" not in task_config:
+            return
+        # evaluator dict
+        # func -> metric function string, or list of metric function strings
+        # conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or"
+        # result -> result getter config, or list of result getter configs
+        # expected (optional) -> expected getter config, or list of expected getter configs
+        # options (optional) -> metric options, or list of metric options
+        # if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length
+        # even if one of the metrics does not need expected or options field, it should be included in the list with None
+        self.evaluator = task_config["evaluator"]
+        self.metric: Metric = [getattr(metrics, func) for func in self.evaluator["func"]] \
+            if isinstance(self.evaluator["func"], list) \
+            else getattr(metrics, self.evaluator["func"])
+        self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
+        if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
+            self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
+                                          self.evaluator["result"]] \
+                if isinstance(self.evaluator["result"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
+        else:
+            self.result_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+        if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
+            self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
+                                            self.evaluator["expected"]] \
+                if isinstance(self.evaluator["expected"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
+        else:
+            self.expected_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+        self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in
+                                                                            self.evaluator["options"]] \
+            if isinstance(self.evaluator.get("options", {}), list) \
+            else self.evaluator["options"] \
+            if "options" in self.evaluator \
+            else [{}] * len(self.metric) \
+            if isinstance(self.metric, list) \
+            else {}
+        assert (not isinstance(self.evaluator["func"], list)
+                or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
+                    self.metric_options)))
+    def step(self, action, pause=2):
+        self._step_no += 1
+        self.action_history.append(action)
+        # Mark environment as used when step is called
+        self.is_environment_used = True
+        reward = 0  # todo: Define reward calculation for each example
+        done = False  # todo: Define episode termination condition for each example
+        info = {}
+        logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
+        # handle the special actions
+        if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
+            if action == 'WAIT' or (type(action) == dict and action.get('action_type') == 'WAIT'):
+                time.sleep(pause)
+            elif action == 'FAIL' or (type(action) == dict and action.get('action_type') == 'FAIL'):
+                done = True
+                info = {"fail": True}
+            elif action == 'DONE' or (type(action) == dict and action.get('action_type') == 'DONE'):
+                done = True
+                info = {"done": True}
+        if self.action_space == "computer_13":
+            # the set of all possible actions defined in the action representation
+            self.controller.execute_action(action)
+        elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
+            if action in ['WAIT', 'FAIL', 'DONE'] or (type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']):
+                self.controller.execute_action(action)
+            else:
+                # the set of all possible python commands insides `pyautogui`
+                if type(action) == str:
+                    # Fix PyAutoGUI '<' character bug before execution
+                    fixed_command = _fix_pyautogui_less_than_bug(action)
+                    self.controller.execute_python_command(fixed_command)
+                elif type(action) == dict:
+                    # Fix PyAutoGUI '<' character bug before execution
+                    fixed_command = _fix_pyautogui_less_than_bug(action['command'])
+                    self.controller.execute_python_command(fixed_command)
+        time.sleep(pause)
+        observation = self._get_obs()
+        return observation, reward, done, info
+    def evaluate(self):
+        """
+        Evaluate whether the task is successfully completed.
+        """
+        postconfig = self.evaluator.get("postconfig", [])
+        self.setup_controller.setup(postconfig, self.enable_proxy)
+        # Mark environment as used if there were postconfig setup operations
+        if postconfig:
+            self.is_environment_used = True
+        if self.evaluator['func'] == "infeasible":
+            if len(self.action_history) > 0:
+                last_action = self.action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 1
+            return 0
+        else:
+            if len(self.action_history) > 0:
+                last_action = self.action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 0
+        if type(self.metric) == list:
+            # Multiple metrics to evaluate whether the task is successfully completed
+            results = []
+            assert len(self.metric) == len(self.result_getter), "The number of metrics and result getters must be the same"
+            if "expected" in self.evaluator:
+                assert len(self.metric) == len(self.expected_getter), "The number of metrics and expected getters must be the same"
+            for idx, metric in enumerate(self.metric):
+                try:
+                    config = self.evaluator["result"][idx]
+                    result_state = self.result_getter[idx](self, config)
+                except FileNotFoundError:
+                    logger.error("File not found!")
+                    if self.metric_conj == 'and':
+                        return 0
+                if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                    expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
+                    metric: int = metric(result_state, expected_state, **self.metric_options[idx])
+                else:
+                    metric: int = metric(result_state, **self.metric_options[idx])
+                if self.metric_conj == 'and' and float(metric) == 0.0:
+                    return 0
+                elif self.metric_conj == 'or' and float(metric) == 1.0:
+                    return 1
+                else:
+                    results.append(metric)
+            return sum(results) / len(results) if self.metric_conj == 'and' else max(results)
+        else:
+            # Single metric to evaluate whether the task is successfully completed
+            try:
+                result_state = self.result_getter(self, self.evaluator["result"])
+            except FileNotFoundError:
+                logger.error("File not found!")
+                return 0
+            if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                expected_state = self.expected_getter(self, self.evaluator["expected"])
+                metric: float = self.metric(result_state, expected_state, **self.metric_options)
+            else:
+                metric: float = self.metric(result_state, **self.metric_options)
+        return metric
+    def render(self, mode='rgb_array'):
+        if mode == 'rgb_array':
+            return self.controller.get_screenshot()
+        else:
+            raise ValueError('Unsupported render mode: {}'.format(mode))

desktop_env/evaluators/README.md ADDED Viewed

	@@ -0,0 +1,224 @@

+# Evaluator Setup Details
+Setup scaffolding for the evaluators in the desktop environment for those who want to know the details of the evaluator setup for customized evaluation and extension
+## Overall
+Inside the virtual machine, disable the system crash report by:
+```
+sudo vim /etc/default/apport
+```
+and then change the `enabled` to `0`.
+## VSCode
+todo
+## LibreOffice
+For LibreOffice, please enter into the app first, and then enable the no pop-up when 'ctrl + s'.
+## LibreOffice Press
+### Setting Up the python-pptx Library
+```shell
+pip install python-pptx
+```
+## LibreOffice Writer
+### Setting Up the python-docx and odfpy Library
+```shell
+pip install python-docx
+pip install odfpy
+```
+## LibreOffice Calc
+### Required Libraries
+```
+openpyxl
+pandas
+lxml
+xmltodict
+```
+### How to Generate CSV from XLSX
+```sh
+libreoffice --convert-to "csv:Text - txt - csv (StarCalc):44,34,UTF8,,,,false,true,true,false,false,1" --out-dir /home/user /home/user/abc.xlsx
+```
+This command will generate `abc-Sheet1.csv` under `/home/user`. The last `1` in
+the conversion options indicates the sheet number (starting from 1) to export.
+Detailed usage should be referred to at [CSV Filter
+Options](https://help.libreoffice.org/latest/ro/text/shared/guide/csv_params.html).
+Refer to `libreoffice_calc/21df9241-f8d7-4509-b7f1-37e501a823f7.json` for an
+example.
+### About `compare_table`
+Evaluation to xlsx files mainly relies on `compare_table`. It accepts two file
+names and a list of rules defined as `options`. Refer to
+`libreoffice_calc/21df9241-f8d7-4509-b7f1-37e501a823f7.json` for an example.
+In each rule, there is a required field `type`. The supported types are defined
+in `compare_table` function. The most common two are `sheet_data` and
+`sheet_print`. `sheet_data` compares the internal cell values through pandoc,
+while `sheet_print` compares the shown cell values through csv. A csv should be
+generated and downloaded for `sheet_print`. See the previous section and
+example in `libreoffice_calc/21df9241-f8d7-4509-b7f1-37e501a823f7.json`.
+Other fields in a rule are described for each evaluation type in
+`compare_table` function. `sheet_idx0` (or `sheet_idx1`, `sheet_idx`) is a
+common field to indicate which sheet is to extracted from the workbook. If an
+integer i is given, then it extracts the i-th sheet from result xlsx (i starts
+from 0). If a string is given, it should be preceded with "RI", "RN", "EI", or
+"EN". "R" indicates to extract from result xlsx while "E" indicates to extract
+from expected (golden) xlsx. "I" indicates a sheet number (starting from 0) and
+"N" indicates a sheet name (usually, they're like "Sheet1", "Sheet2", ...).
+Some rules use a atructure like `{"method": "eq", "ref": "abc"}`. These rules
+are checked through `utils._match_value_to_rule` function. Check it for the
+implemented matching methods.
+## Chrome
+### Starting Chrome with Remote Debugging for Python
+To enable remote debugging in Chrome, which allows tools like Playwright for Python to connect to and control an existing Chrome instance, follow these steps:
+#### Manually Enabling Remote Debugging in Chrome
+1. **Locate the Chrome Shortcut**:
+   - Find the Chrome shortcut that you usually use to open the browser. This could be on your desktop, start menu, or taskbar.
+2. **Edit Shortcut Properties**:
+   - Right-click on the Chrome shortcut and select `Properties`.
+3. **Modify the Target Field**:
+   - In the `Target` field, add `--remote-debugging-port=9222` at the end of the path. Ensure there is a space between the path and the flag you add.
+   - It should look something like this: `"C:\Path\To\Chrome.exe" --remote-debugging-port=9222`.
+4. **Apply and Close**:
+   - Click `Apply` and then `OK` to close the dialog.
+5. **Start Chrome**:
+   - Use this modified shortcut to start Chrome. Chrome will now start with remote debugging enabled on port 9222.
+6. **Confirm Remote Debugging**:
+   - Open a browser and navigate to `http://localhost:9222`. If you see a webpage with information about active tabs, remote debugging is working.
+---
+### Setting Up Playwright for Python
+Playwright for Python is a browser automation library to control Chromium, Firefox, and WebKit with a single API.
+#### Installing Playwright
+- Ensure you have Python installed on your system. If not, download and install it from the [Python official website](https://www.python.org/).
+- Install Playwright using pip (Python Package Installer). Open a command line or terminal and run:
+  ```bash
+  pip install playwright
+  ```
+- After installing Playwright, you need to run the install command to download the necessary browser binaries:
+  ```bash
+  playwright install
+  ```
+#### Writing a Playwright Script in Python
+- Create a Python file for your automation script.
+- Import the Playwright module at the beginning of your script:
+  ```python
+  from playwright.sync_api import sync_playwright
+  ```
+- You can now use Playwright's API to control browsers.
+#### Example Playwright Script
+Here is a simple example to open a page using Playwright:
+```python
+from playwright.sync_api import sync_playwright
+def run(playwright):
+    browser = playwright.chromium.launch()
+    page = browser.new_page()
+    page.goto("http://example.com")
+    ## other actions...
+    browser.close()
+with sync_playwright() as playwright:
+    run(playwright)
+```
+- This script launches Chromium, opens a new page, navigates to `example.com`, and then closes the browser.
+#### Troubleshooting
+- If you encounter issues with Playwright, ensure that your Python environment is correctly set up and that you have installed Playwright and its dependencies correctly.
+- For detailed documentation, visit the [Playwright for Python Documentation](https://playwright.dev/python/docs/intro).
+## VLC Media Player
+### Bugs fix
+One thing on Ubuntu need to do, enter into the `meida`>`convert/save`>select files>`convert/save`
+Then enter the profile of `Audio - MP3`, change the profile for mp3, section audiocodec from "MP3" to "MPEG Audio"
+Otherwise the mp3 file will be created but with 0 bytes. It's a bug of VLC.
+### Setting Up VLC's HTTP Interface
+To enable and use the HTTP interface in VLC Media Player for remote control and status checks, follow these steps:
+#### 1. Open VLC Preferences
+- Open VLC Media Player.
+- Go to `Tools` > `Preferences` from the menu.
+#### 2. Show All Settings
+- In the Preferences window, at the bottom left corner, select `All` under `Show settings` to display advanced settings.
+#### 3. Enable Main Interfaces
+- In the advanced preferences, expand the `Interface` section.
+- Click on `Main interfaces`.
+- Check the box for `Web` to enable the HTTP interface.
+#### 4. Configure Lua HTTP
+- Expand the `Main interfaces` node and select `Lua`.
+- Under `Lua HTTP`, set a password `password` in the `Lua HTTP` section. This password will be required to access the HTTP interface.
+#### 5. Save and Restart VLC
+- Click `Save` to apply the changes.
+- Restart VLC Media Player for the changes to take effect.
+#### 6. Accessing the HTTP Interface
+- Open a web browser and go to `http://localhost:8080`.
+- You will be prompted for a password. Enter the password you set in the Lua HTTP settings.
+- Once logged in, you will have access to VLC's HTTP interface for remote control.
+#### Packages
+```bash
+pip install opencv-python-headless Pillow imagehash
+```
+#### Troubleshooting
+- If you cannot access the HTTP interface, check if your firewall or security software is blocking the connection.
+- Ensure VLC is running and the correct port (default is 8080) is being used.
+- If the port is in use by another application, you may change the port number in VLC's settings.
+## GIMP
+Click on the "Keep" of the image loading pop-up.

desktop_env/evaluators/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+#from .table import compare_table
+#eval_funcs = {
+    #"compare_table(expected, actual)": compare_table
+#}