AUXteam commited on
Commit
6e38ce1
·
verified ·
1 Parent(s): 8218306

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. .gitignore +213 -0
  3. .hfignore +11 -0
  4. .python-version +1 -0
  5. CODE_OF_CONDUCT.md +9 -0
  6. CONTRIBUTING.md +87 -0
  7. Dockerfile +49 -0
  8. LICENSE +21 -0
  9. README.md +463 -4
  10. SECURITY.md +41 -0
  11. SUPPORT.md +14 -0
  12. TRANSPARENCY_NOTE.md +128 -0
  13. TROUBLESHOOTING.md +133 -0
  14. docs/img/magenticui.jpg +0 -0
  15. docs/img/magenticui_running.png +3 -0
  16. docs/img/magui-actionguard.png +3 -0
  17. docs/img/magui-coplanning.png +3 -0
  18. docs/img/magui-cotasking.png +3 -0
  19. docs/img/magui-landing.png +3 -0
  20. docs/img/magui-readme-logo.png +3 -0
  21. docs/img/magui-readme-logo.svg +79 -0
  22. docs/index.html +141 -0
  23. docs/tutorials/web_agent_tutorial_full.ipynb +1782 -0
  24. experiments/endpoint_configs/.gitignore +3 -0
  25. experiments/endpoint_configs/config_template.yaml +15 -0
  26. experiments/endpoint_configs/test_client.py +31 -0
  27. experiments/eval/.gitignore +2 -0
  28. experiments/eval/README.md +75 -0
  29. experiments/eval/analyze_sim_user.py +257 -0
  30. experiments/eval/explore_results.py +202 -0
  31. experiments/eval/plot_results.py +158 -0
  32. experiments/eval/prepare_for_submission.py +128 -0
  33. experiments/eval/run.py +276 -0
  34. experiments/eval/sample_eval_systems.py +84 -0
  35. experiments/eval/systems/__init__.py +5 -0
  36. experiments/eval/systems/magentic_one_system.py +241 -0
  37. experiments/eval/systems/magentic_ui_sim_user_system.py +484 -0
  38. experiments/eval/systems/magentic_ui_system.py +328 -0
  39. fara_config.yaml +20 -0
  40. frontend/.env.default +1 -0
  41. frontend/.gitignore +6 -0
  42. frontend/README.md +32 -0
  43. frontend/gatsby-browser.js +6 -0
  44. frontend/gatsby-config.ts +59 -0
  45. frontend/gatsby-ssr.tsx +16 -0
  46. frontend/package.json +72 -0
  47. frontend/postcss.config.js +6 -0
  48. frontend/src/assets/logo.svg +29 -0
  49. frontend/src/components/common/AutoResizeTextarea.tsx +117 -0
  50. frontend/src/components/common/Button.tsx +96 -0
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/img/magenticui_running.png filter=lfs diff=lfs merge=lfs -text
37
+ docs/img/magui-actionguard.png filter=lfs diff=lfs merge=lfs -text
38
+ docs/img/magui-coplanning.png filter=lfs diff=lfs merge=lfs -text
39
+ docs/img/magui-cotasking.png filter=lfs diff=lfs merge=lfs -text
40
+ docs/img/magui-landing.png filter=lfs diff=lfs merge=lfs -text
41
+ docs/img/magui-readme-logo.png filter=lfs diff=lfs merge=lfs -text
42
+ frontend/src/styles/Open_Sans/OpenSans-Italic-VariableFont_wdth,wght.ttf filter=lfs diff=lfs merge=lfs -text
43
+ frontend/src/styles/Open_Sans/OpenSans-VariableFont_wdth,wght.ttf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .vscode
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+ db.sqlite3
64
+ db.sqlite3-journal
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ #Pipfile.lock
98
+
99
+ # poetry
100
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104
+ #poetry.lock
105
+
106
+ # pdm
107
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108
+ #pdm.lock
109
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110
+ # in version control.
111
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112
+ .pdm.toml
113
+ .pdm-python
114
+ .pdm-build/
115
+
116
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117
+ __pypackages__/
118
+
119
+ # Celery stuff
120
+ celerybeat-schedule
121
+ celerybeat.pid
122
+
123
+ # SageMath parsed files
124
+ *.sage.py
125
+
126
+ # Environments
127
+ .env
128
+ .venv
129
+ env/
130
+ venv/
131
+ ENV/
132
+ env.bak/
133
+ venv.bak/
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+ scratch.py
166
+
167
+ .DS_Store
168
+ .magentic_ui_state.json
169
+ debug
170
+
171
+ data
172
+ runs
173
+
174
+ node_modules
175
+
176
+
177
+ # Autogen Studio
178
+ database.sqlite
179
+ .cache/*
180
+ src/magentic_ui/backend/web/files/user/*
181
+ src/magentic_ui/backend/test
182
+ src/magentic_ui/backend/database/alembic.ini
183
+ src/magentic_ui/backend/database/alembic/*
184
+ src/magentic_ui/backend/web/files/ui/*
185
+ OAI_CONFIG_LIST
186
+ scratch/
187
+ src/magentic_ui/backend/web/workdir/*
188
+ src/magentic_ui/backend/web/ui/*
189
+ src/magentic_ui/backend/web/skills/user/*
190
+ .release.sh
191
+ .nightly.sh
192
+ notebooks/test
193
+
194
+ notebooks/work_dir/*
195
+ notebooks/test.db
196
+
197
+ # Byte-compiled / optimized / DLL files
198
+ __pycache__/
199
+ *.py[cod]
200
+ *$py.class
201
+
202
+ # Environments
203
+ .env
204
+ .venv
205
+ env/
206
+ venv/
207
+ ENV/
208
+ env.bak/
209
+ venv.bak/
210
+
211
+ # Task centric memory related db and logs
212
+ **/memory_bank/
213
+ **/pagelogs/
.hfignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git/
2
+ .github/
3
+ frontend/node_modules/
4
+ .venv/
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+ .db
10
+ .cache/
11
+ public/
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Microsoft Open Source Code of Conduct
2
+
3
+ This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4
+
5
+ Resources:
6
+
7
+ - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8
+ - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9
+ - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
CONTRIBUTING.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to Magentic-UI
2
+
3
+ Thank you for your interest in contributing to Magentic-UI!
4
+
5
+ We welcome all contributions - whether it’s bug reports, feature requests, code, documentation, or helping others with their questions.
6
+
7
+ ## Code of Conduct
8
+
9
+ This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
10
+ For more information, see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
11
+
12
+ ## Contributor License Agreement (CLA)
13
+
14
+ Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution.
15
+ For details, visit [https://opensource.microsoft.com/pdf/microsoft-contribution-license-agreement.pdf](https://opensource.microsoft.com/pdf/microsoft-contribution-license-agreement.pdf).
16
+
17
+ When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment).
18
+ Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
19
+
20
+
21
+ ## How to Contribute
22
+
23
+ - **Find an Issue:**
24
+ - Browse [All Issues](https://github.com/microsoft/magentic-ui/issues).
25
+ - Look for issues labeled with <span style="color:green"><strong>help-wanted</strong></span> as these are especially open for community contribution!
26
+ - You can also help review [open PRs](https://github.com/microsoft/magentic-ui/pulls).
27
+
28
+ - **Pick Something to Work On:**
29
+ - See the checklist below for high-priority issues.
30
+ - If you have an idea for a new feature or improvement, feel free to open a new issue for discussion.
31
+
32
+ - **Fork and Clone:**
33
+ - Fork the repository and clone it to your local machine.
34
+
35
+ - **Create a Branch:**
36
+ - Use a descriptive branch name (e.g., `fix/session-bug` or `feature/file-upload`).
37
+
38
+ - **Write Code and Tests:**
39
+ - Please include tests for new features or bug fixes. See the `tests` directory for examples.
40
+
41
+ - **Run Checks Locally:**
42
+ - Before submitting a PR, run:
43
+ ```sh
44
+ poe check
45
+ ```
46
+
47
+ - **Submit a Pull Request:**
48
+ - Open a PR against the `main` branch.
49
+ - Reference the issue number in your PR description (e.g., “Closes #123”).
50
+ - The CLA bot will guide you if you need to sign the CLA.
51
+
52
+
53
+ ## Community “Help Wanted” Issues
54
+
55
+ We use the green <span style="color:green"><strong>help-wanted</strong></span> label to highlight issues that are especially open for community contribution.
56
+ Here are the top 10 issues you can help with right now:
57
+
58
+ - [ ] **Allow MAGUI to understand video and audio** ([#132](https://github.com/microsoft/magentic-ui/issues/132))
59
+ - [ ] **Enable arbitrary file upload in UI** ([#128](https://github.com/microsoft/magentic-ui/issues/128))
60
+ - [ ] **Add streaming of final answer and coder messages** ([#126](https://github.com/microsoft/magentic-ui/issues/126))
61
+ - [ ] **Add unit tests** ([#123](https://github.com/microsoft/magentic-ui/issues/123))
62
+ - [ ] **Allow websurfer to scroll inside containers** ([#124](https://github.com/microsoft/magentic-ui/issues/124))
63
+ - [ ] **Composing multiple plans** ([#129](https://github.com/microsoft/magentic-ui/issues/129))
64
+ - [ ] **Reduce latency** ([#131](https://github.com/microsoft/magentic-ui/issues/131))
65
+ - [ ] **Improve allowed list** ([#125](https://github.com/microsoft/magentic-ui/issues/125))
66
+ - [ ] **Add agent name to step in frontend** ([#110](https://github.com/microsoft/magentic-ui/issues/110))
67
+ - [ ] **Pass auth info for browser sessions** ([#120](https://github.com/microsoft/magentic-ui/issues/120))
68
+
69
+ See [all issues needing help](https://github.com/microsoft/magentic-ui/issues?q=is%3Aissue+is%3Aopen+label%3Ahelp-wanted).
70
+
71
+ ## Reviewing Pull Requests
72
+
73
+ You can also help by reviewing [open PRs](https://github.com/microsoft/magentic-ui/pulls).
74
+
75
+ ## Running Tests and Checks
76
+
77
+ All contributions must pass the continuous integration checks.
78
+ You can run these checks locally before submitting a PR by running:
79
+
80
+ ```bash
81
+ poe check
82
+ ```
83
+
84
+ ## Questions?
85
+
86
+ If you have any questions, open an issue or start a discussion.
87
+ Thank you for helping make Magentic-UI better!
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a Python base image
2
+ FROM python:3.12-slim
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PYTHONDONTWRITEBYTECODE=1 \
7
+ PATH="/home/user/.local/bin:$PATH" \
8
+ HOME=/home/user
9
+
10
+ # Install system dependencies
11
+ RUN apt-get update && apt-get install -y --no-install-recommends \
12
+ curl \
13
+ git \
14
+ rsync \
15
+ gnupg \
16
+ build-essential \
17
+ && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
18
+ && apt-get install -y nodejs \
19
+ && npm install -g yarn \
20
+ && rm -rf /var/lib/apt/lists/*
21
+
22
+ # Create a non-root user
23
+ RUN useradd -m -u 1000 user
24
+ USER user
25
+ WORKDIR $HOME/app
26
+
27
+ # Install uv
28
+ RUN curl -LsSf https://astral.sh/uv/install.sh | sh
29
+
30
+ # Copy the project files
31
+ COPY --chown=user . .
32
+
33
+ # Build the frontend
34
+ RUN cd frontend && yarn install && yarn build
35
+
36
+ # Install Python dependencies with uv
37
+ RUN $HOME/.local/bin/uv pip install --system .
38
+
39
+ # Install Playwright and its browsers
40
+ RUN $HOME/.local/bin/uv pip install --system playwright && \
41
+ playwright install --with-deps chromium
42
+
43
+ # Expose the HF port
44
+ EXPOSE 7860
45
+
46
+ # Command to run the application
47
+ # We use --run-without-docker to avoid issues with Docker-in-Docker on HF Spaces
48
+ # We also set the host to 0.0.0.0 and port to 7860
49
+ CMD ["magentic-ui", "--port", "7860", "--host", "0.0.0.0", "--run-without-docker"]
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Microsoft
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,469 @@
1
  ---
2
  title: Maxun
3
- emoji: 🐠
4
- colorFrom: gray
5
- colorTo: gray
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Maxun
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
+ app_port: 7860
9
  ---
10
 
11
+ <div align="center">
12
+ <img src="docs/img/magui-readme-logo.svg" alt="Magentic-UI Logo">
13
+
14
+
15
+ _Automate your web tasks while you stay in control_
16
+
17
+ [![image](https://img.shields.io/pypi/v/magentic_ui.svg)](https://pypi.python.org/pypi/magentic_ui)
18
+ [![image](https://img.shields.io/pypi/l/magentic_ui.svg)](https://pypi.python.org/pypi/magentic_ui)
19
+ ![Python Versions](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)
20
+ [![arXiv](https://img.shields.io/badge/arXiv-2507.22358-b31b1b.svg)](https://arxiv.org/abs/2507.22358)
21
+
22
+ </div>
23
+
24
+ ---
25
+
26
+ Magentic-UI is a **research prototype** human-centered AI agent that solves complex web and coding tasks that may require monitoring. Unlike other black-box agents, the system reveals its plan before executions, lets you guide its actions, and requests approval for sensitive operations while browsing websites, executing code, and analyzing files.
27
+ *Check out the [demo section](#demos) for inspiration on what tasks you can accomplish.*
28
+
29
+ ## ✨ What's New
30
+
31
+ Microsoft latest agentic model [Fara-7B](https://www.microsoft.com/en-us/research/blog/fara-7b-an-efficient-agentic-model-for-computer-use/) is now integrated in Magentic-UI, read how to launch in <a href="#fara-7b"> Fara-7B guide</a>
32
+
33
+
34
+ - **"Tell me When"**: Automate monitoring tasks and repeatable workflows that require web or API access that span minutes to days. *Learn more [here](https://www.microsoft.com/en-us/research/blog/tell-me-when-building-agents-that-can-wait-monitor-and-act/).*
35
+ - **File Upload Support**: Upload any file through the UI for analysis or modification
36
+ - **MCP Agents**: Extend capabilities with your favorite MCP servers
37
+ - **Easier Installation**: We have uploaded our docker containers to GHCR so you no longer need to build any containers! Installation time now is much quicker.
38
+
39
+
40
+ ## 🚀 Quick Start
41
+
42
+ Here's how you can get started with Magentic-UI:
43
+
44
+ ```bash
45
+ # 1. Setup environment
46
+ python3 -m venv .venv
47
+ source .venv/bin/activate
48
+ pip install magentic-ui --upgrade
49
+
50
+ # 2. Set your API key
51
+ export OPENAI_API_KEY="your-api-key-here"
52
+
53
+ # 3. Launch Magentic-UI
54
+ magentic-ui --port 8081
55
+ ```
56
+
57
+ Then open <http://localhost:8081> in your browser to interact with Magentic-UI!
58
+
59
+ > **Prerequisites**: Requires Docker and Python 3.10+. Windows users should use WSL2. See [detailed installation](#️-installation) for more info.
60
+
61
+ ## Alternative Usage Options
62
+
63
+ **Without Docker** (limited functionality: no code execution):
64
+ ```bash
65
+ magentic-ui --run-without-docker --port 8081
66
+ ```
67
+
68
+ **Command Line Interface**:
69
+ ```bash
70
+ magentic-cli --work-dir PATH/TO/STORE/DATA
71
+ ```
72
+
73
+ **Custom LLM Clients**:
74
+ ```bash
75
+ # Azure
76
+ pip install magentic-ui[azure]
77
+
78
+ # Ollama (local models)
79
+ pip install magentic-ui[ollama]
80
+ ```
81
+
82
+ You can then pass a config file to the `magentic-ui` command (<a href="#model-client-configuration"> client config</a>) or change the model client inside the UI settings.
83
+
84
+ For further details on installation please read the <a href="#️-installation">🛠️ Installation</a> section. For common installation issues and their solutions, please refer to the [troubleshooting document](TROUBLESHOOTING.md). See advanced usage instructions with the command `magentic-ui --help`.
85
+
86
+ ## Quick Navigation:
87
+ <p align="center">
88
+ <a href="#demos">🎬 Demos</a> &nbsp;|&nbsp;
89
+ <a href="#how-it-works">🟪 How it Works</a> &nbsp;|&nbsp;
90
+ <a href="#installation">🛠️ Installation</a> &nbsp;|&nbsp;
91
+ <a href="#troubleshooting">⚠️ Troubleshooting</a> &nbsp;|&nbsp;
92
+ <a href="#contributing">🤝 Contributing</a> &nbsp;|&nbsp;
93
+ <a href="#license">📄 License</a>
94
+ </p>
95
+
96
+ ---
97
+
98
+ ## Demos
99
+
100
+ <table>
101
+ <tr>
102
+ <td width="33%" align="center">
103
+
104
+ **🍕 Pizza Ordering**
105
+ *Web automation with human-in-the-loop*
106
+
107
+ <video src="https://github.com/user-attachments/assets/dc95cf5f-c4b4-4fe0-b708-158ff071e5a9" width="100%" style="max-height: 300px;">
108
+ </video>
109
+
110
+ </td>
111
+ <td width="33%" align="center">
112
+
113
+ **🏠 Airbnb Price Analysis**
114
+ *MCP agent integration*
115
+
116
+ <video src="https://github.com/user-attachments/assets/c19ed8c2-e06f-43b7-bee3-5e2ffc4c5e02" width="100%" style="max-height: 300px;">
117
+ </video>
118
+
119
+ </td>
120
+ <td width="33%" align="center">
121
+
122
+ **⭐ Star Monitoring**
123
+ *Long-running monitoring task*
124
+
125
+ <video src="https://github.com/user-attachments/assets/d2a463ca-7a94-4414-932d-a69f30fff63b" width="100%" style="max-height: 300px;">
126
+ </video>
127
+
128
+ </td>
129
+ </tr>
130
+ </table>
131
+
132
+
133
+
134
+ ## How it Works
135
+ <p align="center">
136
+ <img src="./docs/img/magenticui_running.png" alt="Magentic-UI" height="400">
137
+ </p>
138
+
139
+ Magentic-UI is especially useful for web tasks that require actions on the web (e.g., filling a form, customizing a food order), deep navigation through websites not indexed by search engines (e.g., filtering flights, finding a link from a personal site) or tasks that need web navigation and code execution (e.g., generate a chart from online data).
140
+
141
+ What differentiates Magentic-UI from other browser use offerings is its transparent and controllable interface that allows for efficient human-in-the-loop involvement. Magentic-UI is built using [AutoGen](https://github.com/microsoft/autogen) and provides a platform to study human-agent interaction and experiment with web agents. Key features include:
142
+
143
+ - 🧑‍🤝‍🧑 **Co-Planning**: Collaboratively create and approve step-by-step plans using chat and the plan editor.
144
+ - 🤝 **Co-Tasking**: Interrupt and guide the task execution using the web browser directly or through chat. Magentic-UI can also ask for clarifications and help when needed.
145
+ - 🛡️ **Action Guards**: Sensitive actions are only executed with explicit user approvals.
146
+ - 🧠 **Plan Learning and Retrieval**: Learn from previous runs to improve future task automation and save them in a plan gallery. Automatically or manually retrieve saved plans in future tasks.
147
+ - 🔀 **Parallel Task Execution**: You can run multiple tasks in parallel and session status indicators will let you know when Magentic-UI needs your input or has completed the task.
148
+
149
+ <div align="center">
150
+ <a href="https://www.youtube.com/watch?v=wOs-5SR8xOc" target="_blank">
151
+ <img src="https://img.youtube.com/vi/wOs-5SR8xOc/maxresdefault.jpg" alt="Watch the demo video" width="600"/>
152
+ </a>
153
+ <br>
154
+ ▶️ <em> Click to watch a video and learn more about Magentic-UI </em>
155
+ </div>
156
+
157
+
158
+ ### Autonomous Evaluation
159
+
160
+ To evaluate its autonomous capabilities, Magentic-UI has been tested against several benchmarks when running with o4-mini: [GAIA](https://huggingface.co/datasets/gaia-benchmark/GAIA) test set (42.52%), which assesses general AI assistants across reasoning, tool use, and web interaction tasks ; [AssistantBench](https://huggingface.co/AssistantBench) test set (27.60%), focusing on realistic, time-consuming web tasks; [WebVoyager](https://github.com/MinorJerry/WebVoyager) (82.2%), measuring end-to-end web navigation in real-world scenarios; and [WebGames](https://webgames.convergence.ai/) (45.5%), evaluating general-purpose web-browsing agents through interactive challenges.
161
+ To reproduce these experimental results, please see the following [instructions](experiments/eval/README.md).
162
+
163
+
164
+
165
+ If you're interested in reading more checkout our [technical report](https://www.microsoft.com/en-us/research/wp-content/uploads/2025/07/magentic-ui-report.pdf) and [blog post](https://www.microsoft.com/en-us/research/blog/magentic-ui-an-experimental-human-centered-web-agent/).
166
+
167
+
168
+ ## Installation
169
+ ### Pre-Requisites
170
+
171
+ **Note**: If you're using Windows, we highly recommend using [WSL2](https://docs.microsoft.com/en-us/windows/wsl/install) (Windows Subsystem for Linux).
172
+
173
+ 1. If running on **Windows** or **Mac** you should use [Docker Desktop](https://www.docker.com/products/docker-desktop/) or if inside WSL2 you can install Docker directly inside WSL [docker in WSL2 guide](https://gist.github.com/dehsilvadeveloper/c3bdf0f4cdcc5c177e2fe9be671820c7). If running on **Linux**, you should use [Docker Engine](https://docs.docker.com/engine/install/).
174
+
175
+ If using Docker Desktop, make sure it is set up to use WSL2:
176
+ - Go to Settings > Resources > WSL Integration
177
+ - Enable integration with your development distro You can find more detailed instructions about this step [here](https://docs.microsoft.com/en-us/windows/wsl/tutorials/wsl-containers).
178
+
179
+
180
+
181
+ 2. During the Installation step, you will need to set up your `OPENAI_API_KEY`. To use other models, review the [Model Client Configuration](#model-client-configuration) section below.
182
+
183
+ 3. You need at least [Python 3.10](https://www.python.org/downloads/) installed.
184
+
185
+
186
+ If you are on Windows, we recommend to run Magentic-UI inside [WSL2](https://docs.microsoft.com/en-us/windows/wsl/install) (Windows Subsystem for Linux) for correct Docker and file path compatibility.
187
+
188
+
189
+
190
+ ### PyPI Installation
191
+
192
+ Magentic-UI is available on PyPI. We recommend using a virtual environment to avoid conflicts with other packages.
193
+
194
+ ```bash
195
+ python3 -m venv .venv
196
+ source .venv/bin/activate
197
+ pip install magentic-ui
198
+ ```
199
+
200
+ Alternatively, if you use [`uv`](https://docs.astral.sh/uv/getting-started/installation/) for dependency management, you can install Magentic-UI with:
201
+
202
+ ```bash
203
+ uv venv --python=3.12 .venv
204
+ . .venv/bin/activate
205
+ uv pip install magentic-ui
206
+ ```
207
+
208
+
209
+ ### Running Magentic-UI
210
+
211
+ To run Magentic-UI, make sure that Docker is running, then run the following command:
212
+
213
+ ```bash
214
+ magentic-ui --port 8081
215
+ ```
216
+
217
+ >**Note**: Running this command for the first time will pull two docker images required for the Magentic-UI agents. If you encounter problems, you can build them directly with the following command:
218
+ ```bash
219
+ cd docker
220
+ sh build-all.sh
221
+ ```
222
+
223
+ If you face issues with Docker, please refer to the [TROUBLESHOOTING.md](TROUBLESHOOTING.md) document.
224
+
225
+ Once the server is running, you can access the UI at <http://localhost:8081>.
226
+
227
+
228
+
229
+ ### Fara-7B
230
+
231
+ 1) First install magentic-ui with the fara extras:
232
+
233
+ ```bash
234
+ python3 -m venv .venv
235
+ source .venv/bin/activate
236
+ pip install magentic-ui[fara]
237
+ ```
238
+
239
+ 2) In a seperate process, serve the Fara-7B model using vLLM:
240
+
241
+ ```bash
242
+ vllm serve "microsoft/Fara-7B" --port 5000 --dtype auto
243
+ ```
244
+
245
+ 3) First create a `fara_config.yaml` file with the following content:
246
+
247
+ ```yaml
248
+ model_config_local_surfer: &client_surfer
249
+ provider: OpenAIChatCompletionClient
250
+ config:
251
+ model: "microsoft/Fara-7B"
252
+ base_url: http://localhost:5000/v1
253
+ api_key: not-needed
254
+ model_info:
255
+ vision: true
256
+ function_calling: true
257
+ json_output: false
258
+ family: "unknown"
259
+ structured_output: false
260
+ multiple_system_messages: false
261
+
262
+ orchestrator_client: *client_surfer
263
+ coder_client: *client_surfer
264
+ web_surfer_client: *client_surfer
265
+ file_surfer_client: *client_surfer
266
+ action_guard_client: *client_surfer
267
+ model_client: *client_surfer
268
+ ```
269
+ Note: if you are hosting vLLM on a different port or host, change the `base_url` accordingly.
270
+
271
+
272
+ Then launch Magentic-UI with the fara agent:
273
+
274
+ ```bash
275
+ magentic-ui --fara --port 8081 --config fara_config.yaml
276
+ ```
277
+
278
+ Finally, navigate to <http://localhost:8081> to access the interface!
279
+
280
+ ### Configuration
281
+
282
+ #### Model Client Configuration
283
+
284
+ If you want to use a different OpenAI key, or if you want to configure use with Azure OpenAI or Ollama, you can do so inside the UI by navigating to settings (top right icon) and changing model configuration. Another option is to pass a yaml config file when you start Magentic-UI which will override any settings in the UI:
285
+
286
+ ```bash
287
+ magentic-ui --port 8081 --config config.yaml
288
+ ```
289
+
290
+ Where the `config.yaml` should look as follows with an AutoGen model client configuration:
291
+
292
+ ```yaml
293
+ gpt4o_client: &gpt4o_client
294
+ provider: OpenAIChatCompletionClient
295
+ config:
296
+ model: gpt-4o-2024-08-06
297
+ api_key: null
298
+ base_url: null
299
+ max_retries: 5
300
+
301
+ orchestrator_client: *gpt4o_client
302
+ coder_client: *gpt4o_client
303
+ web_surfer_client: *gpt4o_client
304
+ file_surfer_client: *gpt4o_client
305
+ action_guard_client: *gpt4o_client
306
+ plan_learning_client: *gpt4o_client
307
+ ```
308
+ You can change the client for each of the agents using the config file and use AzureOpenAI (`AzureOpenAIChatCompletionClient`), Ollama and other clients.
309
+
310
+ #### MCP Server Configuration
311
+
312
+ You can also extend Magentic-UI's capabilities by adding custom "McpAgents" to the multi-agent team. Each McpAgent can have access to one or more MCP Servers. You can specify these agents via the `mcp_agent_configs` parameter in your `config.yaml`.
313
+
314
+ For example, here's an agent called "airbnb_surfer" that has access to the OpenBnb MCP Server running locally via Stdio.
315
+
316
+ ```yaml
317
+ mcp_agent_configs:
318
+ - name: airbnb_surfer
319
+ description: "The airbnb_surfer has direct access to AirBnB."
320
+ model_client:
321
+ provider: OpenAIChatCompletionClient
322
+ config:
323
+ model: gpt-4.1-2025-04-14
324
+ max_retries: 10
325
+ system_message: |-
326
+ You are AirBnb Surfer, a helpful digital assistant that can help users acces AirBnB.
327
+
328
+ You have access to a suite of tools provided by the AirBnB API. Use those tools to satisfy the users requests.
329
+ reflect_on_tool_use: false
330
+ mcp_servers:
331
+ - server_name: AirBnB
332
+ server_params:
333
+ type: StdioServerParams
334
+ command: npx
335
+ args:
336
+ - -y
337
+ - "@openbnb/mcp-server-airbnb"
338
+ - --ignore-robots-txt
339
+ ```
340
+
341
+ Under the hood, each `McpAgent` is just a `autogen_agentchat.agents.AssistantAgent` with the set of MCP Servers exposed as an `AggregateMcpWorkbench` which is simply a named collection of `autogen_ext.tools.mcp.McpWorkbench` objects (one per MCP Server).
342
+
343
+ Currently the supported MCP Server types are `autogen_ext.tools.mcp.StdioServerParams` and `autogen_ext.tools.mcp.SseServerParams`.
344
+
345
+ ### Building Magentic-UI from source
346
+
347
+ This step is primarily for users seeking to make modifications to the code, are having trouble with the pypi installation or want the latest code before a pypi version release.
348
+
349
+ #### 1. Make sure the above prerequisites are installed, and that Docker is running.
350
+
351
+ #### 2. Clone the repository to your local machine:
352
+
353
+ ```bash
354
+ git clone https://github.com/microsoft/magentic-ui.git
355
+ cd magentic-ui
356
+ ```
357
+
358
+ #### 3. Install Magentic-UI's dependencies with uv or your favorite package manager:
359
+
360
+ ```bash
361
+ # install uv through https://docs.astral.sh/uv/getting-started/installation/
362
+ uv venv --python=3.12 .venv
363
+ uv sync --all-extras
364
+ source .venv/bin/activate
365
+ ```
366
+
367
+ #### 4. Build the frontend:
368
+
369
+ First make sure to install node:
370
+
371
+ ```bash
372
+ # install nvm to install node
373
+ curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh | bash
374
+ nvm install node
375
+ ```
376
+
377
+ Then install the frontend:
378
+
379
+ ```bash
380
+ cd frontend
381
+ npm install -g gatsby-cli
382
+ npm install --global yarn
383
+ yarn install
384
+ yarn build
385
+ ```
386
+
387
+ #### 5. Run Magentic-UI, as usual.
388
+
389
+ ```bash
390
+ magentic-ui --port 8081
391
+ ```
392
+
393
+
394
+ #### Running the UI from source
395
+
396
+ If you are making changes to the source code of the UI, you can run the frontend in development mode so that it will automatically update when you make changes for faster development.
397
+
398
+ 1. Open a separate terminal and change directory to the frontend
399
+
400
+ ```bash
401
+ cd frontend
402
+ ```
403
+
404
+ 2. Create a `.env.development` file.
405
+
406
+ ```bash
407
+ cp .env.default .env.development
408
+ ```
409
+
410
+ 3. Launch frontend server
411
+
412
+ ```bash
413
+ npm run start
414
+ ```
415
+
416
+ 4. Then run the UI:
417
+
418
+ ```bash
419
+ magentic-ui --port 8081
420
+ ```
421
+
422
+ The frontend from source will be available at <http://localhost:8000>, and the compiled frontend will be available at <http://localhost:8081>.
423
+
424
+
425
+
426
+
427
+ ## Troubleshooting
428
+
429
+
430
+ If you were unable to get Magentic-UI running, do not worry! The first step is to make sure you have followed the steps outlined above, particularly with the [pre-requisites](#pre-requisites).
431
+
432
+ For common issues and their solutions, please refer to the [TROUBLESHOOTING.md](TROUBLESHOOTING.md) file in this repository. If you do not see your problem there, please open a `GitHub Issue`.
433
+
434
+ ## Contributing
435
+
436
+ This project welcomes contributions and suggestions. For information about contributing to Magentic-UI, please see our [CONTRIBUTING.md](CONTRIBUTING.md) guide, which includes current issues to be resolved and other forms of contributing.
437
+
438
+ This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information, see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
439
+
440
+
441
+ ## Citation
442
+
443
+ Please cite our paper if you use our work in your research:
444
+
445
+ ```
446
+ @article{mozannar2025magentic,
447
+ title={Magentic-UI: Towards Human-in-the-loop Agentic Systems},
448
+ author={Mozannar, Hussein and Bansal, Gagan and Tan, Cheng and Fourney, Adam and Dibia, Victor and Chen, Jingya and Gerrits, Jack and Payne, Tyler and Maldaner, Matheus Kunzler and Grunde-McLaughlin, Madeleine and others},
449
+ journal={arXiv preprint arXiv:2507.22358},
450
+ year={2025}
451
+ }
452
+ ```
453
+
454
+ ## License
455
+
456
+ Microsoft, and any contributors, grant you a license to any code in the repository under the [MIT License](https://opensource.org/licenses/MIT). See the [LICENSE](LICENSE) file.
457
+
458
+ Microsoft, Windows, Microsoft Azure, and/or other Microsoft products and services referenced in the documentation
459
+ may be either trademarks or registered trademarks of Microsoft in the United States and/or other countries.
460
+ The licenses for this project do not grant you rights to use any Microsoft names, logos, or trademarks.
461
+ Microsoft's general trademark guidelines can be found at <http://go.microsoft.com/fwlink/?LinkID=254653>.
462
+
463
+ Any use of third-party trademarks or logos are subject to those third-party's policies.
464
+
465
+ Privacy information can be found at <https://go.microsoft.com/fwlink/?LinkId=521839>
466
+
467
+ Microsoft and any contributors reserve all other rights, whether under their respective copyrights, patents, or trademarks, whether by implication, estoppel, or otherwise.
468
+
469
+ # Dummy change
SECURITY.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
2
+
3
+ ## Security
4
+
5
+ Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6
+
7
+ If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8
+
9
+ ## Reporting Security Issues
10
+
11
+ **Please do not report security vulnerabilities through public GitHub issues.**
12
+
13
+ Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14
+
15
+ If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16
+
17
+ You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18
+
19
+ Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20
+
21
+ * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22
+ * Full paths of source file(s) related to the manifestation of the issue
23
+ * The location of the affected source code (tag/branch/commit or direct URL)
24
+ * Any special configuration required to reproduce the issue
25
+ * Step-by-step instructions to reproduce the issue
26
+ * Proof-of-concept or exploit code (if possible)
27
+ * Impact of the issue, including how an attacker might exploit the issue
28
+
29
+ This information will help us triage your report more quickly.
30
+
31
+ If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32
+
33
+ ## Preferred Languages
34
+
35
+ We prefer all communications to be in English.
36
+
37
+ ## Policy
38
+
39
+ Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40
+
41
+ <!-- END MICROSOFT SECURITY.MD BLOCK -->
SUPPORT.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Support
2
+
3
+ ## How to file issues and get help
4
+
5
+ This project uses GitHub Issues to track bugs and feature requests. Please search the existing
6
+ issues before filing new issues to avoid duplicates. For new issues, file your bug or
7
+ feature request as a new Issue.
8
+
9
+ For help and questions about using this project, please post questions ti GitHub issues, as per
10
+ above, and assign them the label "question".
11
+
12
+ ## Microsoft Support Policy
13
+
14
+ Support for Magentic-UI is limited to the resources listed above.
TRANSPARENCY_NOTE.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Magentic-UI
2
+
3
+ ## OVERVIEW
4
+
5
+ Magentic-UI is a human-centered computer use agent (CUA) **designed for collaboration with people** on web-based tasks. Magentic-UI operates a web browser and other tools, like code execution and file navigation, in real-time while optimizing for human-in-the-loop (HIL) orchestration.
6
+
7
+ ### What Can Magentic-UI Do?
8
+
9
+ Magentic-UI was developed to investigate human-in-the-loop approaches for agentic design with the goals to improve agentic performance and increase user productivity for web tasks. Magentic-UI strongly involves the user throughout the planning and execution phase. Magentic-UI prompts the user to accept a plan before starting execution. Plans can be modified, saved and re-used.
10
+
11
+ ### Intended Uses
12
+
13
+ Magentic-UI is a research prototype best suited to explore, experience and investigate agentic assistance in performing tasks that require web navigation. Magentic-UI should always be used with human supervision.
14
+
15
+ Below are examples of tasks that Magentic-UI can accomplish:
16
+
17
+ - Check the price of a coffee from the closest coffee shops to a certain destination
18
+ - Create a formatted spreadsheet containing the box score statistics of all NBA games that occurred on a certain date
19
+ - Summarize in a report a set of papers each downloaded from a given URL, such as the latest papers from arxiv on a certain topic
20
+
21
+ Magentic-UI is being shared with the research community to foster further research on human-in-the-loop in agentic systems.
22
+
23
+ Magentic-UI is intended to be used by domain experts who are independently capable of evaluating the quality of outputs, safety issues and potential harm before acting on them. OUT-of-scope uses
24
+
25
+ We do not recommend using Magentic-UI in commercial or real-world applications without further testing and development. It is being released for research purposes.
26
+
27
+ Magentic-UI is not well suited for tasks that: rely on audio or video data to process, long-duration tasks (e.g., summarize 100 papers) or tasks that require real-time fast actions like playing online games.
28
+
29
+ Magentic-UI should always be used with a human-in-the-loop. While we support an autonomous version of Magentic-UI in our code for the purposes of evaluation, this version is not included in the interface and should only be used for evaluation purposes and nothing else. We discourage the use of the autonomous version as it does not possess the same safety safeguards as the human-in-the-loop version through the interface and has not undergone the same safety testing.
30
+
31
+ Magentic-UI was not designed or evaluated for all possible downstream purposes. Developers should consider its inherent limitations as they select use cases, and evaluate and mitigate for accuracy, safety, and fairness concerns specific to each intended downstream use.
32
+
33
+ Magentic-UI should not be used in highly regulated domains or high stakes situations where inaccurate outputs could suggest actions that lead to injury or negatively impact an individual's health, legal, and financial, life opportunities or legal status.
34
+
35
+ We do not recommend using Magentic-UI in the context of high-risk decision making (e.g. in law enforcement, legal, finance, or healthcare).
36
+
37
+ ## HOW TO GET STARTED
38
+
39
+ To begin using Magentic-UI, follow instructions at [microsoft/magentic-ui: Magentic-UI](https://github.com/microsoft/magentic-ui)
40
+
41
+ ## EVALUATION
42
+
43
+ Magentic-UI was evaluated on its ability to autonomously solve complex tasks from benchmarks such as GAIA. Magentic-UI autonomously tries to complete these tasks and its final answer is judged with respect to the ground truth answer. To evaluate a human-in-the-loop set-up we also evaluated Magentic-UI with a simulated user with an interactive version of the GAIA benchmark.
44
+
45
+ ### Evaluation Methods
46
+
47
+ We compared the performance of Magentic-UI against [Magentic-One](https://github.com/microsoft/autogen/tree/gaia_multiagent_v01_march_1st/samples/tools/autogenbench/scenarios/GAIA/Templates/Orchestrator) on the, [GAIA](https://arxiv.org/abs/2311.12983) benchmark. When running autonomously Magentic-UI shows comparable performance to Magentic-One (which previously achieved sota results on GAIA) and higher accuracy with simulated human-in-the-loop.
48
+
49
+ The model used for evaluation was GPT-4o from Azure OpenAI. Results may vary if Magentic-UI is used with a different model, or when using other models for evaluation, based on their unique design, configuration and training.
50
+
51
+ In addition to robust quality performance testing, Magentic-UI was assessed from a Responsible AI perspective. Based on these results, we implemented mitigations to minimize Magentic-UI s susceptibility to misuse. See details in risks and mitigation section below.
52
+
53
+ ### Evaluation Results
54
+
55
+ At a high level, we found that Magentic-UI performed similarly to [Magentic-One](https://github.com/microsoft/autogen/tree/gaia_multiagent_v01_march_1st/samples/tools/autogenbench/scenarios/GAIA/Templates/Orchestrator) on autonomous task completion and better with simulated human-in-the-loop.
56
+
57
+ ## LIMITATIONS
58
+
59
+ Magentic-UI was developed for research and experimental purposes. Further testing and validation are needed before considering its application in commercial or real-world scenarios.
60
+
61
+ Magentic-UI was designed and tested using the English language. Performance in other languages may vary and should be assessed by someone who is both an expert in the expected outputs and a native speaker of that language.
62
+
63
+ Outputs generated by AI may include factual errors, fabrication, or speculation. Users are responsible for assessing the accuracy of generated content. All decisions leveraging outputs of the system should be made with human oversight and not be based solely on system outputs.
64
+
65
+ Magentic-UI inherits any biases, errors, or omissions produced by the model used. Developers are advised to choose an appropriate base LLM/MLLM carefully, depending on the intended use case.
66
+
67
+ There has not been a systematic effort to ensure that systems using Magentic-UI are protected from security vulnerabilities such as indirect prompt injection attacks. Any systems using it should take proactive measures to harden their systems as appropriate.
68
+
69
+ ## BEST PRACTICES
70
+
71
+ Magentic-UI is a highly capable agent, proficient at interacting with websites, operating over local files, and writing or executing Python code, but like all LLM-based systems, it can and will make mistakes. To safely operate Magentic-UI, always run it within the provided Docker containers, and strictly limit its access to only essential resources avoid sharing unnecessary files, folders, or logging into websites through the agent. Never share sensitive data you wouldn't confidently send to external providers like Azure or OpenAI. Magentic-UI shares browser screenshots with model providers including all data users choose to enter on websites in Magentic-UI s browser. Ensure careful human oversight by meticulously reviewing proposed actions and monitoring progress before giving approval. Finally, approach its output with appropriate skepticism; Magentic-UI can hallucinate, misattribute sources, or be misled by deceptive or low-quality online content.
72
+
73
+ We strongly encourage users to use LLMs/MLLMs that support robust Responsible AI mitigations, such as Azure Open AI (AOAI) services. Such services continually update their safety and RAI mitigations with the latest industry standards for responsible use. For more on AOAI s best practices when employing foundations models for scripts and applications:
74
+
75
+ - [Blog post on responsible AI features in AOAI that were presented at Ignite 2023](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-new-ai-safety-amp-responsible-ai-features-in-azure/ba-p/3983686)
76
+ - [Overview of Responsible AI practices for Azure OpenAI models] (https://learn.microsoft.com/en-us/legal/cognitive-services/openai/overview)
77
+ - [Azure OpenAI Transparency Note](<https://learn.microsoft.com/en-us/legal/cognitive-services/openai/transparency-note>)
78
+ - [OpenAI s Usage policies](https://openai.com/policies/usage-policies)
79
+ - [Azure OpenAI s Code of Conduct](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/code-of-conduct)
80
+
81
+ Users are reminded to be mindful of data privacy concerns and are encouraged to review the privacy policies associated with any models and data storage solutions interfacing with Magentic-UI.
82
+
83
+ It is the user s responsibility to ensure that the use of Magentic-UI complies with relevant data protection regulations and organizational guidelines.
84
+
85
+ For benchmarking purposes Magentic-UI has an autonomous mode that deactivates human-in-the-loop components such as co-planning and co-execution. This mode is not accessible through the UI, we strongly encourage to limit it s usage to benchmark scenarios.
86
+
87
+ ## RISKS AND MITIGATIONS
88
+
89
+ Human agency and oversight are foundational to Magentic-UI s design. From the ground up, Magentic-UI was created with a human-in-the-loop (HIL) philosophy that places the user in control of agent behavior. Every action Magentic-UI takes -- whether navigating the web, manipulating data, or executing code -- is preceded by a transparent planning phase where the proposed steps are surfaced for review. Plans are only executed with explicit user approval, and users retain the ability to pause, modify, or interrupt the agent at any time. When Magentic-UI encounters a scenario it deems high-impact or non-reversible, such as navigating to a new domain or initiating a potentially risky action, it proactively requests confirmation before proceeding. The user can also configure Magentic-UI to always ask for permission before performing any action. This approach reinforces user autonomy while minimizing unintended or unsafe behavior.
90
+
91
+ One of the key safety features in Magentic-UI is the ability to set a set of allowed websites. The allowed websites represent the set of websites that Magentic-UI can visit without explicit user approval. If Magentic-UI needs to visit a website outside the allowed list, it will ask the user for explicit approval by mentioning the exact URL, the page title and the reason for visiting the website.
92
+
93
+ To address safety and security concerns, Magentic-UI underwent targeted red-teaming to assess its behavior under adversarial and failure scenarios. Such scenarios include cross-site prompt injection attacks where web pages contain malicious instructions distinct from the user s original intents (e.g., to execute risky code, access sensitive files, or perform actions on other websites). It also contains scenarios comparable to phishing, which try to trick Magentic-UI into entering sensitive information, or granting permissions on impostor sites (e.g., a synthetic website that asks Magentic-UI to log in and enter Google credentials to read an article). In our preliminary evaluations, we found that Magentic-UI either refuses to complete the requests, stops to ask the user, or, as a final safety measure, is eventually unable to complete the request due to Docker sandboxing. We have found that this layered approach is effective for thwarting these attacks.
94
+
95
+ Magentic-UI was architected with strong isolation boundaries: every component is sandboxed in separate Docker containers, allowing fine-grained access control to only necessary resources. This effectively shields the host environment from agent activities. Sensitive data such as chat history, user settings, and execution logs are stored locally to preserve user privacy and minimize exposure.
96
+
97
+ Together, these mitigations are intended to reduce misuse risks, promote transparency, and preserve user control at every step. Magentic-UI is not a system that operates behind the scenes; it is a collaborator designed to act *with* the user, not *for* them.
98
+
99
+ ## LICENSE
100
+
101
+ ```
102
+ Magentic-UI is published under MIT License.
103
+ Copyright (c) Microsoft Corporation.
104
+
105
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
106
+ this software and associated documentation files (the "Software"), to deal in
107
+ the Software without restriction, including without limitation the rights to
108
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
109
+ of the Software, and to permit persons to whom the Software is furnished to do
110
+ so, subject to the following conditions:
111
+
112
+ The above copyright notice and this permission notice shall be included in all
113
+ copies or substantial portions of the Software.
114
+
115
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
116
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
117
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
118
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
119
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
120
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
121
+ THE SOFTWARE
122
+ ```
123
+
124
+ ## CONTACT
125
+
126
+ We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at [magui@service.microsoft.com]
127
+
128
+ If the team receives reports of undesired behavior or identifies issues independently, we will update this repository with appropriate mitigations.
TROUBLESHOOTING.md ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ⚠️ TROUBLESHOOTING
2
+
3
+ This document lists common issues users have encountered with Magentic-UI and how to resolve them.
4
+
5
+
6
+ ## 1. 🐳 Docker Not Detected / 🚫 Podman Not Supported
7
+
8
+ **Error:**
9
+ `Checking if Docker is running...Failed`
10
+ `Docker is not running. Please start Docker and try again.`
11
+
12
+ **Solution:**
13
+ - Magentic-UI requires Docker Desktop (Windows/Mac) or Docker Engine (Linux).
14
+ - Podman and other container engines are **not supported**.
15
+ - Make sure Docker is installed and running.
16
+
17
+ One possible fix for Mac/Ubuntu (especially if using Colima) is by setting environment variable for DOCKER_HOST ([see issue 81](https://github.com/microsoft/magentic-ui/issues/81), thank you to serproqnx) to point to docker.sock:
18
+
19
+ ```bash
20
+ export DOCKER_HOST=unix:///home/<your-username>/.docker/desktop/docker.sock
21
+ ```
22
+ **Note**: you might have to adjust the path to point to the correct location of docker.sock, as suggested by KangEn1997 in [issue 137](https://github.com/microsoft/magentic-ui/issues/137).
23
+
24
+ This can resolve issues where the Docker SDK cannot automatically detect the Docker socket location.
25
+
26
+ Another possible fix on Ubuntu if docker is not running is to make sure your user is in the 'docker' group or run with sudo.
27
+
28
+ Please read [Linux post-installation steps for Docker Engine
29
+ ](https://docs.docker.com/engine/install/linux-postinstall/) for more information.
30
+
31
+ ## 2. 🚪 Port 8081 Fails to Start
32
+
33
+ **Error:**
34
+ `Port 8081 failed to start` or `Address already in use`
35
+
36
+ **Solution:**
37
+ - Make sure port 8081 is not being used by another application.
38
+ - You can change the port with `magentic ui --port <another_port>`.
39
+
40
+ ## 3. 🏗️ Docker Image Pull Fails
41
+
42
+ **Error:**
43
+ `Pulling docker image...Failed` or similar
44
+
45
+ **Solution:**
46
+ - Make sure you have a stable internet connection.
47
+ - Update Docker to the latest version.
48
+ - Check that you have enough disk space.
49
+ - Try building the images manually:
50
+ ```bash
51
+ cd docker
52
+ sh build-all.sh
53
+ ```
54
+
55
+ ## 4. 🪟 WSL2 Not Set Up on Windows
56
+
57
+ **Error:**
58
+ `Docker is not running` or `WSL2 required`
59
+
60
+ **Solution:**
61
+ - Follow the [For Windows Users](#for-windows-users) section in the README.
62
+ - Ensure Docker Desktop is configured to use WSL2.
63
+ - Go to Settings > Resources > WSL Integration
64
+ - Enable integration with your WSL distro
65
+
66
+
67
+ ## 5. 🖥️ Browser Cannot Be Operated
68
+
69
+ **Symptoms:**
70
+ - UI loads, but browser window is blank or unresponsive.
71
+
72
+ **Solution:**
73
+ - Make sure Docker containers are running (`docker ps`).
74
+ - Check firewall settings and ensure required ports are open.
75
+ - Try restarting Docker and Magentic-UI.
76
+
77
+ ## 6. 🏔️ Alpine Linux Compatibility
78
+
79
+ **Issue:**
80
+ - Magentic-UI is not tested on Alpine Linux. Use Ubuntu or Debian for best results.
81
+
82
+ ## 7. 🌐 Running on Remote Servers
83
+
84
+ **Issue:**
85
+ - UI is not accessible remotely, or browser does not work.
86
+
87
+ **Solution:**
88
+ - Make sure ports are open and forwarded correctly.
89
+ - Check firewall and security group settings.
90
+
91
+ ## 8. 🟪 Magentic Command Not Found
92
+
93
+ **Issue:**
94
+ - Command not found: Magentic
95
+ ```bash
96
+ magentic ui --port 8081
97
+ zsh: command not found: magentic
98
+ ```
99
+
100
+ **Solution 1**:
101
+
102
+ - Make sure you have you have activated your virtual environment.
103
+ - You can double check by reactivating it and then running the command again:
104
+
105
+ ```bash
106
+ deactivate
107
+ source .venv/bin/activate
108
+ magentic ui --port 8081
109
+ ```
110
+
111
+ **Solution 2**:
112
+
113
+ - You may have accidentally installed the package named `magentic` instead of ours `magentic-ui`
114
+ - Make sure you are running the following command:
115
+
116
+ ```bash
117
+ pip install magentic-ui
118
+ ```
119
+
120
+
121
+
122
+ ## 9. ❓ Still Having Issues?
123
+
124
+ - Double-check all [pre-requisites](#pre-requisites-please-read) in the README.
125
+ - Search [GitHub Issues](https://github.com/microsoft/magentic-ui/issues) for similar problems.
126
+ - Open a new issue and include:
127
+ 1. A detailed description of your problem
128
+ 2. Information about your system (OS, Docker version, etc.)
129
+ 3. Steps to replicate the issue (if possible)
130
+
131
+ ---
132
+
133
+ If you have suggestions for this document or find a solution not listed, please submit a pull request! 🙏
docs/img/magenticui.jpg ADDED
docs/img/magenticui_running.png ADDED

Git LFS Details

  • SHA256: 36317426835c71bfa6e6f3d955dd1cf287d52fa3ef9cc6dccb8f94ede271bd2e
  • Pointer size: 131 Bytes
  • Size of remote file: 583 kB
docs/img/magui-actionguard.png ADDED

Git LFS Details

  • SHA256: 6d5a125e6ae8fcf10daf9e03b3ebe496241812d07e6f6d873f51ed62270fe5e5
  • Pointer size: 131 Bytes
  • Size of remote file: 448 kB
docs/img/magui-coplanning.png ADDED

Git LFS Details

  • SHA256: 36654c585575deccf0bedfdec9495bc5cde4ff3e3d1c4f0020051e74ea4742b4
  • Pointer size: 131 Bytes
  • Size of remote file: 169 kB
docs/img/magui-cotasking.png ADDED

Git LFS Details

  • SHA256: a5db7e727ccb7fb01b2494a7e14d79ff80ad2abf79b28b2f6a856be90fe9d5e4
  • Pointer size: 132 Bytes
  • Size of remote file: 1.01 MB
docs/img/magui-landing.png ADDED

Git LFS Details

  • SHA256: fa83d0a19420c2b918cf6663857b9add1c4db9bcce72eaacc0adbe700478b8e5
  • Pointer size: 131 Bytes
  • Size of remote file: 151 kB
docs/img/magui-readme-logo.png ADDED

Git LFS Details

  • SHA256: 0c97b6028d0437dcd4c8fadd335cf892b813fadcd7cbe8bacfa1239822f429fd
  • Pointer size: 131 Bytes
  • Size of remote file: 309 kB
docs/img/magui-readme-logo.svg ADDED
docs/index.html ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Magentic-UI</title>
7
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/github-markdown-css/5.4.0/github-markdown-light.min.css">
8
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github.min.css">
9
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/marked/9.1.6/marked.min.js"></script>
10
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
11
+ <style>
12
+ body {
13
+ margin: 0;
14
+ padding: 0;
15
+ background-color: #ffffff;
16
+ }
17
+ .markdown-body {
18
+ box-sizing: border-box;
19
+ min-width: 200px;
20
+ max-width: 980px;
21
+ margin: 0 auto;
22
+ padding: 45px;
23
+ }
24
+ @media (max-width: 767px) {
25
+ .markdown-body {
26
+ padding: 15px;
27
+ }
28
+ }
29
+ .loading {
30
+ text-align: center;
31
+ padding: 50px;
32
+ color: #666;
33
+ }
34
+ .error {
35
+ text-align: center;
36
+ padding: 50px;
37
+ color: #d73a49;
38
+ }
39
+ /* GitHub-style header */
40
+ .header {
41
+ background-color: #24292f;
42
+ color: white;
43
+ padding: 16px 0;
44
+ margin-bottom: 32px;
45
+ }
46
+ .header-content {
47
+ max-width: 980px;
48
+ margin: 0 auto;
49
+ padding: 0 45px;
50
+ display: flex;
51
+ align-items: center;
52
+ gap: 16px;
53
+ }
54
+ .header h1 {
55
+ margin: 0;
56
+ font-size: 20px;
57
+ font-weight: 600;
58
+ }
59
+ .header a {
60
+ color: #7d8590;
61
+ text-decoration: none;
62
+ }
63
+ .header a:hover {
64
+ color: white;
65
+ }
66
+ @media (max-width: 767px) {
67
+ .header-content {
68
+ padding: 0 15px;
69
+ }
70
+ }
71
+ </style>
72
+ </head>
73
+ <body>
74
+ <div class="header">
75
+ <div class="header-content">
76
+ <h1>Magentic-UI</h1>
77
+ <span>•</span>
78
+ <a href="https://github.com/microsoft/magentic-ui">View on GitHub</a>
79
+ <span>•</span>
80
+ <a href="https://github.com/microsoft/magentic-ui/releases">Releases</a>
81
+ </div>
82
+ </div>
83
+
84
+ <article class="markdown-body">
85
+ <div id="loading" class="loading">Loading README...</div>
86
+ <div id="content" style="display: none;"></div>
87
+ <div id="error" class="error" style="display: none;">
88
+ <h2>Error loading README</h2>
89
+ <p>Please visit the <a href="https://github.com/microsoft/magentic-ui">GitHub repository</a> to view the latest content.</p>
90
+ </div>
91
+ </article>
92
+
93
+ <script>
94
+ // Configure marked options
95
+ marked.setOptions({
96
+ highlight: function(code, lang) {
97
+ if (lang && hljs.getLanguage(lang)) {
98
+ return hljs.highlight(code, { language: lang }).value;
99
+ } else {
100
+ return hljs.highlightAuto(code).value;
101
+ }
102
+ },
103
+ breaks: true,
104
+ gfm: true
105
+ });
106
+
107
+ // Fetch and display README content
108
+ fetch('https://raw.githubusercontent.com/microsoft/magentic-ui/main/README.md')
109
+ .then(response => {
110
+ if (!response.ok) {
111
+ throw new Error('Failed to fetch README');
112
+ }
113
+ return response.text();
114
+ })
115
+ .then(markdown => {
116
+ // Fix relative image paths to point to GitHub
117
+ let fixedMarkdown = markdown
118
+ // Fix Markdown image syntax: ![alt](path)
119
+ .replace(/!\[([^\]]*)\]\((?!https?:\/\/)([^)]+)\)/g, '![$1](https://raw.githubusercontent.com/microsoft/magentic-ui/main/$2)')
120
+ // Fix HTML img tags: <img src="path">
121
+ .replace(/<img([^>]+)src="(?!https?:\/\/)([^"]+)"/g, '<img$1src="https://raw.githubusercontent.com/microsoft/magentic-ui/main/$2"')
122
+ // Convert GitHub video URLs to embedded video players
123
+ .replace(/https:\/\/github\.com\/user-attachments\/assets\/([a-f0-9-]+)/g,
124
+ '<video controls width="100%" style="max-width: 800px;"><source src="https://github.com/user-attachments/assets/$1" type="video/mp4">Your browser does not support the video tag.</video>');
125
+
126
+ const html = marked.parse(fixedMarkdown);
127
+ document.getElementById('loading').style.display = 'none';
128
+ document.getElementById('content').style.display = 'block';
129
+ document.getElementById('content').innerHTML = html;
130
+
131
+ // Initialize syntax highlighting
132
+ hljs.highlightAll();
133
+ })
134
+ .catch(error => {
135
+ console.error('Error:', error);
136
+ document.getElementById('loading').style.display = 'none';
137
+ document.getElementById('error').style.display = 'block';
138
+ });
139
+ </script>
140
+ </body>
141
+ </html>
docs/tutorials/web_agent_tutorial_full.ipynb ADDED
@@ -0,0 +1,1782 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Tutorial: Building a Browser Use Agent From Scratch and with Magentic-UI\n"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "\n",
15
+ "You might have seen cool video demos online of AI agents taking control of a computer or a browser to perform tasks. This is a new category of agents referred to as Computer-Use-Agents (CUA) or Browser-Use-Agents (BUA). Examples of such CUA/BUA agents include [OpenAI's Operator](https://openai.com/index/introducing-operator/), [Claude Computer Use Model](https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool), [AutoGen's MultiModalWebSurfer](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.agents.web_surfer.html), [Adept AI](https://www.adept.ai/blog/act-1), [Google's Project Mariner](https://deepmind.google/models/project-mariner/) and [Browser-Use](https://github.com/browser-use/browser-use/tree/main) among many others.\n",
16
+ "\n",
17
+ "\n",
18
+ "## What is a Computer Use Agent?\n",
19
+ "\n",
20
+ "**Definition**: A computer or browser use agent is an agent that given a task, e.g., \"order a shawarma sandwich from BestShawarma for pickup now\", can programmatically control a computer or browser to autonomously complete the task. By \"control a browser\" we mean interacting with the browser in a similar way to how a human might control the browser: clicking on buttons, typing in fields, scrolling and so on. Note that a tool-use language model agent could complete this food ordering task if it had access to the restaurant API for instance, this would not make it a CUA agent as it is not _interacting_ with the browser to complete the task.\n",
21
+ "\n",
22
+ "To make this distinction more clear, here is another example task.\n",
23
+ "Suppose we wanted to find the list of available Airbnbs in Miami from 6/18 to 6/20 for 2 guests.\n",
24
+ "\n",
25
+ "![airbnb_sc.png](airbnb_sc.png)\n",
26
+ "\n",
27
+ "How would a browser use agent solve this task:\n",
28
+ "\n",
29
+ "- **Step 1:** Visit airbnb.com\n",
30
+ "- **Step 2:** Type \"Miami\" in the \"Where\" input box\n",
31
+ "- **Step 3:** Select \"6/18\" in the \"Check in\" date box\n",
32
+ "- **Step 4:** Select \"6/20\" in the \"Check out\" date box\n",
33
+ "- **Step 5:** Click on the \"Who\" button\n",
34
+ "- **Step 6:** Click \"+\" twice to add two guests\n",
35
+ "- **Step 7:** Click \"Search\" button\n",
36
+ "- **Step 8:** Summarize and extract listings from the webpage\n",
37
+ "\n",
38
+ "On the other hand, suppose we had an API for Airbnb that looks like: `find_listings(location, check_in, check_out, guests)`\n",
39
+ "\n",
40
+ "Then a tool-call agent would first need to generate a tool call: `find_listings(\"Miami\", 6/18, 6/20, 2)` and read out the result of the tool call.\n",
41
+ "\n",
42
+ "Clearly if we had an API for every website and everything on the computer, then it would be much simpler to perform this task. _But that is not the case currently_, many interfaces on the web cannot be accessed by an API and so the only way is through interacting with the website directly. While future interfaces might become more directly accessible to agents via APIs and MCP servers, for now we need to perform direct manipulation with the websites.\n",
43
+ "\n",
44
+ "## What Does This Tutorial Cover?\n",
45
+ "\n",
46
+ "In this tutorial, we will cover how to build a basic browser-use agent. The goal of this tutorial is to demystify such agents and show how we can build a simple version of them. The only thing we need is access to a large language model (LLM) that can perform tool calling or structured JSON outputs (GPT-4o, Qwen2.5-VL, Llama 3.1, ...). The LLM does not need to be vision capable, but a model capable of taking image input would improve performance significantly. The LLM also does not need to be trained previously for browser-use, out of the box LLMs can be turned into semi-capable browser-use agents following the recipe in this tutorial. At the end of the tutorial we will discuss further directions.\n",
47
+ "\n",
48
+ "We will cover three levels of building your browser use agent:\n",
49
+ "\n",
50
+ "- Level 1: From scratch using only the `playwright` python package.\n",
51
+ "- Level 2: Using helpers from the `magentic-ui` package which simplifies building your agent.\n",
52
+ "- Level 3: Using the WebSurfer Agent from the `magentic-ui` package directly.\n"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "markdown",
57
+ "metadata": {},
58
+ "source": [
59
+ "# Tutorial Prerequisites\n",
60
+ "\n"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "markdown",
65
+ "metadata": {},
66
+ "source": [
67
+ "\n",
68
+ "You will need Python >3.10 to run this tutorial and the `magentic-ui` package. [Magentic-UI](https://github.com/microsoft/magentic-ui/tree/main) is a research prototype from Microsoft of a human-centered agentic interface. In this tutorial we will be using utilities and helpers from that package without using the Magentic-UI application itself.\n",
69
+ "\n",
70
+ "We recommend using a virtual environment to avoid conflicts with other packages.\n",
71
+ "\n",
72
+ "```bash\n",
73
+ "python3 -m venv .venv\n",
74
+ "source .venv/bin/activate\n",
75
+ "pip install magentic-ui\n",
76
+ "```\n",
77
+ "\n",
78
+ "Alternatively, if you use [`uv`](https://docs.astral.sh/uv/getting-started/installation/) for dependency management, you can install Magentic-UI with:\n",
79
+ "\n",
80
+ "```bash\n",
81
+ "uv venv --python=3.12 .venv\n",
82
+ ". .venv/bin/activate\n",
83
+ "uv pip install magentic-ui\n",
84
+ "```\n",
85
+ "\n",
86
+ "We also need to install the browsers that our agent will control with playwright:\n",
87
+ "\n",
88
+ "```bash\n",
89
+ "playwright install --with-deps chromium\n",
90
+ "```\n",
91
+ "\n",
92
+ "The other thing you need to set up is your LLM. The easiest way to follow this tutorial is to obtain an OpenAI API key and set it as an environment variable:\n",
93
+ "\n",
94
+ "```bash\n",
95
+ "export OPENAI_API_KEY=<YOUR API KEY>\n",
96
+ "```\n",
97
+ "\n",
98
+ "You can also use any open source model with [Ollama](https://ollama.com/) if you have a capable GPU at your disposal. We will be covering both using OpenAI and Ollama."
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "markdown",
103
+ "metadata": {},
104
+ "source": [
105
+ "# Level 1: Building a Browser Use Agent From Scratch\n",
106
+ "\n",
107
+ "For this level of building our browser use agent, we will only need the `playwright` and `openai` packages which are included in `magentic-ui` package.\n"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "markdown",
112
+ "metadata": {},
113
+ "source": [
114
+ "## Step 1: Lauching a Browser\n",
115
+ "\n",
116
+ "The first step is to launch the browser that our agent will control. We will be using the [Playwright](https://github.com/microsoft/playwright-python) library that provides an API to control browsers.\n",
117
+ "\n",
118
+ "We can launch the browser in headless mode (we cannot see the actual browser on our machine) or non-headless where the browser will be launched locally.\n"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": null,
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "from playwright.async_api import async_playwright\n",
128
+ "\n",
129
+ "headless = False # Change to True to run the browser in headless mode\n",
130
+ "\n",
131
+ "# Launch and keep browser running\n",
132
+ "p = await async_playwright().start()\n",
133
+ "browser = await p.chromium.launch(headless=headless)\n",
134
+ "# context is the browser window\n",
135
+ "context = await browser.new_context()\n",
136
+ "# page is the tab in the browser\n",
137
+ "page = await context.new_page()\n",
138
+ "print(\"Browser launched!\")"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "markdown",
143
+ "metadata": {},
144
+ "source": [
145
+ "At this point you should see a browser launched locally, it will be pointing at a blank page:\n",
146
+ "\n",
147
+ "![blank_page.png](blank_page.png)\n"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "markdown",
152
+ "metadata": {},
153
+ "source": [
154
+ "We can use the playwright API to interact with this browser, for instance let us navigate to the bing homepage (give it a few seconds)\n"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": null,
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "await page.goto(\"https://www.bing.com\")\n",
164
+ "print(\"Navigated to Bing homepage\")"
165
+ ]
166
+ },
167
+ {
168
+ "cell_type": "markdown",
169
+ "metadata": {},
170
+ "source": [
171
+ "## Step 2: Represent the browser for the Agent using Set-Of-Marks Prompting.\n",
172
+ "\n",
173
+ "Our next challenge is how do we feed the browser as input to our agent so that it is able to perform actions on it.\n",
174
+ "\n",
175
+ "Using Playwright we can first take a screenshot of the browser as well as extract the text on the page.\n"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": null,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "from IPython.display import Image, display\n",
185
+ "\n",
186
+ "# Take a screenshot and store it in memory\n",
187
+ "screenshot_bytes = await page.screenshot()\n",
188
+ "\n",
189
+ "# Display the screenshot\n",
190
+ "display(Image(screenshot_bytes))\n",
191
+ "\n",
192
+ "# Get all the text on the page and print first 10 lines\n",
193
+ "text = await page.evaluate(\"() => document.body.innerText\")\n",
194
+ "print(\"\\nFirst 10 lines of text content:\")\n",
195
+ "print(\"\\n\".join(text.split(\"\\n\")[:10]))"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "markdown",
200
+ "metadata": {},
201
+ "source": [
202
+ "Now how do we get our agent to type in the search box and press search?\n",
203
+ "\n",
204
+ "The key is to extract all **interactive elements** in the page using Plawright. By interactive elements we mean elements on the page we can interact with including buttons, text boxes, dropdown menus among others. Each interactive element will have an ID that we can track on the page and if it is a visible element it will have the coordinates of bounding box of the element. We will also only look at the interactive elements that are currently visibile in the current viewport, some elements might be out of view and we'd need to scroll down to view them. For simplicity, we will ignore these elements and give our agent the ability to scroll down to view them later on.\n"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "from dataclasses import dataclass\n",
214
+ "from playwright.async_api import Page\n",
215
+ "\n",
216
+ "\n",
217
+ "# A class to represent an interactive element on the page\n",
218
+ "@dataclass\n",
219
+ "class Element:\n",
220
+ " id: int # The id of the element\n",
221
+ " aria_label: (\n",
222
+ " str # The aria-label attribute is used to provide a label for an element\n",
223
+ " )\n",
224
+ " type: str # The type of the element\n",
225
+ " bbox: dict # The bounding box of the element\n",
226
+ " text: str # The text content of the element\n",
227
+ "\n",
228
+ "\n",
229
+ "# We will now go over the page and extract all interactive elements\n",
230
+ "# We will also add a data attribute to the element with the element ID for later reference\n",
231
+ "async def get_interactive_elements(page: Page) -> list[Element]:\n",
232
+ " elements: list[Element] = []\n",
233
+ " # Viewport size is a dict with keys 'width' and 'height'\n",
234
+ " viewport_size = page.viewport_size\n",
235
+ " print(f\"Viewport size: {viewport_size}\")\n",
236
+ "\n",
237
+ " # For simplicity, we will only look at buttons, textboxes, and links. We can add more roles later on.\n",
238
+ " interactive_roles = [\"button\", \"textbox\", \"link\"]\n",
239
+ " i = 0\n",
240
+ " for role in interactive_roles:\n",
241
+ " print(f\"Getting {role} elements...\")\n",
242
+ " # We will use the Playwright API to get all elements with the given role\n",
243
+ " elements_with_role = await page.get_by_role(role).all()\n",
244
+ " for element in elements_with_role:\n",
245
+ " # Check if element is visible and in current viewport\n",
246
+ " bbox = await element.bounding_box()\n",
247
+ " if bbox: # Element is visible if it has a bounding box\n",
248
+ " # Check if element is in current viewport (not scrolled out of view)\n",
249
+ " if 0 <= bbox[\"y\"] <= viewport_size[\"height\"]:\n",
250
+ " # Set a data attribute with the element ID for later reference\n",
251
+ " await element.evaluate(f\"el => el.setAttribute('data-element-id', '{i}')\")\n",
252
+ " elements.append(\n",
253
+ " Element(\n",
254
+ " id=i,\n",
255
+ " aria_label=await element.get_attribute(\"aria-label\")\n",
256
+ " or await element.get_attribute(\"aria-role\")\n",
257
+ " or \"\",\n",
258
+ " type=role,\n",
259
+ " bbox=bbox,\n",
260
+ " text=await element.text_content() or \"\",\n",
261
+ " )\n",
262
+ " )\n",
263
+ " i += 1\n",
264
+ " print(f\"Found {len(elements)} visible interactive elements in current viewport:\")\n",
265
+ " return elements\n",
266
+ "\n",
267
+ "\n",
268
+ "elements = await get_interactive_elements(page)\n",
269
+ "formatted_list_of_elements = \"\\n\".join(\n",
270
+ " [f\"Element {i}: {element}\" for i, element in enumerate(elements)]\n",
271
+ ")\n",
272
+ "print(formatted_list_of_elements)"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "markdown",
277
+ "metadata": {},
278
+ "source": [
279
+ "The first question, is how do we identify the search box element on the Bing page give these elements?\n",
280
+ "We can try to read to figure this out by reading the list of elements, we can see that it is likely to be Element 19:\n",
281
+ "\n",
282
+ "Element(id=19, aria_label='0 characters out of 2000', type='textbox', bbox={'x': 193, 'y': 158, 'width': 843, 'height': 22}, text='')\n",
283
+ "\n",
284
+ "As this is the only texbox or searchbox element on the page.\n"
285
+ ]
286
+ },
287
+ {
288
+ "cell_type": "code",
289
+ "execution_count": null,
290
+ "metadata": {},
291
+ "outputs": [],
292
+ "source": [
293
+ "# find the search box\n",
294
+ "search_box_id = None\n",
295
+ "for element in elements:\n",
296
+ " if element.type == \"textbox\":\n",
297
+ " search_box_id = element.id\n",
298
+ " break\n",
299
+ "print(f\"Search box id: {search_box_id}\")"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "markdown",
304
+ "metadata": {},
305
+ "source": [
306
+ "However, we also have access to the page screenshot and the coordinates of each element. A neat idea would be to superimpose the bounding boxes on top of the screenshot to better understand what each element is. This technique is called Set-of-Mark Prompting (SoM) coined by Yang, Jianwei, et al. [1] to improve visual grounding.\n",
307
+ "\n",
308
+ "[1]: Yang, Jianwei, et al. \"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v.\" arXiv preprint arXiv:2310.11441 (2023). https://arxiv.org/pdf/2310.11441\n",
309
+ "\n",
310
+ "We're gonna now implement a simplified version of SoM prompting:\n"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": [
319
+ "from PIL import Image, ImageDraw\n",
320
+ "import io\n",
321
+ "\n",
322
+ "\n",
323
+ "def get_som_screenshot(screenshot_bytes: bytes, elements: list[Element]) -> Image.Image:\n",
324
+ " screenshot = Image.open(io.BytesIO(screenshot_bytes))\n",
325
+ "\n",
326
+ " # Create a drawing object\n",
327
+ " draw = ImageDraw.Draw(screenshot)\n",
328
+ "\n",
329
+ " # Draw bounding boxes and element IDs for each element\n",
330
+ " for element in elements:\n",
331
+ " bbox = element.bbox\n",
332
+ " x = bbox[\"x\"]\n",
333
+ " y = bbox[\"y\"]\n",
334
+ " width = bbox[\"width\"]\n",
335
+ " height = bbox[\"height\"]\n",
336
+ "\n",
337
+ " # Draw rectangle\n",
338
+ " draw.rectangle([(x, y), (x + width, y + height)], outline=\"red\", width=2)\n",
339
+ "\n",
340
+ " # Draw element ID\n",
341
+ " draw.text((x, y - 15), f\"{element.id}\", fill=\"red\")\n",
342
+ "\n",
343
+ " # Display the annotated screenshot\n",
344
+ " display(screenshot)\n",
345
+ " som_screenshot = screenshot.copy()\n",
346
+ " return som_screenshot\n",
347
+ "\n",
348
+ "\n",
349
+ "screenshot_bytes = await page.screenshot()\n",
350
+ "som_screenshot = get_som_screenshot(screenshot_bytes, elements)"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "markdown",
355
+ "metadata": {},
356
+ "source": [
357
+ "This confirms what we previously found that Element with id=19 is in fact the searchbox!\n",
358
+ "\n",
359
+ "Let us now wrap what we just did in a helper function to prepare the page to be used as input to our agent:\n"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": 99,
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "async def prepare_page_for_agent(page: Page) -> tuple[str, str, Image.Image]:\n",
369
+ " \"\"\"\n",
370
+ " Prepare the page for the agent.\n",
371
+ " Returns:\n",
372
+ " tuple[str, str, Image.Image]: The page text, the formatted list of elements, and the screenshot with bounding boxes.\n",
373
+ " \"\"\"\n",
374
+ " page_text = await page.evaluate(\"() => document.body.innerText\")\n",
375
+ " elements = await get_interactive_elements(page)\n",
376
+ " screenshot_bytes = await page.screenshot()\n",
377
+ " som_screenshot = get_som_screenshot(screenshot_bytes, elements)\n",
378
+ "\n",
379
+ " formatted_list_of_elements = \"\\n\".join(\n",
380
+ " [f\"Element {i}: {element}\" for i, element in enumerate(elements)]\n",
381
+ " )\n",
382
+ "\n",
383
+ " return page_text, formatted_list_of_elements, som_screenshot\n",
384
+ "\n",
385
+ "\n",
386
+ "# page_text, formatted_list_of_elements, screenshot = await prepare_page_for_agent(page)"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "markdown",
391
+ "metadata": {},
392
+ "source": [
393
+ "## Step 3: Define Agent Action Space\n",
394
+ "\n",
395
+ "Now that we have established how to represent the browser state for our Agent, it's time to define our Agent architecture. This section will cover the action space and execution flow that enables our agent to perform tasks using the browser.\n"
396
+ ]
397
+ },
398
+ {
399
+ "cell_type": "markdown",
400
+ "metadata": {},
401
+ "source": [
402
+ "\n",
403
+ "### Action Space Definition\n",
404
+ "\n",
405
+ "Our web agent operates with a carefully designed set of actions that cover the fundamental browser interactions needed for most web automation tasks:\n",
406
+ "\n",
407
+ "- **`goto(url)`**: Navigate to a specific URL\n",
408
+ "- **`click(id)`**: Click on an element identified by its ID\n",
409
+ "- **`type(id, text)`**: Input text into a form field or text element by ID\n",
410
+ "- **`scroll(direction)`**: Scroll the page vertically (up/down)\n",
411
+ "- **`stop_action(final_answer)`**: Complete the task and return the final result\n",
412
+ "\n",
413
+ "_Note: This is a simplfied action set designed for our initial prototype. It can be extended with additional actions like hover, select, wait, etc., as needed._\n",
414
+ "\n",
415
+ "### Agent Architecture Flow\n",
416
+ "\n",
417
+ "The following diagram illustrates how our web agent processes user queries and executes actions:\n",
418
+ "\n",
419
+ "```mermaid\n",
420
+ "flowchart TD\n",
421
+ " A[\"Input: User Query\"] --> B[\"Initialize Agent\"]\n",
422
+ " B --> C[\"Capture Current Page State\"]\n",
423
+ " C --> D[\"Analyze Page & Query\"]\n",
424
+ " D --> E{Action Decision}\n",
425
+ " E -->|goto| F[\"Navigate to URL\"]\n",
426
+ " E -->|click| G[\"Click Element by ID\"]\n",
427
+ " E -->|type| H[\"Type Text in Element\"]\n",
428
+ " E -->|scroll| I[\"Scroll Page\"]\n",
429
+ " E -->|stop_action| J[\"Return Final Answer\"]\n",
430
+ " F --> K[\"Execute Action\"]\n",
431
+ " G --> K\n",
432
+ " H --> K\n",
433
+ " I --> K\n",
434
+ " K --> C\n",
435
+ " J --> L[\"Output: Final Answer\"]\n",
436
+ "\n",
437
+ " style A fill:#e1f5fe\n",
438
+ " style L fill:#e8f5e8\n",
439
+ " style E fill:#fff3e0\n",
440
+ " style J fill:#ffebee\n",
441
+ "```\n",
442
+ "\n",
443
+ "### Execution Flow Details\n",
444
+ "\n",
445
+ "1. **Input Processing**: The agent receives a user query describing the desired task\n",
446
+ "2. **State Capture**: Current browser page state is captured and processed\n",
447
+ "3. **Action Selection**: Based on the analysis, one of the five actions is chosen\n",
448
+ "4. **Execution**: The selected action is executed in the browser. We append the feedback of the action into the chat history.\n",
449
+ "5. **Loop Continuation**: The process repeats until `stop_action` is triggered\n",
450
+ "\n",
451
+ "The agent continues this loop until it determines the task is complete, at which point it executes `stop_action` with the final answer.\n"
452
+ ]
453
+ },
454
+ {
455
+ "cell_type": "markdown",
456
+ "metadata": {},
457
+ "source": [
458
+ "Our first step is to create the prompt template for the model to decide on the correct action. Instead of using tool calling to decide on the action, we will JSON outputs for simplicity.\n"
459
+ ]
460
+ },
461
+ {
462
+ "cell_type": "code",
463
+ "execution_count": 100,
464
+ "metadata": {},
465
+ "outputs": [],
466
+ "source": [
467
+ "AGENT_PROMPT = \"\"\"\n",
468
+ "You are a helpful assistant that can navigate a web page and perform actions on it.\n",
469
+ "\n",
470
+ "The task we are trying to complete is:\n",
471
+ "{task}\n",
472
+ "\n",
473
+ "The current visible text on the page is:\n",
474
+ "{page_text}\n",
475
+ "\n",
476
+ "The current visible elements on the page are:\n",
477
+ "{formatted_list_of_elements}\n",
478
+ "\n",
479
+ "You will need to decide on the next action to take.\n",
480
+ "\n",
481
+ "The action space is:\n",
482
+ "- goto(url): navigate to a URL\n",
483
+ "- click(id): click a button given it's ID\n",
484
+ "- type(id, text): type \"text\" into element \"id\"\n",
485
+ "- scroll(direction): scroll the page in direction up or down.\n",
486
+ "- stop_action(final_answer): declare that we have finished the task and prepare a final_answer to return to the user.\n",
487
+ "\n",
488
+ "Output a JSON object with the following fields:\n",
489
+ "{{\n",
490
+ " \"action\": \"goto\" | \"click\" | \"type\" | \"scroll\" | \"stop_action\",\n",
491
+ " \"action_args\": {{\n",
492
+ " \"url\": \"https://www.google.com\",\n",
493
+ " \"id\": \"123\",\n",
494
+ " \"text\": \"Hello\",\n",
495
+ " \"direction\": \"up\"\n",
496
+ " }}\n",
497
+ "}}\n",
498
+ "\n",
499
+ "Only output the JSON object, no other text or comments.\n",
500
+ "\"\"\""
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "markdown",
505
+ "metadata": {},
506
+ "source": [
507
+ "Let's now try this prompt with our LLM:\n"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "code",
512
+ "execution_count": null,
513
+ "metadata": {},
514
+ "outputs": [],
515
+ "source": [
516
+ "from openai import OpenAI\n",
517
+ "import json\n",
518
+ "import base64\n",
519
+ "from PIL import Image\n",
520
+ "import os\n",
521
+ "\n",
522
+ "# Prepare the page for the agent\n",
523
+ "page_text, formatted_list_of_elements, som_screenshot = await prepare_page_for_agent(\n",
524
+ " page\n",
525
+ ")\n",
526
+ "task = \"Search for Magentic-UI\"\n",
527
+ "# Now make the API call\n",
528
+ "client = OpenAI(\n",
529
+ " api_key=os.getenv(\"OPENAI_API_KEY\")\n",
530
+ ") # you can use any other LLM client here\n",
531
+ "image_data_url = f\"data:image/png;base64,{base64.b64encode((lambda b: (som_screenshot.save(b, format='PNG'), b.getvalue())[1])(io.BytesIO())).decode()}\"\n",
532
+ "\n",
533
+ "\n",
534
+ "def get_llm_response(\n",
535
+ " client: OpenAI, # OpenAI client\n",
536
+ " task: str, # Task to complete\n",
537
+ " page_text: str, # Page text\n",
538
+ " formatted_list_of_elements: str, # Formatted list of elements\n",
539
+ " image_data_url: str, # Image data URL\n",
540
+ " message_history: list[dict] = [], # Message history\n",
541
+ " model: str = \"gpt-4o\", # Model to use\n",
542
+ ") -> dict:\n",
543
+ " response = client.chat.completions.create(\n",
544
+ " model=model,\n",
545
+ " messages=[\n",
546
+ " *message_history,\n",
547
+ " {\n",
548
+ " \"role\": \"user\",\n",
549
+ " \"content\": [\n",
550
+ " {\n",
551
+ " \"type\": \"text\",\n",
552
+ " \"text\": AGENT_PROMPT.format(\n",
553
+ " task=task,\n",
554
+ " page_text=page_text,\n",
555
+ " formatted_list_of_elements=formatted_list_of_elements,\n",
556
+ " ),\n",
557
+ " },\n",
558
+ " {\n",
559
+ " \"type\": \"image_url\",\n",
560
+ " \"image_url\": {\"url\": image_data_url},\n",
561
+ " },\n",
562
+ " ],\n",
563
+ " },\n",
564
+ " ],\n",
565
+ " )\n",
566
+ "\n",
567
+ " # Parse the response\n",
568
+ " try:\n",
569
+ " action_decision = json.loads(response.choices[0].message.content)\n",
570
+ " print(\"Model's decision:\", json.dumps(action_decision, indent=2))\n",
571
+ " except json.JSONDecodeError:\n",
572
+ " # it starts with ```json\n",
573
+ " response_content = response.choices[0].message.content\n",
574
+ " response_content = response_content.replace(\"```json\", \"\").replace(\"```\", \"\")\n",
575
+ " action_decision = json.loads(response_content)\n",
576
+ " print(\"Model's decision:\", json.dumps(action_decision, indent=2))\n",
577
+ " except Exception as e:\n",
578
+ " raise e\n",
579
+ " return action_decision\n",
580
+ "\n",
581
+ "\n",
582
+ "action_decision = get_llm_response(\n",
583
+ " client, task, page_text, formatted_list_of_elements, image_data_url\n",
584
+ ")\n",
585
+ "print(action_decision)"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "markdown",
590
+ "metadata": {},
591
+ "source": [
592
+ "We can see that the model made the right decision given the task of \"Search for Magentic-UI\", the action is to type in the search box for \"Magentic-UI\"\n",
593
+ "\n",
594
+ "The last remaining piece before we put it all together is to now execute the action using Playwright."
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "markdown",
599
+ "metadata": {},
600
+ "source": [
601
+ "## Step 4: Executing the actions with Playwright\n",
602
+ "\n",
603
+ "For each of the actions we have previously defined, we will now write code using Playwright to execute them."
604
+ ]
605
+ },
606
+ {
607
+ "cell_type": "code",
608
+ "execution_count": 103,
609
+ "metadata": {},
610
+ "outputs": [],
611
+ "source": [
612
+ "# This is mostly basic Playwright code, but we will use it to execute the actions.\n",
613
+ "async def execute_action(action: str, action_args: dict, page: Page) -> str:\n",
614
+ " \"\"\"\n",
615
+ " Execute an action on the page.\n",
616
+ " \"\"\"\n",
617
+ " if action == \"goto\":\n",
618
+ " await page.goto(action_args[\"url\"])\n",
619
+ " return f\"I navigated to {action_args['url']}\"\n",
620
+ " elif action == \"click\":\n",
621
+ " # Get the element using the data attribute\n",
622
+ " await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
623
+ " element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
624
+ " if element:\n",
625
+ " await element.click()\n",
626
+ " else:\n",
627
+ " raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
628
+ " return f\"I clicked on {action_args['id']}\"\n",
629
+ " elif action == \"type\":\n",
630
+ " await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
631
+ " element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
632
+ " if element:\n",
633
+ " await element.fill(action_args[\"text\"])\n",
634
+ " else:\n",
635
+ " raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
636
+ " return f\"I typed {action_args['text']} into {action_args['id']}\"\n",
637
+ " elif action == \"scroll\":\n",
638
+ " await page.scroll(action_args[\"direction\"])\n",
639
+ " return f\"I scrolled {action_args['direction']}\"\n",
640
+ " elif action == \"stop_action\":\n",
641
+ " return action_args[\"final_answer\"]\n",
642
+ " else:\n",
643
+ " raise ValueError(f\"Invalid action: {action}\")"
644
+ ]
645
+ },
646
+ {
647
+ "cell_type": "code",
648
+ "execution_count": null,
649
+ "metadata": {},
650
+ "outputs": [],
651
+ "source": [
652
+ "await execute_action(action_decision[\"action\"], action_decision[\"action_args\"], page)"
653
+ ]
654
+ },
655
+ {
656
+ "cell_type": "code",
657
+ "execution_count": null,
658
+ "metadata": {},
659
+ "outputs": [],
660
+ "source": [
661
+ "# Take a screenshot of the page\n",
662
+ "screenshot = await page.screenshot()\n",
663
+ "display(Image.open(io.BytesIO(screenshot)))"
664
+ ]
665
+ },
666
+ {
667
+ "cell_type": "markdown",
668
+ "metadata": {},
669
+ "source": [
670
+ "Success! We can see that our agent was properly able to type \"Magentic-UI\" into the searchbox!\n",
671
+ "\n",
672
+ "The final step is to put it all together into our Agent!"
673
+ ]
674
+ },
675
+ {
676
+ "cell_type": "code",
677
+ "execution_count": null,
678
+ "metadata": {},
679
+ "outputs": [],
680
+ "source": [
681
+ "await browser.close()"
682
+ ]
683
+ },
684
+ {
685
+ "cell_type": "markdown",
686
+ "metadata": {},
687
+ "source": [
688
+ "## Step 5: Putting it all together into our Agent"
689
+ ]
690
+ },
691
+ {
692
+ "cell_type": "code",
693
+ "execution_count": 12,
694
+ "metadata": {},
695
+ "outputs": [],
696
+ "source": [
697
+ "from openai import OpenAI\n",
698
+ "from playwright.async_api import Page\n",
699
+ "from playwright.async_api import async_playwright\n",
700
+ "from PIL import Image, ImageDraw\n",
701
+ "import io\n",
702
+ "import base64\n",
703
+ "import json\n",
704
+ "from dataclasses import dataclass\n",
705
+ "from IPython.display import display\n",
706
+ "\n",
707
+ "@dataclass\n",
708
+ "class Element:\n",
709
+ " id: int # The id of the element\n",
710
+ " aria_label: (\n",
711
+ " str # The aria-label attribute is used to provide a label for an element\n",
712
+ " )\n",
713
+ " type: str # The type of the element\n",
714
+ " bbox: dict # The bounding box of the element\n",
715
+ " text: str # The text content of the element\n",
716
+ "\n",
717
+ "\n",
718
+ "AGENT_PROMPT = \"\"\"\n",
719
+ "You are a helpful assistant that can navigate a web page and perform actions on it.\n",
720
+ "\n",
721
+ "The task we are trying to complete is:\n",
722
+ "{task}\n",
723
+ "\n",
724
+ "The current visible text on the page is:\n",
725
+ "{page_text}\n",
726
+ "\n",
727
+ "The current visible elements on the page are:\n",
728
+ "{formatted_list_of_elements}\n",
729
+ "\n",
730
+ "You will need to decide on the next action to take.\n",
731
+ "\n",
732
+ "The action space is:\n",
733
+ "- goto(url): navigate to a URL\n",
734
+ "- click(id): click a button given it's ID\n",
735
+ "- type(id, text): type \"text\" into element \"id\"\n",
736
+ "- scroll(direction): scroll the page in direction up or down.\n",
737
+ "- stop_action(final_answer): declare that we have finished the task and prepare a final_answer to return to the user.\n",
738
+ "\n",
739
+ "Output a JSON object with the following fields:\n",
740
+ "{{\n",
741
+ " \"action\": \"goto\" | \"click\" | \"type\" | \"scroll\" | \"stop_action\",\n",
742
+ " \"action_args\": {{\n",
743
+ " \"url\": \"https://www.google.com\",\n",
744
+ " \"id\": \"123\",\n",
745
+ " \"text\": \"Hello\",\n",
746
+ " \"direction\": \"up\"\n",
747
+ " }}\n",
748
+ "}}\n",
749
+ "\n",
750
+ "Only output the JSON object, no other text or comments.\n",
751
+ "\"\"\"\n",
752
+ "\n",
753
+ "\n",
754
+ "class BrowserUseAgent:\n",
755
+ " def __init__(\n",
756
+ " self,\n",
757
+ " client: OpenAI,\n",
758
+ " model: str = \"gpt-4o\",\n",
759
+ " headless: bool = False,\n",
760
+ " run_in_jupyter: bool = True,\n",
761
+ " ):\n",
762
+ " self.client = client\n",
763
+ " self.model = model\n",
764
+ " self.headless = headless\n",
765
+ " self.message_history: list[dict] = []\n",
766
+ " self.page: Page = None\n",
767
+ " self.run_in_jupyter = run_in_jupyter\n",
768
+ "\n",
769
+ " async def _launch_browser(self) -> None:\n",
770
+ " p = await async_playwright().start()\n",
771
+ " self.browser = await p.chromium.launch(headless=self.headless)\n",
772
+ " # context is the browser window\n",
773
+ " self.context = await self.browser.new_context()\n",
774
+ " # page is the tab in the browser\n",
775
+ " self.page = await self.context.new_page()\n",
776
+ "\n",
777
+ " async def execute_task(self, task: str) -> str:\n",
778
+ " \"\"\"\n",
779
+ " This is NEW! This is the main function that will be called to execute the task and implement our agent loop.\n",
780
+ " \"\"\"\n",
781
+ " # Step 1: Launch the browser if it's not already launched\n",
782
+ " if self.page is None:\n",
783
+ " await self._launch_browser()\n",
784
+ " # Our stop condition is when the LLM decides to output stop_action\n",
785
+ " should_stop = False\n",
786
+ " final_answer = None\n",
787
+ " i = 0\n",
788
+ " while not should_stop:\n",
789
+ " # Step 2: Prepare the page for the agent\n",
790
+ " (\n",
791
+ " page_text,\n",
792
+ " formatted_list_of_elements,\n",
793
+ " som_screenshot,\n",
794
+ " ) = await self._prepare_page_for_agent(self.page)\n",
795
+ " # Step 3: Get the LLM response\n",
796
+ " image_data_url = f\"data:image/png;base64,{base64.b64encode((lambda b: (som_screenshot.save(b, format='PNG'), b.getvalue())[1])(io.BytesIO())).decode()}\"\n",
797
+ " action_decision = self._get_llm_response(\n",
798
+ " self.client,\n",
799
+ " task,\n",
800
+ " page_text,\n",
801
+ " formatted_list_of_elements,\n",
802
+ " image_data_url,\n",
803
+ " self.message_history,\n",
804
+ " self.model,\n",
805
+ " )\n",
806
+ " print(f\"Action decision {i}: {action_decision}\")\n",
807
+ " # Add the action decision to the message history\n",
808
+ " self.message_history.append(\n",
809
+ " {\n",
810
+ " \"role\": \"user\",\n",
811
+ " \"content\": [{\"type\": \"text\", \"text\": json.dumps(action_decision)}],\n",
812
+ " }\n",
813
+ " )\n",
814
+ " # Step 4: Execute the action with some error handling\n",
815
+ " try:\n",
816
+ " action_feedback = await self._execute_action(\n",
817
+ " action_decision[\"action\"], action_decision[\"action_args\"], self.page\n",
818
+ " )\n",
819
+ " except Exception as e:\n",
820
+ " print(f\"Error executing action {i}: {e}\")\n",
821
+ " action_feedback = f\"Error executing action {i}: {e}\"\n",
822
+ " print(f\"Action feedback {i}: {action_feedback}\")\n",
823
+ " # Sleep for 3 seconds to let the page load\n",
824
+ " await self.page.wait_for_timeout(3000)\n",
825
+ " # Update the message history with feedback on the action and the new page screenshot\n",
826
+ " new_page_screenshot = await self.page.screenshot()\n",
827
+ " self.message_history.append(\n",
828
+ " {\n",
829
+ " \"role\": \"user\",\n",
830
+ " \"content\": [\n",
831
+ " {\"type\": \"text\", \"text\": action_feedback},\n",
832
+ " {\n",
833
+ " \"type\": \"image_url\",\n",
834
+ " \"image_url\": {\n",
835
+ " \"url\": f\"data:image/png;base64,{base64.b64encode(new_page_screenshot).decode()}\"\n",
836
+ " },\n",
837
+ " },\n",
838
+ " ],\n",
839
+ " }\n",
840
+ " )\n",
841
+ " if self.run_in_jupyter:\n",
842
+ " display(Image.open(io.BytesIO(new_page_screenshot)))\n",
843
+ " # Check if the task is complete\n",
844
+ " should_stop = action_decision[\"action\"] == \"stop_action\"\n",
845
+ " if should_stop:\n",
846
+ " final_answer = action_decision[\"action_args\"][\"final_answer\"]\n",
847
+ " i += 1\n",
848
+ " return final_answer\n",
849
+ "\n",
850
+ " async def _execute_action(self, action: str, action_args: dict, page: Page) -> str:\n",
851
+ " \"\"\"\n",
852
+ " Execute an action on the page.\n",
853
+ " \"\"\"\n",
854
+ " if action == \"goto\":\n",
855
+ " await page.goto(action_args[\"url\"])\n",
856
+ " return f\"I navigated to {action_args['url']}\"\n",
857
+ " elif action == \"click\":\n",
858
+ " # Get the element using the data attribute\n",
859
+ " await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
860
+ " element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
861
+ " if element:\n",
862
+ " await element.click()\n",
863
+ " else:\n",
864
+ " raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
865
+ " return f\"I clicked on {action_args['id']}\"\n",
866
+ " elif action == \"type\":\n",
867
+ " await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
868
+ " element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
869
+ " if element:\n",
870
+ " await element.fill(action_args[\"text\"])\n",
871
+ " # Press enter\n",
872
+ " await element.press(\"Enter\")\n",
873
+ " else:\n",
874
+ " raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
875
+ " return f\"I typed {action_args['text']} into {action_args['id']}\"\n",
876
+ " elif action == \"scroll\":\n",
877
+ " await page.scroll(action_args[\"direction\"])\n",
878
+ " return f\"I scrolled {action_args['direction']}\"\n",
879
+ " elif action == \"stop_action\":\n",
880
+ " return action_args[\"final_answer\"]\n",
881
+ " else:\n",
882
+ " raise ValueError(f\"Invalid action: {action}\")\n",
883
+ "\n",
884
+ " def _get_llm_response(\n",
885
+ " self,\n",
886
+ " client: OpenAI, # OpenAI client\n",
887
+ " task: str, # Task to complete\n",
888
+ " page_text: str, # Page text\n",
889
+ " formatted_list_of_elements: str, # Formatted list of elements\n",
890
+ " image_data_url: str, # Image data URL\n",
891
+ " message_history: list[dict] = [], # Message history\n",
892
+ " model: str = \"gpt-4o\", # Model to use\n",
893
+ " ) -> dict:\n",
894
+ " response = client.chat.completions.create(\n",
895
+ " model=model,\n",
896
+ " messages=[\n",
897
+ " *message_history,\n",
898
+ " {\n",
899
+ " \"role\": \"user\",\n",
900
+ " \"content\": [\n",
901
+ " {\n",
902
+ " \"type\": \"text\",\n",
903
+ " \"text\": AGENT_PROMPT.format(\n",
904
+ " task=task,\n",
905
+ " page_text=page_text,\n",
906
+ " formatted_list_of_elements=formatted_list_of_elements,\n",
907
+ " ),\n",
908
+ " },\n",
909
+ " {\n",
910
+ " \"type\": \"image_url\",\n",
911
+ " \"image_url\": {\"url\": image_data_url},\n",
912
+ " },\n",
913
+ " ],\n",
914
+ " },\n",
915
+ " ],\n",
916
+ " )\n",
917
+ "\n",
918
+ " # Parse the response\n",
919
+ " try:\n",
920
+ " action_decision = json.loads(response.choices[0].message.content)\n",
921
+ " except json.JSONDecodeError:\n",
922
+ " # it starts with ```json\n",
923
+ " response_content = response.choices[0].message.content\n",
924
+ " response_content = response_content.replace(\"```json\", \"\").replace(\n",
925
+ " \"```\", \"\"\n",
926
+ " )\n",
927
+ " action_decision = json.loads(response_content)\n",
928
+ " except Exception as e:\n",
929
+ " raise e\n",
930
+ " return action_decision\n",
931
+ "\n",
932
+ " async def _prepare_page_for_agent(self, page: Page) -> tuple[str, str, Image.Image]:\n",
933
+ " \"\"\"\n",
934
+ " Prepare the page for the agent.\n",
935
+ " Returns:\n",
936
+ " tuple[str, str, Image.Image]: The page text, the formatted list of elements, and the screenshot with bounding boxes.\n",
937
+ " \"\"\"\n",
938
+ " page_text = await page.evaluate(\"() => document.body.innerText\")\n",
939
+ " elements = await self._get_interactive_elements(page)\n",
940
+ " screenshot_bytes = await page.screenshot()\n",
941
+ " som_screenshot = self._get_som_screenshot(screenshot_bytes, elements)\n",
942
+ "\n",
943
+ " formatted_list_of_elements = \"\\n\".join(\n",
944
+ " [f\"Element {i}: {element}\" for i, element in enumerate(elements)]\n",
945
+ " )\n",
946
+ "\n",
947
+ " return page_text, formatted_list_of_elements, som_screenshot\n",
948
+ "\n",
949
+ " def _get_som_screenshot(\n",
950
+ " self, screenshot_bytes: bytes, elements: list[Element]\n",
951
+ " ) -> Image.Image:\n",
952
+ " screenshot = Image.open(io.BytesIO(screenshot_bytes))\n",
953
+ "\n",
954
+ " # Create a drawing object\n",
955
+ " draw = ImageDraw.Draw(screenshot)\n",
956
+ "\n",
957
+ " # Draw bounding boxes and element IDs for each element\n",
958
+ " for element in elements:\n",
959
+ " bbox = element.bbox\n",
960
+ " x = bbox[\"x\"]\n",
961
+ " y = bbox[\"y\"]\n",
962
+ " width = bbox[\"width\"]\n",
963
+ " height = bbox[\"height\"]\n",
964
+ "\n",
965
+ " # Draw rectangle\n",
966
+ " draw.rectangle([(x, y), (x + width, y + height)], outline=\"red\", width=2)\n",
967
+ "\n",
968
+ " # Draw element ID\n",
969
+ " draw.text((x, y - 15), f\"{element.id}\", fill=\"red\")\n",
970
+ "\n",
971
+ " som_screenshot = screenshot.copy()\n",
972
+ " return som_screenshot\n",
973
+ "\n",
974
+ " async def _get_interactive_elements(self, page: Page) -> list[Element]:\n",
975
+ " elements: list[Element] = []\n",
976
+ " # Viewport size is a dict with keys 'width' and 'height'\n",
977
+ " viewport_size = page.viewport_size\n",
978
+ "\n",
979
+ " # For simplicity, we will only look at buttons, textboxes, and links. We can add more roles later on.\n",
980
+ " interactive_roles = [\"button\", \"textbox\", \"link\"]\n",
981
+ " i = 0\n",
982
+ " for role in interactive_roles:\n",
983
+ " # We will use the Playwright API to get all elements with the given role\n",
984
+ " elements_with_role = await page.get_by_role(role).all()\n",
985
+ " for element in elements_with_role:\n",
986
+ " # Check if element is visible and in current viewport\n",
987
+ " bbox = await element.bounding_box()\n",
988
+ " if bbox: # Element is visible if it has a bounding box\n",
989
+ " # Check if element is in current viewport (not scrolled out of view)\n",
990
+ " if 0 <= bbox[\"y\"] <= viewport_size[\"height\"]:\n",
991
+ " # Set a data attribute with the element ID for later reference\n",
992
+ " await element.evaluate(\n",
993
+ " f\"el => el.setAttribute('data-element-id', '{i}')\"\n",
994
+ " )\n",
995
+ " elements.append(\n",
996
+ " Element(\n",
997
+ " id=i,\n",
998
+ " aria_label=await element.get_attribute(\"aria-label\")\n",
999
+ " or await element.get_attribute(\"aria-role\")\n",
1000
+ " or \"\",\n",
1001
+ " type=role,\n",
1002
+ " bbox=bbox,\n",
1003
+ " text=await element.text_content() or \"\",\n",
1004
+ " )\n",
1005
+ " )\n",
1006
+ " i += 1\n",
1007
+ " return elements\n",
1008
+ "\n",
1009
+ " async def close(self) -> None:\n",
1010
+ " if self.page is not None:\n",
1011
+ " await self.page.close()\n",
1012
+ " if self.context is not None:\n",
1013
+ " await self.context.close()\n",
1014
+ " if self.browser is not None:\n",
1015
+ " await self.browser.close()"
1016
+ ]
1017
+ },
1018
+ {
1019
+ "cell_type": "markdown",
1020
+ "metadata": {},
1021
+ "source": [
1022
+ "Now let's run the Agent on a sample task!"
1023
+ ]
1024
+ },
1025
+ {
1026
+ "cell_type": "code",
1027
+ "execution_count": null,
1028
+ "metadata": {},
1029
+ "outputs": [],
1030
+ "source": [
1031
+ "from openai import OpenAI\n",
1032
+ "import os\n",
1033
+ "openai_client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
1034
+ "agent = BrowserUseAgent(openai_client)\n",
1035
+ "try:\n",
1036
+ " final_answer = await agent.execute_task(\"find the open issues assigned to husseinmozannar on the microsoft/magentic-ui repo on github\")\n",
1037
+ " print(final_answer)\n",
1038
+ "finally:\n",
1039
+ " await agent.close()"
1040
+ ]
1041
+ },
1042
+ {
1043
+ "cell_type": "markdown",
1044
+ "metadata": {},
1045
+ "source": [
1046
+ "Sucess! Our agent was able to navigate to GitHub and filter the issues assigned to me. It ran into some issues but was able to debug and get to the right answer.\n",
1047
+ "\n",
1048
+ "To conclude, in this short tutorial, we showed how to build a browser use agent from scratch.\n",
1049
+ "\n",
1050
+ "The main ingredients were: set-of-marks prompting, playwright for browser automation and tool calling or structured JSON output ability of current LLMs. With these three ingredients we can build a semi-capable agent to navigate the web!"
1051
+ ]
1052
+ },
1053
+ {
1054
+ "cell_type": "markdown",
1055
+ "metadata": {},
1056
+ "source": [
1057
+ "# Level 2: Building a Browser Use Agent Using Magentic-UI\n",
1058
+ "\n",
1059
+ "While it was fun building the browser use agent from scratch, it was not easy. We had to figure out how to launch the browser, fiddle around with playwright to extract interactive elements, figure out how to execute actions on the page and so on.\n",
1060
+ "\n",
1061
+ "The `magentic-ui` library as we will see has many utilities that will make your life much easier when building browser use agents. We will now do the same steps as before but by using the helpers from the `magentic-ui` library.\n"
1062
+ ]
1063
+ },
1064
+ {
1065
+ "cell_type": "markdown",
1066
+ "metadata": {},
1067
+ "source": [
1068
+ "## Step 1: Launching a Browser"
1069
+ ]
1070
+ },
1071
+ {
1072
+ "cell_type": "markdown",
1073
+ "metadata": {},
1074
+ "source": [
1075
+ "Magentic-UI provides three different Playwright browser implementations, each designed for specific use cases:\n",
1076
+ "\n",
1077
+ "1. Local Playwright Browser (`LocalPlaywrightBrowser`)\n",
1078
+ "- **Purpose**: Runs Playwright directly on the local machine without Docker\n",
1079
+ "- **Use Case**: Development and testing environments where Docker isn't needed\n",
1080
+ "- **Features**: Lightweight, direct browser control, supports both headless and headed modes\n",
1081
+ "\n",
1082
+ "2. Headless Docker Playwright Browser (`HeadlessDockerPlaywrightBrowser`) \n",
1083
+ "- **Purpose**: Runs a headless Playwright browser inside a Docker container\n",
1084
+ "- **Use Case**: Production environments, CI/CD pipelines, server-side automation\n",
1085
+ "- **Features**: Isolated execution, reproducible environment, no GUI overhead and more secure.\n",
1086
+ "- **Docker Image**: Uses Microsoft's official Playwright Docker image (`mcr.microsoft.com/playwright:v1.51.1-noble`)\n",
1087
+ "\n",
1088
+ "3. VNC Docker Playwright Browser (`VncDockerPlaywrightBrowser`)\n",
1089
+ "- **Purpose**: Runs Playwright in Docker with VNC support for visual interaction, you can interact with the browser on localhost.\n",
1090
+ "- **Use Case**: Debugging, development, and scenarios requiring visual browser inspection\n",
1091
+ "- **Features**: Programmatic control + visual access via noVNC web interface\n",
1092
+ "- **Docker Image**: Uses custom `magentic-ui-vnc-browser` image with VNC server. You need to run `magentic-ui --rebuild-docker` command to build it.\n",
1093
+ "\n",
1094
+ "How to Launch Each Browser:\n",
1095
+ "\n",
1096
+ "```python\n",
1097
+ "from pathlib import Path\n",
1098
+ "from magentic_ui.tools.playwright import HeadlessDockerPlaywrightBrowser, VncDockerPlaywrightBrowser, LocalPlaywrightBrowser\n",
1099
+ "\n",
1100
+ "# Direct instantiation examples\n",
1101
+ "async def launch_browsers():\n",
1102
+ " # Headless Docker Browser\n",
1103
+ " headless_browser = HeadlessDockerPlaywrightBrowser(\n",
1104
+ " playwright_port=37367,\n",
1105
+ " inside_docker=False\n",
1106
+ " )\n",
1107
+ " \n",
1108
+ " # VNC Docker Browser \n",
1109
+ " vnc_browser = VncDockerPlaywrightBrowser(\n",
1110
+ " bind_dir=Path(\"./workspace\"),\n",
1111
+ " playwright_port=37367,\n",
1112
+ " novnc_port=6080,\n",
1113
+ " inside_docker=False\n",
1114
+ " )\n",
1115
+ " \n",
1116
+ " # Local Browser\n",
1117
+ " local_browser = LocalPlaywrightBrowser(headless=True)\n",
1118
+ " \n",
1119
+ "```\n",
1120
+ "\n"
1121
+ ]
1122
+ },
1123
+ {
1124
+ "cell_type": "markdown",
1125
+ "metadata": {},
1126
+ "source": [
1127
+ "For simplicity we will stick with the local playwright browser that we launched in Level 1:"
1128
+ ]
1129
+ },
1130
+ {
1131
+ "cell_type": "code",
1132
+ "execution_count": null,
1133
+ "metadata": {},
1134
+ "outputs": [],
1135
+ "source": [
1136
+ "from magentic_ui.tools.playwright import LocalPlaywrightBrowser\n",
1137
+ "browser = LocalPlaywrightBrowser(headless=False)\n",
1138
+ "# Start the browser\n",
1139
+ "await browser._start()\n",
1140
+ "# Get the browser context and start a new page\n",
1141
+ "context = browser.browser_context\n",
1142
+ "page = await context.new_page()\n"
1143
+ ]
1144
+ },
1145
+ {
1146
+ "cell_type": "markdown",
1147
+ "metadata": {},
1148
+ "source": [
1149
+ "You should now see a browser open to the blank page."
1150
+ ]
1151
+ },
1152
+ {
1153
+ "cell_type": "markdown",
1154
+ "metadata": {},
1155
+ "source": [
1156
+ "## Step 2: Represent the browser for the Agent using Set-Of-Marks Prompting."
1157
+ ]
1158
+ },
1159
+ {
1160
+ "cell_type": "markdown",
1161
+ "metadata": {},
1162
+ "source": [
1163
+ "To get the interactive elements on the page, we have done a lot of work for you in Magentic-UI to capture every posible interactive element type on the page including elements in the shadow-DOM [(see this javascript file if interested for more info)](https://github.com/microsoft/magentic-ui/blob/main/src/magentic_ui/tools/playwright/page_script.js).\n",
1164
+ "\n",
1165
+ "These utilities are wrapped in a helper class called the [`PlaywrightController`](https://github.com/microsoft/magentic-ui/blob/main/src/magentic_ui/tools/playwright/playwright_controller.py)"
1166
+ ]
1167
+ },
1168
+ {
1169
+ "cell_type": "code",
1170
+ "execution_count": 31,
1171
+ "metadata": {},
1172
+ "outputs": [],
1173
+ "source": [
1174
+ "from magentic_ui.tools.playwright import PlaywrightController\n",
1175
+ "browser_controller = PlaywrightController(viewport_width=1280, viewport_height=720)"
1176
+ ]
1177
+ },
1178
+ {
1179
+ "cell_type": "markdown",
1180
+ "metadata": {},
1181
+ "source": [
1182
+ "The PlaywrightController has a lot of convenience methods that have been debugged extensively so that we can perform actions on the browser more reliably and securily.\n",
1183
+ "\n",
1184
+ "There are methods to get the interactive elements, get the screenshot, click, type, scroll, manage tabs, hover, describe pages in markdown and much more."
1185
+ ]
1186
+ },
1187
+ {
1188
+ "cell_type": "markdown",
1189
+ "metadata": {},
1190
+ "source": [
1191
+ "For now, let's navigate to Bing using our `browser_controller`."
1192
+ ]
1193
+ },
1194
+ {
1195
+ "cell_type": "code",
1196
+ "execution_count": 33,
1197
+ "metadata": {},
1198
+ "outputs": [],
1199
+ "source": [
1200
+ "_ = await browser_controller.visit_page(page, \"https://www.bing.com\")"
1201
+ ]
1202
+ },
1203
+ {
1204
+ "cell_type": "markdown",
1205
+ "metadata": {},
1206
+ "source": [
1207
+ "The visit_page method only returns when the page is fully loaded."
1208
+ ]
1209
+ },
1210
+ {
1211
+ "cell_type": "markdown",
1212
+ "metadata": {},
1213
+ "source": [
1214
+ "Now let us get the set of interactive elements:"
1215
+ ]
1216
+ },
1217
+ {
1218
+ "cell_type": "code",
1219
+ "execution_count": null,
1220
+ "metadata": {},
1221
+ "outputs": [],
1222
+ "source": [
1223
+ "interactive_elements = await browser_controller.get_interactive_rects(page)\n",
1224
+ "# print the first 20 interactive elements\n",
1225
+ "i = 0\n",
1226
+ "for element in interactive_elements:\n",
1227
+ " print(f\"Element {i}: id={element}, data={interactive_elements[element]}\")\n",
1228
+ " i += 1\n",
1229
+ " if i > 20:\n",
1230
+ " break"
1231
+ ]
1232
+ },
1233
+ {
1234
+ "cell_type": "markdown",
1235
+ "metadata": {},
1236
+ "source": [
1237
+ "You'll notice that this ran much faster than using the Playwright script in Level 1 tutorial because here we are using javascript to extract the elements instead of going through the playwright API.\n",
1238
+ "\n",
1239
+ "Our searchbox is now Element id 22 and has the following data:\n",
1240
+ "\n",
1241
+ "\n",
1242
+ " Element 12: id=22, data={'tag_name': 'textarea', 'role': 'textbox', 'aria_name': '0 characters out of 2000', 'v_scrollable': False, 'rects': [{'x': 193, 'y': 158, 'width': 843, 'height': 22, 'top': 158, 'right': 1036, 'bottom': 180, 'left': 193}]}\n",
1243
+ "\n",
1244
+ "To type in the searchbox we can use the fill_id method of the PlaywrightController:"
1245
+ ]
1246
+ },
1247
+ {
1248
+ "cell_type": "code",
1249
+ "execution_count": 38,
1250
+ "metadata": {},
1251
+ "outputs": [],
1252
+ "source": [
1253
+ "\n",
1254
+ "await browser_controller.fill_id(page, \"22\", \"Magentic-UI\")\n"
1255
+ ]
1256
+ },
1257
+ {
1258
+ "cell_type": "markdown",
1259
+ "metadata": {},
1260
+ "source": [
1261
+ "Let's check if we are the right page:"
1262
+ ]
1263
+ },
1264
+ {
1265
+ "cell_type": "code",
1266
+ "execution_count": null,
1267
+ "metadata": {},
1268
+ "outputs": [],
1269
+ "source": [
1270
+ "from PIL import Image\n",
1271
+ "import io\n",
1272
+ "from IPython.display import display\n",
1273
+ "\n",
1274
+ "screenshot = await browser_controller.get_screenshot(page)\n",
1275
+ "image = Image.open(io.BytesIO(screenshot))\n",
1276
+ "display(image)\n"
1277
+ ]
1278
+ },
1279
+ {
1280
+ "cell_type": "markdown",
1281
+ "metadata": {},
1282
+ "source": [
1283
+ "We can also easily extract the search results using the get_page_markdown method that uses the [`markitdown`](https://github.com/microsoft/markitdown) package from our team at Microsoft Research."
1284
+ ]
1285
+ },
1286
+ {
1287
+ "cell_type": "code",
1288
+ "execution_count": null,
1289
+ "metadata": {},
1290
+ "outputs": [],
1291
+ "source": [
1292
+ "page_text = await browser_controller.get_page_markdown(page)\n",
1293
+ "print(page_text)"
1294
+ ]
1295
+ },
1296
+ {
1297
+ "cell_type": "markdown",
1298
+ "metadata": {},
1299
+ "source": [
1300
+ "The final thing we need is to get the set-of-marks image:"
1301
+ ]
1302
+ },
1303
+ {
1304
+ "cell_type": "code",
1305
+ "execution_count": null,
1306
+ "metadata": {},
1307
+ "outputs": [],
1308
+ "source": [
1309
+ "from magentic_ui.agents.web_surfer._set_of_mark import add_set_of_mark\n",
1310
+ "\n",
1311
+ "\n",
1312
+ "interactive_elements = await browser_controller.get_interactive_rects(page)\n",
1313
+ "screenshot = await browser_controller.get_screenshot(page)\n",
1314
+ "som_screenshot, visible_elements, elements_above, elements_below, _ = add_set_of_mark(\n",
1315
+ " screenshot, interactive_elements, use_sequential_ids=True\n",
1316
+ ")\n",
1317
+ "\n",
1318
+ "display(som_screenshot)\n"
1319
+ ]
1320
+ },
1321
+ {
1322
+ "cell_type": "markdown",
1323
+ "metadata": {},
1324
+ "source": [
1325
+ "The add_set_of_mark method returns the SoM screenshot in addition to elements visible on the viewport, elements above the viewport and elements below the viewport.\n",
1326
+ "\n",
1327
+ "We can see how much the `magentic-ui` makes our life easier with these tools, we are now ready to re-implement the agent from Level 1!"
1328
+ ]
1329
+ },
1330
+ {
1331
+ "cell_type": "markdown",
1332
+ "metadata": {},
1333
+ "source": [
1334
+ "## Step 3: Putting it all together"
1335
+ ]
1336
+ },
1337
+ {
1338
+ "cell_type": "markdown",
1339
+ "metadata": {},
1340
+ "source": [
1341
+ "Using the tools from the `magentic-ui` library now we can more easily implement our BrowserUseAgent:"
1342
+ ]
1343
+ },
1344
+ {
1345
+ "cell_type": "code",
1346
+ "execution_count": 71,
1347
+ "metadata": {},
1348
+ "outputs": [],
1349
+ "source": [
1350
+ "from openai import OpenAI\n",
1351
+ "from playwright.async_api import Page\n",
1352
+ "from playwright.async_api import async_playwright\n",
1353
+ "from PIL import Image, ImageDraw\n",
1354
+ "import io\n",
1355
+ "import base64\n",
1356
+ "import json\n",
1357
+ "from dataclasses import dataclass\n",
1358
+ "from IPython.display import display\n",
1359
+ "from magentic_ui.tools.playwright import LocalPlaywrightBrowser\n",
1360
+ "from magentic_ui.tools.playwright import PlaywrightController\n",
1361
+ "from magentic_ui.agents.web_surfer._set_of_mark import add_set_of_mark\n",
1362
+ "\n",
1363
+ "\n",
1364
+ "AGENT_PROMPT = \"\"\"\n",
1365
+ "You are a helpful assistant that can navigate a web page and perform actions on it.\n",
1366
+ "\n",
1367
+ "The task we are trying to complete is:\n",
1368
+ "{task}\n",
1369
+ "\n",
1370
+ "The current visible text on the page is:\n",
1371
+ "{page_text}\n",
1372
+ "\n",
1373
+ "The current visible elements on the page are:\n",
1374
+ "{formatted_list_of_elements}\n",
1375
+ "\n",
1376
+ "You will need to decide on the next action to take.\n",
1377
+ "\n",
1378
+ "The action space is:\n",
1379
+ "- goto(url): navigate to a URL\n",
1380
+ "- click(id): click a button given it's ID\n",
1381
+ "- type(id, text): type \"text\" into element \"id\"\n",
1382
+ "- scroll(direction): scroll the page in direction up or down.\n",
1383
+ "- stop_action(final_answer): declare that we have finished the task and prepare a final_answer to return to the user.\n",
1384
+ "\n",
1385
+ "Output a JSON object with the following fields:\n",
1386
+ "{{\n",
1387
+ " \"action\": \"goto\" | \"click\" | \"type\" | \"scroll\" | \"stop_action\",\n",
1388
+ " \"action_args\": {{\n",
1389
+ " \"url\": \"https://www.google.com\",\n",
1390
+ " \"id\": \"123\",\n",
1391
+ " \"text\": \"Hello\",\n",
1392
+ " \"direction\": \"up\"\n",
1393
+ " }}\n",
1394
+ "}}\n",
1395
+ "\n",
1396
+ "Only output the JSON object, no other text or comments.\n",
1397
+ "\"\"\"\n",
1398
+ "\n",
1399
+ "\n",
1400
+ "class BrowserUseAgent:\n",
1401
+ " def __init__(\n",
1402
+ " self,\n",
1403
+ " client: OpenAI,\n",
1404
+ " model: str = \"gpt-4o\",\n",
1405
+ " headless: bool = False,\n",
1406
+ " run_in_jupyter: bool = True,\n",
1407
+ " ):\n",
1408
+ " self.client = client\n",
1409
+ " self.model = model\n",
1410
+ " self.headless = headless\n",
1411
+ " self.message_history: list[dict] = []\n",
1412
+ " self.page: Page = None\n",
1413
+ " self.run_in_jupyter = run_in_jupyter\n",
1414
+ " self.browser_controller = PlaywrightController(\n",
1415
+ " viewport_width=1280, viewport_height=720\n",
1416
+ " )\n",
1417
+ "\n",
1418
+ " async def _launch_browser(self) -> None:\n",
1419
+ " self.browser = LocalPlaywrightBrowser(headless=False)\n",
1420
+ " # Start the browser\n",
1421
+ " await self.browser._start()\n",
1422
+ " # Get the browser context and start a new page\n",
1423
+ " self.context = self.browser.browser_context\n",
1424
+ " self.page = await self.context.new_page()\n",
1425
+ "\n",
1426
+ " async def execute_task(self, task: str) -> str:\n",
1427
+ " \"\"\"\n",
1428
+ " This is NEW! This is the main function that will be called to execute the task and implement our agent loop.\n",
1429
+ " \"\"\"\n",
1430
+ " # Step 1: Launch the browser if it's not already launched\n",
1431
+ " if self.page is None:\n",
1432
+ " await self._launch_browser()\n",
1433
+ " # Our stop condition is when the LLM decides to output stop_action\n",
1434
+ " should_stop = False\n",
1435
+ " final_answer = None\n",
1436
+ " i = 0\n",
1437
+ " while not should_stop:\n",
1438
+ " # Step 2: Prepare the page for the agent\n",
1439
+ " (\n",
1440
+ " page_text,\n",
1441
+ " formatted_list_of_elements,\n",
1442
+ " som_screenshot,\n",
1443
+ " ) = await self._prepare_page_for_agent(self.page)\n",
1444
+ " # Step 3: Get the LLM response\n",
1445
+ " image_data_url = f\"data:image/png;base64,{base64.b64encode((lambda b: (som_screenshot.save(b, format='PNG'), b.getvalue())[1])(io.BytesIO())).decode()}\"\n",
1446
+ " action_decision = self._get_llm_response(\n",
1447
+ " self.client,\n",
1448
+ " task,\n",
1449
+ " page_text,\n",
1450
+ " formatted_list_of_elements,\n",
1451
+ " image_data_url,\n",
1452
+ " self.message_history,\n",
1453
+ " self.model,\n",
1454
+ " )\n",
1455
+ " print(f\"Action decision {i}: {action_decision}\")\n",
1456
+ " # Add the action decision to the message history\n",
1457
+ " self.message_history.append(\n",
1458
+ " {\n",
1459
+ " \"role\": \"user\",\n",
1460
+ " \"content\": [{\"type\": \"text\", \"text\": json.dumps(action_decision)}],\n",
1461
+ " }\n",
1462
+ " )\n",
1463
+ " # Step 4: Execute the action with some error handling\n",
1464
+ " try:\n",
1465
+ " action_feedback = await self._execute_action(\n",
1466
+ " action_decision[\"action\"], action_decision[\"action_args\"], self.page\n",
1467
+ " )\n",
1468
+ " except Exception as e:\n",
1469
+ " print(f\"Error executing action {i}: {e}\")\n",
1470
+ " action_feedback = f\"Error executing action {i}: {e}\"\n",
1471
+ " print(f\"Action feedback {i}: {action_feedback}\")\n",
1472
+ " # Sleep for 3 seconds to let the page load\n",
1473
+ " await self.page.wait_for_timeout(3000)\n",
1474
+ " # Update the message history with feedback on the action and the new page screenshot\n",
1475
+ " new_page_screenshot = await self.page.screenshot()\n",
1476
+ " self.message_history.append(\n",
1477
+ " {\n",
1478
+ " \"role\": \"user\",\n",
1479
+ " \"content\": [\n",
1480
+ " {\"type\": \"text\", \"text\": action_feedback},\n",
1481
+ " {\n",
1482
+ " \"type\": \"image_url\",\n",
1483
+ " \"image_url\": {\n",
1484
+ " \"url\": f\"data:image/png;base64,{base64.b64encode(new_page_screenshot).decode()}\"\n",
1485
+ " },\n",
1486
+ " },\n",
1487
+ " ],\n",
1488
+ " }\n",
1489
+ " )\n",
1490
+ " if self.run_in_jupyter:\n",
1491
+ " display(Image.open(io.BytesIO(new_page_screenshot)))\n",
1492
+ " # Check if the task is complete\n",
1493
+ " should_stop = action_decision[\"action\"] == \"stop_action\"\n",
1494
+ " if should_stop:\n",
1495
+ " final_answer = action_decision[\"action_args\"][\"final_answer\"]\n",
1496
+ " i += 1\n",
1497
+ " return final_answer\n",
1498
+ "\n",
1499
+ " async def _prepare_page_for_agent(self, page: Page) -> tuple[str, str, bytes]:\n",
1500
+ " interactive_elements = await self.browser_controller.get_interactive_rects(page)\n",
1501
+ " screenshot = await self.browser_controller.get_screenshot(page)\n",
1502
+ " som_screenshot, visible_elements, elements_above, elements_below, _ = (\n",
1503
+ " add_set_of_mark(screenshot, interactive_elements, use_sequential_ids=False)\n",
1504
+ " )\n",
1505
+ " visible_elements_formatted = \"\"\n",
1506
+ " for element_id in visible_elements:\n",
1507
+ " element_data = interactive_elements[element_id]\n",
1508
+ " visible_elements_formatted += f\"{element_id}: {element_data}\\n\"\n",
1509
+ "\n",
1510
+ " page_text = await self.browser_controller.get_page_markdown(page)\n",
1511
+ " return page_text, visible_elements_formatted, som_screenshot\n",
1512
+ " async def _execute_action(self, action: str, action_args: dict, page: Page) -> str:\n",
1513
+ " if action == \"goto\":\n",
1514
+ " await self.browser_controller.visit_page(page, action_args[\"url\"])\n",
1515
+ " return f\"Visited {action_args['url']}\"\n",
1516
+ " elif action == \"click\":\n",
1517
+ " await self.browser_controller.click_id(self.context, page, action_args[\"id\"])\n",
1518
+ " return f\"Clicked {action_args['id']}\"\n",
1519
+ " elif action == \"type\":\n",
1520
+ " await self.browser_controller.fill_id(page, action_args[\"id\"], action_args[\"text\"])\n",
1521
+ " return f\"Typed {action_args['text']} into {action_args['id']}\"\n",
1522
+ " elif action == \"scroll\":\n",
1523
+ " if action_args[\"direction\"] == \"up\":\n",
1524
+ " await self.browser_controller.page_up(page)\n",
1525
+ " elif action_args[\"direction\"] == \"down\":\n",
1526
+ " await self.browser_controller.page_down(page)\n",
1527
+ " return f\"Scrolled {action_args['direction']}\"\n",
1528
+ " elif action == \"stop_action\":\n",
1529
+ " return action_args[\"final_answer\"]\n",
1530
+ " else:\n",
1531
+ " raise ValueError(f\"Invalid action: {action}\")\n",
1532
+ "\n",
1533
+ " def _get_llm_response(\n",
1534
+ " self,\n",
1535
+ " client: OpenAI, # OpenAI client\n",
1536
+ " task: str, # Task to complete\n",
1537
+ " page_text: str, # Page text\n",
1538
+ " formatted_list_of_elements: str, # Formatted list of elements\n",
1539
+ " image_data_url: str, # Image data URL\n",
1540
+ " message_history: list[dict] = [], # Message history\n",
1541
+ " model: str = \"gpt-4o\", # Model to use\n",
1542
+ " ) -> dict:\n",
1543
+ " response = client.chat.completions.create(\n",
1544
+ " model=model,\n",
1545
+ " messages=[\n",
1546
+ " *message_history,\n",
1547
+ " {\n",
1548
+ " \"role\": \"user\",\n",
1549
+ " \"content\": [\n",
1550
+ " {\n",
1551
+ " \"type\": \"text\",\n",
1552
+ " \"text\": AGENT_PROMPT.format(\n",
1553
+ " task=task,\n",
1554
+ " page_text=page_text,\n",
1555
+ " formatted_list_of_elements=formatted_list_of_elements,\n",
1556
+ " ),\n",
1557
+ " },\n",
1558
+ " {\n",
1559
+ " \"type\": \"image_url\",\n",
1560
+ " \"image_url\": {\"url\": image_data_url},\n",
1561
+ " },\n",
1562
+ " ],\n",
1563
+ " },\n",
1564
+ " ],\n",
1565
+ " )\n",
1566
+ "\n",
1567
+ " # Parse the response\n",
1568
+ " try:\n",
1569
+ " action_decision = json.loads(response.choices[0].message.content)\n",
1570
+ " except json.JSONDecodeError:\n",
1571
+ " # it starts with ```json\n",
1572
+ " response_content = response.choices[0].message.content\n",
1573
+ " response_content = response_content.replace(\"```json\", \"\").replace(\n",
1574
+ " \"```\", \"\"\n",
1575
+ " )\n",
1576
+ " action_decision = json.loads(response_content)\n",
1577
+ " except Exception as e:\n",
1578
+ " raise e\n",
1579
+ " return action_decision\n",
1580
+ "\n",
1581
+ " async def close(self) -> None:\n",
1582
+ " if self.page is not None:\n",
1583
+ " await self.page.close()\n",
1584
+ " if self.context is not None:\n",
1585
+ " await self.context.close()\n"
1586
+ ]
1587
+ },
1588
+ {
1589
+ "cell_type": "code",
1590
+ "execution_count": null,
1591
+ "metadata": {},
1592
+ "outputs": [],
1593
+ "source": [
1594
+ "from openai import OpenAI\n",
1595
+ "import os\n",
1596
+ "openai_client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
1597
+ "agent = BrowserUseAgent(openai_client)\n",
1598
+ "try:\n",
1599
+ " final_answer = await agent.execute_task(\"find the open issues assigned to husseinmozannar on the microsoft/magentic-ui repo on github\")\n",
1600
+ " print(final_answer)\n",
1601
+ "finally:\n",
1602
+ " await agent.close()"
1603
+ ]
1604
+ },
1605
+ {
1606
+ "cell_type": "markdown",
1607
+ "metadata": {},
1608
+ "source": [
1609
+ "Success! Our agent again performed the task correctly!\n",
1610
+ "\n",
1611
+ "With this tutorial, I hope to have convinced you that `magentic-ui` can help you build a browser-use agent more easily. You might be curious how to build the best browser-use agent possible given this, and we have already implemented one for you with many features that we haven't discussed previously in Magentic-UI which we will discuss next."
1612
+ ]
1613
+ },
1614
+ {
1615
+ "cell_type": "markdown",
1616
+ "metadata": {},
1617
+ "source": [
1618
+ "# Level 3: Using the WebSurfer Agent from Magentic-UI\n",
1619
+ "\n",
1620
+ "We have a reference implementation of a capable browser use agent in Magentic-UI which we call the `WebSurfer` agent. I'll show you now how to use it. \n"
1621
+ ]
1622
+ },
1623
+ {
1624
+ "cell_type": "markdown",
1625
+ "metadata": {},
1626
+ "source": [
1627
+ "\n",
1628
+ "`WebSurfer` is an AutoGen AgentChat agent built using the tools we have seen previously to complete actions autonomously on the web. We have spent a lot of time fixing many many edge cases that arise on the web to arrive at a more reliable (but not perfect) browser use agent.\n",
1629
+ "This agent builds on the [`MultimodalWebSurfer`](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.agents.web_surfer.html) agent from AutoGen that we previously developed. \n",
1630
+ "\n",
1631
+ "Let's see now how to use it!"
1632
+ ]
1633
+ },
1634
+ {
1635
+ "cell_type": "code",
1636
+ "execution_count": null,
1637
+ "metadata": {},
1638
+ "outputs": [],
1639
+ "source": [
1640
+ "\n",
1641
+ "from autogen_ext.models.openai import OpenAIChatCompletionClient\n",
1642
+ "from magentic_ui.agents import WebSurfer\n",
1643
+ "from magentic_ui.tools.playwright import (\n",
1644
+ " LocalPlaywrightBrowser,\n",
1645
+ ")\n",
1646
+ "\n",
1647
+ "browser = LocalPlaywrightBrowser(headless=False)\n",
1648
+ "\n",
1649
+ "model_client = OpenAIChatCompletionClient(model=\"gpt-4o\")\n",
1650
+ "\n",
1651
+ "web_surfer = WebSurfer(\n",
1652
+ " name=\"web_surfer\",\n",
1653
+ " model_client=model_client, # Use any client from AutoGen!\n",
1654
+ " animate_actions=True, # Set to True if you want to see the actions being animated!\n",
1655
+ " max_actions_per_step=10, # Maximum number of actions to perform before returning\n",
1656
+ " downloads_folder=\"debug\", # Where to save downloads\n",
1657
+ " debug_dir=\"debug\", # Where to save debug files and screenshots\n",
1658
+ " to_save_screenshots=False, # set to True if you want to save screenshots of the actions\n",
1659
+ " browser=browser, # Use any browser from Magentic-UI!\n",
1660
+ " multiple_tools_per_call=False, # Set to True if you want to use multiple tools per call\n",
1661
+ " json_model_output=False, # Set to True if your model does not support tool calling\n",
1662
+ ")\n",
1663
+ "await web_surfer.lazy_init()\n",
1664
+ "\n",
1665
+ "task = \"find the open issues assigned to husseinmozannar on the microsoft/magentic-ui repo on github\"\n",
1666
+ "try:\n",
1667
+ " messages = []\n",
1668
+ " async for message in web_surfer.run_stream(task=task):\n",
1669
+ " messages.append(message)\n",
1670
+ " print(message)\n",
1671
+ " print(\"########################################################\")\n",
1672
+ " print(\"Final answer:\")\n",
1673
+ " print(messages[-1].messages[-2].content)\n",
1674
+ "finally:\n",
1675
+ " await web_surfer.close()\n"
1676
+ ]
1677
+ },
1678
+ {
1679
+ "cell_type": "markdown",
1680
+ "metadata": {},
1681
+ "source": [
1682
+ "We encourage you to experiment using the sample code file [sample_web_surfer.py](https://github.com/microsoft/magentic-ui/blob/main/samples/sample_web_surfer.py) and to use the Magentic-UI application which provides a web UI to interact with the WebSurfer agent and launch multiple parallel tasks and more!\n",
1683
+ "\n",
1684
+ "Just run:\n",
1685
+ "\n",
1686
+ "```bash\n",
1687
+ "python3 -m venv .venv\n",
1688
+ "source .venv/bin/activate\n",
1689
+ "pip install magentic-ui\n",
1690
+ "# export OPENAI_API_KEY=<YOUR API KEY>\n",
1691
+ "magentic ui --port 8081\n",
1692
+ "```\n",
1693
+ "See [https://github.com/microsoft/magentic-ui](https://github.com/microsoft/magentic-ui) for the full instructions.\n",
1694
+ "\n",
1695
+ "![../img/magenticui_running.png](../img/magenticui_running.png)"
1696
+ ]
1697
+ },
1698
+ {
1699
+ "cell_type": "markdown",
1700
+ "metadata": {},
1701
+ "source": [
1702
+ "# What's next?\n",
1703
+ "\n",
1704
+ "\n"
1705
+ ]
1706
+ },
1707
+ {
1708
+ "cell_type": "markdown",
1709
+ "metadata": {},
1710
+ "source": [
1711
+ "\n",
1712
+ "## Evaluation\n",
1713
+ "\n",
1714
+ "The first thing you might be curious about is how well does the WebSurfer agent perform?\n",
1715
+ "\n",
1716
+ "In Magentic-UI, we have built a small evaluation library [magentic-ui/eval](https://github.com/microsoft/magentic-ui/tree/main/src/magentic_ui/eval) that implements popular browser-use benchmarks and makes it easy to run evals. We will be building a bit on this library and will have a tutorial on how to use it.\n",
1717
+ "\n",
1718
+ " Magentic-UI has been tested against several benchmarks when running with o4-mini: [GAIA](https://huggingface.co/datasets/gaia-benchmark/GAIA) test set (42.52%), which assesses general AI assistants across reasoning, tool use, and web interaction tasks ; [AssistantBench](https://huggingface.co/AssistantBench) test set (27.60%), focusing on realistic, time-consuming web tasks; [WebVoyager](https://github.com/MinorJerry/WebVoyager) (82.2%), measuring end-to-end web navigation in real-world scenarios; and [WebGames](https://webgames.convergence.ai/) (45.5%), evaluating general-purpose web-browsing agents through interactive challenges.\n",
1719
+ "To reproduce these experimental results, please see the following [instructions](experiments/README.md).\n",
1720
+ "\n",
1721
+ "For reference, the current SOTA on WebVoyager is the [browser-use library](https://browser-use.com/posts/sota-technical-report) using GPT-4o achieving 89%. Note that the WebVoyager evaluation is not consistent across different systems as it relies on a mix of LLM-as-a-judge evaluation and human evaluation.\n",
1722
+ "\n",
1723
+ "\n",
1724
+ "## Limitations\n",
1725
+ "\n",
1726
+ "Using the Set-Of-Mark approach for building the Browser Use Agent has many limitations (note that both Magentic-UI and Browser Use library use SoM). For instance, any task that requires understanding coordinates on the screen our agent will fail on.\n",
1727
+ "Examples:\n",
1728
+ "\n",
1729
+ "- dragging an element from position A to position B\n",
1730
+ "- drawing on the screen\n",
1731
+ "- playing web games\n",
1732
+ "\n",
1733
+ "Moreover, it will not generalize to any Computer Use task where we might not have the DOM to obtain element coordinates. Therefore, we will need to have a model that can click on specific coordinates rather than using element IDs. The [UI-Tars](https://github.com/bytedance/UI-TARS) models have such an ability as well as the latest [compute-preview-api](https://platform.openai.com/docs/guides/tools-computer-use) from OpenAI. Another approach is to use a grounding or parsing model instead of the DOM such as [OmniParser](https://microsoft.github.io/OmniParser/) to obtain element IDs from any GUI interface combined with a tool-calling LLM.\n",
1734
+ "\n",
1735
+ "\n",
1736
+ "Another limitation is that these agents are not *real-time* and so tasks such as video-understanding or playing games become almost impossible natively as there multiple seconds delay between each agent action.\n",
1737
+ "\n",
1738
+ "## Safety\n",
1739
+ "\n",
1740
+ "Current LLMs are still very prone to adversarial attacks on the web, see these papers for how bad things can get it with current models even those tuned directly for CUA:\n",
1741
+ "\n",
1742
+ "- [Commercial LLM Agents Are Already Vulnerable to Simple Yet Dangerous Attacks\n",
1743
+ "](https://arxiv.org/html/2502.08586v1)\n",
1744
+ "- [RedTeamCUA:\n",
1745
+ "Realistic Adversarial Testing of Computer-Use Agents in\n",
1746
+ "Hybrid Web-OS Environments](https://osu-nlp-group.github.io/RedTeamCUA/)\n",
1747
+ "\n",
1748
+ "We recommend to have guardrails built into the agent to allow the human to approve actions if needed. We call such guardrails \"ActionGuard\" in Magentic-UI and they allow you to define heuristics in addition to LLM judgmenet for when actions might need human approval.\n",
1749
+ "\n",
1750
+ "\n"
1751
+ ]
1752
+ },
1753
+ {
1754
+ "cell_type": "markdown",
1755
+ "metadata": {},
1756
+ "source": [
1757
+ "If you've made it this far I really appreciate you taking the time to read and hope you've enjoyed following along!"
1758
+ ]
1759
+ }
1760
+ ],
1761
+ "metadata": {
1762
+ "kernelspec": {
1763
+ "display_name": "Python 3 (ipykernel)",
1764
+ "language": "python",
1765
+ "name": "python3"
1766
+ },
1767
+ "language_info": {
1768
+ "codemirror_mode": {
1769
+ "name": "ipython",
1770
+ "version": 3
1771
+ },
1772
+ "file_extension": ".py",
1773
+ "mimetype": "text/x-python",
1774
+ "name": "python",
1775
+ "nbconvert_exporter": "python",
1776
+ "pygments_lexer": "ipython3",
1777
+ "version": "3.12.6"
1778
+ }
1779
+ },
1780
+ "nbformat": 4,
1781
+ "nbformat_minor": 2
1782
+ }
experiments/endpoint_configs/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ config.yaml
2
+ exp_configs/*
3
+ exp_configs
experiments/endpoint_configs/config_template.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMPORTANT: This file is a template with default configurations.
2
+ # To use it, make a copy in the same directory and rename it to `config.yaml`
3
+ model_config_4o_openai: &client_4o_openai
4
+ provider: OpenAIChatCompletionClient
5
+ config:
6
+ model: gpt-4o-2024-08-06
7
+ max_retries: 5
8
+
9
+ orchestrator_client: *client_4o_openai
10
+ coder_client: *client_4o_openai
11
+ web_surfer_client: *client_4o_openai
12
+ file_surfer_client: *client_4o_openai
13
+ action_guard_client: *client_4o_openai
14
+ user_proxy_client: *client_4o_openai
15
+ model_client: *client_4o_openai
experiments/endpoint_configs/test_client.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import asyncio
3
+ from autogen_core.models import ChatCompletionClient, UserMessage
4
+
5
+
6
+ async def test_chat_completion_client() -> None:
7
+ # Load the config file
8
+ print("Loading config...")
9
+ with open("config.yaml", "r") as f:
10
+ config = yaml.safe_load(f)
11
+
12
+ # Get the orchestrator client config
13
+ client_config = config.get("orchestrator_client")
14
+ print(f"Loaded client config: {client_config}")
15
+
16
+ # Initialize the client
17
+ print("Initializing client...")
18
+ client = ChatCompletionClient.load_component(client_config)
19
+
20
+ # Test a simple completion
21
+ print("Testing completion...")
22
+ response = await client.create(
23
+ messages=[UserMessage(content="Say hello", source="user")]
24
+ )
25
+ print(f"Response content: {response.content}")
26
+
27
+ await client.close()
28
+
29
+
30
+ if __name__ == "__main__":
31
+ asyncio.run(test_chat_completion_client())
experiments/eval/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ data
2
+ runs
experiments/eval/README.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Reproducing Experimental Results
2
+
3
+ Make sure to clone the repo and install Magentic-UI with the following command:
4
+ ```bash
5
+ pip install magentic-ui[eval]
6
+ ```
7
+
8
+ From the root of the repo you can run these commands to reproduce our experimental results. Note that running the full experiments may take hours and each task may cost up to $0.5 of API credits when using OpenAI models.
9
+
10
+ To evaluate an existing run or get partial results, replace "--mode run" with "--mode eval". See [experiments/eval/run.py](experiments/eval/run.py) for more information about the arguments.
11
+
12
+ The run.py script takes care of running Magentic-UI on the benchmark of choice. It will download the data in `./data` folder at the root of the repo and store the run logs inside `runs/[SYSTEM NAME]/[DATASET NAME]/[SPLIT NAME]/[RUN ID]`. Inside this folder you'll find a folder for each task with files containing the run messages (`[TASK_ID]_messages.json`), time data (`times.json`), token usage data (`model_tokens_usage.json`), evaluation scores (`score.json`) and any screenshots (`screenshot_raw_[TIMESTAMP].png` and `screenshot*som*[TIMESTAMP].png`) or produced files. You will also find a `metrics.json` file with metrics for the entire run.
13
+
14
+
15
+ **NOTE:** Make sure to create a config file with your model client endpoints. We provide a template config file [config_template.yaml](../endpoint_configs/config_template.yaml) that you should adapt. You should copy and rename this file to `config.yaml` inside `experiments/endpoint_configs` directory.
16
+
17
+ ## WebGames
18
+
19
+ ```bash
20
+ python experiments/eval/run.py --current-dir . --dataset WebGames --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
21
+ ```
22
+
23
+ ## WebVoyager
24
+
25
+ ```bash
26
+ python experiments/eval/run.py --current-dir . --dataset WebVoyager --split webvoyager --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --web-surfer-only true --mode run
27
+ ```
28
+
29
+ ## GAIA
30
+
31
+ ### Simulated User
32
+
33
+ On the validation set we first get autonomous performance:
34
+
35
+ ```bash
36
+ python experiments/eval/run.py --current-dir . --dataset Gaia --split validation --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
37
+ ```
38
+
39
+ Then the simulated user with a stronger model (make sure your config file is correct first).
40
+
41
+ ```bash
42
+ python experiments/eval/run.py --current-dir . --dataset Gaia --split validation --run-id 2 --simulated-user-type co-planning-and-execution --how-helpful-user-proxy no_hints --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
43
+ ```
44
+
45
+ Then the simulated user with access to metadata.
46
+
47
+ ```bash
48
+ python experiments/eval/run.py --current-dir . --dataset Gaia --split validation --run-id 3 --simulated-user-type co-planning-and-execution --how-helpful-user-proxy soft --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
49
+ ```
50
+
51
+ To explore the results of these runs, you can use the following scripts that generate a CSV inside the logs directory:
52
+
53
+ ```bash
54
+ python experiments/eval/explore_results.py --run-dir runs/MagenticUI_co-planning-and-execution_soft/Gaia/validation/3 --data-dir data/Gaia
55
+ ```
56
+
57
+ and
58
+
59
+ ```bash
60
+ python experiments/eval/analyze_sim_user.py --run-dir runs/MagenticUI_co-planning-and-execution_soft/Gaia/validation/3
61
+ ```
62
+
63
+ ### Test Set
64
+
65
+ ```bash
66
+ python experiments/eval/run.py --current-dir . --dataset Gaia --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
67
+ ```
68
+
69
+ You can use the [experiments/eval/prepare_for_submission.py](experiments/eval/prepare_for_submission.py) script to submit to the Gaia and AssistantBench leaderboard.
70
+
71
+ ## AssistantBench
72
+
73
+ ```bash
74
+ python experiments/eval/run.py --current-dir . --dataset AssistantBench --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
75
+ ```
experiments/eval/analyze_sim_user.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import pandas as pd
4
+ from typing import Dict, Any, Optional, Union
5
+ import argparse
6
+ import os
7
+ from datetime import datetime
8
+
9
+ from autogen_core.models import (
10
+ ChatCompletionClient,
11
+ UserMessage,
12
+ SystemMessage,
13
+ )
14
+
15
+ # Setup logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Configure model client
20
+ default_client_config = {
21
+ "provider": "OpenAIChatCompletionClient",
22
+ "config": {
23
+ "model": "gpt-4o-2024-08-06",
24
+ },
25
+ "max_retries": 5,
26
+ }
27
+
28
+
29
+ def get_model_client(
30
+ endpoint_config: Optional[Union[Dict[str, Any], None]],
31
+ ) -> ChatCompletionClient:
32
+ if endpoint_config is None:
33
+ return ChatCompletionClient.load_component(default_client_config)
34
+ return ChatCompletionClient.load_component(endpoint_config)
35
+
36
+
37
+ async def check_ground_truth_in_messages(
38
+ question: str, ground_truth: str, user_messages_str: str
39
+ ) -> int:
40
+ """
41
+ Ask the model if ground_truth is included in user_messages
42
+ Returns 1 if ground truth is included, 0 otherwise
43
+ """
44
+ prompt = f"""
45
+
46
+ I have a question: "{question}"
47
+
48
+
49
+ I have a list of messages a user sent to an AI assistant: {user_messages_str}
50
+
51
+
52
+ The question has a ground truth answer: "{ground_truth}"
53
+
54
+
55
+ Check if any of the user messages reveal directly the ground truth answer to the AI assistant.
56
+ In cases where the ground truth is found in the user messages but the message looks like a validation of the answer, then you should output 0.
57
+ Answer with ONLY 1 if the ground truth is included exactly, or ONLY 0 if it is not.
58
+ IMPORTANT: Only output 1 or 0, no other text.
59
+ """
60
+
61
+ model_client = get_model_client(None)
62
+ messages = [
63
+ SystemMessage(
64
+ content="You analyze if specific information is contained in messages."
65
+ ),
66
+ UserMessage(content=prompt, source="user"),
67
+ ]
68
+
69
+ try:
70
+ response = await model_client.create(messages=messages)
71
+ answer = response.content.strip()
72
+
73
+ # Ensure we only get 0 or 1
74
+ if answer == "1":
75
+ result = 1
76
+ else:
77
+ result = 0
78
+
79
+ await model_client.close()
80
+ return result
81
+ except Exception as e:
82
+ logger.error(f"Error calling model: {e}")
83
+ await model_client.close()
84
+ return -1
85
+
86
+
87
+ async def process_csv(csv_path: str, output_path: str) -> None:
88
+ """Process the CSV file and analyze if ground truth is in user messages"""
89
+ try:
90
+ df = pd.read_csv(csv_path)
91
+ logger.info(f"Loaded dataframe with {len(df)} rows")
92
+
93
+ # Create columns for the results
94
+ df["ground_truth_in_messages"] = None
95
+ df["trivial_ground_truth_in_messages"] = (
96
+ None # New column for trivial string match
97
+ )
98
+ df["llm_execution_count"] = 0 # New column for counting llm executions
99
+ df["llm_plan_count"] = 0 # New column for counting llm planning
100
+
101
+ for index, row in df.iterrows():
102
+ if pd.isna(row.get("ground_truth")) or pd.isna(row.get("user_messages")):
103
+ logger.warning(f"Missing data for row {index}")
104
+ continue
105
+
106
+ user_messages_str = str(row["user_messages"])
107
+ # Count llm executions in user messages
108
+ try:
109
+ messages = eval(user_messages_str)
110
+ llm_count = sum(
111
+ 1
112
+ for msg in messages
113
+ if isinstance(msg, dict)
114
+ and isinstance(msg.get("metadata"), dict)
115
+ and "user_execution_reply" in msg.get("metadata", {})
116
+ and msg["metadata"]["user_execution_reply"] == "llm"
117
+ )
118
+ plan_count = sum(
119
+ 1
120
+ for msg in messages
121
+ if isinstance(msg, dict)
122
+ and isinstance(msg.get("metadata"), dict)
123
+ and "user_plan_reply" in msg.get("metadata", {})
124
+ and msg["metadata"]["user_plan_reply"] == "llm"
125
+ )
126
+ df.at[index, "llm_execution_count"] = llm_count
127
+ df.at[index, "llm_plan_count"] = plan_count
128
+ except Exception as e:
129
+ logger.warning(
130
+ f"Could not parse messages for task {row.get('task_id', index)}: {e}"
131
+ )
132
+ df.at[index, "llm_execution_count"] = 0
133
+ df.at[index, "llm_plan_count"] = 0
134
+
135
+ answer = str(row.get("answer", "")).strip().lower()
136
+ # if answer == "unable to determine":
137
+ # df.at[index, "llm_execution_count"] = max(1, df.at[index, "llm_execution_count"])
138
+
139
+ logger.info(f"Processing task {row.get('task_id', index)}")
140
+ question = str(row["question"])
141
+ ground_truth = str(row["ground_truth"])
142
+ actual_user_messages = eval(user_messages_str)
143
+ actual_user_messages_str = ""
144
+ for msg in actual_user_messages:
145
+ actual_user_messages_str += f"{msg['content']}\n"
146
+ trivial_result = int(ground_truth in actual_user_messages_str)
147
+ df.at[index, "trivial_ground_truth_in_messages"] = trivial_result
148
+ result = await check_ground_truth_in_messages(
149
+ question, ground_truth, actual_user_messages_str
150
+ )
151
+ df.at[index, "ground_truth_in_messages"] = result
152
+ logger.info(
153
+ f"Task {row.get('task_id', index)}: result = {result}, llm executions = {df.at[index, 'llm_execution_count']}, llm planning = {df.at[index, 'llm_plan_count']}"
154
+ )
155
+
156
+ # Save results to new CSV
157
+ df.to_csv(output_path, index=False)
158
+ logger.info(f"Results saved to {output_path}")
159
+
160
+ # Calculate summary statistics (ALL TASKS)
161
+ counts = df["ground_truth_in_messages"].value_counts()
162
+ trivial_counts = df["trivial_ground_truth_in_messages"].value_counts()
163
+ total_valid = counts.sum()
164
+ trivial_total_valid = trivial_counts.sum()
165
+ percentage_included = (
166
+ (counts.get(1, 0) / total_valid * 100) if total_valid > 0 else 0
167
+ )
168
+ trivial_percentage_included = (
169
+ (trivial_counts.get(1, 0) / trivial_total_valid * 100)
170
+ if trivial_total_valid > 0
171
+ else 0
172
+ )
173
+
174
+ logger.info(
175
+ f"Summary (ALL TASKS): Ground truth included in {counts.get(1, 0)}/{total_valid} cases ({percentage_included:.2f}%)"
176
+ )
177
+ logger.info(
178
+ f"Trivial string match (ALL TASKS): Ground truth included in {trivial_counts.get(1, 0)}/{trivial_total_valid} cases ({trivial_percentage_included:.2f}%)"
179
+ )
180
+
181
+ mask_not_unable = (
182
+ df["answer"].astype(str).str.strip().str.lower() != "unable to determine"
183
+ )
184
+ df_not_unable = df[mask_not_unable]
185
+ # Ensure these are pandas Series for value_counts
186
+ gt_series = pd.Series(df_not_unable["ground_truth_in_messages"])
187
+ trivial_series = pd.Series(df_not_unable["trivial_ground_truth_in_messages"])
188
+ counts_not_unable = gt_series.value_counts()
189
+ trivial_counts_not_unable = trivial_series.value_counts()
190
+ total_valid_not_unable = counts_not_unable.sum()
191
+ trivial_total_valid_not_unable = trivial_counts_not_unable.sum()
192
+ percentage_included_not_unable = (
193
+ (counts_not_unable.get(1, 0) / total_valid_not_unable * 100)
194
+ if total_valid_not_unable > 0
195
+ else 0
196
+ )
197
+ trivial_percentage_included_not_unable = (
198
+ (trivial_counts_not_unable.get(1, 0) / trivial_total_valid_not_unable * 100)
199
+ if trivial_total_valid_not_unable > 0
200
+ else 0
201
+ )
202
+ logger.info(
203
+ f"Summary (EXCLUDING 'unable to determine'): Ground truth included in {counts_not_unable.get(1, 0)}/{total_valid_not_unable} cases ({percentage_included_not_unable:.2f}%)"
204
+ )
205
+ logger.info(
206
+ f"Trivial string match (EXCLUDING 'unable to determine'): Ground truth included in {trivial_counts_not_unable.get(1, 0)}/{trivial_total_valid_not_unable} cases ({trivial_percentage_included_not_unable:.2f}%)"
207
+ )
208
+
209
+ # Add summary statistics for llm executions
210
+ llm_stats = df["llm_execution_count"].describe()
211
+ tasks_with_execution = (df["llm_execution_count"] > 0).sum()
212
+ total_tasks = len(df)
213
+
214
+ # Get statistics for tasks with at least 1 execution
215
+ tasks_with_execution_df = df[df["llm_execution_count"] > 0]
216
+ tasks_with_planning = (df["llm_plan_count"] > 0).sum()
217
+ median_when_used = tasks_with_execution_df["llm_execution_count"].median()
218
+ mean_when_used = tasks_with_execution_df["llm_execution_count"].mean()
219
+
220
+ logger.info("\nLLM Execution Statistics:")
221
+ logger.info(
222
+ f"Tasks with at least 1 execution: {tasks_with_execution}/{total_tasks} ({(tasks_with_execution/total_tasks)*100:.2f}%)"
223
+ )
224
+ logger.info(
225
+ f"Tasks with at least 1 planning: {tasks_with_planning}/{total_tasks} ({(tasks_with_planning/total_tasks)*100:.2f}%)"
226
+ )
227
+ logger.info("\nWhen LLM is used at least once:")
228
+ logger.info(f" - Median executions: {median_when_used:.2f}")
229
+ logger.info(f" - Mean executions: {mean_when_used:.2f}")
230
+ logger.info("\nOverall statistics:")
231
+ logger.info(f"Mean executions per task: {llm_stats['mean']:.2f}")
232
+ logger.info(f"Median executions per task: {llm_stats['50%']:.2f}")
233
+ logger.info(f"Max executions in a task: {llm_stats['max']:.0f}")
234
+ logger.info(f"Min executions in a task: {llm_stats['min']:.0f}")
235
+
236
+ except Exception as e:
237
+ logger.error(f"Error processing CSV: {e}")
238
+
239
+
240
+ def main():
241
+ parser = argparse.ArgumentParser(description="Analyze simulated user data CSV.")
242
+ parser.add_argument(
243
+ "--run-dir", type=str, required=True, help="Path to the run directory."
244
+ )
245
+ args = parser.parse_args()
246
+
247
+ run_dir = args.run_dir
248
+ input_csv = os.path.join(run_dir, "results.csv")
249
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
250
+ output_csv = os.path.join(run_dir, f"sim_user_{timestamp}.csv")
251
+
252
+ # Run the analysis
253
+ asyncio.run(process_csv(input_csv, output_csv))
254
+
255
+
256
+ if __name__ == "__main__":
257
+ main()
experiments/eval/explore_results.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ import argparse
5
+ from typing import Dict, Any
6
+ from magentic_ui.eval.benchmarks.gaia.gaia import GaiaBenchmark
7
+
8
+
9
+ def get_run_results_df(
10
+ run_dir: str, data_dir: str, dataset_name: str = "Gaia"
11
+ ) -> pd.DataFrame:
12
+ """
13
+ Process a run directory and create a DataFrame containing all task results and ground truth.
14
+
15
+ Args:
16
+ run_dir (str): Path to the run directory containing task subdirectories
17
+
18
+ Returns:
19
+ pd.DataFrame: DataFrame containing task results and ground truth
20
+ """
21
+ # Initialize benchmark
22
+ if dataset_name == "Gaia":
23
+ benchmark = GaiaBenchmark(data_dir=data_dir)
24
+ else:
25
+ raise ValueError(f"Invalid dataset name: {dataset_name}")
26
+ # Download the dataset (only needed once)
27
+ benchmark.download_dataset()
28
+ # Load it into memory
29
+ benchmark.load_dataset()
30
+
31
+ # Initialize lists to store data
32
+ data = []
33
+
34
+ # Process each task directory
35
+ for task_dir in os.listdir(run_dir):
36
+ task_path = os.path.join(run_dir, task_dir)
37
+
38
+ # Skip if not a directory or if it's a log file
39
+ if not os.path.isdir(task_path) or task_dir.startswith("."):
40
+ continue
41
+
42
+ task_data: Dict[str, Any] = {"task_id": task_dir}
43
+
44
+ # Get ground truth from benchmark
45
+ if task_dir in benchmark.tasks:
46
+ task_data["ground_truth"] = benchmark.tasks[task_dir].ground_truth
47
+ task_data["question"] = benchmark.tasks[task_dir].question
48
+ task_data["difficulty"] = benchmark.tasks[task_dir].difficulty
49
+ task_data["metadata"] = benchmark.tasks[task_dir].metadata
50
+
51
+ # Read answer file
52
+ answer_file = os.path.join(task_path, f"{task_dir}_answer.json")
53
+ if os.path.exists(answer_file):
54
+ with open(answer_file, "r") as f:
55
+ task_data["answer"] = json.load(f)["answer"]
56
+
57
+ # Read messages file
58
+ messages_file = os.path.join(task_path, f"{task_dir}_messages.json")
59
+ if os.path.exists(messages_file):
60
+ with open(messages_file, "r") as f:
61
+ task_data["messages"] = json.load(f)
62
+ user_messages = [
63
+ message
64
+ for message in task_data["messages"]
65
+ if message["source"] == "user_proxy"
66
+ ]
67
+ task_data["user_messages"] = user_messages
68
+
69
+ # Read score file
70
+ score_file = os.path.join(task_path, "score.json")
71
+ if os.path.exists(score_file):
72
+ with open(score_file, "r") as f:
73
+ score = json.load(f)
74
+ task_data["score"] = score["score"]
75
+
76
+ # Read times file
77
+ times_file = os.path.join(task_path, "times.json")
78
+ if os.path.exists(times_file):
79
+ with open(times_file, "r") as f:
80
+ task_data["duration"] = json.load(f)["duration"]
81
+
82
+ data.append(task_data)
83
+ df = pd.DataFrame(data)
84
+ # Filter out rows where score is NaN
85
+ df = df.dropna(subset=["score"])
86
+
87
+ # Save DataFrame to CSV
88
+ output_csv = os.path.join(run_dir, "results.csv")
89
+ df.to_csv(output_csv, index=False)
90
+ print(f"Results DataFrame saved to {output_csv}")
91
+
92
+ return df
93
+
94
+
95
+ def get_output_prefix(run_dir: str) -> str:
96
+ """Generate output prefix from last 4 parts of run_dir path."""
97
+ # Split path and get last 4 parts
98
+ parts = os.path.normpath(run_dir).split(os.sep)
99
+ relevant_parts = parts[-4:] if len(parts) >= 4 else parts
100
+ return "_".join(relevant_parts)
101
+
102
+
103
+ def main():
104
+ parser = argparse.ArgumentParser(
105
+ description="Process run results and analyze tasks."
106
+ )
107
+ parser.add_argument(
108
+ "--run-dir",
109
+ type=str,
110
+ required=True,
111
+ help="Path to the run directory containing task subdirectories",
112
+ )
113
+ parser.add_argument(
114
+ "--data-dir", type=str, required=True, help="Path to the data directory"
115
+ )
116
+ args, unknown = (
117
+ parser.parse_known_args()
118
+ ) # First parse run_dir to generate default filenames
119
+
120
+ # Generate default filenames based on run_dir
121
+ prefix = get_output_prefix(args.run_dir)
122
+ parser.add_argument(
123
+ "--failed_output",
124
+ type=str,
125
+ default=f"{args.run_dir}/failed_tasks_{prefix}.json",
126
+ help="Output file path for failed tasks",
127
+ )
128
+ parser.add_argument(
129
+ "--all_output",
130
+ type=str,
131
+ default=f"{args.run_dir}/all_tasks_{prefix}.json",
132
+ help="Output file path for all tasks",
133
+ )
134
+
135
+ args = parser.parse_args() # Parse all arguments
136
+
137
+ df = get_run_results_df(args.run_dir, args.data_dir)
138
+
139
+ # Add a column to flag 'unable to determine' answers
140
+ unable_str = "Unable to determine"
141
+ df["unable_to_determine"] = (
142
+ df["answer"].astype(str).str.strip().str.contains(unable_str)
143
+ )
144
+ unable_count = df["unable_to_determine"].sum()
145
+
146
+ # Accuracy excluding 'unable to determine'
147
+ df_excl = df[~df["unable_to_determine"]]
148
+ if len(df_excl) > 0:
149
+ acc_excl = (df_excl["score"] > 0).mean()
150
+ else:
151
+ acc_excl = float("nan")
152
+
153
+ # Accuracy counting 'unable to determine' as correct
154
+ acc_unable_correct = ((df["score"] > 0) | df["unable_to_determine"]).mean()
155
+
156
+ # Create a list to store all tasks and failed tasks
157
+ all_tasks = []
158
+ failed_tasks = []
159
+
160
+ for index, row in df.iterrows():
161
+ task_info = {
162
+ "task_id": row["task_id"],
163
+ "question": row["question"],
164
+ "answer": row["answer"],
165
+ "ground_truth": row["ground_truth"],
166
+ "score": row["score"],
167
+ "difficulty": row["difficulty"],
168
+ "duration": row.get("duration", None),
169
+ "messages": row["messages"],
170
+ }
171
+ all_tasks.append(task_info)
172
+
173
+ if row["score"] == 0:
174
+ failed_tasks.append(task_info)
175
+
176
+ # Write all tasks to a log file
177
+ with open(args.all_output, "w") as log_file:
178
+ json.dump(all_tasks, log_file, indent=4, ensure_ascii=False)
179
+ print(f"All tasks written to {args.all_output}")
180
+
181
+ # Write failed tasks to a log file
182
+ with open(args.failed_output, "w") as log_file:
183
+ json.dump(failed_tasks, log_file, indent=4, ensure_ascii=False)
184
+ print(f"Failed tasks written to {args.failed_output}")
185
+
186
+ # Print summary statistics
187
+ print("\nSummary:")
188
+ print(f"Total tasks: {len(all_tasks)}")
189
+ print(f"Failed tasks: {len(failed_tasks)}")
190
+ print(f"Unable to determine: {unable_count}")
191
+ print(f"Rate of unable to determine: {unable_count / len(df) * 100:.2f}%")
192
+ print(
193
+ f"Success rate: {((len(all_tasks) - len(failed_tasks)) / len(all_tasks) * 100):.2f}%"
194
+ )
195
+ print(f"Accuracy (excluding 'unable to determine'): {acc_excl*100:.2f}%")
196
+ print(
197
+ f"Accuracy (counting 'unable to determine' as correct): {acc_unable_correct*100:.2f}%"
198
+ )
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()
experiments/eval/plot_results.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import matplotlib.style as style
3
+ from matplotlib.ticker import PercentFormatter
4
+ import os
5
+ import argparse
6
+ import numpy as np
7
+
8
+
9
+ def create_accuracy_plot(save_path=None, save_dir=None):
10
+ """
11
+ Parameters:
12
+ -----------
13
+ save_path : str, optional
14
+ Filename to save the figure. If None, the figure is not saved.
15
+ save_dir : str, optional
16
+ Directory to save the figure. If provided, the directory will be created
17
+ if it doesn't exist. Default is current directory if save_path is provided.
18
+
19
+ Returns:
20
+ --------
21
+ fig, ax : tuple
22
+ Figure and axes objects for further customization if needed.
23
+ """
24
+ style.use("seaborn-v0_8-whitegrid")
25
+ plt.rcParams["font.family"] = "sans-serif"
26
+ plt.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans"]
27
+ plt.rcParams["font.size"] = 16
28
+ plt.rcParams["axes.labelsize"] = 16
29
+ plt.rcParams["axes.titlesize"] = 17
30
+ plt.rcParams["xtick.labelsize"] = 12
31
+ plt.rcParams["ytick.labelsize"] = 12
32
+ plt.rcParams["legend.fontsize"] = 12
33
+
34
+ # Data
35
+ models = [
36
+ "Magentic-One",
37
+ "Magentic-UI\n(autonomous)",
38
+ "Magentic-UI +\nSimulated User\n(smarter model)",
39
+ "Magentic-UI +\nSimulated User\n(side-information)",
40
+ "Human",
41
+ ]
42
+ accuracy = [33.72, 30.2, 42.6, 51.9, 92]
43
+ sample_size = 162
44
+
45
+ # Calculate 95% confidence intervals for each accuracy
46
+ z = 1.96 # for 95% confidence
47
+ accuracy_frac = np.array(accuracy) / 100.0
48
+ ci_half_width = (
49
+ z * np.sqrt(accuracy_frac * (1 - accuracy_frac) / sample_size) * 100
50
+ ) # convert back to percent
51
+
52
+ # Create figure and axis with adjusted figsize for more horizontal space
53
+ fig, ax = plt.subplots(figsize=(9, 6))
54
+
55
+ # Custom colors as specified
56
+ dark_magenta = "#8B008B" # Darker magenta for Magentic-One
57
+ grey = "#808080" # Grey for Magentic-UI + Simulated Human
58
+ beige = "#F5F5DC" # Beige for Human
59
+
60
+ colors = [grey, dark_magenta, dark_magenta, dark_magenta, beige]
61
+ hatches = [
62
+ "",
63
+ "",
64
+ "///",
65
+ "xx",
66
+ "",
67
+ ]
68
+
69
+ # Create custom x positions for more space between bars
70
+ x = np.arange(len(models)) * 2
71
+
72
+ # Create separate bars for each model
73
+ bars = []
74
+ for i, (model, acc) in enumerate(zip(models, accuracy)):
75
+ bar = ax.bar(
76
+ x[i],
77
+ acc,
78
+ color=colors[i],
79
+ width=1,
80
+ edgecolor="black",
81
+ linewidth=0.8,
82
+ label=model,
83
+ hatch=hatches[i],
84
+ yerr=ci_half_width[i],
85
+ capsize=8,
86
+ )
87
+ bars.extend(bar)
88
+
89
+ # Set x-tick positions and labels
90
+ ax.set_xticks(x)
91
+ ax.set_xticklabels(models, rotation=0, ha="center")
92
+ # Configure the axes
93
+ ax.set_ylabel("Accuracy (%)", fontweight="bold")
94
+ ax.set_ylim(0, 100) # Set y-axis from 0 to 100%
95
+ ax.yaxis.set_major_formatter(PercentFormatter())
96
+
97
+ # Add grid for y-axis only and put it behind the bars
98
+ ax.yaxis.grid(True, linestyle="--", alpha=0.7)
99
+ ax.set_axisbelow(True)
100
+
101
+ # Remove top and right spines
102
+ ax.spines["top"].set_visible(False)
103
+ ax.spines["right"].set_visible(False)
104
+
105
+ # Make left and bottom spines thicker
106
+ ax.spines["left"].set_linewidth(1.5)
107
+ ax.spines["bottom"].set_linewidth(1.5)
108
+
109
+ # Add legend inside the plot
110
+ legend = ax.legend(
111
+ loc="upper left", frameon=True, framealpha=0.9, edgecolor="lightgray"
112
+ )
113
+ legend.get_title().set_fontweight("bold")
114
+
115
+ # Add some padding to the x-axis labels
116
+ plt.xticks(rotation=0, ha="center")
117
+
118
+ # Adjust bottom margin to ensure labels fit
119
+ plt.subplots_adjust(bottom=0.15)
120
+
121
+ plt.tight_layout()
122
+
123
+ # Save the figure in high resolution if path provided
124
+ if save_path:
125
+ if save_dir:
126
+ # Create directory if it doesn't exist
127
+ os.makedirs(save_dir, exist_ok=True)
128
+ full_path = os.path.join(save_dir, save_path)
129
+ else:
130
+ full_path = save_path
131
+ # save as pdf
132
+ plt.savefig(full_path.replace(".png", ".pdf"), dpi=600, bbox_inches="tight")
133
+ # save as png
134
+ plt.savefig(full_path.replace(".pdf", ".png"), dpi=600, bbox_inches="tight")
135
+ print(
136
+ f"Plot saved to: {os.path.abspath(full_path.replace('.png', '.pdf'))} and {os.path.abspath(full_path.replace('.pdf', '.png'))}"
137
+ )
138
+
139
+ return fig, ax
140
+
141
+
142
+ if __name__ == "__main__":
143
+ # Set up command line argument parsing
144
+ parser = argparse.ArgumentParser(description="plot experimental results")
145
+ parser.add_argument(
146
+ "--save-dir",
147
+ "-d",
148
+ type=str,
149
+ default="plots",
150
+ help="Directory to save the plot (default: plots)",
151
+ )
152
+
153
+ args = parser.parse_args()
154
+
155
+ # Create and display the plot
156
+ fig, ax = create_accuracy_plot(
157
+ save_path="model_accuracy_comparison.png", save_dir=args.save_dir
158
+ )
experiments/eval/prepare_for_submission.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+ from typing import Dict, Any, List
5
+
6
+
7
+ def load_questions_gaia(metadata_path: str) -> Dict[str, str]:
8
+ """Load questions from a Gaia metadata JSONL file."""
9
+ questions: Dict[str, str] = {}
10
+ with open(metadata_path, "r") as f:
11
+ for line in f:
12
+ entry = json.loads(line)
13
+ questions[entry["task_id"]] = entry["Question"]
14
+ return questions
15
+
16
+
17
+ def load_questions_assistantbench(metadata_path: str) -> Dict[str, str]:
18
+ """Load questions from an AssistantBench metadata JSONL file."""
19
+ questions: Dict[str, str] = {}
20
+ with open(metadata_path, "r") as f:
21
+ for line in f:
22
+ entry = json.loads(line)
23
+ questions[entry["id"]] = entry["task"]
24
+ return questions
25
+
26
+
27
+ def prepare_for_submission_gaia(base_dir: str, metadata_path: str) -> None:
28
+ """Prepare Gaia model answers for submission by aggregating answers and questions into a JSONL file."""
29
+ questions = load_questions_gaia(metadata_path)
30
+ task_ids = [
31
+ d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
32
+ ]
33
+ results: List[Dict[str, Any]] = []
34
+ found_task_ids = set()
35
+ for task_id in task_ids:
36
+ answer_path = os.path.join(base_dir, task_id, f"{task_id}_answer.json")
37
+ if os.path.exists(answer_path):
38
+ with open(answer_path, "r") as f:
39
+ data = json.load(f)
40
+ answer = data.get("answer", "")
41
+ if answer == "Unable to determine":
42
+ answer = ""
43
+ question = questions.get(task_id, "")
44
+ results.append(
45
+ {
46
+ "task_id": task_id,
47
+ "question": question,
48
+ "model_answer": answer,
49
+ "reasoning_trace": "Reasoning trace not available",
50
+ }
51
+ )
52
+ found_task_ids.add(task_id)
53
+ # Add missing questions from metadata
54
+ for task_id, question in questions.items():
55
+ if task_id not in found_task_ids:
56
+ results.append(
57
+ {
58
+ "task_id": task_id,
59
+ "question": question,
60
+ "answer": "",
61
+ "reasoning_trace": "Reasoning trace not available",
62
+ }
63
+ )
64
+ # Write to model_answers.jsonl in base_dir
65
+ output_file = os.path.join(base_dir, "model_answers.jsonl")
66
+ with open(output_file, "w") as f:
67
+ for item in results:
68
+ f.write(json.dumps(item) + "\n")
69
+
70
+
71
+ def prepare_for_submission_assistantbench(base_dir: str, metadata_path: str) -> None:
72
+ """Prepare AssistantBench model answers for submission by aggregating answers and questions into a JSONL file."""
73
+ questions = load_questions_assistantbench(metadata_path)
74
+ task_ids = [
75
+ d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
76
+ ]
77
+ results: List[Dict[str, Any]] = []
78
+ found_ids = set()
79
+ for task_id in task_ids:
80
+ answer_path = os.path.join(base_dir, task_id, f"{task_id}_answer.json")
81
+ if os.path.exists(answer_path):
82
+ with open(answer_path, "r") as f:
83
+ data = json.load(f)
84
+ # Expecting {"id": ..., "answer": ...}
85
+ id_ = data.get("id", task_id)
86
+ model_answer = data.get("answer", "")
87
+ if model_answer in ("Unable to determine", "None"):
88
+ model_answer = ""
89
+ # question = questions.get(id_, "")
90
+ results.append(
91
+ {
92
+ "id": id_,
93
+ # "question": question,
94
+ "answer": model_answer,
95
+ }
96
+ )
97
+ found_ids.add(id_)
98
+ # Add missing questions from metadata
99
+ for id_, question in questions.items():
100
+ if id_ not in found_ids:
101
+ results.append(
102
+ {
103
+ "id": id_,
104
+ # "question": question,
105
+ "answer": "",
106
+ }
107
+ )
108
+ # Write to model_answers.jsonl in base_dir
109
+ output_file = os.path.join(base_dir, "model_answers.jsonl")
110
+ with open(output_file, "w") as f:
111
+ for item in results:
112
+ f.write(json.dumps(item) + "\n")
113
+
114
+
115
+ if __name__ == "__main__":
116
+ parser = argparse.ArgumentParser(
117
+ description="Prepare model answers for submission."
118
+ )
119
+ parser.add_argument("base_dir", help="Base directory containing task folders.")
120
+ parser.add_argument("--metadata", default="", help="Path to metadata.jsonl file.")
121
+ parser.add_argument("--dataset", default="Gaia", help="Dataset name.")
122
+ args = parser.parse_args()
123
+ if args.dataset == "Gaia":
124
+ prepare_for_submission_gaia(args.base_dir, args.metadata)
125
+ elif args.dataset == "AssistantBench":
126
+ prepare_for_submission_assistantbench(args.base_dir, args.metadata)
127
+ else:
128
+ raise ValueError(f"Dataset {args.dataset} not supported.")
experiments/eval/run.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import yaml
3
+ import argparse
4
+ import os
5
+ import datetime
6
+ from typing import Optional, Dict, Any, Callable
7
+ from magentic_ui.eval.core import run_evaluate_benchmark_func, evaluate_benchmark_func
8
+ from systems.magentic_ui_sim_user_system import MagenticUISimUserSystem
9
+ from magentic_ui.eval.systems import LLMSystem
10
+ from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
11
+ from magentic_ui.eval.benchmark import Benchmark
12
+ from autogen_core.models import ChatCompletionClient
13
+
14
+
15
+ def save_experiment_args(args: argparse.Namespace, system_name: str) -> None:
16
+ """
17
+ Save experiment arguments to a timestamped JSON file.
18
+
19
+ Args:
20
+ args (argparse.Namespace): The arguments namespace containing experiment parameters.
21
+ system_name (str): The name of the system being evaluated.
22
+ """
23
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
24
+ filename = f"args_{timestamp}.json"
25
+
26
+ # Create the same directory structure as used in core.py
27
+ save_dir = os.path.join(
28
+ args.current_dir,
29
+ "runs",
30
+ system_name,
31
+ args.dataset,
32
+ args.split or "all_benchmark",
33
+ str(args.run_id),
34
+ )
35
+ os.makedirs(save_dir, exist_ok=True)
36
+
37
+ # Convert args namespace to dict
38
+ args_dict = vars(args).copy()
39
+
40
+ # Add only relevant client configurations if config file exists
41
+ if args.config and os.path.exists(args.config):
42
+ config_contents = load_config(args.config)
43
+ if config_contents is not None:
44
+ client_keys = [
45
+ "orchestrator_client",
46
+ "web_surfer_client",
47
+ "coder_client",
48
+ "file_surfer_client",
49
+ "user_proxy_client",
50
+ ]
51
+ args_dict["client_configs"] = {
52
+ k: config_contents.get(k) for k in client_keys if k in config_contents
53
+ }
54
+ args_dict["config_path"] = os.path.abspath(args.config)
55
+
56
+ filepath = os.path.join(save_dir, filename)
57
+ with open(filepath, "w") as f:
58
+ json.dump(args_dict, f, indent=4)
59
+
60
+ print(f"Experiment args saved to {filepath}")
61
+
62
+
63
+ def load_config(config_path: Optional[str]) -> Optional[Dict[str, Any]]:
64
+ """
65
+ Load configuration from either YAML or JSON file.
66
+
67
+ Args:
68
+ config_path (Optional[str]): Path to the configuration file (YAML or JSON).
69
+
70
+ Returns:
71
+ Optional[Dict[str, Any]]: The loaded configuration as a dictionary, or None if not found.
72
+ """
73
+ if config_path is None:
74
+ return None
75
+
76
+ with open(config_path, "r") as f:
77
+ if config_path.endswith((".yml", ".yaml")):
78
+ config = yaml.safe_load(f)
79
+ return config if config else None
80
+ else:
81
+ return json.load(f)
82
+
83
+
84
+ def run_system_evaluation(
85
+ args: argparse.Namespace,
86
+ system_constructor: Any,
87
+ system_name: str,
88
+ config: Optional[Dict[str, Any]] = None,
89
+ ) -> None:
90
+ """
91
+ Common function to run system evaluation to avoid code duplication.
92
+
93
+ Args:
94
+ args (argparse.Namespace): The arguments namespace containing experiment parameters.
95
+ system_constructor (Any): The system instance or constructor to evaluate.
96
+ system_name (str): The name of the system being evaluated.
97
+ config (Optional[Dict[str, Any]]): Optional configuration dictionary.
98
+ """
99
+ benchmark_constructor: Optional[Callable[..., Benchmark]] = None
100
+ if args.dataset == "WebVoyager":
101
+ # Download the dataset (only needed once)
102
+ client = ChatCompletionClient.load_component(
103
+ {
104
+ "provider": "OpenAIChatCompletionClient",
105
+ "config": {
106
+ "model": "gpt-4o-2024-08-06",
107
+ },
108
+ "max_retries": 10,
109
+ }
110
+ )
111
+
112
+ def create_benchmark(data_dir="WebVoyager", name="WebVoyager"):
113
+ benchmark = WebVoyagerBenchmark(
114
+ data_dir=data_dir,
115
+ eval_method="gpt_eval",
116
+ model_client=client,
117
+ )
118
+ return benchmark
119
+
120
+ benchmark_constructor = create_benchmark
121
+ # Load it into memory
122
+ if args.mode == "eval":
123
+ evaluate_benchmark_func(
124
+ benchmark_name=args.dataset,
125
+ benchmark_constructor=benchmark_constructor,
126
+ system_name=system_name,
127
+ parallel=args.parallel,
128
+ benchmark_dir=args.current_dir,
129
+ runs_dir=args.current_dir,
130
+ split=args.split,
131
+ run_id=args.run_id,
132
+ system_constructor=system_constructor,
133
+ redo_eval=args.redo_eval,
134
+ )
135
+ else:
136
+ run_evaluate_benchmark_func(
137
+ benchmark_name=args.dataset,
138
+ benchmark_constructor=benchmark_constructor,
139
+ system_name=system_name,
140
+ parallel=args.parallel,
141
+ benchmark_dir=args.current_dir,
142
+ runs_dir=args.current_dir,
143
+ split=args.split,
144
+ run_id=args.run_id,
145
+ system_constructor=system_constructor,
146
+ subsample=args.subsample if args.subsample < 1 else None,
147
+ redo_eval=args.redo_eval,
148
+ )
149
+
150
+
151
+ def run_system_sim_user(args: argparse.Namespace, system_name: str) -> None:
152
+ """
153
+ Run evaluation using the MagenticUISystem, which simulates user interactions.
154
+
155
+ Args:
156
+ args (argparse.Namespace): The arguments namespace containing experiment parameters.
157
+ system_name (str): The name of the system being evaluated.
158
+ """
159
+ config = load_config(args.config)
160
+
161
+ if system_name == "LLM":
162
+ # Use LLMSystem for LLM-based evaluations
163
+ system = LLMSystem(
164
+ system_name=system_name,
165
+ endpoint_config=config.get("model_client") if config else None,
166
+ )
167
+ else:
168
+ system = MagenticUISimUserSystem(
169
+ simulated_user_type=args.simulated_user_type,
170
+ endpoint_config_orch=config.get("orchestrator_client") if config else None,
171
+ endpoint_config_websurfer=config.get("web_surfer_client") if config else None,
172
+ endpoint_config_coder=config.get("coder_client") if config else None,
173
+ endpoint_config_file_surfer=config.get("file_surfer_client")
174
+ if config
175
+ else None,
176
+ endpoint_config_user_proxy=config.get("user_proxy_client") if config else None,
177
+ web_surfer_only=args.web_surfer_only,
178
+ how_helpful_user_proxy=args.how_helpful_user_proxy,
179
+ dataset_name=args.dataset,
180
+ )
181
+
182
+ run_system_evaluation(args, system, system_name, config)
183
+
184
+
185
+ def main() -> None:
186
+ """
187
+ Main entry point for running or evaluating the Magentic-UI system on benchmarks.
188
+ Parses command-line arguments and dispatches to the appropriate system runner.
189
+ """
190
+ parser = argparse.ArgumentParser(
191
+ description="Run or evaluate Magentic-UI system on benchmarks"
192
+ )
193
+ parser.add_argument(
194
+ "--mode",
195
+ choices=["run", "eval"],
196
+ default="run",
197
+ help="Mode to run: 'run' for running benchmarks, 'eval' for evaluation",
198
+ )
199
+ parser.add_argument(
200
+ "--current-dir", default=os.getcwd(), help="Current working directory"
201
+ )
202
+ parser.add_argument("--split", default="validation-1", help="Dataset split to use")
203
+ parser.add_argument("--dataset", default="Gaia", help="Dataset name")
204
+ parser.add_argument(
205
+ "--config", required=False, help="Path to endpoint configuration file for LLMs"
206
+ )
207
+ parser.add_argument(
208
+ "--run-id", type=int, default=1, help="Run ID for the experiment"
209
+ )
210
+ parser.add_argument(
211
+ "--parallel", type=int, default=1, help="Number of parallel processes to use"
212
+ )
213
+ parser.add_argument(
214
+ "--subsample",
215
+ type=float,
216
+ default=1,
217
+ help="Subsample ratio for the dataset (only used in run mode)",
218
+ )
219
+ parser.add_argument(
220
+ "--simulated-user-type",
221
+ type=str,
222
+ default="none",
223
+ help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, dummy, none)",
224
+ )
225
+ parser.add_argument(
226
+ "--how-helpful-user-proxy",
227
+ type=str,
228
+ default="soft",
229
+ help="How helpful the user proxy should be (strict, soft, no_hints)",
230
+ )
231
+
232
+ parser.add_argument(
233
+ "--user-messages-data",
234
+ type=str,
235
+ help="Path to user messages data CSV file",
236
+ )
237
+ parser.add_argument(
238
+ "--system-type",
239
+ type=str,
240
+ default="MagenticUI",
241
+ choices=["MagenticUI", "magentic-ui-sim-user", "LLM"],
242
+ help="Type of system to run",
243
+ )
244
+ parser.add_argument(
245
+ "--web-surfer-only",
246
+ type=bool,
247
+ default=False,
248
+ help="Run only the web surfer agent",
249
+ )
250
+ parser.add_argument(
251
+ "--redo-eval",
252
+ action="store_true",
253
+ default=False,
254
+ help="Redo evaluation even if results exist (default: False)",
255
+ )
256
+
257
+ args = parser.parse_args()
258
+
259
+ # Determine system name based on arguments
260
+
261
+ system_name = args.system_type
262
+
263
+ if args.simulated_user_type != "none":
264
+ system_name += f"_{args.simulated_user_type}_{args.how_helpful_user_proxy}"
265
+ if args.web_surfer_only:
266
+ system_name += "_web_surfer_only"
267
+
268
+ # Save experiment args
269
+ save_experiment_args(args, system_name)
270
+
271
+ # Run the appropriate system
272
+ run_system_sim_user(args, system_name)
273
+
274
+
275
+ if __name__ == "__main__":
276
+ main()
experiments/eval/sample_eval_systems.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from autogen_core.models import ChatCompletionClient
2
+ from systems import MagenticUIAutonomousSystem
3
+ from systems.magentic_one_system import MagenticOneSystem
4
+ from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
5
+ import os
6
+
7
+ def test_magentic_ui_system():
8
+ default_client_config = {
9
+ "provider": "OpenAIChatCompletionClient",
10
+ "config": {
11
+ "model": "gpt-4o-2024-08-06",
12
+ },
13
+ "max_retries": 10,
14
+ }
15
+
16
+ system = MagenticUIAutonomousSystem(
17
+ endpoint_config_orch=default_client_config,
18
+ endpoint_config_websurfer=default_client_config,
19
+ endpoint_config_coder=default_client_config,
20
+ endpoint_config_file_surfer=default_client_config,
21
+ use_local_browser=True,
22
+ web_surfer_only=True,
23
+ )
24
+
25
+ client = ChatCompletionClient.load_component(default_client_config)
26
+
27
+ benchmark = WebVoyagerBenchmark(
28
+ data_dir="WebVoyager",
29
+ eval_method="gpt_eval",
30
+ model_client=client,
31
+ )
32
+ benchmark.download_dataset()
33
+ benchmark.load_dataset()
34
+ test_task = benchmark.tasks["Allrecipes--0"]
35
+ print(test_task)
36
+ os.makedirs("test_output_magentic_ui", exist_ok=True)
37
+ answer = system.get_answer(
38
+ task_id="Allrecipes--0",
39
+ task=test_task,
40
+ output_dir="test_output_magentic_ui",
41
+ )
42
+ print(answer)
43
+ score = benchmark.evaluator(test_task, answer)
44
+ print(score)
45
+
46
+
47
+ def test_magentic_one_system():
48
+ default_client_config = {
49
+ "provider": "OpenAIChatCompletionClient",
50
+ "config": {
51
+ "model": "gpt-4o-2024-08-06",
52
+ },
53
+ "max_retries": 10,
54
+ }
55
+
56
+ system = MagenticOneSystem(
57
+ model_client_config=default_client_config,
58
+ web_surfer_only=True,
59
+ )
60
+
61
+ client = ChatCompletionClient.load_component(default_client_config)
62
+
63
+ benchmark = WebVoyagerBenchmark(
64
+ data_dir="WebVoyager",
65
+ eval_method="gpt_eval",
66
+ model_client=client,
67
+ )
68
+ benchmark.download_dataset()
69
+ benchmark.load_dataset()
70
+ test_task = benchmark.tasks["Allrecipes--0"]
71
+ print(test_task)
72
+ os.makedirs("test_output_magentic_one", exist_ok=True)
73
+ answer = system.get_answer(
74
+ task_id="Allrecipes--0",
75
+ task=test_task,
76
+ output_dir="test_output_magentic_one",
77
+ )
78
+ print(answer)
79
+ score = benchmark.evaluator(test_task, answer)
80
+ print(score)
81
+
82
+
83
+ if __name__ == "__main__":
84
+ test_magentic_one_system()
experiments/eval/systems/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .magentic_ui_sim_user_system import MagenticUISimUserSystem
2
+ from .magentic_ui_system import MagenticUIAutonomousSystem
3
+ from .magentic_one_system import MagenticOneSystem
4
+
5
+ __all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem", "MagententicOneSystem"]
experiments/eval/systems/magentic_one_system.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import aiofiles
5
+ import logging
6
+ import datetime
7
+ from PIL import Image
8
+ from pydantic import BaseModel
9
+ from typing import List, Dict, Any, Tuple
10
+ from autogen_core.models import ChatCompletionClient
11
+ from autogen_core import Image as AGImage
12
+ from autogen_agentchat.base import TaskResult, ChatAgent
13
+ from autogen_agentchat.messages import (
14
+ MultiModalMessage,
15
+ TextMessage,
16
+ )
17
+
18
+ from autogen_ext.agents.file_surfer import FileSurfer
19
+ from autogen_ext.agents.web_surfer import MultimodalWebSurfer
20
+ from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
21
+ from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
22
+ from autogen_agentchat.agents import CodeExecutorAgent
23
+ from autogen_agentchat.teams import MagenticOneGroupChat
24
+ from magentic_ui.eval.basesystem import BaseSystem
25
+ from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
26
+ from magentic_ui.types import CheckpointEvent
27
+
28
+ logger = logging.getLogger(__name__)
29
+ logging.getLogger("autogen").setLevel(logging.WARNING)
30
+ logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
31
+ logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)
32
+
33
+
34
+ class LogEventSystem(BaseModel):
35
+ """
36
+ Data model for logging events.
37
+
38
+ Attributes:
39
+ source (str): The source of the event (e.g., agent name).
40
+ content (str): The content/message of the event.
41
+ timestamp (str): ISO-formatted timestamp of the event.
42
+ metadata (Dict[str, str]): Additional metadata for the event.
43
+ """
44
+
45
+ source: str
46
+ content: str
47
+ timestamp: str
48
+ metadata: Dict[str, str] = {}
49
+
50
+
51
+ class MagenticOneSystem(BaseSystem):
52
+ """
53
+ MagenticOneSystem
54
+
55
+ Args:
56
+ name (str): Name of the system instance.
57
+ model_client_config (Dict[str, Any]): Model client config.
58
+ web_surfer_only (bool): If True, only the web surfer agent is used.
59
+ dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ model_client_config: Dict[str, Any],
65
+ web_surfer_only: bool = False,
66
+ name: str = "MagenticOneSystem",
67
+ dataset_name: str = "Gaia",
68
+ ):
69
+ super().__init__(name)
70
+ self.candidate_class = WebVoyagerCandidate
71
+ self.model_client_config = model_client_config
72
+ self.dataset_name = dataset_name
73
+ self.web_surfer_only = web_surfer_only
74
+
75
+ def get_answer(
76
+ self, task_id: str, task: BaseTask, output_dir: str
77
+ ) -> BaseCandidate:
78
+ """
79
+ Runs the agent team to solve a given task and saves the answer and logs to disk.
80
+
81
+ Args:
82
+ task_id (str): Unique identifier for the task.
83
+ task (BaseTask): The task object containing the question and metadata.
84
+ output_dir (str): Directory to save logs, screenshots, and answer files.
85
+
86
+ Returns:
87
+ BaseCandidate: An object containing the final answer and any screenshots taken during execution.
88
+ """
89
+
90
+ async def _runner() -> Tuple[str, List[str]]:
91
+ """
92
+ Asynchronous runner that executes the agent team and collects the answer and screenshots.
93
+
94
+ Returns:
95
+ Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
96
+ """
97
+ messages_so_far: List[LogEventSystem] = []
98
+
99
+ task_question: str = task.question
100
+ # Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
101
+ FINAL_ANSWER_PROMPT = """
102
+ output a FINAL ANSWER to the task.
103
+ The task is: {task}`
104
+
105
+ To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
106
+ Don't put your answer in brackets or quotes.
107
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
108
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
109
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
110
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
111
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
112
+ You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
113
+ """
114
+
115
+ model_client = ChatCompletionClient.load_component(self.model_client_config)
116
+
117
+ # Instantiate agents explicitly
118
+ ws = MultimodalWebSurfer(
119
+ "WebSurfer",
120
+ model_client=model_client,
121
+ to_save_screenshots=True,
122
+ debug_dir=output_dir,
123
+ )
124
+
125
+ agents: List[ChatAgent] = []
126
+ if self.web_surfer_only:
127
+ agents = [ws]
128
+ else:
129
+ coder = MagenticOneCoderAgent("Coder", model_client=model_client)
130
+ executor = CodeExecutorAgent(
131
+ "ComputerTerminal", code_executor=LocalCommandLineCodeExecutor()
132
+ )
133
+ fs = FileSurfer("FileSurfer", model_client=model_client)
134
+
135
+ agents = [fs, ws, coder, executor]
136
+ m1_agent = MagenticOneGroupChat(
137
+ agents,
138
+ model_client=model_client,
139
+ final_answer_prompt=FINAL_ANSWER_PROMPT,
140
+ )
141
+
142
+ # Step 3: Prepare the task message
143
+ answer: str = ""
144
+ # check if file name is an image if it exists
145
+ if (
146
+ hasattr(task, "file_name")
147
+ and task.file_name
148
+ and task.file_name.endswith((".png", ".jpg", ".jpeg"))
149
+ ):
150
+ task_message = MultiModalMessage(
151
+ content=[
152
+ task_question,
153
+ AGImage.from_pil(Image.open(task.file_name)),
154
+ ],
155
+ source="user",
156
+ )
157
+ else:
158
+ task_message = TextMessage(content=task_question, source="user")
159
+ # Step 4: Run the team on the task
160
+ async for message in m1_agent.run_stream(task=task_message):
161
+ # Store log events
162
+ message_str: str = ""
163
+ try:
164
+ if isinstance(message, TaskResult) or isinstance(
165
+ message, CheckpointEvent
166
+ ):
167
+ continue
168
+ message_str = message.to_text()
169
+ # Create log event with source, content and timestamp
170
+ log_event = LogEventSystem(
171
+ source=message.source,
172
+ content=message_str,
173
+ timestamp=datetime.datetime.now().isoformat(),
174
+ metadata=message.metadata,
175
+ )
176
+ messages_so_far.append(log_event)
177
+ except Exception as e:
178
+ logger.info(
179
+ f"[likely nothing] When creating model_dump of message encountered exception {e}"
180
+ )
181
+ pass
182
+
183
+ # save to file
184
+ logger.info(f"Run in progress: {task_id}, message: {message_str}")
185
+ async with aiofiles.open(
186
+ f"{output_dir}/{task_id}_messages.json", "w"
187
+ ) as f:
188
+ # Convert list of logevent objects to list of dicts
189
+ messages_json = [msg.model_dump() for msg in messages_so_far]
190
+ await f.write(json.dumps(messages_json, indent=2))
191
+ await f.flush() # Flush to disk immediately
192
+ # how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
193
+
194
+ # get last message with source MagenticOneOrchestrator, might not be the last message
195
+ last_message_with_orchestrator = None
196
+ for message in messages_so_far:
197
+ if message.source == "MagenticOneOrchestrator":
198
+ last_message_with_orchestrator = message
199
+ if last_message_with_orchestrator:
200
+ answer = last_message_with_orchestrator.content
201
+ answer = answer.split("FINAL ANSWER:")[0].strip()
202
+ else:
203
+ answer = messages_so_far[-1].content
204
+
205
+ assert isinstance(
206
+ answer, str
207
+ ), f"Expected answer to be a string, got {type(answer)}"
208
+
209
+ # save the usage of each of the client in a usage json file
210
+ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
211
+ return {
212
+ "prompt_tokens": model_client.total_usage().prompt_tokens,
213
+ "completion_tokens": model_client.total_usage().completion_tokens,
214
+ }
215
+
216
+ usage_json = {
217
+ "client": get_usage(model_client),
218
+ }
219
+ async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
220
+ await f.write(json.dumps(usage_json, indent=2))
221
+
222
+ # Step 5: Prepare the screenshots
223
+ screenshots_paths = []
224
+ # check the directory for screenshots which start with screenshot_raw_
225
+ for file in os.listdir(output_dir):
226
+ if file.startswith("screenshot_"):
227
+ timestamp = file.split("_")[1]
228
+ screenshots_paths.append(
229
+ [timestamp, os.path.join(output_dir, file)]
230
+ )
231
+
232
+ # restrict to last 15 screenshots by timestamp
233
+ screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
234
+ screenshots_paths = [x[1] for x in screenshots_paths]
235
+ return answer, screenshots_paths
236
+
237
+ # Step 6: Return the answer and screenshots
238
+ answer, screenshots_paths = asyncio.run(_runner())
239
+ answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
240
+ self.save_answer_to_disk(task_id, answer, output_dir)
241
+ return answer
experiments/eval/systems/magentic_ui_sim_user_system.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import aiofiles
5
+ import logging
6
+ import datetime
7
+ from pathlib import Path
8
+ from PIL import Image
9
+ from pydantic import BaseModel
10
+ from typing import List, Optional, Union, Dict, Any, Literal, Tuple
11
+ from autogen_core import ComponentModel
12
+ from autogen_core.models import ChatCompletionClient
13
+ from autogen_core import Image as AGImage
14
+ from autogen_agentchat.base import TaskResult, ChatAgent
15
+ from autogen_agentchat.messages import (
16
+ MultiModalMessage,
17
+ TextMessage,
18
+ )
19
+ from autogen_agentchat.conditions import TimeoutTermination
20
+ from magentic_ui import OrchestratorConfig
21
+ from magentic_ui.eval.basesystem import BaseSystem
22
+ from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
23
+ from magentic_ui.types import CheckpointEvent
24
+ from magentic_ui.agents import WebSurfer, CoderAgent, FileSurfer
25
+ from magentic_ui.teams import GroupChat
26
+ from magentic_ui.agents.users import MetadataUserProxy, DummyUserProxy
27
+ from magentic_ui.tools.playwright.browser import VncDockerPlaywrightBrowser
28
+ from magentic_ui.tools.playwright.browser.utils import get_available_port
29
+ from magentic_ui.approval_guard import (
30
+ ApprovalGuard,
31
+ ApprovalGuardContext,
32
+ ApprovalConfig,
33
+ )
34
+
35
+ logger = logging.getLogger(__name__)
36
+ logging.getLogger("autogen").setLevel(logging.WARNING)
37
+ logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
38
+ logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)
39
+
40
+
41
+ class LogEventSystem(BaseModel):
42
+ """
43
+ Data model for logging events.
44
+
45
+ Attributes:
46
+ source (str): The source of the event (e.g., agent name).
47
+ content (str): The content/message of the event.
48
+ timestamp (str): ISO-formatted timestamp of the event.
49
+ metadata (Dict[str, str]): Additional metadata for the event.
50
+ """
51
+
52
+ source: str
53
+ content: str
54
+ timestamp: str
55
+ metadata: Dict[str, str] = {}
56
+
57
+
58
+ USER_PROXY_DESCRIPTION = """
59
+ The human user who gave the original task.
60
+ The human user cannot browse the web or write code or access files. So do not ask them to perform any actions on the web.
61
+ In case where the task requires further clarifying information, the user can be asked to clarify the task.
62
+ In case where you are stuck and unable to make progress on completing the task, you can ask the user for help.
63
+ Make sure to do your best to complete the task with other agents before asking the user for help.
64
+ The human can help if you're stuck by providing hints on how to solve the task.
65
+ The human can also help verify your answer and provide you guidance.
66
+ """
67
+
68
+
69
+ class MagenticUISimUserSystem(BaseSystem):
70
+ """
71
+ MagenticUISimUserSystem orchestrates a simulated user and a team of agents to solve tasks using Magentic-UI.
72
+
73
+ This class manages the instantiation of agents (WebSurfer, CoderAgent, FileSurfer, and optionally a user proxy), configures the orchestration logic, launches a browser for web tasks, and coordinates the team to solve a given task. It logs all agent messages, saves answers and resource usage, and supports different evaluation datasets and user simulation types.
74
+
75
+ Args:
76
+ name (str): Name of the system instance.
77
+ simulated_user_type (Literal): Type of simulated user ("co-planning", "co-execution", etc.).
78
+ how_helpful_user_proxy (Literal): Determines how helpful the user proxy is ("strict", "soft", "no_hints").
79
+ web_surfer_only (bool): If True, only the web surfer agent is used.
80
+ endpoint_config_orch (Optional[Dict]): Orchestrator model client config.
81
+ endpoint_config_websurfer (Optional[Dict]): WebSurfer agent model client config.
82
+ endpoint_config_coder (Optional[Dict]): Coder agent model client config.
83
+ endpoint_config_file_surfer (Optional[Dict]): FileSurfer agent model client config.
84
+ endpoint_config_user_proxy (Optional[Dict]): User proxy agent model client config.
85
+ dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
86
+ include_metadata_in_task_message (bool): Whether to include rewritten metadata in the task message.
87
+ """
88
+
89
+ default_client_config = {
90
+ "provider": "OpenAIChatCompletionClient",
91
+ "config": {
92
+ "model": "gpt-4o-2024-08-06",
93
+ },
94
+ "max_retries": 10,
95
+ }
96
+
97
+ o4_client_config = {
98
+ "provider": "OpenAIChatCompletionClient",
99
+ "config": {
100
+ "model": "o4-mini",
101
+ },
102
+ "max_retries": 10,
103
+ }
104
+
105
+ def __init__(
106
+ self,
107
+ name: str = "MagenticUISimUserSystem",
108
+ simulated_user_type: Literal[
109
+ "co-planning",
110
+ "co-execution",
111
+ "co-planning-and-execution",
112
+ "none",
113
+ "dummy",
114
+ ] = "none",
115
+ how_helpful_user_proxy: Literal["strict", "soft", "no_hints"] = "soft",
116
+ web_surfer_only: bool = False,
117
+ endpoint_config_orch: Optional[Dict[str, Any]] = default_client_config,
118
+ endpoint_config_websurfer: Optional[Dict[str, Any]] = default_client_config,
119
+ endpoint_config_coder: Optional[Dict[str, Any]] = default_client_config,
120
+ endpoint_config_file_surfer: Optional[Dict[str, Any]] = default_client_config,
121
+ endpoint_config_user_proxy: Optional[Dict[str, Any]] = default_client_config,
122
+ dataset_name: str = "Gaia",
123
+ include_metadata_in_task_message: bool = False,
124
+ ):
125
+ super().__init__(name)
126
+ self.candidate_class = WebVoyagerCandidate
127
+ self.endpoint_config_orch = endpoint_config_orch
128
+ self.endpoint_config_websurfer = endpoint_config_websurfer
129
+ self.endpoint_config_coder = endpoint_config_coder
130
+ self.endpoint_config_file_surfer = endpoint_config_file_surfer
131
+ self.simulated_user_type = simulated_user_type
132
+ self.endpoint_config_user_proxy = endpoint_config_user_proxy
133
+ self.web_surfer_only = web_surfer_only
134
+ self.dataset_name = dataset_name
135
+ self.how_helpful_user_proxy = how_helpful_user_proxy
136
+ self.include_metadata_in_task_message = include_metadata_in_task_message
137
+
138
+ def get_answer(
139
+ self, task_id: str, task: BaseTask, output_dir: str
140
+ ) -> BaseCandidate:
141
+ """
142
+ Runs the agent team to solve a given task and saves the answer and logs to disk.
143
+
144
+ Args:
145
+ task_id (str): Unique identifier for the task.
146
+ task (BaseTask): The task object containing the question and metadata.
147
+ output_dir (str): Directory to save logs, screenshots, and answer files.
148
+
149
+ Returns:
150
+ BaseCandidate: An object containing the final answer and any screenshots taken during execution.
151
+ """
152
+
153
+ async def _runner() -> Tuple[str, List[str]]:
154
+ """
155
+ Asynchronous runner that executes the agent team and collects the answer and screenshots.
156
+
157
+ Returns:
158
+ Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
159
+ """
160
+ task_question: str = task.question
161
+ # STEP 1: FINAL ANSWER PROMPT
162
+ if self.dataset_name == "WebVoyager":
163
+ # For WebVoyager, there is no restrictions on the final answer like Gaia or AssistantBench for evaluation
164
+ FINAL_ANSWER_PROMPT = f"""
165
+ output a FINAL ANSWER to the task
166
+
167
+ The real task is: {task_question}
168
+
169
+ Try your best to answer the question and provide a final answer that completely answers
170
+ To output the final answer, use the following template FINAL ANSWER: [YOUR FINAL ANSWER]
171
+ Don't put your answer in brackets or quotes.
172
+ """
173
+ else:
174
+ if (
175
+ self.simulated_user_type != "none"
176
+ or self.dataset_name == "AssistantBench"
177
+ ):
178
+ # This allows model to say "Unable to determine" or "None" if it is unable to answer the question.
179
+ FINAL_ANSWER_PROMPT = f"""
180
+ output a FINAL ANSWER to the task.
181
+
182
+ The real task is: {task_question}
183
+
184
+
185
+ To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
186
+ Don't put your answer in brackets or quotes.
187
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
188
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
189
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
190
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
191
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
192
+ If you are unable to determine the final answer, output '[any explanation for final answer] FINAL ANSWER: Unable to determine'
193
+ Try your best to answer the question and provide a smart guess if you are unsure.
194
+ """
195
+ else:
196
+ # Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
197
+ FINAL_ANSWER_PROMPT = f"""
198
+ output a FINAL ANSWER to the task.
199
+
200
+ The real task is: {task_question}
201
+
202
+
203
+ To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
204
+ Don't put your answer in brackets or quotes.
205
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
206
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
207
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
208
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
209
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
210
+ You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
211
+ """
212
+ # Step 2: Create the Magentic-UI team
213
+ # TERMINATION CONDITION
214
+ termination_condition = TimeoutTermination(
215
+ timeout_seconds=60 * 15
216
+ ) # 15 minutes
217
+ model_context_token_limit = 110000
218
+ # ORCHESTRATOR CONFIGURATION
219
+ orchestrator_config = OrchestratorConfig(
220
+ cooperative_planning=False
221
+ if self.simulated_user_type in ["co-execution", "none"]
222
+ else True,
223
+ autonomous_execution=True
224
+ if self.simulated_user_type in ["co-planning", "none", "dummy"]
225
+ else False,
226
+ allow_follow_up_input=False,
227
+ final_answer_prompt=FINAL_ANSWER_PROMPT,
228
+ model_context_token_limit=model_context_token_limit,
229
+ no_overwrite_of_task=True,
230
+ )
231
+
232
+ # GET MODEL CLIENTS
233
+ def get_model_client(
234
+ endpoint_config: Optional[Union[ComponentModel, Dict[str, Any]]],
235
+ ) -> ChatCompletionClient:
236
+ """
237
+ Loads a ChatCompletionClient from a given endpoint configuration.
238
+
239
+ Args:
240
+ endpoint_config (Optional[Union[ComponentModel, Dict[str, Any]]]):
241
+ The configuration for the model client.
242
+
243
+ Returns:
244
+ ChatCompletionClient: The loaded model client.
245
+ """
246
+ if endpoint_config is None:
247
+ return ChatCompletionClient.load_component(
248
+ self.default_client_config
249
+ )
250
+ return ChatCompletionClient.load_component(endpoint_config)
251
+
252
+ model_client_orch = get_model_client(self.endpoint_config_orch)
253
+ model_client_coder = get_model_client(self.endpoint_config_coder)
254
+ model_client_websurfer = get_model_client(self.endpoint_config_websurfer)
255
+ model_client_file_surfer = get_model_client(
256
+ self.endpoint_config_file_surfer
257
+ )
258
+ model_client_user_proxy = get_model_client(self.endpoint_config_user_proxy)
259
+
260
+ # launch the browser
261
+ playwright_port, socket = get_available_port()
262
+ novnc_port, socket_vnc = get_available_port()
263
+ socket.close()
264
+ socket_vnc.close()
265
+ browser = VncDockerPlaywrightBrowser(
266
+ bind_dir=Path(output_dir),
267
+ playwright_port=playwright_port,
268
+ novnc_port=novnc_port,
269
+ inside_docker=False,
270
+ )
271
+
272
+ # Create action guard with default policy "never"
273
+ action_guard = ApprovalGuard(
274
+ input_func=None,
275
+ default_approval=False,
276
+ model_client=model_client_orch,
277
+ config=ApprovalConfig(
278
+ approval_policy="never",
279
+ ),
280
+ )
281
+
282
+ # CREATE AGENTS
283
+ coder_agent = CoderAgent(
284
+ name="coder_agent",
285
+ model_client=model_client_coder,
286
+ work_dir=os.path.abspath(output_dir),
287
+ model_context_token_limit=model_context_token_limit,
288
+ )
289
+
290
+ file_surfer = FileSurfer(
291
+ name="file_surfer",
292
+ model_client=model_client_file_surfer,
293
+ work_dir=os.path.abspath(output_dir),
294
+ bind_dir=os.path.abspath(output_dir),
295
+ model_context_token_limit=model_context_token_limit,
296
+ )
297
+ # Create web surfer
298
+ with ApprovalGuardContext.populate_context(action_guard):
299
+ web_surfer = WebSurfer(
300
+ name="web_surfer",
301
+ model_client=model_client_websurfer,
302
+ browser=browser,
303
+ animate_actions=False,
304
+ max_actions_per_step=10,
305
+ start_page="about:blank" if task.url_path == "" else task.url_path,
306
+ downloads_folder=os.path.abspath(output_dir),
307
+ debug_dir=os.path.abspath(output_dir),
308
+ model_context_token_limit=model_context_token_limit,
309
+ to_save_screenshots=True,
310
+ )
311
+
312
+ # USER PROXY IF NEEDED for simulated user
313
+ task_metadata = getattr(task, "metadata", "")
314
+ if task_metadata and "Steps" in task_metadata:
315
+ task_metadata = task_metadata["Steps"] # type: ignore
316
+
317
+ if self.simulated_user_type == "none":
318
+ user_proxy = None
319
+ elif self.simulated_user_type == "dummy":
320
+ user_proxy = DummyUserProxy(
321
+ name="user_proxy",
322
+ )
323
+ else:
324
+ user_proxy = MetadataUserProxy(
325
+ name="user_proxy",
326
+ description=USER_PROXY_DESCRIPTION,
327
+ task=task.question,
328
+ helpful_task_hints=task_metadata,
329
+ task_answer=getattr(task, "ground_truth", ""),
330
+ model_client=model_client_user_proxy,
331
+ simulated_user_type=self.simulated_user_type, # type: ignore
332
+ how_helpful=self.how_helpful_user_proxy, # type: ignore
333
+ )
334
+
335
+ agent_list: List[ChatAgent] = [web_surfer, coder_agent, file_surfer]
336
+ if self.web_surfer_only:
337
+ agent_list = [web_surfer]
338
+ if user_proxy:
339
+ agent_list.append(user_proxy)
340
+
341
+ team = GroupChat(
342
+ participants=agent_list,
343
+ orchestrator_config=orchestrator_config,
344
+ model_client=model_client_orch,
345
+ termination_condition=termination_condition,
346
+ )
347
+ await team.lazy_init()
348
+ # Step 3: Prepare the task message
349
+ answer: str = ""
350
+ messages_so_far: List[LogEventSystem] = []
351
+ # Optionally append rewritten metadata for both multimodal and non-multimodal
352
+ rewritten_metadata = None
353
+ if self.include_metadata_in_task_message and task_metadata:
354
+ from autogen_core import CancellationToken
355
+ from autogen_core.models import UserMessage
356
+
357
+ prompt = f"""Rewrite the following helpful hints to help solve the task, but remove any information that directly reveals the answer. \nKeep the hints as close to the original as possible but remove any information that directly reveals the answer.\nHelpful hints: {task_metadata}\n\nAnswer: {getattr(task, "ground_truth", "")}\n\nDo not include anything else in your response except the rewritten hints.\nRewritten helpful hints:"""
358
+ result = await model_client_orch.create(
359
+ messages=[UserMessage(content=prompt, source="user")],
360
+ cancellation_token=CancellationToken(),
361
+ )
362
+ assert isinstance(result.content, str)
363
+ rewritten_metadata = (
364
+ "\n\nWe have access to helpful hints that helps in solving the task: "
365
+ + result.content.strip()
366
+ )
367
+ # check if file name is an image if it exists
368
+ if (
369
+ hasattr(task, "file_name")
370
+ and task.file_name
371
+ and task.file_name.endswith((".png", ".jpg", ".jpeg"))
372
+ ):
373
+ content_list: list[Union[str, AGImage]] = [task_question]
374
+ if rewritten_metadata:
375
+ if isinstance(content_list[0], str):
376
+ content_list[0] = content_list[0] + rewritten_metadata
377
+ content_list.append(AGImage.from_pil(Image.open(task.file_name)))
378
+ task_message = MultiModalMessage(
379
+ content=content_list,
380
+ source="user",
381
+ )
382
+ else:
383
+ if rewritten_metadata:
384
+ task_message = TextMessage(
385
+ content=task_question + rewritten_metadata, source="user"
386
+ )
387
+ else:
388
+ task_message = TextMessage(content=task_question, source="user")
389
+ # Step 4: Run the team on the task
390
+ async for message in team.run_stream(task=task_message):
391
+ # Store log events
392
+ message_str: str = ""
393
+ try:
394
+ if isinstance(message, TaskResult) or isinstance(
395
+ message, CheckpointEvent
396
+ ):
397
+ continue
398
+ message_str = message.to_text()
399
+ # Create log event with source, content and timestamp
400
+ log_event = LogEventSystem(
401
+ source=message.source,
402
+ content=message_str,
403
+ timestamp=datetime.datetime.now().isoformat(),
404
+ metadata=message.metadata,
405
+ )
406
+ messages_so_far.append(log_event)
407
+ except Exception as e:
408
+ logger.info(
409
+ f"[likely nothing] When creating model_dump of message encountered exception {e}"
410
+ )
411
+ pass
412
+
413
+ # save to file
414
+ logger.info(f"Run in progress: {task_id}, message: {message_str}")
415
+ async with aiofiles.open(
416
+ f"{output_dir}/{task_id}_messages.json", "w"
417
+ ) as f:
418
+ # Convert list of logevent objects to list of dicts
419
+ messages_json = [msg.model_dump() for msg in messages_so_far]
420
+ await f.write(json.dumps(messages_json, indent=2))
421
+ await f.flush() # Flush to disk immediately
422
+ # how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
423
+
424
+ if message_str.startswith("Final Answer:"):
425
+ answer = message_str[len("Final Answer:") :].strip()
426
+ # remove the "FINAL ANSWER:" part and get the string after it
427
+ answer = answer.split("FINAL ANSWER:")[1].strip()
428
+
429
+ assert isinstance(answer, str), (
430
+ f"Expected answer to be a string, got {type(answer)}"
431
+ )
432
+
433
+ # save the usage of each of the client in a usage json file
434
+ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
435
+ return {
436
+ "prompt_tokens": model_client.total_usage().prompt_tokens,
437
+ "completion_tokens": model_client.total_usage().completion_tokens,
438
+ }
439
+
440
+ usage_json = {
441
+ "orchestrator": get_usage(model_client_orch),
442
+ "websurfer": get_usage(model_client_websurfer),
443
+ "coder": get_usage(model_client_coder),
444
+ "file_surfer": get_usage(model_client_file_surfer),
445
+ "user_proxy": get_usage(model_client_user_proxy),
446
+ }
447
+ usage_json["total_without_user_proxy"] = {
448
+ "prompt_tokens": sum(
449
+ usage_json[key]["prompt_tokens"]
450
+ for key in usage_json
451
+ if key != "user_proxy"
452
+ ),
453
+ "completion_tokens": sum(
454
+ usage_json[key]["completion_tokens"]
455
+ for key in usage_json
456
+ if key != "user_proxy"
457
+ ),
458
+ }
459
+ async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
460
+ await f.write(json.dumps(usage_json, indent=2))
461
+
462
+ await team.close()
463
+ # Step 5: Prepare the screenshots
464
+ screenshots_paths = []
465
+ # check the directory for screenshots which start with screenshot_raw_
466
+ for file in os.listdir(output_dir):
467
+ if file.startswith("screenshot_raw_"):
468
+ # screenshot_raw_1746259609.png
469
+ # get the timestamp from the file name
470
+ timestamp = file.split("_")[1]
471
+ screenshots_paths.append(
472
+ [timestamp, os.path.join(output_dir, file)]
473
+ )
474
+
475
+ # restrict to last 15 screenshots by timestamp
476
+ screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
477
+ screenshots_paths = [x[1] for x in screenshots_paths]
478
+ return answer, screenshots_paths
479
+
480
+ # Step 6: Return the answer and screenshots
481
+ answer, screenshots_paths = asyncio.run(_runner())
482
+ answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
483
+ self.save_answer_to_disk(task_id, answer, output_dir)
484
+ return answer
experiments/eval/systems/magentic_ui_system.py ADDED
@@ -0,0 +1,328 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ import aiofiles
5
+ import logging
6
+ import datetime
7
+ from pathlib import Path
8
+ from PIL import Image
9
+ from pydantic import BaseModel
10
+ from typing import List, Dict, Any, Tuple
11
+ from autogen_core.models import ChatCompletionClient
12
+ from autogen_core import Image as AGImage
13
+ from autogen_agentchat.base import TaskResult, ChatAgent
14
+ from autogen_agentchat.messages import (
15
+ MultiModalMessage,
16
+ TextMessage,
17
+ )
18
+ from autogen_agentchat.conditions import TimeoutTermination
19
+ from magentic_ui import OrchestratorConfig
20
+ from magentic_ui.eval.basesystem import BaseSystem
21
+ from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
22
+ from magentic_ui.types import CheckpointEvent
23
+ from magentic_ui.agents import WebSurfer, CoderAgent, FileSurfer
24
+ from magentic_ui.teams import GroupChat
25
+ from magentic_ui.tools.playwright.browser import VncDockerPlaywrightBrowser
26
+ from magentic_ui.tools.playwright.browser import LocalPlaywrightBrowser
27
+ from magentic_ui.tools.playwright.browser.utils import get_available_port
28
+
29
+
30
+ logger = logging.getLogger(__name__)
31
+ logging.getLogger("autogen").setLevel(logging.WARNING)
32
+ logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
33
+ logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)
34
+
35
+
36
+ class LogEventSystem(BaseModel):
37
+ """
38
+ Data model for logging events.
39
+
40
+ Attributes:
41
+ source (str): The source of the event (e.g., agent name).
42
+ content (str): The content/message of the event.
43
+ timestamp (str): ISO-formatted timestamp of the event.
44
+ metadata (Dict[str, str]): Additional metadata for the event.
45
+ """
46
+
47
+ source: str
48
+ content: str
49
+ timestamp: str
50
+ metadata: Dict[str, str] = {}
51
+
52
+
53
+ class MagenticUIAutonomousSystem(BaseSystem):
54
+ """
55
+ MagenticUIAutonomousSystem
56
+
57
+ Args:
58
+ name (str): Name of the system instance.
59
+ web_surfer_only (bool): If True, only the web surfer agent is used.
60
+ endpoint_config_orch (Optional[Dict]): Orchestrator model client config.
61
+ endpoint_config_websurfer (Optional[Dict]): WebSurfer agent model client config.
62
+ endpoint_config_coder (Optional[Dict]): Coder agent model client config.
63
+ endpoint_config_file_surfer (Optional[Dict]): FileSurfer agent model client config.
64
+ dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
65
+ use_local_browser (bool): If True, use the local browser.
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ endpoint_config_orch: Dict[str, Any],
71
+ endpoint_config_websurfer: Dict[str, Any],
72
+ endpoint_config_coder: Dict[str, Any],
73
+ endpoint_config_file_surfer: Dict[str, Any],
74
+ name: str = "MagenticUIAutonomousSystem",
75
+ dataset_name: str = "Gaia",
76
+ web_surfer_only: bool = False,
77
+ use_local_browser: bool = False,
78
+ ):
79
+ super().__init__(name)
80
+ self.candidate_class = WebVoyagerCandidate
81
+ self.endpoint_config_orch = endpoint_config_orch
82
+ self.endpoint_config_websurfer = endpoint_config_websurfer
83
+ self.endpoint_config_coder = endpoint_config_coder
84
+ self.endpoint_config_file_surfer = endpoint_config_file_surfer
85
+ self.web_surfer_only = web_surfer_only
86
+ self.dataset_name = dataset_name
87
+ self.use_local_browser = use_local_browser
88
+
89
+ def get_answer(
90
+ self, task_id: str, task: BaseTask, output_dir: str
91
+ ) -> BaseCandidate:
92
+ """
93
+ Runs the agent team to solve a given task and saves the answer and logs to disk.
94
+
95
+ Args:
96
+ task_id (str): Unique identifier for the task.
97
+ task (BaseTask): The task object containing the question and metadata.
98
+ output_dir (str): Directory to save logs, screenshots, and answer files.
99
+
100
+ Returns:
101
+ BaseCandidate: An object containing the final answer and any screenshots taken during execution.
102
+ """
103
+
104
+ async def _runner() -> Tuple[str, List[str]]:
105
+ """
106
+ Asynchronous runner that executes the agent team and collects the answer and screenshots.
107
+
108
+ Returns:
109
+ Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
110
+ """
111
+ messages_so_far: List[LogEventSystem] = []
112
+
113
+ task_question: str = task.question
114
+ # Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
115
+ FINAL_ANSWER_PROMPT = f"""
116
+ output a FINAL ANSWER to the task.
117
+
118
+ The real task is: {task_question}
119
+
120
+
121
+ To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
122
+ Don't put your answer in brackets or quotes.
123
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
124
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
125
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
126
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
127
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
128
+ You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
129
+ """
130
+ # Step 2: Create the Magentic-UI team
131
+ # TERMINATION CONDITION
132
+ termination_condition = TimeoutTermination(
133
+ timeout_seconds=60 * 15
134
+ ) # 15 minutes
135
+ model_context_token_limit = 110000
136
+ # ORCHESTRATOR CONFIGURATION
137
+ orchestrator_config = OrchestratorConfig(
138
+ cooperative_planning=False,
139
+ autonomous_execution=True,
140
+ allow_follow_up_input=False,
141
+ final_answer_prompt=FINAL_ANSWER_PROMPT,
142
+ model_context_token_limit=model_context_token_limit,
143
+ no_overwrite_of_task=True,
144
+ )
145
+
146
+ model_client_orch = ChatCompletionClient.load_component(
147
+ self.endpoint_config_orch
148
+ )
149
+ model_client_coder = ChatCompletionClient.load_component(
150
+ self.endpoint_config_coder
151
+ )
152
+ model_client_websurfer = ChatCompletionClient.load_component(
153
+ self.endpoint_config_websurfer
154
+ )
155
+ model_client_file_surfer = ChatCompletionClient.load_component(
156
+ self.endpoint_config_file_surfer
157
+ )
158
+
159
+ # launch the browser
160
+ if self.use_local_browser:
161
+ browser = LocalPlaywrightBrowser(headless=True)
162
+ else:
163
+ playwright_port, socket = get_available_port()
164
+ novnc_port, socket_vnc = get_available_port()
165
+ socket.close()
166
+ socket_vnc.close()
167
+ browser = VncDockerPlaywrightBrowser(
168
+ bind_dir=Path(output_dir),
169
+ playwright_port=playwright_port,
170
+ novnc_port=novnc_port,
171
+ inside_docker=False,
172
+ )
173
+ browser_location_log = LogEventSystem(
174
+ source="browser",
175
+ content=f"Browser at novnc port {novnc_port} and playwright port {playwright_port} launched",
176
+ timestamp=datetime.datetime.now().isoformat(),
177
+ )
178
+ messages_so_far.append(browser_location_log)
179
+
180
+ # Create web surfer
181
+ web_surfer = WebSurfer(
182
+ name="web_surfer",
183
+ model_client=model_client_websurfer,
184
+ browser=browser,
185
+ animate_actions=False,
186
+ max_actions_per_step=10,
187
+ start_page="about:blank" if task.url_path == "" else task.url_path,
188
+ downloads_folder=os.path.abspath(output_dir),
189
+ debug_dir=os.path.abspath(output_dir),
190
+ model_context_token_limit=model_context_token_limit,
191
+ to_save_screenshots=True,
192
+ )
193
+
194
+ agent_list: List[ChatAgent] = [web_surfer]
195
+ if not self.web_surfer_only:
196
+ coder_agent = CoderAgent(
197
+ name="coder_agent",
198
+ model_client=model_client_coder,
199
+ work_dir=os.path.abspath(output_dir),
200
+ model_context_token_limit=model_context_token_limit,
201
+ )
202
+
203
+ file_surfer = FileSurfer(
204
+ name="file_surfer",
205
+ model_client=model_client_file_surfer,
206
+ work_dir=os.path.abspath(output_dir),
207
+ bind_dir=os.path.abspath(output_dir),
208
+ model_context_token_limit=model_context_token_limit,
209
+ )
210
+ agent_list.append(coder_agent)
211
+ agent_list.append(file_surfer)
212
+ team = GroupChat(
213
+ participants=agent_list,
214
+ orchestrator_config=orchestrator_config,
215
+ model_client=model_client_orch,
216
+ termination_condition=termination_condition,
217
+ )
218
+ await team.lazy_init()
219
+ # Step 3: Prepare the task message
220
+ answer: str = ""
221
+ # check if file name is an image if it exists
222
+ if (
223
+ hasattr(task, "file_name")
224
+ and task.file_name
225
+ and task.file_name.endswith((".png", ".jpg", ".jpeg"))
226
+ ):
227
+ task_message = MultiModalMessage(
228
+ content=[
229
+ task_question,
230
+ AGImage.from_pil(Image.open(task.file_name)),
231
+ ],
232
+ source="user",
233
+ )
234
+ else:
235
+ task_message = TextMessage(content=task_question, source="user")
236
+ # Step 4: Run the team on the task
237
+ async for message in team.run_stream(task=task_message):
238
+ # Store log events
239
+ message_str: str = ""
240
+ try:
241
+ if isinstance(message, TaskResult) or isinstance(
242
+ message, CheckpointEvent
243
+ ):
244
+ continue
245
+ message_str = message.to_text()
246
+ # Create log event with source, content and timestamp
247
+ log_event = LogEventSystem(
248
+ source=message.source,
249
+ content=message_str,
250
+ timestamp=datetime.datetime.now().isoformat(),
251
+ metadata=message.metadata,
252
+ )
253
+ messages_so_far.append(log_event)
254
+ except Exception as e:
255
+ logger.info(
256
+ f"[likely nothing] When creating model_dump of message encountered exception {e}"
257
+ )
258
+ pass
259
+
260
+ # save to file
261
+ logger.info(f"Run in progress: {task_id}, message: {message_str}")
262
+ async with aiofiles.open(
263
+ f"{output_dir}/{task_id}_messages.json", "w"
264
+ ) as f:
265
+ # Convert list of logevent objects to list of dicts
266
+ messages_json = [msg.model_dump() for msg in messages_so_far]
267
+ await f.write(json.dumps(messages_json, indent=2))
268
+ await f.flush() # Flush to disk immediately
269
+ # how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"
270
+
271
+ if message_str.startswith("Final Answer:"):
272
+ answer = message_str[len("Final Answer:") :].strip()
273
+ # remove the "FINAL ANSWER:" part and get the string after it
274
+ answer = answer.split("FINAL ANSWER:")[1].strip()
275
+
276
+ assert isinstance(
277
+ answer, str
278
+ ), f"Expected answer to be a string, got {type(answer)}"
279
+
280
+ # save the usage of each of the client in a usage json file
281
+ def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
282
+ return {
283
+ "prompt_tokens": model_client.total_usage().prompt_tokens,
284
+ "completion_tokens": model_client.total_usage().completion_tokens,
285
+ }
286
+
287
+ usage_json = {
288
+ "orchestrator": get_usage(model_client_orch),
289
+ "websurfer": get_usage(model_client_websurfer),
290
+ "coder": get_usage(model_client_coder),
291
+ "file_surfer": get_usage(model_client_file_surfer),
292
+ }
293
+ usage_json["total_without_user_proxy"] = {
294
+ "prompt_tokens": sum(
295
+ usage_json[key]["prompt_tokens"]
296
+ for key in usage_json
297
+ if key != "user_proxy"
298
+ ),
299
+ "completion_tokens": sum(
300
+ usage_json[key]["completion_tokens"]
301
+ for key in usage_json
302
+ if key != "user_proxy"
303
+ ),
304
+ }
305
+ async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
306
+ await f.write(json.dumps(usage_json, indent=2))
307
+
308
+ await team.close()
309
+ # Step 5: Prepare the screenshots
310
+ screenshots_paths = []
311
+ # check the directory for screenshots which start with screenshot_raw_
312
+ for file in os.listdir(output_dir):
313
+ if file.startswith("screenshot_raw_"):
314
+ timestamp = file.split("_")[1]
315
+ screenshots_paths.append(
316
+ [timestamp, os.path.join(output_dir, file)]
317
+ )
318
+
319
+ # restrict to last 15 screenshots by timestamp
320
+ screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
321
+ screenshots_paths = [x[1] for x in screenshots_paths]
322
+ return answer, screenshots_paths
323
+
324
+ # Step 6: Return the answer and screenshots
325
+ answer, screenshots_paths = asyncio.run(_runner())
326
+ answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
327
+ self.save_answer_to_disk(task_id, answer, output_dir)
328
+ return answer
fara_config.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config_local_surfer: &client_surfer
2
+ provider: OpenAIChatCompletionClient
3
+ config:
4
+ model: "microsoft/Fara-7B"
5
+ base_url: http://localhost:5000/v1
6
+ api_key: not-needed
7
+ model_info:
8
+ vision: true
9
+ function_calling: true
10
+ json_output: false
11
+ family: "unknown"
12
+ structured_output: false
13
+ multiple_system_messages: false
14
+
15
+ orchestrator_client: *client_surfer
16
+ coder_client: *client_surfer
17
+ web_surfer_client: *client_surfer
18
+ file_surfer_client: *client_surfer
19
+ action_guard_client: *client_surfer
20
+ model_client: *client_surfer
frontend/.env.default ADDED
@@ -0,0 +1 @@
 
 
1
+ GATSBY_API_URL=http://127.0.0.1:8081/api
frontend/.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ node_modules/
2
+ .cache/
3
+ public
4
+ src/gatsby-types.d.ts
5
+ .env.development
6
+ .env.production
frontend/README.md ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 🚀 Running UI in Dev Mode
2
+
3
+ Run the UI in dev mode (make changes and see them reflected in the browser with hotreloading):
4
+
5
+ - Ensure yarn is installed.
6
+ - `yarn install`
7
+ - `yarn start`
8
+
9
+ This should start the server on port 8000.
10
+
11
+ ## Design Elements
12
+
13
+ - **Gatsby**: The app is created in Gatsby. A guide on bootstrapping a Gatsby app can be found here - https://www.gatsbyjs.com/docs/quick-start/.
14
+ This provides an overview of the project file structure include functionality of files like `gatsby-config.js`, `gatsby-node.js`, `gatsby-browser.js` and `gatsby-ssr.js`.
15
+ - **TailwindCSS**: The app uses TailwindCSS for styling. A guide on using TailwindCSS with Gatsby can be found here - https://tailwindcss.com/docs/guides/gatsby.https://tailwindcss.com/docs/guides/gatsby . This will explain the functionality in tailwind.config.js and postcss.config.js.
16
+
17
+ ## Modifying the UI, Adding Pages
18
+
19
+ The core of the app can be found in the `src` folder. To add pages, add a new folder in `src/pages` and add a `index.js` file. This will be the entry point for the page. For example to add a route in the app like `/about`, add a folder `about` in `src/pages` and add a `index.tsx` file. You can follow the content style in `src/pages/index.tsx` to add content to the page.
20
+
21
+ Core logic for each component should be written in the `src/components` folder and then imported in pages as needed.
22
+
23
+ ## connecting to front end
24
+
25
+ the front end makes request to the backend api and expects it at `http://localhost:8081/api`
26
+
27
+ ## setting env variables for the UI
28
+
29
+ - please look at `.env.default`
30
+ - make a copy of this file and name it `.env.development`
31
+ - set the values for the variables in this file
32
+ - The main variable here is `GATSBY_API_URL` which should be set to `http://localhost:8081/api` for local development. This tells the UI where to make requests to the backend.
frontend/gatsby-browser.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import "antd/dist/reset.css";
2
+ import "./src/styles/global.css";
3
+
4
+ import AuthProvider from "./src/hooks/provider";
5
+
6
+ export const wrapRootElement = AuthProvider;
frontend/gatsby-config.ts ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { GatsbyConfig } from "gatsby";
2
+ import fs from "fs";
3
+
4
+ const envFile = `.env.${process.env.NODE_ENV}`;
5
+
6
+ fs.access(envFile, fs.constants.F_OK, (err) => {
7
+ if (err) {
8
+ console.warn(`File '${envFile}' is missing. Using default values.`);
9
+ }
10
+ });
11
+
12
+ require("dotenv").config({
13
+ path: envFile,
14
+ });
15
+
16
+ const config: GatsbyConfig = {
17
+ pathPrefix: process.env.PREFIX_PATH_VALUE || "",
18
+ siteMetadata: {
19
+ title: `Magentic-UI`,
20
+ description: `Human-centered web agent interface`,
21
+ siteUrl: `http://tbd.place`,
22
+ },
23
+ // More easily incorporate content into your pages through automatic TypeScript type generation and better GraphQL IntelliSense.
24
+ // If you use VSCode you can also use the GraphQL plugin
25
+ // Learn more at: https://gatsby.dev/graphql-typegen
26
+ graphqlTypegen: true,
27
+ plugins: [
28
+ "gatsby-plugin-postcss",
29
+ "gatsby-plugin-image",
30
+ "gatsby-plugin-sitemap",
31
+ {
32
+ resolve: "gatsby-plugin-manifest",
33
+ options: {
34
+ icon: "src/images/icon.png",
35
+ },
36
+ },
37
+ "gatsby-plugin-mdx",
38
+ "gatsby-plugin-sharp",
39
+ "gatsby-transformer-sharp",
40
+ {
41
+ resolve: "gatsby-source-filesystem",
42
+ options: {
43
+ name: "images",
44
+ path: "./src/images/",
45
+ },
46
+ __key: "images",
47
+ },
48
+ {
49
+ resolve: "gatsby-source-filesystem",
50
+ options: {
51
+ name: "pages",
52
+ path: "./src/pages/",
53
+ },
54
+ __key: "pages",
55
+ },
56
+ ],
57
+ };
58
+
59
+ export default config;
frontend/gatsby-ssr.tsx ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from "react";
2
+
3
+ const codeToRunOnClient = `(function() {
4
+ try {
5
+ var mode = localStorage.getItem('darkmode');
6
+ document.getElementsByTagName("html")[0].className === 'dark' ? 'dark' : 'light';
7
+ } catch (e) {}
8
+ })();`;
9
+
10
+ export const onRenderBody = ({ setHeadComponents }) =>
11
+ setHeadComponents([
12
+ <script
13
+ key="myscript"
14
+ dangerouslySetInnerHTML={{ __html: codeToRunOnClient }}
15
+ />,
16
+ ]);
frontend/package.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Magentic-UI",
3
+ "version": "1.0.0",
4
+ "private": true,
5
+ "description": "Magentic-UI",
6
+ "author": "Microsoft",
7
+ "keywords": [
8
+ "gatsby"
9
+ ],
10
+ "scripts": {
11
+ "develop": "gatsby clean && gatsby develop",
12
+ "dev": "npm run develop",
13
+ "start": "gatsby clean && gatsby develop",
14
+ "build": "gatsby clean && rm -rf ../src/magentic_ui/backend/web/ui && PREFIX_PATH_VALUE='' gatsby build --prefix-paths && rsync -a --delete public/ ../src/magentic_ui/backend/web/ui/",
15
+ "serve": "gatsby serve",
16
+ "clean": "gatsby clean",
17
+ "typecheck": "tsc --noEmit"
18
+ },
19
+ "dependencies": {
20
+ "@dagrejs/dagre": "^1.1.4",
21
+ "@dnd-kit/core": "^6.2.0",
22
+ "@headlessui/react": "^2.2.0",
23
+ "@hello-pangea/dnd": "^17.0.0",
24
+ "@heroicons/react": "^2.0.18",
25
+ "@mdx-js/react": "^3.1.0",
26
+ "@monaco-editor/react": "^4.6.0",
27
+ "@tailwindcss/typography": "^0.5.9",
28
+ "@xyflow/react": "^12.3.5",
29
+ "antd": "^5.22.1",
30
+ "autoprefixer": "^10.4.20",
31
+ "gatsby": "^5.14.0",
32
+ "gatsby-plugin-image": "^3.14.0",
33
+ "gatsby-plugin-manifest": "^5.14.0",
34
+ "gatsby-plugin-mdx": "^5.14.0",
35
+ "gatsby-plugin-postcss": "^6.14.0",
36
+ "gatsby-plugin-sharp": "^5.14.0",
37
+ "gatsby-plugin-sitemap": "^6.14.0",
38
+ "gatsby-source-filesystem": "^5.14.0",
39
+ "gatsby-transformer-sharp": "^5.14.0",
40
+ "install": "^0.13.0",
41
+ "js-yaml": "^4.1.0",
42
+ "lucide-react": "^0.460.0",
43
+ "postcss": "^8.4.49",
44
+ "react": "^18.2.0",
45
+ "react-dom": "^18.2.0",
46
+ "react-markdown": "^9.0.1",
47
+ "react-syntax-highlighter": "^15.6.1",
48
+ "react-vnc": "^3.0.8",
49
+ "react-window": "^1.8.11",
50
+ "remark-gfm": "^4.0.0",
51
+ "tailwindcss": "^3.4.14",
52
+ "yarn": "^1.22.22",
53
+ "zod": "^3.25.63",
54
+ "zustand": "^5.0.1"
55
+ },
56
+ "devDependencies": {
57
+ "@types/lodash.debounce": "^4.0.9",
58
+ "@types/node": "^22.9.0",
59
+ "@types/react": "^18.2.55",
60
+ "@types/react-dom": "^18.2.19",
61
+ "@types/react-syntax-highlighter": "^15.5.13",
62
+ "@types/uuid": "^10.0.0",
63
+ "typescript": "^5.3.3"
64
+ },
65
+ "resolutions": {
66
+ "tar-fs": "2.1.2",
67
+ "path-to-regexp": "0.1.12",
68
+ "prismjs": "1.30.0",
69
+ "cookie": "0.7.0",
70
+ "base-x": "3.0.11"
71
+ }
72
+ }
frontend/postcss.config.js ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ plugins: {
3
+ tailwindcss: {},
4
+ autoprefixer: {},
5
+ },
6
+ }
frontend/src/assets/logo.svg ADDED
frontend/src/components/common/AutoResizeTextarea.tsx ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React, { useEffect, useLayoutEffect, useRef } from "react";
2
+
3
+ interface AutoResizeTextareaProps
4
+ extends React.TextareaHTMLAttributes<HTMLTextAreaElement> {
5
+ value: string;
6
+ onChange: (e: React.ChangeEvent<HTMLTextAreaElement>) => void;
7
+ className: string;
8
+ minHeight?: string;
9
+ maxHeight?: string;
10
+ }
11
+
12
+ const AutoResizeTextarea: React.FC<AutoResizeTextareaProps> = ({
13
+ value,
14
+ onChange,
15
+ className,
16
+ minHeight = "30px",
17
+ maxHeight = "120px",
18
+ ...props
19
+ }) => {
20
+ const textareaRef = useRef<HTMLTextAreaElement>(null);
21
+ const observerRef = useRef<ResizeObserver | null>(null);
22
+
23
+ const adjustHeight = () => {
24
+ const textarea = textareaRef.current;
25
+ if (!textarea) return;
26
+
27
+ // Reset height to get the correct scrollHeight measurement
28
+ textarea.style.height = minHeight;
29
+
30
+ // Convert min and max heights to numbers for comparison
31
+ const minHeightPx = parseInt(minHeight);
32
+ const maxHeightPx = parseInt(maxHeight);
33
+
34
+ // Set the height to match content, bounded by min and max heights
35
+ const desiredHeight = Math.min(
36
+ Math.max(minHeightPx, textarea.scrollHeight),
37
+ maxHeightPx
38
+ );
39
+ textarea.style.height = `${desiredHeight}px`;
40
+
41
+ // Add scrollbar if content exceeds maxHeight
42
+ textarea.style.overflowY =
43
+ textarea.scrollHeight > maxHeightPx ? "auto" : "hidden";
44
+ };
45
+
46
+ // Initial height adjustment using useLayoutEffect to prevent flash
47
+ useLayoutEffect(() => {
48
+ adjustHeight();
49
+ }, []);
50
+
51
+ // Adjust height when value changes
52
+ useEffect(() => {
53
+ adjustHeight();
54
+ }, [value]);
55
+
56
+ // Setup resize observer and window resize handler
57
+ useEffect(() => {
58
+ const textarea = textareaRef.current;
59
+ if (!textarea) return;
60
+
61
+ // Create resize observer
62
+ observerRef.current = new ResizeObserver(() => {
63
+ adjustHeight();
64
+ });
65
+
66
+ // Observe both the textarea and its parent element
67
+ observerRef.current.observe(textarea);
68
+ if (textarea.parentElement) {
69
+ observerRef.current.observe(textarea.parentElement);
70
+ }
71
+
72
+ // Handle window resize
73
+ const handleResize = () => adjustHeight();
74
+ window.addEventListener("resize", handleResize);
75
+
76
+ // Setup intersection observer for visibility changes
77
+ const intersectionObserver = new IntersectionObserver(
78
+ (entries) => {
79
+ entries.forEach((entry) => {
80
+ if (entry.isIntersecting) {
81
+ adjustHeight();
82
+ }
83
+ });
84
+ },
85
+ { threshold: 0.1 }
86
+ );
87
+
88
+ intersectionObserver.observe(textarea);
89
+
90
+ return () => {
91
+ window.removeEventListener("resize", handleResize);
92
+ if (observerRef.current) {
93
+ observerRef.current.disconnect();
94
+ }
95
+ intersectionObserver.disconnect();
96
+ };
97
+ }, []);
98
+
99
+ return (
100
+ <textarea
101
+ ref={textareaRef}
102
+ value={value}
103
+ onChange={onChange}
104
+ className={className}
105
+ style={{
106
+ minHeight,
107
+ maxHeight,
108
+ overflowY: "auto",
109
+ resize: "none",
110
+ ...props.style,
111
+ }}
112
+ {...props}
113
+ />
114
+ );
115
+ };
116
+
117
+ export default AutoResizeTextarea;
frontend/src/components/common/Button.tsx ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React from "react";
2
+ import { Spin } from "antd";
3
+
4
+ export type ButtonVariant =
5
+ | "primary"
6
+ | "secondary"
7
+ | "tertiary"
8
+ | "success"
9
+ | "warning"
10
+ | "danger";
11
+ export type ButtonSize = "xs" | "sm" | "md" | "lg";
12
+
13
+ interface ButtonProps extends React.ButtonHTMLAttributes<HTMLButtonElement> {
14
+ variant?: ButtonVariant;
15
+ size?: ButtonSize;
16
+ isLoading?: boolean;
17
+ icon?: React.ReactNode;
18
+ iconPosition?: "left" | "right";
19
+ fullWidth?: boolean;
20
+ children?: React.ReactNode;
21
+ className?: string;
22
+ }
23
+
24
+ export const Button: React.FC<ButtonProps> = ({
25
+ variant = "primary",
26
+ size = "md",
27
+ isLoading = false,
28
+ icon,
29
+ iconPosition = "left",
30
+ fullWidth = false,
31
+ disabled = false,
32
+ children,
33
+ className = "",
34
+ ...props
35
+ }) => {
36
+ // Base classes shared by all buttons
37
+ const baseClasses =
38
+ "inline-flex items-center justify-center rounded-md transition-colors focus:outline-none";
39
+
40
+ // Size variations
41
+ const sizeClasses = {
42
+ xs: "px-2 py-1 text-xs",
43
+ sm: "px-2.5 py-1.5 text-sm",
44
+ md: "px-4 py-2 text-base",
45
+ lg: "px-6 py-3 text-lg",
46
+ };
47
+
48
+ // Variant classes - these would use your color variables
49
+ const variantClasses = {
50
+ primary:
51
+ "bg-magenta-800 text-white hover:bg-magenta-900 focus:ring-2 focus:ring-magenta-900",
52
+ secondary:
53
+ "bg-transparent border border-magenta-800 text-magenta-800 hover:bg-magenta-900/50",
54
+ tertiary: "bg-transparent text-gray-800 hover:text-primary",
55
+ success:
56
+ "bg-green-600 text-white hover:bg-green-700 focus:ring-2 focus:ring-green-400",
57
+ warning:
58
+ "bg-warning-primary text-white hover:bg-amber-600 focus:ring-2 focus:ring-amber-400",
59
+ danger:
60
+ "bg-red-600 text-white hover:bg-red-700 focus:ring-2 focus:ring-red-400",
61
+ };
62
+
63
+ // States
64
+ const stateClasses =
65
+ disabled || isLoading ? "opacity-60 cursor-not-allowed" : "cursor-pointer";
66
+
67
+ // Width
68
+ const widthClass = fullWidth ? "w-full" : "";
69
+
70
+ return (
71
+ <button
72
+ disabled={disabled || isLoading}
73
+ className={`
74
+ ${baseClasses}
75
+ ${sizeClasses[size]}
76
+ ${variantClasses[variant]}
77
+ ${stateClasses}
78
+ ${widthClass}
79
+ ${className}
80
+ `}
81
+ {...props}
82
+ >
83
+ {isLoading && <Spin size="small" className={children ? "mr-2" : ""} />}
84
+
85
+ {!isLoading && icon && iconPosition === "left" && (
86
+ <span className={`${children ? "mr-2" : ""}`}>{icon}</span>
87
+ )}
88
+
89
+ {children}
90
+
91
+ {!isLoading && icon && iconPosition === "right" && (
92
+ <span className={`${children ? "ml-2" : ""}`}>{icon}</span>
93
+ )}
94
+ </button>
95
+ );
96
+ };