Spaces:

AUXteam
/

Maxun

Paused

App Files Files Community

AUXteam commited on Feb 17

Commit

6e38ce1

verified ·

1 Parent(s): 8218306

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
.gitignore +213 -0
.hfignore +11 -0
.python-version +1 -0
CODE_OF_CONDUCT.md +9 -0
CONTRIBUTING.md +87 -0
Dockerfile +49 -0
LICENSE +21 -0
README.md +463 -4
SECURITY.md +41 -0
SUPPORT.md +14 -0
TRANSPARENCY_NOTE.md +128 -0
TROUBLESHOOTING.md +133 -0
docs/img/magenticui.jpg +0 -0
docs/img/magenticui_running.png +3 -0
docs/img/magui-actionguard.png +3 -0
docs/img/magui-coplanning.png +3 -0
docs/img/magui-cotasking.png +3 -0
docs/img/magui-landing.png +3 -0
docs/img/magui-readme-logo.png +3 -0
docs/img/magui-readme-logo.svg +79 -0
docs/index.html +141 -0
docs/tutorials/web_agent_tutorial_full.ipynb +1782 -0
experiments/endpoint_configs/.gitignore +3 -0
experiments/endpoint_configs/config_template.yaml +15 -0
experiments/endpoint_configs/test_client.py +31 -0
experiments/eval/.gitignore +2 -0
experiments/eval/README.md +75 -0
experiments/eval/analyze_sim_user.py +257 -0
experiments/eval/explore_results.py +202 -0
experiments/eval/plot_results.py +158 -0
experiments/eval/prepare_for_submission.py +128 -0
experiments/eval/run.py +276 -0
experiments/eval/sample_eval_systems.py +84 -0
experiments/eval/systems/__init__.py +5 -0
experiments/eval/systems/magentic_one_system.py +241 -0
experiments/eval/systems/magentic_ui_sim_user_system.py +484 -0
experiments/eval/systems/magentic_ui_system.py +328 -0
fara_config.yaml +20 -0
frontend/.env.default +1 -0
frontend/.gitignore +6 -0
frontend/README.md +32 -0
frontend/gatsby-browser.js +6 -0
frontend/gatsby-config.ts +59 -0
frontend/gatsby-ssr.tsx +16 -0
frontend/package.json +72 -0
frontend/postcss.config.js +6 -0
frontend/src/assets/logo.svg +29 -0
frontend/src/components/common/AutoResizeTextarea.tsx +117 -0
frontend/src/components/common/Button.tsx +96 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/img/magenticui_running.png filter=lfs diff=lfs merge=lfs -text
+docs/img/magui-actionguard.png filter=lfs diff=lfs merge=lfs -text
+docs/img/magui-coplanning.png filter=lfs diff=lfs merge=lfs -text
+docs/img/magui-cotasking.png filter=lfs diff=lfs merge=lfs -text
+docs/img/magui-landing.png filter=lfs diff=lfs merge=lfs -text
+docs/img/magui-readme-logo.png filter=lfs diff=lfs merge=lfs -text
+frontend/src/styles/Open_Sans/OpenSans-Italic-VariableFont_wdth,wght.ttf filter=lfs diff=lfs merge=lfs -text
+frontend/src/styles/Open_Sans/OpenSans-VariableFont_wdth,wght.ttf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,213 @@

+.vscode
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+scratch.py
+.DS_Store
+.magentic_ui_state.json
+debug
+data
+runs
+node_modules
+# Autogen Studio
+database.sqlite
+.cache/*
+src/magentic_ui/backend/web/files/user/*
+src/magentic_ui/backend/test
+src/magentic_ui/backend/database/alembic.ini
+src/magentic_ui/backend/database/alembic/*
+src/magentic_ui/backend/web/files/ui/*
+OAI_CONFIG_LIST
+scratch/
+src/magentic_ui/backend/web/workdir/*
+src/magentic_ui/backend/web/ui/*
+src/magentic_ui/backend/web/skills/user/*
+.release.sh
+.nightly.sh
+notebooks/test
+notebooks/work_dir/*
+notebooks/test.db
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Task centric memory related db and logs
+**/memory_bank/
+**/pagelogs/

.hfignore ADDED Viewed

	@@ -0,0 +1,11 @@

+.git/
+.github/
+frontend/node_modules/
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.db
+.cache/
+public/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,9 @@

+# Microsoft Open Source Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+Resources:
+- [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
+- [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
+- Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,87 @@

+# Contributing to Magentic-UI
+Thank you for your interest in contributing to Magentic-UI!
+We welcome all contributions - whether it’s bug reports, feature requests, code, documentation, or helping others with their questions.
+## Code of Conduct
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information, see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+## Contributor License Agreement (CLA)
+Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution.
+For details, visit [https://opensource.microsoft.com/pdf/microsoft-contribution-license-agreement.pdf](https://opensource.microsoft.com/pdf/microsoft-contribution-license-agreement.pdf).
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., status check, comment).
+Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
+## How to Contribute
+- **Find an Issue:**
+  - Browse [All Issues](https://github.com/microsoft/magentic-ui/issues).
+  - Look for issues labeled with <span style="color:green"><strong>help-wanted</strong></span> as these are especially open for community contribution!
+  - You can also help review [open PRs](https://github.com/microsoft/magentic-ui/pulls).
+- **Pick Something to Work On:**
+  - See the checklist below for high-priority issues.
+  - If you have an idea for a new feature or improvement, feel free to open a new issue for discussion.
+- **Fork and Clone:**
+  - Fork the repository and clone it to your local machine.
+- **Create a Branch:**
+  - Use a descriptive branch name (e.g., `fix/session-bug` or `feature/file-upload`).
+- **Write Code and Tests:**
+  - Please include tests for new features or bug fixes. See the `tests` directory for examples.
+- **Run Checks Locally:**
+  - Before submitting a PR, run:
+    ```sh
+    poe check
+    ```
+- **Submit a Pull Request:**
+  - Open a PR against the `main` branch.
+  - Reference the issue number in your PR description (e.g., “Closes #123”).
+  - The CLA bot will guide you if you need to sign the CLA.
+## Community “Help Wanted” Issues
+We use the green <span style="color:green"><strong>help-wanted</strong></span> label to highlight issues that are especially open for community contribution.
+Here are the top 10 issues you can help with right now:
+- [ ] **Allow MAGUI to understand video and audio** ([#132](https://github.com/microsoft/magentic-ui/issues/132))
+- [ ] **Enable arbitrary file upload in UI** ([#128](https://github.com/microsoft/magentic-ui/issues/128))
+- [ ] **Add streaming of final answer and coder messages** ([#126](https://github.com/microsoft/magentic-ui/issues/126))
+- [ ] **Add unit tests** ([#123](https://github.com/microsoft/magentic-ui/issues/123))
+- [ ] **Allow websurfer to scroll inside containers** ([#124](https://github.com/microsoft/magentic-ui/issues/124))
+- [ ] **Composing multiple plans** ([#129](https://github.com/microsoft/magentic-ui/issues/129))
+- [ ] **Reduce latency** ([#131](https://github.com/microsoft/magentic-ui/issues/131))
+- [ ] **Improve allowed list** ([#125](https://github.com/microsoft/magentic-ui/issues/125))
+- [ ] **Add agent name to step in frontend** ([#110](https://github.com/microsoft/magentic-ui/issues/110))
+- [ ] **Pass auth info for browser sessions** ([#120](https://github.com/microsoft/magentic-ui/issues/120))
+See [all issues needing help](https://github.com/microsoft/magentic-ui/issues?q=is%3Aissue+is%3Aopen+label%3Ahelp-wanted).
+## Reviewing Pull Requests
+You can also help by reviewing [open PRs](https://github.com/microsoft/magentic-ui/pulls).
+## Running Tests and Checks
+All contributions must pass the continuous integration checks.
+You can run these checks locally before submitting a PR by running:
+```bash
+poe check
+```
+## Questions?
+If you have any questions, open an issue or start a discussion.
+Thank you for helping make Magentic-UI better!

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+# Use a Python base image
+FROM python:3.12-slim
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PATH="/home/user/.local/bin:$PATH" \
+    HOME=/home/user
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    git \
+    rsync \
+    gnupg \
+    build-essential \
+    && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
+    && apt-get install -y nodejs \
+    && npm install -g yarn \
+    && rm -rf /var/lib/apt/lists/*
+# Create a non-root user
+RUN useradd -m -u 1000 user
+USER user
+WORKDIR $HOME/app
+# Install uv
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+# Copy the project files
+COPY --chown=user . .
+# Build the frontend
+RUN cd frontend && yarn install && yarn build
+# Install Python dependencies with uv
+RUN $HOME/.local/bin/uv pip install --system .
+# Install Playwright and its browsers
+RUN $HOME/.local/bin/uv pip install --system playwright && \
+    playwright install --with-deps chromium
+# Expose the HF port
+EXPOSE 7860
+# Command to run the application
+# We use --run-without-docker to avoid issues with Docker-in-Docker on HF Spaces
+# We also set the host to 0.0.0.0 and port to 7860
+CMD ["magentic-ui", "--port", "7860", "--host", "0.0.0.0", "--run-without-docker"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Microsoft
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,469 @@
 ---
 title: Maxun
-emoji: 🐠
-colorFrom: gray
-colorTo: gray
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Maxun
+emoji: 🤖
+colorFrom: blue
+colorTo: indigo
 sdk: docker
 pinned: false
+app_port: 7860
 ---
+<div align="center">
+<img src="docs/img/magui-readme-logo.svg" alt="Magentic-UI Logo">
+_Automate your web tasks while you stay in control_
+[![image](https://img.shields.io/pypi/v/magentic_ui.svg)](https://pypi.python.org/pypi/magentic_ui)
+[![image](https://img.shields.io/pypi/l/magentic_ui.svg)](https://pypi.python.org/pypi/magentic_ui)
+![Python Versions](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)
+[![arXiv](https://img.shields.io/badge/arXiv-2507.22358-b31b1b.svg)](https://arxiv.org/abs/2507.22358)
+</div>
+---
+Magentic-UI is a **research prototype** human-centered AI agent that solves complex web and coding tasks that may require monitoring. Unlike other black-box agents, the system reveals its plan before executions, lets you guide its actions, and requests approval for sensitive operations while browsing websites, executing code, and analyzing files.
+*Check out the [demo section](#demos) for inspiration on what tasks you can accomplish.*
+## ✨ What's New
+Microsoft latest agentic model [Fara-7B](https://www.microsoft.com/en-us/research/blog/fara-7b-an-efficient-agentic-model-for-computer-use/) is now integrated in Magentic-UI, read how to launch in <a href="#fara-7b"> Fara-7B guide</a>
+- **"Tell me When"**: Automate monitoring tasks and repeatable workflows that require web or API access that span minutes to days. *Learn more [here](https://www.microsoft.com/en-us/research/blog/tell-me-when-building-agents-that-can-wait-monitor-and-act/).*
+- **File Upload Support**: Upload any file through the UI for analysis or modification
+- **MCP Agents**: Extend capabilities with your favorite MCP servers
+- **Easier Installation**: We have uploaded our docker containers to GHCR so you no longer need to build any containers! Installation time now is much quicker.
+## 🚀 Quick Start
+Here's how you can get started with Magentic-UI:
+```bash
+# 1. Setup environment
+python3 -m venv .venv
+source .venv/bin/activate
+pip install magentic-ui --upgrade
+# 2. Set your API key
+export OPENAI_API_KEY="your-api-key-here"
+# 3. Launch Magentic-UI
+magentic-ui --port 8081
+```
+Then open <http://localhost:8081> in your browser to interact with Magentic-UI!
+> **Prerequisites**: Requires Docker and Python 3.10+. Windows users should use WSL2. See [detailed installation](#️-installation) for more info.
+## Alternative Usage Options
+**Without Docker** (limited functionality: no code execution):
+```bash
+magentic-ui --run-without-docker --port 8081
+```
+**Command Line Interface**:
+```bash
+magentic-cli --work-dir PATH/TO/STORE/DATA
+```
+**Custom LLM Clients**:
+```bash
+# Azure
+pip install magentic-ui[azure]
+# Ollama (local models)
+pip install magentic-ui[ollama]
+```
+You can then pass a config file to the `magentic-ui` command (<a href="#model-client-configuration"> client config</a>) or change the model client inside the UI settings.
+For further details on installation please read the   <a href="#️-installation">🛠️ Installation</a> section. For common installation issues and their solutions, please refer to the [troubleshooting document](TROUBLESHOOTING.md). See advanced usage instructions with the command `magentic-ui --help`.
+## Quick Navigation:
+<p align="center">
+  <a href="#demos">🎬 Demos</a> &nbsp;|&nbsp;
+  <a href="#how-it-works">🟪 How it Works</a> &nbsp;|&nbsp;
+  <a href="#installation">🛠️ Installation</a> &nbsp;|&nbsp;
+  <a href="#troubleshooting">⚠️ Troubleshooting</a> &nbsp;|&nbsp;
+  <a href="#contributing">🤝 Contributing</a> &nbsp;|&nbsp;
+  <a href="#license">📄 License</a>
+</p>
+---
+## Demos
+<table>
+<tr>
+<td width="33%" align="center">
+**🍕 Pizza Ordering**
+*Web automation with human-in-the-loop*
+<video src="https://github.com/user-attachments/assets/dc95cf5f-c4b4-4fe0-b708-158ff071e5a9" width="100%" style="max-height: 300px;">
+</video>
+</td>
+<td width="33%" align="center">
+**🏠 Airbnb Price Analysis**
+*MCP agent integration*
+<video src="https://github.com/user-attachments/assets/c19ed8c2-e06f-43b7-bee3-5e2ffc4c5e02" width="100%" style="max-height: 300px;">
+</video>
+</td>
+<td width="33%" align="center">
+**⭐ Star Monitoring**
+*Long-running monitoring task*
+<video src="https://github.com/user-attachments/assets/d2a463ca-7a94-4414-932d-a69f30fff63b" width="100%" style="max-height: 300px;">
+</video>
+</td>
+</tr>
+</table>
+## How it Works
+<p align="center">
+  <img src="./docs/img/magenticui_running.png" alt="Magentic-UI" height="400">
+</p>
+Magentic-UI is especially useful for web tasks that require actions on the web (e.g., filling a form, customizing a food order), deep navigation through websites not indexed by search engines (e.g., filtering flights, finding a link from a personal site) or tasks that need web navigation and code execution (e.g., generate a chart from online data).
+What differentiates Magentic-UI from other browser use offerings is its transparent and controllable interface that allows for efficient human-in-the-loop involvement. Magentic-UI is built using [AutoGen](https://github.com/microsoft/autogen) and provides a platform to study human-agent interaction and experiment with web agents. Key features include:
+- 🧑‍🤝‍🧑 **Co-Planning**: Collaboratively create and approve step-by-step plans using chat and the plan editor.
+- 🤝 **Co-Tasking**: Interrupt and guide the task execution using the web browser directly or through chat. Magentic-UI can also ask for clarifications and help when needed.
+- 🛡️ **Action Guards**: Sensitive actions are only executed with explicit user approvals.
+- 🧠 **Plan Learning and Retrieval**: Learn from previous runs to improve future task automation and save them in a plan gallery. Automatically or manually retrieve saved plans in future tasks.
+- 🔀 **Parallel Task Execution**: You can run multiple tasks in parallel and session status indicators will let you know when Magentic-UI needs your input or has completed the task.
+<div align="center">
+  <a href="https://www.youtube.com/watch?v=wOs-5SR8xOc" target="_blank">
+    <img src="https://img.youtube.com/vi/wOs-5SR8xOc/maxresdefault.jpg" alt="Watch the demo video" width="600"/>
+  </a>
+  <br>
+  ▶️ <em> Click to watch a video and learn more about Magentic-UI </em>
+</div>
+### Autonomous Evaluation
+To evaluate its autonomous capabilities, Magentic-UI has been tested against several benchmarks when running with o4-mini: [GAIA](https://huggingface.co/datasets/gaia-benchmark/GAIA) test set (42.52%), which assesses general AI assistants across reasoning, tool use, and web interaction tasks ; [AssistantBench](https://huggingface.co/AssistantBench) test set (27.60%), focusing on realistic, time-consuming web tasks; [WebVoyager](https://github.com/MinorJerry/WebVoyager) (82.2%), measuring end-to-end web navigation in real-world scenarios; and [WebGames](https://webgames.convergence.ai/) (45.5%), evaluating general-purpose web-browsing agents through interactive challenges.
+To reproduce these experimental results, please see the following [instructions](experiments/eval/README.md).
+If you're interested in reading more checkout our [technical report](https://www.microsoft.com/en-us/research/wp-content/uploads/2025/07/magentic-ui-report.pdf) and [blog post](https://www.microsoft.com/en-us/research/blog/magentic-ui-an-experimental-human-centered-web-agent/).
+## Installation
+### Pre-Requisites
+**Note**: If you're using Windows, we highly recommend using [WSL2](https://docs.microsoft.com/en-us/windows/wsl/install) (Windows Subsystem for Linux).
+1. If running on **Windows** or **Mac** you should use [Docker Desktop](https://www.docker.com/products/docker-desktop/) or if inside WSL2 you can install Docker directly inside WSL [docker in WSL2 guide](https://gist.github.com/dehsilvadeveloper/c3bdf0f4cdcc5c177e2fe9be671820c7). If running on **Linux**, you should use [Docker Engine](https://docs.docker.com/engine/install/).
+If using Docker Desktop, make sure it is set up to use WSL2:
+    - Go to Settings > Resources > WSL Integration
+    - Enable integration with your development distro You can find more detailed instructions about this step [here](https://docs.microsoft.com/en-us/windows/wsl/tutorials/wsl-containers).
+2. During the Installation step, you will need to set up your `OPENAI_API_KEY`. To use other models, review the [Model Client Configuration](#model-client-configuration) section below.
+3. You need at least [Python 3.10](https://www.python.org/downloads/) installed.
+If you are on Windows, we recommend to run Magentic-UI inside [WSL2](https://docs.microsoft.com/en-us/windows/wsl/install) (Windows Subsystem for Linux) for correct Docker and file path compatibility.
+### PyPI Installation
+Magentic-UI is available on PyPI. We recommend using a virtual environment to avoid conflicts with other packages.
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install magentic-ui
+```
+Alternatively, if you use [`uv`](https://docs.astral.sh/uv/getting-started/installation/) for dependency management, you can install Magentic-UI with:
+```bash
+uv venv --python=3.12 .venv
+. .venv/bin/activate
+uv pip install magentic-ui
+```
+### Running Magentic-UI
+To run Magentic-UI, make sure that Docker is running, then run the following command:
+```bash
+magentic-ui --port 8081
+```
+>**Note**: Running this command for the first time will pull two docker images required for the Magentic-UI agents. If you encounter problems, you can build them directly with the following command:
+```bash
+cd docker
+sh build-all.sh
+```
+If you face issues with Docker, please refer to the [TROUBLESHOOTING.md](TROUBLESHOOTING.md) document.
+Once the server is running, you can access the UI at <http://localhost:8081>.
+### Fara-7B
+1) First install magentic-ui with the fara extras:
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install magentic-ui[fara]
+```
+2) In a seperate process, serve the Fara-7B model using vLLM:
+```bash
+vllm serve "microsoft/Fara-7B" --port 5000 --dtype auto
+```
+3) First create a `fara_config.yaml` file with the following content:
+```yaml
+model_config_local_surfer: &client_surfer
+  provider: OpenAIChatCompletionClient
+  config:
+    model: "microsoft/Fara-7B"
+    base_url: http://localhost:5000/v1
+    api_key: not-needed
+    model_info:
+      vision: true
+      function_calling: true
+      json_output: false
+      family: "unknown"
+      structured_output: false
+      multiple_system_messages: false
+orchestrator_client: *client_surfer
+coder_client: *client_surfer
+web_surfer_client: *client_surfer
+file_surfer_client: *client_surfer
+action_guard_client: *client_surfer
+model_client: *client_surfer
+```
+Note: if you are hosting vLLM on a different port or host, change the `base_url` accordingly.
+Then launch Magentic-UI with the fara agent:
+```bash
+magentic-ui --fara --port 8081 --config fara_config.yaml
+```
+Finally, navigate to <http://localhost:8081> to access the interface!
+### Configuration
+#### Model Client Configuration
+If you want to use a different OpenAI key, or if you want to configure use with Azure OpenAI or Ollama, you can do so inside the UI by navigating to settings (top right icon) and changing model configuration. Another option is to pass a yaml config file when you start Magentic-UI which will override any settings in the UI:
+```bash
+magentic-ui --port 8081 --config config.yaml
+```
+Where the `config.yaml` should look as follows with an AutoGen model client configuration:
+```yaml
+gpt4o_client: &gpt4o_client
+    provider: OpenAIChatCompletionClient
+    config:
+      model: gpt-4o-2024-08-06
+      api_key: null
+      base_url: null
+      max_retries: 5
+orchestrator_client: *gpt4o_client
+coder_client: *gpt4o_client
+web_surfer_client: *gpt4o_client
+file_surfer_client: *gpt4o_client
+action_guard_client: *gpt4o_client
+plan_learning_client: *gpt4o_client
+```
+You can change the client for each of the agents using the config file and use AzureOpenAI (`AzureOpenAIChatCompletionClient`), Ollama and other clients.
+#### MCP Server Configuration
+You can also extend Magentic-UI's capabilities by adding custom "McpAgents" to the multi-agent team. Each McpAgent can have access to one or more MCP Servers. You can specify these agents via the `mcp_agent_configs` parameter in your `config.yaml`.
+For example, here's an agent called "airbnb_surfer" that has access to the OpenBnb MCP Server running locally via Stdio.
+```yaml
+mcp_agent_configs:
+  - name: airbnb_surfer
+    description: "The airbnb_surfer has direct access to AirBnB."
+    model_client:
+      provider: OpenAIChatCompletionClient
+      config:
+        model: gpt-4.1-2025-04-14
+      max_retries: 10
+    system_message: |-
+      You are AirBnb Surfer, a helpful digital assistant that can help users acces AirBnB.
+      You have access to a suite of tools provided by the AirBnB API. Use those tools to satisfy the users requests.
+    reflect_on_tool_use: false
+    mcp_servers:
+      - server_name: AirBnB
+        server_params:
+          type: StdioServerParams
+          command: npx
+          args:
+            - -y
+            - "@openbnb/mcp-server-airbnb"
+            - --ignore-robots-txt
+```
+Under the hood, each `McpAgent` is just a `autogen_agentchat.agents.AssistantAgent` with the set of MCP Servers exposed as an `AggregateMcpWorkbench` which is simply a named collection of `autogen_ext.tools.mcp.McpWorkbench` objects (one per MCP Server).
+Currently the supported MCP Server types are `autogen_ext.tools.mcp.StdioServerParams` and `autogen_ext.tools.mcp.SseServerParams`.
+### Building Magentic-UI from source
+This step is primarily for users seeking to make modifications to the code, are having trouble with the pypi installation or want the latest code before a pypi version release.
+#### 1. Make sure the above prerequisites are installed, and that Docker is running.
+#### 2. Clone the repository to your local machine:
+```bash
+git clone https://github.com/microsoft/magentic-ui.git
+cd magentic-ui
+```
+#### 3. Install Magentic-UI's dependencies with uv or your favorite package manager:
+```bash
+# install uv through https://docs.astral.sh/uv/getting-started/installation/
+uv venv --python=3.12 .venv
+uv sync --all-extras
+source .venv/bin/activate
+```
+#### 4. Build the frontend:
+First make sure to install node:
+```bash
+# install nvm to install node
+curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.1/install.sh | bash
+nvm install node
+```
+Then install the frontend:
+```bash
+cd frontend
+npm install -g gatsby-cli
+npm install --global yarn
+yarn install
+yarn build
+```
+#### 5. Run Magentic-UI, as usual.
+```bash
+magentic-ui --port 8081
+```
+#### Running the UI from source
+If you are making changes to the source code of the UI, you can run the frontend in development mode so that it will automatically update when you make changes for faster development.
+1. Open a separate terminal and change directory to the frontend
+```bash
+cd frontend
+```
+2. Create a `.env.development` file.
+```bash
+cp .env.default .env.development
+```
+3. Launch frontend server
+```bash
+npm run start
+```
+4. Then run the UI:
+```bash
+magentic-ui --port 8081
+```
+The frontend from source will be available at <http://localhost:8000>, and the compiled frontend will be available at <http://localhost:8081>.
+## Troubleshooting
+If you were unable to get Magentic-UI running, do not worry! The first step is to make sure you have followed the steps outlined above, particularly with the [pre-requisites](#pre-requisites).
+For common issues and their solutions, please refer to the [TROUBLESHOOTING.md](TROUBLESHOOTING.md) file in this repository. If you do not see your problem there, please open a `GitHub Issue`.
+## Contributing
+This project welcomes contributions and suggestions. For information about contributing to Magentic-UI, please see our [CONTRIBUTING.md](CONTRIBUTING.md) guide, which includes current issues to be resolved and other forms of contributing.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information, see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+## Citation
+Please cite our paper if you use our work in your research:
+```
+@article{mozannar2025magentic,
+  title={Magentic-UI: Towards Human-in-the-loop Agentic Systems},
+  author={Mozannar, Hussein and Bansal, Gagan and Tan, Cheng and Fourney, Adam and Dibia, Victor and Chen, Jingya and Gerrits, Jack and Payne, Tyler and Maldaner, Matheus Kunzler and Grunde-McLaughlin, Madeleine and others},
+  journal={arXiv preprint arXiv:2507.22358},
+  year={2025}
+}
+```
+## License
+Microsoft, and any contributors, grant you a license to any code in the repository under the [MIT License](https://opensource.org/licenses/MIT). See the [LICENSE](LICENSE) file.
+Microsoft, Windows, Microsoft Azure, and/or other Microsoft products and services referenced in the documentation
+may be either trademarks or registered trademarks of Microsoft in the United States and/or other countries.
+The licenses for this project do not grant you rights to use any Microsoft names, logos, or trademarks.
+Microsoft's general trademark guidelines can be found at <http://go.microsoft.com/fwlink/?LinkID=254653>.
+Any use of third-party trademarks or logos are subject to those third-party's policies.
+Privacy information can be found at <https://go.microsoft.com/fwlink/?LinkId=521839>
+Microsoft and any contributors reserve all other rights, whether under their respective copyrights, patents, or trademarks, whether by implication, estoppel, or otherwise.
+# Dummy change

SECURITY.md ADDED Viewed

	@@ -0,0 +1,41 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.**
+Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
+If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

SUPPORT.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Support
+## How to file issues and get help
+This project uses GitHub Issues to track bugs and feature requests. Please search the existing
+issues before filing new issues to avoid duplicates.  For new issues, file your bug or
+feature request as a new Issue.
+For help and questions about using this project, please post questions ti GitHub issues, as per
+above, and assign them the label "question".
+## Microsoft Support Policy
+Support for Magentic-UI is limited to the resources listed above.

TRANSPARENCY_NOTE.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Magentic-UI
+## OVERVIEW
+Magentic-UI is a human-centered computer use agent (CUA) **designed for collaboration with people** on web-based tasks. Magentic-UI operates a web browser and other tools, like code execution and file navigation, in real-time while optimizing for human-in-the-loop (HIL) orchestration.
+### What Can Magentic-UI Do?
+Magentic-UI was developed to investigate human-in-the-loop approaches for agentic design with the goals to improve agentic performance and increase user productivity for web tasks. Magentic-UI strongly involves the user throughout the planning and execution phase. Magentic-UI prompts the user to accept a plan before starting execution. Plans can be modified, saved and re-used.
+### Intended Uses
+Magentic-UI is a research prototype best suited to explore, experience and investigate agentic assistance in performing tasks that require web navigation. Magentic-UI should always be used with human supervision.
+Below are examples of tasks that Magentic-UI can accomplish:
+- Check the price of a coffee from the closest coffee shops to a certain destination
+- Create a formatted spreadsheet containing the box score statistics of all NBA games that occurred on a certain date
+- Summarize in a report a set of papers each downloaded from a given URL, such as the latest papers from arxiv on a certain topic
+Magentic-UI is being shared with the research community to foster further research on human-in-the-loop in agentic systems.
+Magentic-UI is intended to be used by domain experts who are independently capable of evaluating the quality of outputs, safety issues and potential harm before acting on them. OUT-of-scope uses
+We do not recommend using Magentic-UI in commercial or real-world applications without further testing and development. It is being released for research purposes.
+Magentic-UI is not well suited for tasks that: rely on audio or video data to process, long-duration tasks (e.g., summarize 100 papers) or tasks that require real-time fast actions like playing online games.
+Magentic-UI should always be used with a human-in-the-loop. While we support an autonomous version of Magentic-UI in our code for the purposes of evaluation, this version is not included in the interface and should only be used for evaluation purposes and nothing else. We discourage the use of the autonomous version as it does not possess the same safety safeguards as the human-in-the-loop version through the interface and has not undergone the same safety testing.
+Magentic-UI was not designed or evaluated for all possible downstream purposes. Developers should consider its inherent limitations as they select use cases, and evaluate and mitigate for accuracy, safety, and fairness concerns specific to each intended downstream use.
+Magentic-UI should not be used in highly regulated domains or high stakes situations where inaccurate outputs could suggest actions that lead to injury or negatively impact an individual's health, legal, and financial, life opportunities or legal status.
+We do not recommend using Magentic-UI in the context of high-risk decision making (e.g. in law enforcement, legal, finance, or healthcare).
+## HOW TO GET STARTED
+To begin using Magentic-UI, follow instructions at [microsoft/magentic-ui: Magentic-UI](https://github.com/microsoft/magentic-ui)
+## EVALUATION
+Magentic-UI was evaluated on its ability to autonomously solve complex tasks from benchmarks such as GAIA. Magentic-UI autonomously tries to complete these tasks and its final answer is judged with respect to the ground truth answer. To evaluate a human-in-the-loop set-up we also evaluated Magentic-UI with a simulated user with an interactive version of the GAIA benchmark.
+### Evaluation Methods
+We compared the performance of Magentic-UI against [Magentic-One](https://github.com/microsoft/autogen/tree/gaia_multiagent_v01_march_1st/samples/tools/autogenbench/scenarios/GAIA/Templates/Orchestrator) on the, [GAIA](https://arxiv.org/abs/2311.12983) benchmark. When running autonomously Magentic-UI shows comparable performance to Magentic-One (which previously achieved sota results on GAIA) and higher accuracy with simulated human-in-the-loop.
+The model used for evaluation was GPT-4o from Azure OpenAI. Results may vary if Magentic-UI is used with a different model, or when using other models for evaluation, based on their unique design, configuration and training.
+In addition to robust quality performance testing, Magentic-UI was assessed from a Responsible AI perspective. Based on these results, we implemented mitigations to minimize Magentic-UI s susceptibility to misuse. See details in risks and mitigation section below.
+### Evaluation Results
+At a high level, we found that Magentic-UI performed similarly to [Magentic-One](https://github.com/microsoft/autogen/tree/gaia_multiagent_v01_march_1st/samples/tools/autogenbench/scenarios/GAIA/Templates/Orchestrator) on autonomous task completion and better with simulated human-in-the-loop.
+## LIMITATIONS
+Magentic-UI was developed for research and experimental purposes. Further testing and validation are needed before considering its application in commercial or real-world scenarios.
+Magentic-UI was designed and tested using the English language. Performance in other languages may vary and should be assessed by someone who is both an expert in the expected outputs and a native speaker of that language.
+Outputs generated by AI may include factual errors, fabrication, or speculation. Users are responsible for assessing the accuracy of generated content. All decisions leveraging outputs of the system should be made with human oversight and not be based solely on system outputs.
+Magentic-UI inherits any biases, errors, or omissions produced by the model used. Developers are advised to choose an appropriate base LLM/MLLM carefully, depending on the intended use case.
+There has not been a systematic effort to ensure that systems using Magentic-UI are protected from security vulnerabilities such as indirect prompt injection attacks. Any systems using it should take proactive measures to harden their systems as appropriate.
+## BEST PRACTICES
+Magentic-UI is a highly capable agent, proficient at interacting with websites, operating over local files, and writing or executing Python code, but like all LLM-based systems, it can and will make mistakes. To safely operate Magentic-UI, always run it within the provided Docker containers, and strictly limit its access to only essential resources avoid sharing unnecessary files, folders, or logging into websites through the agent. Never share sensitive data you wouldn't confidently send to external providers like Azure or OpenAI. Magentic-UI shares browser screenshots with model providers including all data users choose to enter on websites in Magentic-UI s browser. Ensure careful human oversight by meticulously reviewing proposed actions and monitoring progress before giving approval. Finally, approach its output with appropriate skepticism; Magentic-UI can hallucinate, misattribute sources, or be misled by deceptive or low-quality online content.
+We strongly encourage users to use LLMs/MLLMs that support robust Responsible AI mitigations, such as Azure Open AI (AOAI) services. Such services continually update their safety and RAI mitigations with the latest industry standards for responsible use. For more on AOAI s best practices when employing foundations models for scripts and applications:
+- [Blog post on responsible AI features in AOAI that were presented at Ignite 2023](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-new-ai-safety-amp-responsible-ai-features-in-azure/ba-p/3983686)
+- [Overview of Responsible AI practices for Azure OpenAI models] (https://learn.microsoft.com/en-us/legal/cognitive-services/openai/overview)
+- [Azure OpenAI Transparency Note](<https://learn.microsoft.com/en-us/legal/cognitive-services/openai/transparency-note>)
+- [OpenAI s Usage policies](https://openai.com/policies/usage-policies)
+- [Azure OpenAI s Code of Conduct](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/code-of-conduct)
+Users are reminded to be mindful of data privacy concerns and are encouraged to review the privacy policies associated with any models and data storage solutions interfacing with Magentic-UI.
+It is the user s responsibility to ensure that the use of Magentic-UI complies with relevant data protection regulations and organizational guidelines.
+For benchmarking purposes Magentic-UI has an autonomous mode that deactivates human-in-the-loop components such as co-planning and co-execution. This mode is not accessible through the UI, we strongly encourage to limit it s usage to benchmark scenarios.
+## RISKS AND MITIGATIONS
+Human agency and oversight are foundational to Magentic-UI s design. From the ground up, Magentic-UI was created with a human-in-the-loop (HIL) philosophy that places the user in control of agent behavior. Every action Magentic-UI takes -- whether navigating the web, manipulating data, or executing code -- is preceded by a transparent planning phase where the proposed steps are surfaced for review. Plans are only executed with explicit user approval, and users retain the ability to pause, modify, or interrupt the agent at any time. When Magentic-UI encounters a scenario it deems high-impact or non-reversible, such as navigating to a new domain or initiating a potentially risky action, it proactively requests confirmation before proceeding. The user can also configure Magentic-UI to always ask for permission before performing any action. This approach reinforces user autonomy while minimizing unintended or unsafe behavior.
+One of the key safety features in Magentic-UI is the ability to set a set of allowed websites. The allowed websites represent the set of websites that Magentic-UI can visit without explicit user approval. If Magentic-UI needs to visit a website outside the allowed list, it will ask the user for explicit approval by mentioning the exact URL, the page title and the reason for visiting the website.
+To address safety and security concerns, Magentic-UI underwent targeted red-teaming to assess its behavior under adversarial and failure scenarios.  Such scenarios include cross-site prompt injection attacks where web pages contain malicious instructions distinct from the user s original intents (e.g., to execute risky code, access sensitive files, or perform actions on other websites). It also contains scenarios comparable to phishing, which try to trick Magentic-UI into entering sensitive information, or granting permissions on impostor sites (e.g., a synthetic website that asks Magentic-UI to log in and enter Google credentials to read an article). In our preliminary evaluations, we found that Magentic-UI either refuses to complete the requests, stops to ask the user, or, as a final safety measure, is eventually unable to complete the request due to Docker sandboxing. We have found that this layered approach is effective for thwarting these attacks.
+Magentic-UI was architected with strong isolation boundaries: every component is sandboxed in separate Docker containers, allowing fine-grained access control to only necessary resources.  This effectively shields the host environment from agent activities. Sensitive data such as chat history, user settings, and execution logs are stored locally to preserve user privacy and minimize exposure.
+Together, these mitigations are intended to reduce misuse risks, promote transparency, and preserve user control at every step. Magentic-UI is not a system that operates behind the scenes; it is a collaborator designed to act *with* the user, not *for* them.
+## LICENSE
+```
+Magentic-UI is published under MIT License.
+Copyright (c) Microsoft Corporation.
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE
+```
+## CONTACT
+We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at [magui@service.microsoft.com]
+If the team receives reports of undesired behavior or identifies issues independently, we will update this repository with appropriate mitigations.

TROUBLESHOOTING.md ADDED Viewed

	@@ -0,0 +1,133 @@

+# ⚠️ TROUBLESHOOTING
+This document lists common issues users have encountered with Magentic-UI and how to resolve them.
+## 1. 🐳 Docker Not Detected / 🚫 Podman Not Supported
+**Error:**
+`Checking if Docker is running...Failed`
+`Docker is not running. Please start Docker and try again.`
+**Solution:**
+- Magentic-UI requires Docker Desktop (Windows/Mac) or Docker Engine (Linux).
+- Podman and other container engines are **not supported**.
+- Make sure Docker is installed and running.
+One possible fix for Mac/Ubuntu (especially if using Colima) is by setting environment variable for DOCKER_HOST ([see issue 81](https://github.com/microsoft/magentic-ui/issues/81), thank you to serproqnx) to point to docker.sock:
+```bash
+export DOCKER_HOST=unix:///home/<your-username>/.docker/desktop/docker.sock
+```
+**Note**: you might have to adjust the path to point to the correct location of docker.sock, as suggested by KangEn1997 in [issue 137](https://github.com/microsoft/magentic-ui/issues/137).
+This can resolve issues where the Docker SDK cannot automatically detect the Docker socket location.
+Another possible fix on Ubuntu if docker is not running is to make sure your user is in the 'docker' group or run with sudo.
+Please read [Linux post-installation steps for Docker Engine
+](https://docs.docker.com/engine/install/linux-postinstall/) for more information.
+## 2. 🚪 Port 8081 Fails to Start
+**Error:**
+`Port 8081 failed to start` or `Address already in use`
+**Solution:**
+- Make sure port 8081 is not being used by another application.
+- You can change the port with `magentic ui --port <another_port>`.
+## 3. 🏗️ Docker Image Pull Fails
+**Error:**
+`Pulling docker image...Failed` or similar
+**Solution:**
+- Make sure you have a stable internet connection.
+- Update Docker to the latest version.
+- Check that you have enough disk space.
+- Try building the images manually:
+  ```bash
+  cd docker
+  sh build-all.sh
+  ```
+## 4. 🪟 WSL2 Not Set Up on Windows
+**Error:**
+`Docker is not running` or `WSL2 required`
+**Solution:**
+- Follow the [For Windows Users](#for-windows-users) section in the README.
+- Ensure Docker Desktop is configured to use WSL2.
+   - Go to Settings > Resources > WSL Integration
+   - Enable integration with your WSL distro
+## 5. 🖥️ Browser Cannot Be Operated
+**Symptoms:**
+- UI loads, but browser window is blank or unresponsive.
+**Solution:**
+- Make sure Docker containers are running (`docker ps`).
+- Check firewall settings and ensure required ports are open.
+- Try restarting Docker and Magentic-UI.
+## 6. 🏔️ Alpine Linux Compatibility
+**Issue:**
+- Magentic-UI is not tested on Alpine Linux. Use Ubuntu or Debian for best results.
+## 7. 🌐 Running on Remote Servers
+**Issue:**
+- UI is not accessible remotely, or browser does not work.
+**Solution:**
+- Make sure ports are open and forwarded correctly.
+- Check firewall and security group settings.
+## 8. 🟪 Magentic Command Not Found
+**Issue:**
+- Command not found: Magentic
+    ```bash
+    magentic ui --port 8081
+    zsh: command not found: magentic
+    ```
+**Solution 1**:
+- Make sure you have you have activated your virtual environment.
+- You can double check by reactivating it and then running the command again:
+    ```bash
+    deactivate
+    source .venv/bin/activate
+    magentic ui --port 8081
+    ```
+**Solution 2**:
+- You may have accidentally installed the package named `magentic` instead of ours `magentic-ui`
+- Make sure you are running the following command:
+  ```bash
+  pip install magentic-ui
+  ```
+## 9. ❓ Still Having Issues?
+- Double-check all [pre-requisites](#pre-requisites-please-read) in the README.
+- Search [GitHub Issues](https://github.com/microsoft/magentic-ui/issues) for similar problems.
+- Open a new issue and include:
+  1. A detailed description of your problem
+  2. Information about your system (OS, Docker version, etc.)
+  3. Steps to replicate the issue (if possible)
+---
+If you have suggestions for this document or find a solution not listed, please submit a pull request! 🙏

docs/img/magenticui.jpg ADDED Viewed

docs/img/magenticui_running.png ADDED Viewed

Git LFS Details

SHA256: 36317426835c71bfa6e6f3d955dd1cf287d52fa3ef9cc6dccb8f94ede271bd2e
Pointer size: 131 Bytes
Size of remote file: 583 kB

docs/img/magui-actionguard.png ADDED Viewed

Git LFS Details

SHA256: 6d5a125e6ae8fcf10daf9e03b3ebe496241812d07e6f6d873f51ed62270fe5e5
Pointer size: 131 Bytes
Size of remote file: 448 kB

docs/img/magui-coplanning.png ADDED Viewed

Git LFS Details

SHA256: 36654c585575deccf0bedfdec9495bc5cde4ff3e3d1c4f0020051e74ea4742b4
Pointer size: 131 Bytes
Size of remote file: 169 kB

docs/img/magui-cotasking.png ADDED Viewed

Git LFS Details

SHA256: a5db7e727ccb7fb01b2494a7e14d79ff80ad2abf79b28b2f6a856be90fe9d5e4
Pointer size: 132 Bytes
Size of remote file: 1.01 MB

docs/img/magui-landing.png ADDED Viewed

Git LFS Details

SHA256: fa83d0a19420c2b918cf6663857b9add1c4db9bcce72eaacc0adbe700478b8e5
Pointer size: 131 Bytes
Size of remote file: 151 kB

docs/img/magui-readme-logo.png ADDED Viewed

Git LFS Details

SHA256: 0c97b6028d0437dcd4c8fadd335cf892b813fadcd7cbe8bacfa1239822f429fd
Pointer size: 131 Bytes
Size of remote file: 309 kB

docs/img/magui-readme-logo.svg ADDED Viewed

docs/index.html ADDED Viewed

	@@ -0,0 +1,141 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Magentic-UI</title>
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/github-markdown-css/5.4.0/github-markdown-light.min.css">
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/styles/github.min.css">
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/marked/9.1.6/marked.min.js"></script>
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.9.0/highlight.min.js"></script>
+    <style>
+        body {
+            margin: 0;
+            padding: 0;
+            background-color: #ffffff;
+        }
+        .markdown-body {
+            box-sizing: border-box;
+            min-width: 200px;
+            max-width: 980px;
+            margin: 0 auto;
+            padding: 45px;
+        }
+        @media (max-width: 767px) {
+            .markdown-body {
+                padding: 15px;
+            }
+        }
+        .loading {
+            text-align: center;
+            padding: 50px;
+            color: #666;
+        }
+        .error {
+            text-align: center;
+            padding: 50px;
+            color: #d73a49;
+        }
+        /* GitHub-style header */
+        .header {
+            background-color: #24292f;
+            color: white;
+            padding: 16px 0;
+            margin-bottom: 32px;
+        }
+        .header-content {
+            max-width: 980px;
+            margin: 0 auto;
+            padding: 0 45px;
+            display: flex;
+            align-items: center;
+            gap: 16px;
+        }
+        .header h1 {
+            margin: 0;
+            font-size: 20px;
+            font-weight: 600;
+        }
+        .header a {
+            color: #7d8590;
+            text-decoration: none;
+        }
+        .header a:hover {
+            color: white;
+        }
+        @media (max-width: 767px) {
+            .header-content {
+                padding: 0 15px;
+            }
+        }
+    </style>
+</head>
+<body>
+    <div class="header">
+        <div class="header-content">
+            <h1>Magentic-UI</h1>
+            <span>•</span>
+            <a href="https://github.com/microsoft/magentic-ui">View on GitHub</a>
+            <span>•</span>
+            <a href="https://github.com/microsoft/magentic-ui/releases">Releases</a>
+        </div>
+    </div>
+    <article class="markdown-body">
+        <div id="loading" class="loading">Loading README...</div>
+        <div id="content" style="display: none;"></div>
+        <div id="error" class="error" style="display: none;">
+            <h2>Error loading README</h2>
+            <p>Please visit the <a href="https://github.com/microsoft/magentic-ui">GitHub repository</a> to view the latest content.</p>
+        </div>
+    </article>
+    <script>
+        // Configure marked options
+        marked.setOptions({
+            highlight: function(code, lang) {
+                if (lang && hljs.getLanguage(lang)) {
+                    return hljs.highlight(code, { language: lang }).value;
+                } else {
+                    return hljs.highlightAuto(code).value;
+                }
+            },
+            breaks: true,
+            gfm: true
+        });
+        // Fetch and display README content
+        fetch('https://raw.githubusercontent.com/microsoft/magentic-ui/main/README.md')
+            .then(response => {
+                if (!response.ok) {
+                    throw new Error('Failed to fetch README');
+                }
+                return response.text();
+            })
+            .then(markdown => {
+                // Fix relative image paths to point to GitHub
+                let fixedMarkdown = markdown
+                    // Fix Markdown image syntax: ![alt](path)
+                    .replace(/!\[([^\]]*)\]\((?!https?:\/\/)([^)]+)\)/g, '![$1](https://raw.githubusercontent.com/microsoft/magentic-ui/main/$2)')
+                    // Fix HTML img tags: <img src="path">
+                    .replace(/<img([^>]+)src="(?!https?:\/\/)([^"]+)"/g, '<img$1src="https://raw.githubusercontent.com/microsoft/magentic-ui/main/$2"')
+                    // Convert GitHub video URLs to embedded video players
+                    .replace(/https:\/\/github\.com\/user-attachments\/assets\/([a-f0-9-]+)/g,
+                        '<video controls width="100%" style="max-width: 800px;"><source src="https://github.com/user-attachments/assets/$1" type="video/mp4">Your browser does not support the video tag.</video>');
+                const html = marked.parse(fixedMarkdown);
+                document.getElementById('loading').style.display = 'none';
+                document.getElementById('content').style.display = 'block';
+                document.getElementById('content').innerHTML = html;
+                // Initialize syntax highlighting
+                hljs.highlightAll();
+            })
+            .catch(error => {
+                console.error('Error:', error);
+                document.getElementById('loading').style.display = 'none';
+                document.getElementById('error').style.display = 'block';
+            });
+    </script>
+</body>
+</html>

docs/tutorials/web_agent_tutorial_full.ipynb ADDED Viewed

	@@ -0,0 +1,1782 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tutorial: Building a Browser Use Agent From Scratch and with Magentic-UI\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "You might have seen cool video demos online of AI agents taking control of a computer or a browser to perform tasks. This is a new category of agents referred to as Computer-Use-Agents (CUA) or Browser-Use-Agents (BUA). Examples of such CUA/BUA agents include [OpenAI's Operator](https://openai.com/index/introducing-operator/), [Claude Computer Use Model](https://docs.anthropic.com/en/docs/agents-and-tools/tool-use/computer-use-tool), [AutoGen's MultiModalWebSurfer](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.agents.web_surfer.html), [Adept AI](https://www.adept.ai/blog/act-1), [Google's Project Mariner](https://deepmind.google/models/project-mariner/) and [Browser-Use](https://github.com/browser-use/browser-use/tree/main) among many others.\n",
+    "\n",
+    "\n",
+    "## What is a Computer Use Agent?\n",
+    "\n",
+    "**Definition**: A computer or browser use agent is an agent that given a task, e.g., \"order a shawarma sandwich from BestShawarma for pickup now\", can programmatically control a computer or browser to autonomously complete the task. By \"control a browser\" we mean interacting with the browser in a similar way to how a human might control the browser: clicking on buttons, typing in fields, scrolling and so on. Note that a tool-use language model agent could complete this food ordering task if it had access to the restaurant API for instance, this would not make it a CUA agent as it is not _interacting_ with the browser to complete the task.\n",
+    "\n",
+    "To make this distinction more clear, here is another example task.\n",
+    "Suppose we wanted to find the list of available Airbnbs in Miami from 6/18 to 6/20 for 2 guests.\n",
+    "\n",
+    "![airbnb_sc.png](airbnb_sc.png)\n",
+    "\n",
+    "How would a browser use agent solve this task:\n",
+    "\n",
+    "- **Step 1:** Visit airbnb.com\n",
+    "- **Step 2:** Type \"Miami\" in the \"Where\" input box\n",
+    "- **Step 3:** Select \"6/18\" in the \"Check in\" date box\n",
+    "- **Step 4:** Select \"6/20\" in the \"Check out\" date box\n",
+    "- **Step 5:** Click on the \"Who\" button\n",
+    "- **Step 6:** Click \"+\" twice to add two guests\n",
+    "- **Step 7:** Click \"Search\" button\n",
+    "- **Step 8:** Summarize and extract listings from the webpage\n",
+    "\n",
+    "On the other hand, suppose we had an API for Airbnb that looks like: `find_listings(location, check_in, check_out, guests)`\n",
+    "\n",
+    "Then a tool-call agent would first need to generate a tool call: `find_listings(\"Miami\", 6/18, 6/20, 2)` and read out the result of the tool call.\n",
+    "\n",
+    "Clearly if we had an API for every website and everything on the computer, then it would be much simpler to perform this task. _But that is not the case currently_, many interfaces on the web cannot be accessed by an API and so the only way is through interacting with the website directly. While future interfaces might become more directly accessible to agents via APIs and MCP servers, for now we need to perform direct manipulation with the websites.\n",
+    "\n",
+    "## What Does This Tutorial Cover?\n",
+    "\n",
+    "In this tutorial, we will cover how to build a basic browser-use agent. The goal of this tutorial is to demystify such agents and show how we can build a simple version of them. The only thing we need is access to a large language model (LLM) that can perform tool calling or structured JSON outputs (GPT-4o, Qwen2.5-VL, Llama 3.1, ...). The LLM does not need to be vision capable, but a model capable of taking image input would improve performance significantly. The LLM also does not need to be trained previously for browser-use, out of the box LLMs can be turned into semi-capable browser-use agents following the recipe in this tutorial. At the end of the tutorial we will discuss further directions.\n",
+    "\n",
+    "We will cover three levels of building your browser use agent:\n",
+    "\n",
+    "- Level 1: From scratch using only the `playwright` python package.\n",
+    "- Level 2: Using helpers from the `magentic-ui` package which simplifies building your agent.\n",
+    "- Level 3: Using the WebSurfer Agent from the `magentic-ui` package directly.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Tutorial Prerequisites\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "You will need Python >3.10 to run this tutorial and the `magentic-ui` package. [Magentic-UI](https://github.com/microsoft/magentic-ui/tree/main) is a research prototype from Microsoft of a human-centered agentic interface. In this tutorial we will be using utilities and helpers from that package without using the Magentic-UI application itself.\n",
+    "\n",
+    "We recommend using a virtual environment to avoid conflicts with other packages.\n",
+    "\n",
+    "```bash\n",
+    "python3 -m venv .venv\n",
+    "source .venv/bin/activate\n",
+    "pip install magentic-ui\n",
+    "```\n",
+    "\n",
+    "Alternatively, if you use [`uv`](https://docs.astral.sh/uv/getting-started/installation/) for dependency management, you can install Magentic-UI with:\n",
+    "\n",
+    "```bash\n",
+    "uv venv --python=3.12 .venv\n",
+    ". .venv/bin/activate\n",
+    "uv pip install magentic-ui\n",
+    "```\n",
+    "\n",
+    "We also need to install the browsers that our agent will control with playwright:\n",
+    "\n",
+    "```bash\n",
+    "playwright install --with-deps chromium\n",
+    "```\n",
+    "\n",
+    "The other thing you need to set up is your LLM. The easiest way to follow this tutorial is to obtain an OpenAI API key and set it as an environment variable:\n",
+    "\n",
+    "```bash\n",
+    "export OPENAI_API_KEY=<YOUR API KEY>\n",
+    "```\n",
+    "\n",
+    "You can also use any open source model with [Ollama](https://ollama.com/) if you have a capable GPU at your disposal. We will be covering both using OpenAI and Ollama."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Level 1: Building a Browser Use Agent From Scratch\n",
+    "\n",
+    "For this level of building our browser use agent, we will only need the `playwright` and `openai` packages which are included in `magentic-ui` package.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Lauching a Browser\n",
+    "\n",
+    "The first step is to launch the browser that our agent will control. We will be using the [Playwright](https://github.com/microsoft/playwright-python) library that provides an API to control browsers.\n",
+    "\n",
+    "We can launch the browser in headless mode (we cannot see the actual browser on our machine) or non-headless where the browser will be launched locally.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from playwright.async_api import async_playwright\n",
+    "\n",
+    "headless = False  # Change to True to run the browser in headless mode\n",
+    "\n",
+    "# Launch and keep browser running\n",
+    "p = await async_playwright().start()\n",
+    "browser = await p.chromium.launch(headless=headless)\n",
+    "# context is the browser window\n",
+    "context = await browser.new_context()\n",
+    "# page is the tab in the browser\n",
+    "page = await context.new_page()\n",
+    "print(\"Browser launched!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At this point you should see a browser launched locally, it will be pointing at a blank page:\n",
+    "\n",
+    "![blank_page.png](blank_page.png)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can use the playwright API to interact with this browser, for instance let us navigate to the bing homepage (give it a few seconds)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await page.goto(\"https://www.bing.com\")\n",
+    "print(\"Navigated to Bing homepage\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Represent the browser for the Agent using Set-Of-Marks Prompting.\n",
+    "\n",
+    "Our next challenge is how do we feed the browser as input to our agent so that it is able to perform actions on it.\n",
+    "\n",
+    "Using Playwright we can first take a screenshot of the browser as well as extract the text on the page.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import Image, display\n",
+    "\n",
+    "# Take a screenshot and store it in memory\n",
+    "screenshot_bytes = await page.screenshot()\n",
+    "\n",
+    "# Display the screenshot\n",
+    "display(Image(screenshot_bytes))\n",
+    "\n",
+    "# Get all the text on the page and print first 10 lines\n",
+    "text = await page.evaluate(\"() => document.body.innerText\")\n",
+    "print(\"\\nFirst 10 lines of text content:\")\n",
+    "print(\"\\n\".join(text.split(\"\\n\")[:10]))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now how do we get our agent to type in the search box and press search?\n",
+    "\n",
+    "The key is to extract all **interactive elements** in the page using Plawright. By interactive elements we mean elements on the page we can interact with including buttons, text boxes, dropdown menus among others. Each interactive element will have an ID that we can track on the page and if it is a visible element it will have the coordinates of bounding box of the element. We will also only look at the interactive elements that are currently visibile in the current viewport, some elements might be out of view and we'd need to scroll down to view them. For simplicity, we will ignore these elements and give our agent the ability to scroll down to view them later on.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from dataclasses import dataclass\n",
+    "from playwright.async_api import Page\n",
+    "\n",
+    "\n",
+    "# A class to represent an interactive element on the page\n",
+    "@dataclass\n",
+    "class Element:\n",
+    "    id: int  # The id of the element\n",
+    "    aria_label: (\n",
+    "        str  # The aria-label attribute is used to provide a label for an element\n",
+    "    )\n",
+    "    type: str  # The type of the element\n",
+    "    bbox: dict  # The bounding box of the element\n",
+    "    text: str  # The text content of the element\n",
+    "\n",
+    "\n",
+    "# We will now go over the page and extract all interactive elements\n",
+    "# We will also add a data attribute to the element with the element ID for later reference\n",
+    "async def get_interactive_elements(page: Page) -> list[Element]:\n",
+    "    elements: list[Element] = []\n",
+    "    # Viewport size is a dict with keys 'width' and 'height'\n",
+    "    viewport_size = page.viewport_size\n",
+    "    print(f\"Viewport size: {viewport_size}\")\n",
+    "\n",
+    "    # For simplicity, we will only look at buttons, textboxes, and links. We can add more roles later on.\n",
+    "    interactive_roles = [\"button\", \"textbox\", \"link\"]\n",
+    "    i = 0\n",
+    "    for role in interactive_roles:\n",
+    "        print(f\"Getting {role} elements...\")\n",
+    "        # We will use the Playwright API to get all elements with the given role\n",
+    "        elements_with_role = await page.get_by_role(role).all()\n",
+    "        for element in elements_with_role:\n",
+    "            # Check if element is visible and in current viewport\n",
+    "            bbox = await element.bounding_box()\n",
+    "            if bbox:  # Element is visible if it has a bounding box\n",
+    "                # Check if element is in current viewport (not scrolled out of view)\n",
+    "                if 0 <= bbox[\"y\"] <= viewport_size[\"height\"]:\n",
+    "                    # Set a data attribute with the element ID for later reference\n",
+    "                    await element.evaluate(f\"el => el.setAttribute('data-element-id', '{i}')\")\n",
+    "                    elements.append(\n",
+    "                        Element(\n",
+    "                            id=i,\n",
+    "                            aria_label=await element.get_attribute(\"aria-label\")\n",
+    "                            or await element.get_attribute(\"aria-role\")\n",
+    "                            or \"\",\n",
+    "                            type=role,\n",
+    "                            bbox=bbox,\n",
+    "                            text=await element.text_content() or \"\",\n",
+    "                        )\n",
+    "                    )\n",
+    "                    i += 1\n",
+    "    print(f\"Found {len(elements)} visible interactive elements in current viewport:\")\n",
+    "    return elements\n",
+    "\n",
+    "\n",
+    "elements = await get_interactive_elements(page)\n",
+    "formatted_list_of_elements = \"\\n\".join(\n",
+    "    [f\"Element {i}: {element}\" for i, element in enumerate(elements)]\n",
+    ")\n",
+    "print(formatted_list_of_elements)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The first question, is how do we identify the search box element on the Bing page give these elements?\n",
+    "We can try to read to figure this out by reading the list of elements, we can see that it is likely to be Element 19:\n",
+    "\n",
+    "Element(id=19, aria_label='0 characters out of 2000', type='textbox', bbox={'x': 193, 'y': 158, 'width': 843, 'height': 22}, text='')\n",
+    "\n",
+    "As this is the only texbox or searchbox element on the page.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# find the search box\n",
+    "search_box_id = None\n",
+    "for element in elements:\n",
+    "    if element.type == \"textbox\":\n",
+    "        search_box_id = element.id\n",
+    "        break\n",
+    "print(f\"Search box id: {search_box_id}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "However, we also have access to the page screenshot and the coordinates of each element. A neat idea would be to superimpose the bounding boxes on top of the screenshot to better understand what each element is. This technique is called Set-of-Mark Prompting (SoM) coined by Yang, Jianwei, et al. [1] to improve visual grounding.\n",
+    "\n",
+    "[1]: Yang, Jianwei, et al. \"Set-of-mark prompting unleashes extraordinary visual grounding in gpt-4v.\" arXiv preprint arXiv:2310.11441 (2023). https://arxiv.org/pdf/2310.11441\n",
+    "\n",
+    "We're gonna now implement a simplified version of SoM prompting:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image, ImageDraw\n",
+    "import io\n",
+    "\n",
+    "\n",
+    "def get_som_screenshot(screenshot_bytes: bytes, elements: list[Element]) -> Image.Image:\n",
+    "    screenshot = Image.open(io.BytesIO(screenshot_bytes))\n",
+    "\n",
+    "    # Create a drawing object\n",
+    "    draw = ImageDraw.Draw(screenshot)\n",
+    "\n",
+    "    # Draw bounding boxes and element IDs for each element\n",
+    "    for element in elements:\n",
+    "        bbox = element.bbox\n",
+    "        x = bbox[\"x\"]\n",
+    "        y = bbox[\"y\"]\n",
+    "        width = bbox[\"width\"]\n",
+    "        height = bbox[\"height\"]\n",
+    "\n",
+    "        # Draw rectangle\n",
+    "        draw.rectangle([(x, y), (x + width, y + height)], outline=\"red\", width=2)\n",
+    "\n",
+    "        # Draw element ID\n",
+    "        draw.text((x, y - 15), f\"{element.id}\", fill=\"red\")\n",
+    "\n",
+    "    # Display the annotated screenshot\n",
+    "    display(screenshot)\n",
+    "    som_screenshot = screenshot.copy()\n",
+    "    return som_screenshot\n",
+    "\n",
+    "\n",
+    "screenshot_bytes = await page.screenshot()\n",
+    "som_screenshot = get_som_screenshot(screenshot_bytes, elements)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This confirms what we previously found that Element with id=19 is in fact the searchbox!\n",
+    "\n",
+    "Let us now wrap what we just did in a helper function to prepare the page to be used as input to our agent:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def prepare_page_for_agent(page: Page) -> tuple[str, str, Image.Image]:\n",
+    "    \"\"\"\n",
+    "    Prepare the page for the agent.\n",
+    "    Returns:\n",
+    "        tuple[str, str, Image.Image]: The page text, the formatted list of elements, and the screenshot with bounding boxes.\n",
+    "    \"\"\"\n",
+    "    page_text = await page.evaluate(\"() => document.body.innerText\")\n",
+    "    elements = await get_interactive_elements(page)\n",
+    "    screenshot_bytes = await page.screenshot()\n",
+    "    som_screenshot = get_som_screenshot(screenshot_bytes, elements)\n",
+    "\n",
+    "    formatted_list_of_elements = \"\\n\".join(\n",
+    "        [f\"Element {i}: {element}\" for i, element in enumerate(elements)]\n",
+    "    )\n",
+    "\n",
+    "    return page_text, formatted_list_of_elements, som_screenshot\n",
+    "\n",
+    "\n",
+    "# page_text, formatted_list_of_elements, screenshot = await prepare_page_for_agent(page)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3: Define Agent Action Space\n",
+    "\n",
+    "Now that we have established how to represent the browser state for our Agent, it's time to define our Agent architecture. This section will cover the action space and execution flow that enables our agent to perform tasks using the browser.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Action Space Definition\n",
+    "\n",
+    "Our web agent operates with a carefully designed set of actions that cover the fundamental browser interactions needed for most web automation tasks:\n",
+    "\n",
+    "- **`goto(url)`**: Navigate to a specific URL\n",
+    "- **`click(id)`**: Click on an element identified by its ID\n",
+    "- **`type(id, text)`**: Input text into a form field or text element by ID\n",
+    "- **`scroll(direction)`**: Scroll the page vertically (up/down)\n",
+    "- **`stop_action(final_answer)`**: Complete the task and return the final result\n",
+    "\n",
+    "_Note: This is a simplfied action set designed for our initial prototype. It can be extended with additional actions like hover, select, wait, etc., as needed._\n",
+    "\n",
+    "### Agent Architecture Flow\n",
+    "\n",
+    "The following diagram illustrates how our web agent processes user queries and executes actions:\n",
+    "\n",
+    "```mermaid\n",
+    "flowchart TD\n",
+    "    A[\"Input: User Query\"] --> B[\"Initialize Agent\"]\n",
+    "    B --> C[\"Capture Current Page State\"]\n",
+    "    C --> D[\"Analyze Page & Query\"]\n",
+    "    D --> E{Action Decision}\n",
+    "    E -->|goto| F[\"Navigate to URL\"]\n",
+    "    E -->|click| G[\"Click Element by ID\"]\n",
+    "    E -->|type| H[\"Type Text in Element\"]\n",
+    "    E -->|scroll| I[\"Scroll Page\"]\n",
+    "    E -->|stop_action| J[\"Return Final Answer\"]\n",
+    "    F --> K[\"Execute Action\"]\n",
+    "    G --> K\n",
+    "    H --> K\n",
+    "    I --> K\n",
+    "    K --> C\n",
+    "    J --> L[\"Output: Final Answer\"]\n",
+    "\n",
+    "    style A fill:#e1f5fe\n",
+    "    style L fill:#e8f5e8\n",
+    "    style E fill:#fff3e0\n",
+    "    style J fill:#ffebee\n",
+    "```\n",
+    "\n",
+    "### Execution Flow Details\n",
+    "\n",
+    "1. **Input Processing**: The agent receives a user query describing the desired task\n",
+    "2. **State Capture**: Current browser page state is captured and processed\n",
+    "3. **Action Selection**: Based on the analysis, one of the five actions is chosen\n",
+    "4. **Execution**: The selected action is executed in the browser. We append the feedback of the action into the chat history.\n",
+    "5. **Loop Continuation**: The process repeats until `stop_action` is triggered\n",
+    "\n",
+    "The agent continues this loop until it determines the task is complete, at which point it executes `stop_action` with the final answer.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Our first step is to create the prompt template for the model to decide on the correct action. Instead of using tool calling to decide on the action, we will JSON outputs for simplicity.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "AGENT_PROMPT = \"\"\"\n",
+    "You are a helpful assistant that can navigate a web page and perform actions on it.\n",
+    "\n",
+    "The task we are trying to complete is:\n",
+    "{task}\n",
+    "\n",
+    "The current visible text on the page is:\n",
+    "{page_text}\n",
+    "\n",
+    "The current visible elements on the page are:\n",
+    "{formatted_list_of_elements}\n",
+    "\n",
+    "You will need to decide on the next action to take.\n",
+    "\n",
+    "The action space is:\n",
+    "- goto(url): navigate to a URL\n",
+    "- click(id): click a button given it's ID\n",
+    "- type(id, text): type \"text\" into element \"id\"\n",
+    "- scroll(direction): scroll the page in direction up or down.\n",
+    "- stop_action(final_answer): declare that we have finished the task and prepare a final_answer to return to the user.\n",
+    "\n",
+    "Output a JSON object with the following fields:\n",
+    "{{\n",
+    "    \"action\": \"goto\" | \"click\" | \"type\" | \"scroll\" | \"stop_action\",\n",
+    "    \"action_args\": {{\n",
+    "        \"url\": \"https://www.google.com\",\n",
+    "        \"id\": \"123\",\n",
+    "        \"text\": \"Hello\",\n",
+    "        \"direction\": \"up\"\n",
+    "    }}\n",
+    "}}\n",
+    "\n",
+    "Only output the JSON object, no other text or comments.\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's now try this prompt with our LLM:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "import json\n",
+    "import base64\n",
+    "from PIL import Image\n",
+    "import os\n",
+    "\n",
+    "# Prepare the page for the agent\n",
+    "page_text, formatted_list_of_elements, som_screenshot = await prepare_page_for_agent(\n",
+    "    page\n",
+    ")\n",
+    "task = \"Search for Magentic-UI\"\n",
+    "# Now make the API call\n",
+    "client = OpenAI(\n",
+    "    api_key=os.getenv(\"OPENAI_API_KEY\")\n",
+    ")  # you can use any other LLM client here\n",
+    "image_data_url = f\"data:image/png;base64,{base64.b64encode((lambda b: (som_screenshot.save(b, format='PNG'), b.getvalue())[1])(io.BytesIO())).decode()}\"\n",
+    "\n",
+    "\n",
+    "def get_llm_response(\n",
+    "    client: OpenAI,  # OpenAI client\n",
+    "    task: str,  # Task to complete\n",
+    "    page_text: str,  # Page text\n",
+    "    formatted_list_of_elements: str,  # Formatted list of elements\n",
+    "    image_data_url: str,  # Image data URL\n",
+    "    message_history: list[dict] = [],  # Message history\n",
+    "    model: str = \"gpt-4o\",  # Model to use\n",
+    ") -> dict:\n",
+    "    response = client.chat.completions.create(\n",
+    "        model=model,\n",
+    "        messages=[\n",
+    "            *message_history,\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": [\n",
+    "                    {\n",
+    "                        \"type\": \"text\",\n",
+    "                        \"text\": AGENT_PROMPT.format(\n",
+    "                            task=task,\n",
+    "                            page_text=page_text,\n",
+    "                            formatted_list_of_elements=formatted_list_of_elements,\n",
+    "                        ),\n",
+    "                    },\n",
+    "                    {\n",
+    "                        \"type\": \"image_url\",\n",
+    "                        \"image_url\": {\"url\": image_data_url},\n",
+    "                    },\n",
+    "                ],\n",
+    "            },\n",
+    "        ],\n",
+    "    )\n",
+    "\n",
+    "    # Parse the response\n",
+    "    try:\n",
+    "        action_decision = json.loads(response.choices[0].message.content)\n",
+    "        print(\"Model's decision:\", json.dumps(action_decision, indent=2))\n",
+    "    except json.JSONDecodeError:\n",
+    "        # it starts with ```json\n",
+    "        response_content = response.choices[0].message.content\n",
+    "        response_content = response_content.replace(\"```json\", \"\").replace(\"```\", \"\")\n",
+    "        action_decision = json.loads(response_content)\n",
+    "        print(\"Model's decision:\", json.dumps(action_decision, indent=2))\n",
+    "    except Exception as e:\n",
+    "        raise e\n",
+    "    return action_decision\n",
+    "\n",
+    "\n",
+    "action_decision = get_llm_response(\n",
+    "    client, task, page_text, formatted_list_of_elements, image_data_url\n",
+    ")\n",
+    "print(action_decision)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the model made the right decision given the task of \"Search for Magentic-UI\", the action is to type in the search box for \"Magentic-UI\"\n",
+    "\n",
+    "The last remaining piece before we put it all together is to now execute the action using Playwright."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 4: Executing the actions with Playwright\n",
+    "\n",
+    "For each of the actions we have previously defined, we will now write code using Playwright to execute them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This is mostly basic Playwright code, but we will use it to execute the actions.\n",
+    "async def execute_action(action: str, action_args: dict, page: Page) -> str:\n",
+    "    \"\"\"\n",
+    "    Execute an action on the page.\n",
+    "    \"\"\"\n",
+    "    if action == \"goto\":\n",
+    "        await page.goto(action_args[\"url\"])\n",
+    "        return f\"I navigated to {action_args['url']}\"\n",
+    "    elif action == \"click\":\n",
+    "        # Get the element using the data attribute\n",
+    "        await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
+    "        element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
+    "        if element:\n",
+    "            await element.click()\n",
+    "        else:\n",
+    "            raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
+    "        return f\"I clicked on {action_args['id']}\"\n",
+    "    elif action == \"type\":\n",
+    "        await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
+    "        element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
+    "        if element:\n",
+    "            await element.fill(action_args[\"text\"])\n",
+    "        else:\n",
+    "            raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
+    "        return f\"I typed {action_args['text']} into {action_args['id']}\"\n",
+    "    elif action == \"scroll\":\n",
+    "        await page.scroll(action_args[\"direction\"])\n",
+    "        return f\"I scrolled {action_args['direction']}\"\n",
+    "    elif action == \"stop_action\":\n",
+    "        return action_args[\"final_answer\"]\n",
+    "    else:\n",
+    "        raise ValueError(f\"Invalid action: {action}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await execute_action(action_decision[\"action\"], action_decision[\"action_args\"], page)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Take a screenshot of the page\n",
+    "screenshot = await page.screenshot()\n",
+    "display(Image.open(io.BytesIO(screenshot)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Success! We can see that our agent was properly able to type \"Magentic-UI\" into the searchbox!\n",
+    "\n",
+    "The final step is to put it all together into our Agent!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "await browser.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 5: Putting it all together into our Agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "from playwright.async_api import Page\n",
+    "from playwright.async_api import async_playwright\n",
+    "from PIL import Image, ImageDraw\n",
+    "import io\n",
+    "import base64\n",
+    "import json\n",
+    "from dataclasses import dataclass\n",
+    "from IPython.display import display\n",
+    "\n",
+    "@dataclass\n",
+    "class Element:\n",
+    "    id: int  # The id of the element\n",
+    "    aria_label: (\n",
+    "        str  # The aria-label attribute is used to provide a label for an element\n",
+    "    )\n",
+    "    type: str  # The type of the element\n",
+    "    bbox: dict  # The bounding box of the element\n",
+    "    text: str  # The text content of the element\n",
+    "\n",
+    "\n",
+    "AGENT_PROMPT = \"\"\"\n",
+    "You are a helpful assistant that can navigate a web page and perform actions on it.\n",
+    "\n",
+    "The task we are trying to complete is:\n",
+    "{task}\n",
+    "\n",
+    "The current visible text on the page is:\n",
+    "{page_text}\n",
+    "\n",
+    "The current visible elements on the page are:\n",
+    "{formatted_list_of_elements}\n",
+    "\n",
+    "You will need to decide on the next action to take.\n",
+    "\n",
+    "The action space is:\n",
+    "- goto(url): navigate to a URL\n",
+    "- click(id): click a button given it's ID\n",
+    "- type(id, text): type \"text\" into element \"id\"\n",
+    "- scroll(direction): scroll the page in direction up or down.\n",
+    "- stop_action(final_answer): declare that we have finished the task and prepare a final_answer to return to the user.\n",
+    "\n",
+    "Output a JSON object with the following fields:\n",
+    "{{\n",
+    "    \"action\": \"goto\" | \"click\" | \"type\" | \"scroll\" | \"stop_action\",\n",
+    "    \"action_args\": {{\n",
+    "        \"url\": \"https://www.google.com\",\n",
+    "        \"id\": \"123\",\n",
+    "        \"text\": \"Hello\",\n",
+    "        \"direction\": \"up\"\n",
+    "    }}\n",
+    "}}\n",
+    "\n",
+    "Only output the JSON object, no other text or comments.\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "class BrowserUseAgent:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        client: OpenAI,\n",
+    "        model: str = \"gpt-4o\",\n",
+    "        headless: bool = False,\n",
+    "        run_in_jupyter: bool = True,\n",
+    "    ):\n",
+    "        self.client = client\n",
+    "        self.model = model\n",
+    "        self.headless = headless\n",
+    "        self.message_history: list[dict] = []\n",
+    "        self.page: Page = None\n",
+    "        self.run_in_jupyter = run_in_jupyter\n",
+    "\n",
+    "    async def _launch_browser(self) -> None:\n",
+    "        p = await async_playwright().start()\n",
+    "        self.browser = await p.chromium.launch(headless=self.headless)\n",
+    "        # context is the browser window\n",
+    "        self.context = await self.browser.new_context()\n",
+    "        # page is the tab in the browser\n",
+    "        self.page = await self.context.new_page()\n",
+    "\n",
+    "    async def execute_task(self, task: str) -> str:\n",
+    "        \"\"\"\n",
+    "        This is NEW! This is the main function that will be called to execute the task and implement our agent loop.\n",
+    "        \"\"\"\n",
+    "        # Step 1: Launch the browser if it's not already launched\n",
+    "        if self.page is None:\n",
+    "            await self._launch_browser()\n",
+    "        # Our stop condition is when the LLM decides to output stop_action\n",
+    "        should_stop = False\n",
+    "        final_answer = None\n",
+    "        i = 0\n",
+    "        while not should_stop:\n",
+    "            # Step 2: Prepare the page for the agent\n",
+    "            (\n",
+    "                page_text,\n",
+    "                formatted_list_of_elements,\n",
+    "                som_screenshot,\n",
+    "            ) = await self._prepare_page_for_agent(self.page)\n",
+    "            # Step 3: Get the LLM response\n",
+    "            image_data_url = f\"data:image/png;base64,{base64.b64encode((lambda b: (som_screenshot.save(b, format='PNG'), b.getvalue())[1])(io.BytesIO())).decode()}\"\n",
+    "            action_decision = self._get_llm_response(\n",
+    "                self.client,\n",
+    "                task,\n",
+    "                page_text,\n",
+    "                formatted_list_of_elements,\n",
+    "                image_data_url,\n",
+    "                self.message_history,\n",
+    "                self.model,\n",
+    "            )\n",
+    "            print(f\"Action decision {i}: {action_decision}\")\n",
+    "            # Add the action decision to the message history\n",
+    "            self.message_history.append(\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [{\"type\": \"text\", \"text\": json.dumps(action_decision)}],\n",
+    "                }\n",
+    "            )\n",
+    "            # Step 4: Execute the action with some error handling\n",
+    "            try:\n",
+    "                action_feedback = await self._execute_action(\n",
+    "                    action_decision[\"action\"], action_decision[\"action_args\"], self.page\n",
+    "                )\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error executing action {i}: {e}\")\n",
+    "                action_feedback = f\"Error executing action {i}: {e}\"\n",
+    "            print(f\"Action feedback {i}: {action_feedback}\")\n",
+    "            # Sleep for 3 seconds to let the page load\n",
+    "            await self.page.wait_for_timeout(3000)\n",
+    "            # Update the message history with feedback on the action and the new page screenshot\n",
+    "            new_page_screenshot = await self.page.screenshot()\n",
+    "            self.message_history.append(\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [\n",
+    "                        {\"type\": \"text\", \"text\": action_feedback},\n",
+    "                        {\n",
+    "                            \"type\": \"image_url\",\n",
+    "                            \"image_url\": {\n",
+    "                                \"url\": f\"data:image/png;base64,{base64.b64encode(new_page_screenshot).decode()}\"\n",
+    "                            },\n",
+    "                        },\n",
+    "                    ],\n",
+    "                }\n",
+    "            )\n",
+    "            if self.run_in_jupyter:\n",
+    "                display(Image.open(io.BytesIO(new_page_screenshot)))\n",
+    "            # Check if the task is complete\n",
+    "            should_stop = action_decision[\"action\"] == \"stop_action\"\n",
+    "            if should_stop:\n",
+    "                final_answer = action_decision[\"action_args\"][\"final_answer\"]\n",
+    "            i += 1\n",
+    "        return final_answer\n",
+    "\n",
+    "    async def _execute_action(self, action: str, action_args: dict, page: Page) -> str:\n",
+    "        \"\"\"\n",
+    "        Execute an action on the page.\n",
+    "        \"\"\"\n",
+    "        if action == \"goto\":\n",
+    "            await page.goto(action_args[\"url\"])\n",
+    "            return f\"I navigated to {action_args['url']}\"\n",
+    "        elif action == \"click\":\n",
+    "            # Get the element using the data attribute\n",
+    "            await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
+    "            element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
+    "            if element:\n",
+    "                await element.click()\n",
+    "            else:\n",
+    "                raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
+    "            return f\"I clicked on {action_args['id']}\"\n",
+    "        elif action == \"type\":\n",
+    "            await page.wait_for_selector(f\"[data-element-id='{action_args['id']}']\")\n",
+    "            element = page.locator(f\"[data-element-id='{action_args['id']}']\")\n",
+    "            if element:\n",
+    "                await element.fill(action_args[\"text\"])\n",
+    "                # Press enter\n",
+    "                await element.press(\"Enter\")\n",
+    "            else:\n",
+    "                raise ValueError(f\"Element with ID {action_args['id']} not found\")\n",
+    "            return f\"I typed {action_args['text']} into {action_args['id']}\"\n",
+    "        elif action == \"scroll\":\n",
+    "            await page.scroll(action_args[\"direction\"])\n",
+    "            return f\"I scrolled {action_args['direction']}\"\n",
+    "        elif action == \"stop_action\":\n",
+    "            return action_args[\"final_answer\"]\n",
+    "        else:\n",
+    "            raise ValueError(f\"Invalid action: {action}\")\n",
+    "\n",
+    "    def _get_llm_response(\n",
+    "        self,\n",
+    "        client: OpenAI,  # OpenAI client\n",
+    "        task: str,  # Task to complete\n",
+    "        page_text: str,  # Page text\n",
+    "        formatted_list_of_elements: str,  # Formatted list of elements\n",
+    "        image_data_url: str,  # Image data URL\n",
+    "        message_history: list[dict] = [],  # Message history\n",
+    "        model: str = \"gpt-4o\",  # Model to use\n",
+    "    ) -> dict:\n",
+    "        response = client.chat.completions.create(\n",
+    "            model=model,\n",
+    "            messages=[\n",
+    "                *message_history,\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [\n",
+    "                        {\n",
+    "                            \"type\": \"text\",\n",
+    "                            \"text\": AGENT_PROMPT.format(\n",
+    "                                task=task,\n",
+    "                                page_text=page_text,\n",
+    "                                formatted_list_of_elements=formatted_list_of_elements,\n",
+    "                            ),\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"type\": \"image_url\",\n",
+    "                            \"image_url\": {\"url\": image_data_url},\n",
+    "                        },\n",
+    "                    ],\n",
+    "                },\n",
+    "            ],\n",
+    "        )\n",
+    "\n",
+    "        # Parse the response\n",
+    "        try:\n",
+    "            action_decision = json.loads(response.choices[0].message.content)\n",
+    "        except json.JSONDecodeError:\n",
+    "            # it starts with ```json\n",
+    "            response_content = response.choices[0].message.content\n",
+    "            response_content = response_content.replace(\"```json\", \"\").replace(\n",
+    "                \"```\", \"\"\n",
+    "            )\n",
+    "            action_decision = json.loads(response_content)\n",
+    "        except Exception as e:\n",
+    "            raise e\n",
+    "        return action_decision\n",
+    "\n",
+    "    async def _prepare_page_for_agent(self, page: Page) -> tuple[str, str, Image.Image]:\n",
+    "        \"\"\"\n",
+    "        Prepare the page for the agent.\n",
+    "        Returns:\n",
+    "            tuple[str, str, Image.Image]: The page text, the formatted list of elements, and the screenshot with bounding boxes.\n",
+    "        \"\"\"\n",
+    "        page_text = await page.evaluate(\"() => document.body.innerText\")\n",
+    "        elements = await self._get_interactive_elements(page)\n",
+    "        screenshot_bytes = await page.screenshot()\n",
+    "        som_screenshot = self._get_som_screenshot(screenshot_bytes, elements)\n",
+    "\n",
+    "        formatted_list_of_elements = \"\\n\".join(\n",
+    "            [f\"Element {i}: {element}\" for i, element in enumerate(elements)]\n",
+    "        )\n",
+    "\n",
+    "        return page_text, formatted_list_of_elements, som_screenshot\n",
+    "\n",
+    "    def _get_som_screenshot(\n",
+    "        self, screenshot_bytes: bytes, elements: list[Element]\n",
+    "    ) -> Image.Image:\n",
+    "        screenshot = Image.open(io.BytesIO(screenshot_bytes))\n",
+    "\n",
+    "        # Create a drawing object\n",
+    "        draw = ImageDraw.Draw(screenshot)\n",
+    "\n",
+    "        # Draw bounding boxes and element IDs for each element\n",
+    "        for element in elements:\n",
+    "            bbox = element.bbox\n",
+    "            x = bbox[\"x\"]\n",
+    "            y = bbox[\"y\"]\n",
+    "            width = bbox[\"width\"]\n",
+    "            height = bbox[\"height\"]\n",
+    "\n",
+    "            # Draw rectangle\n",
+    "            draw.rectangle([(x, y), (x + width, y + height)], outline=\"red\", width=2)\n",
+    "\n",
+    "            # Draw element ID\n",
+    "            draw.text((x, y - 15), f\"{element.id}\", fill=\"red\")\n",
+    "\n",
+    "        som_screenshot = screenshot.copy()\n",
+    "        return som_screenshot\n",
+    "\n",
+    "    async def _get_interactive_elements(self, page: Page) -> list[Element]:\n",
+    "        elements: list[Element] = []\n",
+    "        # Viewport size is a dict with keys 'width' and 'height'\n",
+    "        viewport_size = page.viewport_size\n",
+    "\n",
+    "        # For simplicity, we will only look at buttons, textboxes, and links. We can add more roles later on.\n",
+    "        interactive_roles = [\"button\", \"textbox\", \"link\"]\n",
+    "        i = 0\n",
+    "        for role in interactive_roles:\n",
+    "            # We will use the Playwright API to get all elements with the given role\n",
+    "            elements_with_role = await page.get_by_role(role).all()\n",
+    "            for element in elements_with_role:\n",
+    "                # Check if element is visible and in current viewport\n",
+    "                bbox = await element.bounding_box()\n",
+    "                if bbox:  # Element is visible if it has a bounding box\n",
+    "                    # Check if element is in current viewport (not scrolled out of view)\n",
+    "                    if 0 <= bbox[\"y\"] <= viewport_size[\"height\"]:\n",
+    "                        # Set a data attribute with the element ID for later reference\n",
+    "                        await element.evaluate(\n",
+    "                            f\"el => el.setAttribute('data-element-id', '{i}')\"\n",
+    "                        )\n",
+    "                        elements.append(\n",
+    "                            Element(\n",
+    "                                id=i,\n",
+    "                                aria_label=await element.get_attribute(\"aria-label\")\n",
+    "                                or await element.get_attribute(\"aria-role\")\n",
+    "                                or \"\",\n",
+    "                                type=role,\n",
+    "                                bbox=bbox,\n",
+    "                                text=await element.text_content() or \"\",\n",
+    "                            )\n",
+    "                        )\n",
+    "                        i += 1\n",
+    "        return elements\n",
+    "\n",
+    "    async def close(self) -> None:\n",
+    "        if self.page is not None:\n",
+    "            await self.page.close()\n",
+    "        if self.context is not None:\n",
+    "            await self.context.close()\n",
+    "        if self.browser is not None:\n",
+    "            await self.browser.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's run the Agent on a sample task!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "import os\n",
+    "openai_client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
+    "agent = BrowserUseAgent(openai_client)\n",
+    "try:\n",
+    "    final_answer = await agent.execute_task(\"find the open issues assigned to husseinmozannar on the microsoft/magentic-ui repo on github\")\n",
+    "    print(final_answer)\n",
+    "finally:\n",
+    "    await agent.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Sucess! Our agent was able to navigate to GitHub and filter the issues assigned to me. It ran into some issues but was able to debug and get to the right answer.\n",
+    "\n",
+    "To conclude, in this short tutorial, we showed how to build a browser use agent from scratch.\n",
+    "\n",
+    "The main ingredients were: set-of-marks prompting, playwright for browser automation and tool calling or structured JSON output ability of current LLMs. With these three ingredients we can build a semi-capable agent to navigate the web!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Level 2: Building a Browser Use Agent Using Magentic-UI\n",
+    "\n",
+    "While it was fun building the browser use agent from scratch, it was not easy. We had to figure out how to launch the browser, fiddle around with playwright to extract interactive elements, figure out how to execute actions on the page and so on.\n",
+    "\n",
+    "The `magentic-ui` library as we will see has many utilities that will make your life much easier when building browser use agents. We will now do the same steps as before but by using the helpers from the `magentic-ui` library.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 1: Launching a Browser"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Magentic-UI provides three different Playwright browser implementations, each designed for specific use cases:\n",
+    "\n",
+    "1. Local Playwright Browser (`LocalPlaywrightBrowser`)\n",
+    "- **Purpose**: Runs Playwright directly on the local machine without Docker\n",
+    "- **Use Case**: Development and testing environments where Docker isn't needed\n",
+    "- **Features**: Lightweight, direct browser control, supports both headless and headed modes\n",
+    "\n",
+    "2. Headless Docker Playwright Browser (`HeadlessDockerPlaywrightBrowser`)  \n",
+    "- **Purpose**: Runs a headless Playwright browser inside a Docker container\n",
+    "- **Use Case**: Production environments, CI/CD pipelines, server-side automation\n",
+    "- **Features**: Isolated execution, reproducible environment, no GUI overhead and more secure.\n",
+    "- **Docker Image**: Uses Microsoft's official Playwright Docker image (`mcr.microsoft.com/playwright:v1.51.1-noble`)\n",
+    "\n",
+    "3. VNC Docker Playwright Browser (`VncDockerPlaywrightBrowser`)\n",
+    "- **Purpose**: Runs Playwright in Docker with VNC support for visual interaction, you can interact with the browser on localhost.\n",
+    "- **Use Case**: Debugging, development, and scenarios requiring visual browser inspection\n",
+    "- **Features**: Programmatic control + visual access via noVNC web interface\n",
+    "- **Docker Image**: Uses custom `magentic-ui-vnc-browser` image with VNC server. You need to run `magentic-ui --rebuild-docker` command to build it.\n",
+    "\n",
+    "How to Launch Each Browser:\n",
+    "\n",
+    "```python\n",
+    "from pathlib import Path\n",
+    "from magentic_ui.tools.playwright import HeadlessDockerPlaywrightBrowser, VncDockerPlaywrightBrowser, LocalPlaywrightBrowser\n",
+    "\n",
+    "# Direct instantiation examples\n",
+    "async def launch_browsers():\n",
+    "    # Headless Docker Browser\n",
+    "    headless_browser = HeadlessDockerPlaywrightBrowser(\n",
+    "        playwright_port=37367,\n",
+    "        inside_docker=False\n",
+    "    )\n",
+    "    \n",
+    "    # VNC Docker Browser  \n",
+    "    vnc_browser = VncDockerPlaywrightBrowser(\n",
+    "        bind_dir=Path(\"./workspace\"),\n",
+    "        playwright_port=37367,\n",
+    "        novnc_port=6080,\n",
+    "        inside_docker=False\n",
+    "    )\n",
+    "    \n",
+    "    # Local Browser\n",
+    "    local_browser = LocalPlaywrightBrowser(headless=True)\n",
+    "    \n",
+    "```\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For simplicity we will stick with the local playwright browser that we launched in Level 1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from magentic_ui.tools.playwright import LocalPlaywrightBrowser\n",
+    "browser = LocalPlaywrightBrowser(headless=False)\n",
+    "# Start the browser\n",
+    "await browser._start()\n",
+    "# Get the browser context and start a new page\n",
+    "context = browser.browser_context\n",
+    "page = await context.new_page()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should now see a browser open to the blank page."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 2: Represent the browser for the Agent using Set-Of-Marks Prompting."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To get the interactive elements on the page, we have done a lot of work for you in Magentic-UI to capture every posible interactive element type on the page including elements in the shadow-DOM [(see this javascript file if interested for more info)](https://github.com/microsoft/magentic-ui/blob/main/src/magentic_ui/tools/playwright/page_script.js).\n",
+    "\n",
+    "These utilities are wrapped in a helper class called the [`PlaywrightController`](https://github.com/microsoft/magentic-ui/blob/main/src/magentic_ui/tools/playwright/playwright_controller.py)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from magentic_ui.tools.playwright import PlaywrightController\n",
+    "browser_controller = PlaywrightController(viewport_width=1280, viewport_height=720)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The PlaywrightController has a lot of convenience methods that have been debugged extensively so that we can perform actions on the browser more reliably and securily.\n",
+    "\n",
+    "There are methods to get the interactive elements, get the screenshot, click, type, scroll, manage tabs, hover, describe pages in markdown and much more."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For now, let's navigate to Bing using our `browser_controller`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = await browser_controller.visit_page(page, \"https://www.bing.com\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The visit_page method only returns when the page is fully loaded."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let us get the set of interactive elements:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "interactive_elements = await browser_controller.get_interactive_rects(page)\n",
+    "# print the first 20 interactive elements\n",
+    "i = 0\n",
+    "for element in interactive_elements:\n",
+    "    print(f\"Element {i}: id={element}, data={interactive_elements[element]}\")\n",
+    "    i += 1\n",
+    "    if i > 20:\n",
+    "        break"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You'll notice that this ran much faster than using the Playwright script in Level 1 tutorial because here we are using javascript to extract the elements instead of going through the playwright API.\n",
+    "\n",
+    "Our searchbox is now Element id 22 and has the following data:\n",
+    "\n",
+    "\n",
+    "    Element 12: id=22, data={'tag_name': 'textarea', 'role': 'textbox', 'aria_name': '0 characters out of 2000', 'v_scrollable': False, 'rects': [{'x': 193, 'y': 158, 'width': 843, 'height': 22, 'top': 158, 'right': 1036, 'bottom': 180, 'left': 193}]}\n",
+    "\n",
+    "To type in the searchbox we can use the fill_id method of the PlaywrightController:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "await browser_controller.fill_id(page, \"22\", \"Magentic-UI\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's check if we are the right page:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import io\n",
+    "from IPython.display import display\n",
+    "\n",
+    "screenshot = await browser_controller.get_screenshot(page)\n",
+    "image = Image.open(io.BytesIO(screenshot))\n",
+    "display(image)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also easily extract the search results using the get_page_markdown method that uses the [`markitdown`](https://github.com/microsoft/markitdown) package from our team at Microsoft Research."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "page_text = await browser_controller.get_page_markdown(page)\n",
+    "print(page_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The final thing we need is to get the set-of-marks image:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from magentic_ui.agents.web_surfer._set_of_mark import add_set_of_mark\n",
+    "\n",
+    "\n",
+    "interactive_elements = await browser_controller.get_interactive_rects(page)\n",
+    "screenshot = await browser_controller.get_screenshot(page)\n",
+    "som_screenshot, visible_elements, elements_above, elements_below, _ = add_set_of_mark(\n",
+    "    screenshot, interactive_elements, use_sequential_ids=True\n",
+    ")\n",
+    "\n",
+    "display(som_screenshot)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The add_set_of_mark method returns the SoM screenshot in addition to elements visible on the viewport, elements above the viewport and elements below the viewport.\n",
+    "\n",
+    "We can see how much the `magentic-ui` makes our life easier with these tools, we are now ready to re-implement the agent from Level 1!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Step 3: Putting it all together"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Using the tools from the `magentic-ui` library now we can more easily implement our BrowserUseAgent:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "from playwright.async_api import Page\n",
+    "from playwright.async_api import async_playwright\n",
+    "from PIL import Image, ImageDraw\n",
+    "import io\n",
+    "import base64\n",
+    "import json\n",
+    "from dataclasses import dataclass\n",
+    "from IPython.display import display\n",
+    "from magentic_ui.tools.playwright import LocalPlaywrightBrowser\n",
+    "from magentic_ui.tools.playwright import PlaywrightController\n",
+    "from magentic_ui.agents.web_surfer._set_of_mark import add_set_of_mark\n",
+    "\n",
+    "\n",
+    "AGENT_PROMPT = \"\"\"\n",
+    "You are a helpful assistant that can navigate a web page and perform actions on it.\n",
+    "\n",
+    "The task we are trying to complete is:\n",
+    "{task}\n",
+    "\n",
+    "The current visible text on the page is:\n",
+    "{page_text}\n",
+    "\n",
+    "The current visible elements on the page are:\n",
+    "{formatted_list_of_elements}\n",
+    "\n",
+    "You will need to decide on the next action to take.\n",
+    "\n",
+    "The action space is:\n",
+    "- goto(url): navigate to a URL\n",
+    "- click(id): click a button given it's ID\n",
+    "- type(id, text): type \"text\" into element \"id\"\n",
+    "- scroll(direction): scroll the page in direction up or down.\n",
+    "- stop_action(final_answer): declare that we have finished the task and prepare a final_answer to return to the user.\n",
+    "\n",
+    "Output a JSON object with the following fields:\n",
+    "{{\n",
+    "    \"action\": \"goto\" | \"click\" | \"type\" | \"scroll\" | \"stop_action\",\n",
+    "    \"action_args\": {{\n",
+    "        \"url\": \"https://www.google.com\",\n",
+    "        \"id\": \"123\",\n",
+    "        \"text\": \"Hello\",\n",
+    "        \"direction\": \"up\"\n",
+    "    }}\n",
+    "}}\n",
+    "\n",
+    "Only output the JSON object, no other text or comments.\n",
+    "\"\"\"\n",
+    "\n",
+    "\n",
+    "class BrowserUseAgent:\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        client: OpenAI,\n",
+    "        model: str = \"gpt-4o\",\n",
+    "        headless: bool = False,\n",
+    "        run_in_jupyter: bool = True,\n",
+    "    ):\n",
+    "        self.client = client\n",
+    "        self.model = model\n",
+    "        self.headless = headless\n",
+    "        self.message_history: list[dict] = []\n",
+    "        self.page: Page = None\n",
+    "        self.run_in_jupyter = run_in_jupyter\n",
+    "        self.browser_controller = PlaywrightController(\n",
+    "            viewport_width=1280, viewport_height=720\n",
+    "        )\n",
+    "\n",
+    "    async def _launch_browser(self) -> None:\n",
+    "        self.browser = LocalPlaywrightBrowser(headless=False)\n",
+    "        # Start the browser\n",
+    "        await self.browser._start()\n",
+    "        # Get the browser context and start a new page\n",
+    "        self.context = self.browser.browser_context\n",
+    "        self.page = await self.context.new_page()\n",
+    "\n",
+    "    async def execute_task(self, task: str) -> str:\n",
+    "        \"\"\"\n",
+    "        This is NEW! This is the main function that will be called to execute the task and implement our agent loop.\n",
+    "        \"\"\"\n",
+    "        # Step 1: Launch the browser if it's not already launched\n",
+    "        if self.page is None:\n",
+    "            await self._launch_browser()\n",
+    "        # Our stop condition is when the LLM decides to output stop_action\n",
+    "        should_stop = False\n",
+    "        final_answer = None\n",
+    "        i = 0\n",
+    "        while not should_stop:\n",
+    "            # Step 2: Prepare the page for the agent\n",
+    "            (\n",
+    "                page_text,\n",
+    "                formatted_list_of_elements,\n",
+    "                som_screenshot,\n",
+    "            ) = await self._prepare_page_for_agent(self.page)\n",
+    "            # Step 3: Get the LLM response\n",
+    "            image_data_url = f\"data:image/png;base64,{base64.b64encode((lambda b: (som_screenshot.save(b, format='PNG'), b.getvalue())[1])(io.BytesIO())).decode()}\"\n",
+    "            action_decision = self._get_llm_response(\n",
+    "                self.client,\n",
+    "                task,\n",
+    "                page_text,\n",
+    "                formatted_list_of_elements,\n",
+    "                image_data_url,\n",
+    "                self.message_history,\n",
+    "                self.model,\n",
+    "            )\n",
+    "            print(f\"Action decision {i}: {action_decision}\")\n",
+    "            # Add the action decision to the message history\n",
+    "            self.message_history.append(\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [{\"type\": \"text\", \"text\": json.dumps(action_decision)}],\n",
+    "                }\n",
+    "            )\n",
+    "            # Step 4: Execute the action with some error handling\n",
+    "            try:\n",
+    "                action_feedback = await self._execute_action(\n",
+    "                    action_decision[\"action\"], action_decision[\"action_args\"], self.page\n",
+    "                )\n",
+    "            except Exception as e:\n",
+    "                print(f\"Error executing action {i}: {e}\")\n",
+    "                action_feedback = f\"Error executing action {i}: {e}\"\n",
+    "            print(f\"Action feedback {i}: {action_feedback}\")\n",
+    "            # Sleep for 3 seconds to let the page load\n",
+    "            await self.page.wait_for_timeout(3000)\n",
+    "            # Update the message history with feedback on the action and the new page screenshot\n",
+    "            new_page_screenshot = await self.page.screenshot()\n",
+    "            self.message_history.append(\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [\n",
+    "                        {\"type\": \"text\", \"text\": action_feedback},\n",
+    "                        {\n",
+    "                            \"type\": \"image_url\",\n",
+    "                            \"image_url\": {\n",
+    "                                \"url\": f\"data:image/png;base64,{base64.b64encode(new_page_screenshot).decode()}\"\n",
+    "                            },\n",
+    "                        },\n",
+    "                    ],\n",
+    "                }\n",
+    "            )\n",
+    "            if self.run_in_jupyter:\n",
+    "                display(Image.open(io.BytesIO(new_page_screenshot)))\n",
+    "            # Check if the task is complete\n",
+    "            should_stop = action_decision[\"action\"] == \"stop_action\"\n",
+    "            if should_stop:\n",
+    "                final_answer = action_decision[\"action_args\"][\"final_answer\"]\n",
+    "            i += 1\n",
+    "        return final_answer\n",
+    "\n",
+    "    async def _prepare_page_for_agent(self, page: Page) -> tuple[str, str, bytes]:\n",
+    "        interactive_elements = await self.browser_controller.get_interactive_rects(page)\n",
+    "        screenshot = await self.browser_controller.get_screenshot(page)\n",
+    "        som_screenshot, visible_elements, elements_above, elements_below, _ = (\n",
+    "            add_set_of_mark(screenshot, interactive_elements, use_sequential_ids=False)\n",
+    "        )\n",
+    "        visible_elements_formatted = \"\"\n",
+    "        for element_id in visible_elements:\n",
+    "            element_data = interactive_elements[element_id]\n",
+    "            visible_elements_formatted += f\"{element_id}: {element_data}\\n\"\n",
+    "\n",
+    "        page_text = await self.browser_controller.get_page_markdown(page)\n",
+    "        return page_text, visible_elements_formatted, som_screenshot\n",
+    "    async def _execute_action(self, action: str, action_args: dict, page: Page) -> str:\n",
+    "        if action == \"goto\":\n",
+    "            await self.browser_controller.visit_page(page, action_args[\"url\"])\n",
+    "            return f\"Visited {action_args['url']}\"\n",
+    "        elif action == \"click\":\n",
+    "            await self.browser_controller.click_id(self.context, page, action_args[\"id\"])\n",
+    "            return f\"Clicked {action_args['id']}\"\n",
+    "        elif action == \"type\":\n",
+    "            await self.browser_controller.fill_id(page, action_args[\"id\"], action_args[\"text\"])\n",
+    "            return f\"Typed {action_args['text']} into {action_args['id']}\"\n",
+    "        elif action == \"scroll\":\n",
+    "            if action_args[\"direction\"] == \"up\":\n",
+    "                await self.browser_controller.page_up(page)\n",
+    "            elif action_args[\"direction\"] == \"down\":\n",
+    "                await self.browser_controller.page_down(page)\n",
+    "            return f\"Scrolled {action_args['direction']}\"\n",
+    "        elif action == \"stop_action\":\n",
+    "            return action_args[\"final_answer\"]\n",
+    "        else:\n",
+    "            raise ValueError(f\"Invalid action: {action}\")\n",
+    "\n",
+    "    def _get_llm_response(\n",
+    "        self,\n",
+    "        client: OpenAI,  # OpenAI client\n",
+    "        task: str,  # Task to complete\n",
+    "        page_text: str,  # Page text\n",
+    "        formatted_list_of_elements: str,  # Formatted list of elements\n",
+    "        image_data_url: str,  # Image data URL\n",
+    "        message_history: list[dict] = [],  # Message history\n",
+    "        model: str = \"gpt-4o\",  # Model to use\n",
+    "    ) -> dict:\n",
+    "        response = client.chat.completions.create(\n",
+    "            model=model,\n",
+    "            messages=[\n",
+    "                *message_history,\n",
+    "                {\n",
+    "                    \"role\": \"user\",\n",
+    "                    \"content\": [\n",
+    "                        {\n",
+    "                            \"type\": \"text\",\n",
+    "                            \"text\": AGENT_PROMPT.format(\n",
+    "                                task=task,\n",
+    "                                page_text=page_text,\n",
+    "                                formatted_list_of_elements=formatted_list_of_elements,\n",
+    "                            ),\n",
+    "                        },\n",
+    "                        {\n",
+    "                            \"type\": \"image_url\",\n",
+    "                            \"image_url\": {\"url\": image_data_url},\n",
+    "                        },\n",
+    "                    ],\n",
+    "                },\n",
+    "            ],\n",
+    "        )\n",
+    "\n",
+    "        # Parse the response\n",
+    "        try:\n",
+    "            action_decision = json.loads(response.choices[0].message.content)\n",
+    "        except json.JSONDecodeError:\n",
+    "            # it starts with ```json\n",
+    "            response_content = response.choices[0].message.content\n",
+    "            response_content = response_content.replace(\"```json\", \"\").replace(\n",
+    "                \"```\", \"\"\n",
+    "            )\n",
+    "            action_decision = json.loads(response_content)\n",
+    "        except Exception as e:\n",
+    "            raise e\n",
+    "        return action_decision\n",
+    "\n",
+    "    async def close(self) -> None:\n",
+    "        if self.page is not None:\n",
+    "            await self.page.close()\n",
+    "        if self.context is not None:\n",
+    "            await self.context.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "import os\n",
+    "openai_client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
+    "agent = BrowserUseAgent(openai_client)\n",
+    "try:\n",
+    "    final_answer = await agent.execute_task(\"find the open issues assigned to husseinmozannar on the microsoft/magentic-ui repo on github\")\n",
+    "    print(final_answer)\n",
+    "finally:\n",
+    "    await agent.close()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Success! Our agent again performed the task correctly!\n",
+    "\n",
+    "With this tutorial, I hope to have convinced you that `magentic-ui` can help you build a browser-use agent more easily. You might be curious how to build the best browser-use agent possible given this, and we have already implemented one for you with many features that we haven't discussed previously in Magentic-UI which we will discuss next."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Level 3: Using the WebSurfer Agent from Magentic-UI\n",
+    "\n",
+    "We have a reference implementation of a capable browser use agent in Magentic-UI which we call the `WebSurfer` agent. I'll show you now how to use it. \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "`WebSurfer` is an AutoGen AgentChat agent built using the tools we have seen previously to complete actions autonomously on the web. We have spent a lot of time fixing many many edge cases that arise on the web to arrive at a more reliable (but not perfect) browser use agent.\n",
+    "This agent builds on the [`MultimodalWebSurfer`](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.agents.web_surfer.html) agent from AutoGen that we previously developed. \n",
+    "\n",
+    "Let's see now how to use it!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from autogen_ext.models.openai import OpenAIChatCompletionClient\n",
+    "from magentic_ui.agents import WebSurfer\n",
+    "from magentic_ui.tools.playwright import (\n",
+    "    LocalPlaywrightBrowser,\n",
+    ")\n",
+    "\n",
+    "browser = LocalPlaywrightBrowser(headless=False)\n",
+    "\n",
+    "model_client = OpenAIChatCompletionClient(model=\"gpt-4o\")\n",
+    "\n",
+    "web_surfer = WebSurfer(\n",
+    "    name=\"web_surfer\",\n",
+    "    model_client=model_client, # Use any client from AutoGen!\n",
+    "    animate_actions=True, # Set to True if you want to see the actions being animated!\n",
+    "    max_actions_per_step=10, # Maximum number of actions to perform before returning\n",
+    "    downloads_folder=\"debug\", # Where to save downloads\n",
+    "    debug_dir=\"debug\", # Where to save debug files and screenshots\n",
+    "    to_save_screenshots=False, # set to True if you want to save screenshots of the actions\n",
+    "    browser=browser, # Use any browser from Magentic-UI!\n",
+    "    multiple_tools_per_call=False, # Set to True if you want to use multiple tools per call\n",
+    "    json_model_output=False, # Set to True if your model does not support tool calling\n",
+    ")\n",
+    "await web_surfer.lazy_init()\n",
+    "\n",
+    "task = \"find the open issues assigned to husseinmozannar on the microsoft/magentic-ui repo on github\"\n",
+    "try:\n",
+    "    messages = []\n",
+    "    async for message in web_surfer.run_stream(task=task):\n",
+    "        messages.append(message)\n",
+    "        print(message)\n",
+    "    print(\"########################################################\")\n",
+    "    print(\"Final answer:\")\n",
+    "    print(messages[-1].messages[-2].content)\n",
+    "finally:\n",
+    "    await web_surfer.close()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We encourage you to experiment using the sample code file [sample_web_surfer.py](https://github.com/microsoft/magentic-ui/blob/main/samples/sample_web_surfer.py) and to use the Magentic-UI application which provides a web UI to interact with the WebSurfer agent and launch multiple parallel tasks and more!\n",
+    "\n",
+    "Just run:\n",
+    "\n",
+    "```bash\n",
+    "python3 -m venv .venv\n",
+    "source .venv/bin/activate\n",
+    "pip install magentic-ui\n",
+    "# export OPENAI_API_KEY=<YOUR API KEY>\n",
+    "magentic ui --port 8081\n",
+    "```\n",
+    "See [https://github.com/microsoft/magentic-ui](https://github.com/microsoft/magentic-ui) for the full instructions.\n",
+    "\n",
+    "![../img/magenticui_running.png](../img/magenticui_running.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# What's next?\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "## Evaluation\n",
+    "\n",
+    "The first thing you might be curious about is how well does the WebSurfer agent perform?\n",
+    "\n",
+    "In Magentic-UI, we have built a small evaluation library [magentic-ui/eval](https://github.com/microsoft/magentic-ui/tree/main/src/magentic_ui/eval) that implements popular browser-use benchmarks and makes it easy to run evals. We will be building a bit on this library and will have a tutorial on how to use it.\n",
+    "\n",
+    " Magentic-UI has been tested against several benchmarks when running with o4-mini: [GAIA](https://huggingface.co/datasets/gaia-benchmark/GAIA) test set (42.52%), which assesses general AI assistants across reasoning, tool use, and web interaction tasks ; [AssistantBench](https://huggingface.co/AssistantBench) test set (27.60%), focusing on realistic, time-consuming web tasks; [WebVoyager](https://github.com/MinorJerry/WebVoyager) (82.2%), measuring end-to-end web navigation in real-world scenarios; and [WebGames](https://webgames.convergence.ai/) (45.5%), evaluating general-purpose web-browsing agents through interactive challenges.\n",
+    "To reproduce these experimental results, please see the following [instructions](experiments/README.md).\n",
+    "\n",
+    "For reference, the current SOTA on WebVoyager is the [browser-use library](https://browser-use.com/posts/sota-technical-report) using GPT-4o achieving 89%. Note that the WebVoyager evaluation is not consistent across different systems as it relies on a mix of LLM-as-a-judge evaluation and human evaluation.\n",
+    "\n",
+    "\n",
+    "## Limitations\n",
+    "\n",
+    "Using the Set-Of-Mark approach for building the Browser Use Agent has many limitations (note that both Magentic-UI and Browser Use library use SoM). For instance, any task that requires understanding coordinates on the screen our agent will fail on.\n",
+    "Examples:\n",
+    "\n",
+    "- dragging an element from position A to position B\n",
+    "- drawing on the screen\n",
+    "- playing web games\n",
+    "\n",
+    "Moreover, it will not generalize to any Computer Use task where we might not have the DOM to obtain element coordinates. Therefore, we will need to have a model that can click on specific coordinates rather than using element IDs. The [UI-Tars](https://github.com/bytedance/UI-TARS) models have such an ability as well as the latest [compute-preview-api](https://platform.openai.com/docs/guides/tools-computer-use) from OpenAI. Another approach is to use a grounding or parsing model instead of the DOM such as [OmniParser](https://microsoft.github.io/OmniParser/) to obtain element IDs from any GUI interface combined with a tool-calling LLM.\n",
+    "\n",
+    "\n",
+    "Another limitation is that these agents are not *real-time* and so tasks such as video-understanding or playing games become almost impossible natively as there multiple seconds delay between each agent action.\n",
+    "\n",
+    "## Safety\n",
+    "\n",
+    "Current LLMs are still very prone to adversarial attacks on the web, see these papers for how bad things can get it with current models even those tuned directly for CUA:\n",
+    "\n",
+    "- [Commercial LLM Agents Are Already Vulnerable to Simple Yet Dangerous Attacks\n",
+    "](https://arxiv.org/html/2502.08586v1)\n",
+    "- [RedTeamCUA:\n",
+    "Realistic Adversarial Testing of Computer-Use Agents in\n",
+    "Hybrid Web-OS Environments](https://osu-nlp-group.github.io/RedTeamCUA/)\n",
+    "\n",
+    "We recommend to have guardrails built into the agent to allow the human to approve actions if needed. We call such guardrails \"ActionGuard\" in Magentic-UI and they allow you to define heuristics in addition to LLM judgmenet for when actions might need human approval.\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you've made it this far I really appreciate you taking the time to read and hope you've enjoyed following along!"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

experiments/endpoint_configs/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+config.yaml
+exp_configs/*
+exp_configs

experiments/endpoint_configs/config_template.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+# IMPORTANT: This file is a template with default configurations.
+# To use it, make a copy in the same directory and rename it to `config.yaml`
+model_config_4o_openai: &client_4o_openai
+  provider: OpenAIChatCompletionClient
+  config:
+    model: gpt-4o-2024-08-06
+  max_retries: 5
+orchestrator_client: *client_4o_openai
+coder_client: *client_4o_openai
+web_surfer_client: *client_4o_openai
+file_surfer_client: *client_4o_openai
+action_guard_client: *client_4o_openai
+user_proxy_client: *client_4o_openai
+model_client: *client_4o_openai

experiments/endpoint_configs/test_client.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import yaml
+import asyncio
+from autogen_core.models import ChatCompletionClient, UserMessage
+async def test_chat_completion_client() -> None:
+    # Load the config file
+    print("Loading config...")
+    with open("config.yaml", "r") as f:
+        config = yaml.safe_load(f)
+    # Get the orchestrator client config
+    client_config = config.get("orchestrator_client")
+    print(f"Loaded client config: {client_config}")
+    # Initialize the client
+    print("Initializing client...")
+    client = ChatCompletionClient.load_component(client_config)
+    # Test a simple completion
+    print("Testing completion...")
+    response = await client.create(
+        messages=[UserMessage(content="Say hello", source="user")]
+    )
+    print(f"Response content: {response.content}")
+    await client.close()
+if __name__ == "__main__":
+    asyncio.run(test_chat_completion_client())

experiments/eval/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ data
2	+ runs

experiments/eval/README.md ADDED Viewed

	@@ -0,0 +1,75 @@

+# Reproducing Experimental Results
+Make sure to clone the repo and install Magentic-UI with the following command:
+```bash
+pip install magentic-ui[eval]
+```
+From the root of the repo you can run these commands to reproduce our experimental results. Note that running the full experiments may take hours and each task may cost up to $0.5 of API credits when using OpenAI models.
+To evaluate an existing run or get partial results, replace "--mode run" with "--mode eval". See [experiments/eval/run.py](experiments/eval/run.py) for more information about the arguments.
+The run.py script takes care of running Magentic-UI on the benchmark of choice. It will download the data in `./data` folder at the root of the repo and store the run logs inside `runs/[SYSTEM NAME]/[DATASET NAME]/[SPLIT NAME]/[RUN ID]`. Inside this folder you'll find a folder for each task with files containing the run messages (`[TASK_ID]_messages.json`), time data (`times.json`), token usage data (`model_tokens_usage.json`), evaluation scores (`score.json`) and any screenshots (`screenshot_raw_[TIMESTAMP].png` and `screenshot*som*[TIMESTAMP].png`) or produced files. You will also find a `metrics.json` file with metrics for the entire run.
+**NOTE:** Make sure to create a config file with your model client endpoints. We provide a template config file [config_template.yaml](../endpoint_configs/config_template.yaml) that you should adapt. You should copy and rename this file to `config.yaml` inside `experiments/endpoint_configs` directory.
+## WebGames
+```bash
+python experiments/eval/run.py --current-dir . --dataset WebGames --split test  --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
+```
+## WebVoyager
+```bash
+python experiments/eval/run.py  --current-dir . --dataset WebVoyager --split webvoyager  --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --web-surfer-only true --mode run
+```
+## GAIA
+### Simulated User
+On the validation set we first get autonomous performance:
+```bash
+python experiments/eval/run.py  --current-dir . --dataset Gaia --split validation   --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml  --mode run
+```
+Then the simulated user with a stronger model (make sure your config file is correct first).
+```bash
+python experiments/eval/run.py  --current-dir . --dataset Gaia --split validation --run-id 2 --simulated-user-type co-planning-and-execution --how-helpful-user-proxy no_hints --parallel 1 --config experiments/endpoint_configs/config.yaml  --mode run
+```
+Then the simulated user with access to metadata.
+```bash
+python experiments/eval/run.py  --current-dir . --dataset Gaia --split validation --run-id 3 --simulated-user-type co-planning-and-execution --how-helpful-user-proxy soft --parallel 1 --config experiments/endpoint_configs/config.yaml  --mode run
+```
+To explore the results of these runs, you can use the following scripts that generate a CSV inside the logs directory:
+```bash
+python experiments/eval/explore_results.py --run-dir runs/MagenticUI_co-planning-and-execution_soft/Gaia/validation/3 --data-dir data/Gaia
+```
+and
+```bash
+python experiments/eval/analyze_sim_user.py --run-dir runs/MagenticUI_co-planning-and-execution_soft/Gaia/validation/3
+```
+### Test Set
+```bash
+python experiments/eval/run.py  --current-dir . --dataset Gaia --split test   --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml  --mode run
+```
+You can use the [experiments/eval/prepare_for_submission.py](experiments/eval/prepare_for_submission.py) script to submit to the Gaia and AssistantBench leaderboard.
+## AssistantBench
+```bash
+ python experiments/eval/run.py  --current-dir . --dataset AssistantBench --split test   --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml  --mode run
+```

experiments/eval/analyze_sim_user.py ADDED Viewed

	@@ -0,0 +1,257 @@

+import asyncio
+import logging
+import pandas as pd
+from typing import Dict, Any, Optional, Union
+import argparse
+import os
+from datetime import datetime
+from autogen_core.models import (
+    ChatCompletionClient,
+    UserMessage,
+    SystemMessage,
+)
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Configure model client
+default_client_config = {
+    "provider": "OpenAIChatCompletionClient",
+    "config": {
+        "model": "gpt-4o-2024-08-06",
+    },
+    "max_retries": 5,
+}
+def get_model_client(
+    endpoint_config: Optional[Union[Dict[str, Any], None]],
+) -> ChatCompletionClient:
+    if endpoint_config is None:
+        return ChatCompletionClient.load_component(default_client_config)
+    return ChatCompletionClient.load_component(endpoint_config)
+async def check_ground_truth_in_messages(
+    question: str, ground_truth: str, user_messages_str: str
+) -> int:
+    """
+    Ask the model if ground_truth is included in user_messages
+    Returns 1 if ground truth is included, 0 otherwise
+    """
+    prompt = f"""
+    I have a question: "{question}"
+    I have a list of messages a user sent to an AI assistant: {user_messages_str}
+    The question has a ground truth answer: "{ground_truth}"
+    Check if any of the user messages reveal directly the ground truth answer to the AI assistant.
+    In cases where the ground truth is found in the user messages but the message looks like a validation of the answer, then you should output 0.
+    Answer with ONLY 1 if the ground truth is included exactly, or ONLY 0 if it is not.
+    IMPORTANT: Only output 1 or 0, no other text.
+    """
+    model_client = get_model_client(None)
+    messages = [
+        SystemMessage(
+            content="You analyze if specific information is contained in messages."
+        ),
+        UserMessage(content=prompt, source="user"),
+    ]
+    try:
+        response = await model_client.create(messages=messages)
+        answer = response.content.strip()
+        # Ensure we only get 0 or 1
+        if answer == "1":
+            result = 1
+        else:
+            result = 0
+        await model_client.close()
+        return result
+    except Exception as e:
+        logger.error(f"Error calling model: {e}")
+        await model_client.close()
+        return -1
+async def process_csv(csv_path: str, output_path: str) -> None:
+    """Process the CSV file and analyze if ground truth is in user messages"""
+    try:
+        df = pd.read_csv(csv_path)
+        logger.info(f"Loaded dataframe with {len(df)} rows")
+        # Create columns for the results
+        df["ground_truth_in_messages"] = None
+        df["trivial_ground_truth_in_messages"] = (
+            None  # New column for trivial string match
+        )
+        df["llm_execution_count"] = 0  # New column for counting llm executions
+        df["llm_plan_count"] = 0  # New column for counting llm planning
+        for index, row in df.iterrows():
+            if pd.isna(row.get("ground_truth")) or pd.isna(row.get("user_messages")):
+                logger.warning(f"Missing data for row {index}")
+                continue
+            user_messages_str = str(row["user_messages"])
+            # Count llm executions in user messages
+            try:
+                messages = eval(user_messages_str)
+                llm_count = sum(
+                    1
+                    for msg in messages
+                    if isinstance(msg, dict)
+                    and isinstance(msg.get("metadata"), dict)
+                    and "user_execution_reply" in msg.get("metadata", {})
+                    and msg["metadata"]["user_execution_reply"] == "llm"
+                )
+                plan_count = sum(
+                    1
+                    for msg in messages
+                    if isinstance(msg, dict)
+                    and isinstance(msg.get("metadata"), dict)
+                    and "user_plan_reply" in msg.get("metadata", {})
+                    and msg["metadata"]["user_plan_reply"] == "llm"
+                )
+                df.at[index, "llm_execution_count"] = llm_count
+                df.at[index, "llm_plan_count"] = plan_count
+            except Exception as e:
+                logger.warning(
+                    f"Could not parse messages for task {row.get('task_id', index)}: {e}"
+                )
+                df.at[index, "llm_execution_count"] = 0
+                df.at[index, "llm_plan_count"] = 0
+            answer = str(row.get("answer", "")).strip().lower()
+            # if answer == "unable to determine":
+            #    df.at[index, "llm_execution_count"] = max(1, df.at[index, "llm_execution_count"])
+            logger.info(f"Processing task {row.get('task_id', index)}")
+            question = str(row["question"])
+            ground_truth = str(row["ground_truth"])
+            actual_user_messages = eval(user_messages_str)
+            actual_user_messages_str = ""
+            for msg in actual_user_messages:
+                actual_user_messages_str += f"{msg['content']}\n"
+            trivial_result = int(ground_truth in actual_user_messages_str)
+            df.at[index, "trivial_ground_truth_in_messages"] = trivial_result
+            result = await check_ground_truth_in_messages(
+                question, ground_truth, actual_user_messages_str
+            )
+            df.at[index, "ground_truth_in_messages"] = result
+            logger.info(
+                f"Task {row.get('task_id', index)}: result = {result}, llm executions = {df.at[index, 'llm_execution_count']}, llm planning = {df.at[index, 'llm_plan_count']}"
+            )
+        # Save results to new CSV
+        df.to_csv(output_path, index=False)
+        logger.info(f"Results saved to {output_path}")
+        # Calculate summary statistics (ALL TASKS)
+        counts = df["ground_truth_in_messages"].value_counts()
+        trivial_counts = df["trivial_ground_truth_in_messages"].value_counts()
+        total_valid = counts.sum()
+        trivial_total_valid = trivial_counts.sum()
+        percentage_included = (
+            (counts.get(1, 0) / total_valid * 100) if total_valid > 0 else 0
+        )
+        trivial_percentage_included = (
+            (trivial_counts.get(1, 0) / trivial_total_valid * 100)
+            if trivial_total_valid > 0
+            else 0
+        )
+        logger.info(
+            f"Summary (ALL TASKS): Ground truth included in {counts.get(1, 0)}/{total_valid} cases ({percentage_included:.2f}%)"
+        )
+        logger.info(
+            f"Trivial string match (ALL TASKS): Ground truth included in {trivial_counts.get(1, 0)}/{trivial_total_valid} cases ({trivial_percentage_included:.2f}%)"
+        )
+        mask_not_unable = (
+            df["answer"].astype(str).str.strip().str.lower() != "unable to determine"
+        )
+        df_not_unable = df[mask_not_unable]
+        # Ensure these are pandas Series for value_counts
+        gt_series = pd.Series(df_not_unable["ground_truth_in_messages"])
+        trivial_series = pd.Series(df_not_unable["trivial_ground_truth_in_messages"])
+        counts_not_unable = gt_series.value_counts()
+        trivial_counts_not_unable = trivial_series.value_counts()
+        total_valid_not_unable = counts_not_unable.sum()
+        trivial_total_valid_not_unable = trivial_counts_not_unable.sum()
+        percentage_included_not_unable = (
+            (counts_not_unable.get(1, 0) / total_valid_not_unable * 100)
+            if total_valid_not_unable > 0
+            else 0
+        )
+        trivial_percentage_included_not_unable = (
+            (trivial_counts_not_unable.get(1, 0) / trivial_total_valid_not_unable * 100)
+            if trivial_total_valid_not_unable > 0
+            else 0
+        )
+        logger.info(
+            f"Summary (EXCLUDING 'unable to determine'): Ground truth included in {counts_not_unable.get(1, 0)}/{total_valid_not_unable} cases ({percentage_included_not_unable:.2f}%)"
+        )
+        logger.info(
+            f"Trivial string match (EXCLUDING 'unable to determine'): Ground truth included in {trivial_counts_not_unable.get(1, 0)}/{trivial_total_valid_not_unable} cases ({trivial_percentage_included_not_unable:.2f}%)"
+        )
+        # Add summary statistics for llm executions
+        llm_stats = df["llm_execution_count"].describe()
+        tasks_with_execution = (df["llm_execution_count"] > 0).sum()
+        total_tasks = len(df)
+        # Get statistics for tasks with at least 1 execution
+        tasks_with_execution_df = df[df["llm_execution_count"] > 0]
+        tasks_with_planning = (df["llm_plan_count"] > 0).sum()
+        median_when_used = tasks_with_execution_df["llm_execution_count"].median()
+        mean_when_used = tasks_with_execution_df["llm_execution_count"].mean()
+        logger.info("\nLLM Execution Statistics:")
+        logger.info(
+            f"Tasks with at least 1 execution: {tasks_with_execution}/{total_tasks} ({(tasks_with_execution/total_tasks)*100:.2f}%)"
+        )
+        logger.info(
+            f"Tasks with at least 1 planning: {tasks_with_planning}/{total_tasks} ({(tasks_with_planning/total_tasks)*100:.2f}%)"
+        )
+        logger.info("\nWhen LLM is used at least once:")
+        logger.info(f"  - Median executions: {median_when_used:.2f}")
+        logger.info(f"  - Mean executions: {mean_when_used:.2f}")
+        logger.info("\nOverall statistics:")
+        logger.info(f"Mean executions per task: {llm_stats['mean']:.2f}")
+        logger.info(f"Median executions per task: {llm_stats['50%']:.2f}")
+        logger.info(f"Max executions in a task: {llm_stats['max']:.0f}")
+        logger.info(f"Min executions in a task: {llm_stats['min']:.0f}")
+    except Exception as e:
+        logger.error(f"Error processing CSV: {e}")
+def main():
+    parser = argparse.ArgumentParser(description="Analyze simulated user data CSV.")
+    parser.add_argument(
+        "--run-dir", type=str, required=True, help="Path to the run directory."
+    )
+    args = parser.parse_args()
+    run_dir = args.run_dir
+    input_csv = os.path.join(run_dir, "results.csv")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_csv = os.path.join(run_dir, f"sim_user_{timestamp}.csv")
+    # Run the analysis
+    asyncio.run(process_csv(input_csv, output_csv))
+if __name__ == "__main__":
+    main()

experiments/eval/explore_results.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import os
+import json
+import pandas as pd
+import argparse
+from typing import Dict, Any
+from magentic_ui.eval.benchmarks.gaia.gaia import GaiaBenchmark
+def get_run_results_df(
+    run_dir: str, data_dir: str, dataset_name: str = "Gaia"
+) -> pd.DataFrame:
+    """
+    Process a run directory and create a DataFrame containing all task results and ground truth.
+    Args:
+        run_dir (str): Path to the run directory containing task subdirectories
+    Returns:
+        pd.DataFrame: DataFrame containing task results and ground truth
+    """
+    # Initialize benchmark
+    if dataset_name == "Gaia":
+        benchmark = GaiaBenchmark(data_dir=data_dir)
+    else:
+        raise ValueError(f"Invalid dataset name: {dataset_name}")
+    # Download the dataset (only needed once)
+    benchmark.download_dataset()
+    # Load it into memory
+    benchmark.load_dataset()
+    # Initialize lists to store data
+    data = []
+    # Process each task directory
+    for task_dir in os.listdir(run_dir):
+        task_path = os.path.join(run_dir, task_dir)
+        # Skip if not a directory or if it's a log file
+        if not os.path.isdir(task_path) or task_dir.startswith("."):
+            continue
+        task_data: Dict[str, Any] = {"task_id": task_dir}
+        # Get ground truth from benchmark
+        if task_dir in benchmark.tasks:
+            task_data["ground_truth"] = benchmark.tasks[task_dir].ground_truth
+            task_data["question"] = benchmark.tasks[task_dir].question
+            task_data["difficulty"] = benchmark.tasks[task_dir].difficulty
+            task_data["metadata"] = benchmark.tasks[task_dir].metadata
+        # Read answer file
+        answer_file = os.path.join(task_path, f"{task_dir}_answer.json")
+        if os.path.exists(answer_file):
+            with open(answer_file, "r") as f:
+                task_data["answer"] = json.load(f)["answer"]
+        # Read messages file
+        messages_file = os.path.join(task_path, f"{task_dir}_messages.json")
+        if os.path.exists(messages_file):
+            with open(messages_file, "r") as f:
+                task_data["messages"] = json.load(f)
+            user_messages = [
+                message
+                for message in task_data["messages"]
+                if message["source"] == "user_proxy"
+            ]
+            task_data["user_messages"] = user_messages
+        # Read score file
+        score_file = os.path.join(task_path, "score.json")
+        if os.path.exists(score_file):
+            with open(score_file, "r") as f:
+                score = json.load(f)
+                task_data["score"] = score["score"]
+        # Read times file
+        times_file = os.path.join(task_path, "times.json")
+        if os.path.exists(times_file):
+            with open(times_file, "r") as f:
+                task_data["duration"] = json.load(f)["duration"]
+        data.append(task_data)
+    df = pd.DataFrame(data)
+    # Filter out rows where score is NaN
+    df = df.dropna(subset=["score"])
+    # Save DataFrame to CSV
+    output_csv = os.path.join(run_dir, "results.csv")
+    df.to_csv(output_csv, index=False)
+    print(f"Results DataFrame saved to {output_csv}")
+    return df
+def get_output_prefix(run_dir: str) -> str:
+    """Generate output prefix from last 4 parts of run_dir path."""
+    # Split path and get last 4 parts
+    parts = os.path.normpath(run_dir).split(os.sep)
+    relevant_parts = parts[-4:] if len(parts) >= 4 else parts
+    return "_".join(relevant_parts)
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process run results and analyze tasks."
+    )
+    parser.add_argument(
+        "--run-dir",
+        type=str,
+        required=True,
+        help="Path to the run directory containing task subdirectories",
+    )
+    parser.add_argument(
+        "--data-dir", type=str, required=True, help="Path to the data directory"
+    )
+    args, unknown = (
+        parser.parse_known_args()
+    )  # First parse run_dir to generate default filenames
+    # Generate default filenames based on run_dir
+    prefix = get_output_prefix(args.run_dir)
+    parser.add_argument(
+        "--failed_output",
+        type=str,
+        default=f"{args.run_dir}/failed_tasks_{prefix}.json",
+        help="Output file path for failed tasks",
+    )
+    parser.add_argument(
+        "--all_output",
+        type=str,
+        default=f"{args.run_dir}/all_tasks_{prefix}.json",
+        help="Output file path for all tasks",
+    )
+    args = parser.parse_args()  # Parse all arguments
+    df = get_run_results_df(args.run_dir, args.data_dir)
+    # Add a column to flag 'unable to determine' answers
+    unable_str = "Unable to determine"
+    df["unable_to_determine"] = (
+        df["answer"].astype(str).str.strip().str.contains(unable_str)
+    )
+    unable_count = df["unable_to_determine"].sum()
+    # Accuracy excluding 'unable to determine'
+    df_excl = df[~df["unable_to_determine"]]
+    if len(df_excl) > 0:
+        acc_excl = (df_excl["score"] > 0).mean()
+    else:
+        acc_excl = float("nan")
+    # Accuracy counting 'unable to determine' as correct
+    acc_unable_correct = ((df["score"] > 0) | df["unable_to_determine"]).mean()
+    # Create a list to store all tasks and failed tasks
+    all_tasks = []
+    failed_tasks = []
+    for index, row in df.iterrows():
+        task_info = {
+            "task_id": row["task_id"],
+            "question": row["question"],
+            "answer": row["answer"],
+            "ground_truth": row["ground_truth"],
+            "score": row["score"],
+            "difficulty": row["difficulty"],
+            "duration": row.get("duration", None),
+            "messages": row["messages"],
+        }
+        all_tasks.append(task_info)
+        if row["score"] == 0:
+            failed_tasks.append(task_info)
+    # Write all tasks to a log file
+    with open(args.all_output, "w") as log_file:
+        json.dump(all_tasks, log_file, indent=4, ensure_ascii=False)
+    print(f"All tasks written to {args.all_output}")
+    # Write failed tasks to a log file
+    with open(args.failed_output, "w") as log_file:
+        json.dump(failed_tasks, log_file, indent=4, ensure_ascii=False)
+    print(f"Failed tasks written to {args.failed_output}")
+    # Print summary statistics
+    print("\nSummary:")
+    print(f"Total tasks: {len(all_tasks)}")
+    print(f"Failed tasks: {len(failed_tasks)}")
+    print(f"Unable to determine: {unable_count}")
+    print(f"Rate of unable to determine: {unable_count / len(df) * 100:.2f}%")
+    print(
+        f"Success rate: {((len(all_tasks) - len(failed_tasks)) / len(all_tasks) * 100):.2f}%"
+    )
+    print(f"Accuracy (excluding 'unable to determine'): {acc_excl*100:.2f}%")
+    print(
+        f"Accuracy (counting 'unable to determine' as correct): {acc_unable_correct*100:.2f}%"
+    )
+if __name__ == "__main__":
+    main()

experiments/eval/plot_results.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import matplotlib.pyplot as plt
+import matplotlib.style as style
+from matplotlib.ticker import PercentFormatter
+import os
+import argparse
+import numpy as np
+def create_accuracy_plot(save_path=None, save_dir=None):
+    """
+    Parameters:
+    -----------
+    save_path : str, optional
+        Filename to save the figure. If None, the figure is not saved.
+    save_dir : str, optional
+        Directory to save the figure. If provided, the directory will be created
+        if it doesn't exist. Default is current directory if save_path is provided.
+    Returns:
+    --------
+    fig, ax : tuple
+        Figure and axes objects for further customization if needed.
+    """
+    style.use("seaborn-v0_8-whitegrid")
+    plt.rcParams["font.family"] = "sans-serif"
+    plt.rcParams["font.sans-serif"] = ["Arial", "DejaVu Sans"]
+    plt.rcParams["font.size"] = 16
+    plt.rcParams["axes.labelsize"] = 16
+    plt.rcParams["axes.titlesize"] = 17
+    plt.rcParams["xtick.labelsize"] = 12
+    plt.rcParams["ytick.labelsize"] = 12
+    plt.rcParams["legend.fontsize"] = 12
+    # Data
+    models = [
+        "Magentic-One",
+        "Magentic-UI\n(autonomous)",
+        "Magentic-UI +\nSimulated User\n(smarter model)",
+        "Magentic-UI +\nSimulated User\n(side-information)",
+        "Human",
+    ]
+    accuracy = [33.72, 30.2, 42.6, 51.9, 92]
+    sample_size = 162
+    # Calculate 95% confidence intervals for each accuracy
+    z = 1.96  # for 95% confidence
+    accuracy_frac = np.array(accuracy) / 100.0
+    ci_half_width = (
+        z * np.sqrt(accuracy_frac * (1 - accuracy_frac) / sample_size) * 100
+    )  # convert back to percent
+    # Create figure and axis with adjusted figsize for more horizontal space
+    fig, ax = plt.subplots(figsize=(9, 6))
+    # Custom colors as specified
+    dark_magenta = "#8B008B"  # Darker magenta for Magentic-One
+    grey = "#808080"  # Grey for Magentic-UI + Simulated Human
+    beige = "#F5F5DC"  # Beige for Human
+    colors = [grey, dark_magenta, dark_magenta, dark_magenta, beige]
+    hatches = [
+        "",
+        "",
+        "///",
+        "xx",
+        "",
+    ]
+    # Create custom x positions for more space between bars
+    x = np.arange(len(models)) * 2
+    # Create separate bars for each model
+    bars = []
+    for i, (model, acc) in enumerate(zip(models, accuracy)):
+        bar = ax.bar(
+            x[i],
+            acc,
+            color=colors[i],
+            width=1,
+            edgecolor="black",
+            linewidth=0.8,
+            label=model,
+            hatch=hatches[i],
+            yerr=ci_half_width[i],
+            capsize=8,
+        )
+        bars.extend(bar)
+    # Set x-tick positions and labels
+    ax.set_xticks(x)
+    ax.set_xticklabels(models, rotation=0, ha="center")
+    # Configure the axes
+    ax.set_ylabel("Accuracy (%)", fontweight="bold")
+    ax.set_ylim(0, 100)  # Set y-axis from 0 to 100%
+    ax.yaxis.set_major_formatter(PercentFormatter())
+    # Add grid for y-axis only and put it behind the bars
+    ax.yaxis.grid(True, linestyle="--", alpha=0.7)
+    ax.set_axisbelow(True)
+    # Remove top and right spines
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    # Make left and bottom spines thicker
+    ax.spines["left"].set_linewidth(1.5)
+    ax.spines["bottom"].set_linewidth(1.5)
+    # Add legend inside the plot
+    legend = ax.legend(
+        loc="upper left", frameon=True, framealpha=0.9, edgecolor="lightgray"
+    )
+    legend.get_title().set_fontweight("bold")
+    # Add some padding to the x-axis labels
+    plt.xticks(rotation=0, ha="center")
+    # Adjust bottom margin to ensure labels fit
+    plt.subplots_adjust(bottom=0.15)
+    plt.tight_layout()
+    # Save the figure in high resolution if path provided
+    if save_path:
+        if save_dir:
+            # Create directory if it doesn't exist
+            os.makedirs(save_dir, exist_ok=True)
+            full_path = os.path.join(save_dir, save_path)
+        else:
+            full_path = save_path
+        # save as pdf
+        plt.savefig(full_path.replace(".png", ".pdf"), dpi=600, bbox_inches="tight")
+        # save as png
+        plt.savefig(full_path.replace(".pdf", ".png"), dpi=600, bbox_inches="tight")
+        print(
+            f"Plot saved to: {os.path.abspath(full_path.replace('.png', '.pdf'))} and {os.path.abspath(full_path.replace('.pdf', '.png'))}"
+        )
+    return fig, ax
+if __name__ == "__main__":
+    # Set up command line argument parsing
+    parser = argparse.ArgumentParser(description="plot experimental results")
+    parser.add_argument(
+        "--save-dir",
+        "-d",
+        type=str,
+        default="plots",
+        help="Directory to save the plot (default: plots)",
+    )
+    args = parser.parse_args()
+    # Create and display the plot
+    fig, ax = create_accuracy_plot(
+        save_path="model_accuracy_comparison.png", save_dir=args.save_dir
+    )

experiments/eval/prepare_for_submission.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import json
+import argparse
+from typing import Dict, Any, List
+def load_questions_gaia(metadata_path: str) -> Dict[str, str]:
+    """Load questions from a Gaia metadata JSONL file."""
+    questions: Dict[str, str] = {}
+    with open(metadata_path, "r") as f:
+        for line in f:
+            entry = json.loads(line)
+            questions[entry["task_id"]] = entry["Question"]
+    return questions
+def load_questions_assistantbench(metadata_path: str) -> Dict[str, str]:
+    """Load questions from an AssistantBench metadata JSONL file."""
+    questions: Dict[str, str] = {}
+    with open(metadata_path, "r") as f:
+        for line in f:
+            entry = json.loads(line)
+            questions[entry["id"]] = entry["task"]
+    return questions
+def prepare_for_submission_gaia(base_dir: str, metadata_path: str) -> None:
+    """Prepare Gaia model answers for submission by aggregating answers and questions into a JSONL file."""
+    questions = load_questions_gaia(metadata_path)
+    task_ids = [
+        d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
+    ]
+    results: List[Dict[str, Any]] = []
+    found_task_ids = set()
+    for task_id in task_ids:
+        answer_path = os.path.join(base_dir, task_id, f"{task_id}_answer.json")
+        if os.path.exists(answer_path):
+            with open(answer_path, "r") as f:
+                data = json.load(f)
+            answer = data.get("answer", "")
+            if answer == "Unable to determine":
+                answer = ""
+            question = questions.get(task_id, "")
+            results.append(
+                {
+                    "task_id": task_id,
+                    "question": question,
+                    "model_answer": answer,
+                    "reasoning_trace": "Reasoning trace not available",
+                }
+            )
+            found_task_ids.add(task_id)
+    # Add missing questions from metadata
+    for task_id, question in questions.items():
+        if task_id not in found_task_ids:
+            results.append(
+                {
+                    "task_id": task_id,
+                    "question": question,
+                    "answer": "",
+                    "reasoning_trace": "Reasoning trace not available",
+                }
+            )
+    # Write to model_answers.jsonl in base_dir
+    output_file = os.path.join(base_dir, "model_answers.jsonl")
+    with open(output_file, "w") as f:
+        for item in results:
+            f.write(json.dumps(item) + "\n")
+def prepare_for_submission_assistantbench(base_dir: str, metadata_path: str) -> None:
+    """Prepare AssistantBench model answers for submission by aggregating answers and questions into a JSONL file."""
+    questions = load_questions_assistantbench(metadata_path)
+    task_ids = [
+        d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))
+    ]
+    results: List[Dict[str, Any]] = []
+    found_ids = set()
+    for task_id in task_ids:
+        answer_path = os.path.join(base_dir, task_id, f"{task_id}_answer.json")
+        if os.path.exists(answer_path):
+            with open(answer_path, "r") as f:
+                data = json.load(f)
+            # Expecting {"id": ..., "answer": ...}
+            id_ = data.get("id", task_id)
+            model_answer = data.get("answer", "")
+            if model_answer in ("Unable to determine", "None"):
+                model_answer = ""
+            # question = questions.get(id_, "")
+            results.append(
+                {
+                    "id": id_,
+                    # "question": question,
+                    "answer": model_answer,
+                }
+            )
+            found_ids.add(id_)
+    # Add missing questions from metadata
+    for id_, question in questions.items():
+        if id_ not in found_ids:
+            results.append(
+                {
+                    "id": id_,
+                    # "question": question,
+                    "answer": "",
+                }
+            )
+    # Write to model_answers.jsonl in base_dir
+    output_file = os.path.join(base_dir, "model_answers.jsonl")
+    with open(output_file, "w") as f:
+        for item in results:
+            f.write(json.dumps(item) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Prepare model answers for submission."
+    )
+    parser.add_argument("base_dir", help="Base directory containing task folders.")
+    parser.add_argument("--metadata", default="", help="Path to metadata.jsonl file.")
+    parser.add_argument("--dataset", default="Gaia", help="Dataset name.")
+    args = parser.parse_args()
+    if args.dataset == "Gaia":
+        prepare_for_submission_gaia(args.base_dir, args.metadata)
+    elif args.dataset == "AssistantBench":
+        prepare_for_submission_assistantbench(args.base_dir, args.metadata)
+    else:
+        raise ValueError(f"Dataset {args.dataset} not supported.")

experiments/eval/run.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import json
+import yaml
+import argparse
+import os
+import datetime
+from typing import Optional, Dict, Any, Callable
+from magentic_ui.eval.core import run_evaluate_benchmark_func, evaluate_benchmark_func
+from systems.magentic_ui_sim_user_system import MagenticUISimUserSystem
+from magentic_ui.eval.systems import LLMSystem
+from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
+from magentic_ui.eval.benchmark import Benchmark
+from autogen_core.models import ChatCompletionClient
+def save_experiment_args(args: argparse.Namespace, system_name: str) -> None:
+    """
+    Save experiment arguments to a timestamped JSON file.
+    Args:
+        args (argparse.Namespace): The arguments namespace containing experiment parameters.
+        system_name (str): The name of the system being evaluated.
+    """
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"args_{timestamp}.json"
+    # Create the same directory structure as used in core.py
+    save_dir = os.path.join(
+        args.current_dir,
+        "runs",
+        system_name,
+        args.dataset,
+        args.split or "all_benchmark",
+        str(args.run_id),
+    )
+    os.makedirs(save_dir, exist_ok=True)
+    # Convert args namespace to dict
+    args_dict = vars(args).copy()
+    # Add only relevant client configurations if config file exists
+    if args.config and os.path.exists(args.config):
+        config_contents = load_config(args.config)
+        if config_contents is not None:
+            client_keys = [
+                "orchestrator_client",
+                "web_surfer_client",
+                "coder_client",
+                "file_surfer_client",
+                "user_proxy_client",
+            ]
+            args_dict["client_configs"] = {
+                k: config_contents.get(k) for k in client_keys if k in config_contents
+            }
+            args_dict["config_path"] = os.path.abspath(args.config)
+    filepath = os.path.join(save_dir, filename)
+    with open(filepath, "w") as f:
+        json.dump(args_dict, f, indent=4)
+    print(f"Experiment args saved to {filepath}")
+def load_config(config_path: Optional[str]) -> Optional[Dict[str, Any]]:
+    """
+    Load configuration from either YAML or JSON file.
+    Args:
+        config_path (Optional[str]): Path to the configuration file (YAML or JSON).
+    Returns:
+        Optional[Dict[str, Any]]: The loaded configuration as a dictionary, or None if not found.
+    """
+    if config_path is None:
+        return None
+    with open(config_path, "r") as f:
+        if config_path.endswith((".yml", ".yaml")):
+            config = yaml.safe_load(f)
+            return config if config else None
+        else:
+            return json.load(f)
+def run_system_evaluation(
+    args: argparse.Namespace,
+    system_constructor: Any,
+    system_name: str,
+    config: Optional[Dict[str, Any]] = None,
+) -> None:
+    """
+    Common function to run system evaluation to avoid code duplication.
+    Args:
+        args (argparse.Namespace): The arguments namespace containing experiment parameters.
+        system_constructor (Any): The system instance or constructor to evaluate.
+        system_name (str): The name of the system being evaluated.
+        config (Optional[Dict[str, Any]]): Optional configuration dictionary.
+    """
+    benchmark_constructor: Optional[Callable[..., Benchmark]] = None
+    if args.dataset == "WebVoyager":
+        # Download the dataset (only needed once)
+        client = ChatCompletionClient.load_component(
+            {
+                "provider": "OpenAIChatCompletionClient",
+                "config": {
+                    "model": "gpt-4o-2024-08-06",
+                },
+                "max_retries": 10,
+            }
+        )
+        def create_benchmark(data_dir="WebVoyager", name="WebVoyager"):
+            benchmark = WebVoyagerBenchmark(
+                data_dir=data_dir,
+                eval_method="gpt_eval",
+                model_client=client,
+            )
+            return benchmark
+        benchmark_constructor = create_benchmark
+        # Load it into memory
+    if args.mode == "eval":
+        evaluate_benchmark_func(
+            benchmark_name=args.dataset,
+            benchmark_constructor=benchmark_constructor,
+            system_name=system_name,
+            parallel=args.parallel,
+            benchmark_dir=args.current_dir,
+            runs_dir=args.current_dir,
+            split=args.split,
+            run_id=args.run_id,
+            system_constructor=system_constructor,
+            redo_eval=args.redo_eval,
+        )
+    else:
+        run_evaluate_benchmark_func(
+            benchmark_name=args.dataset,
+            benchmark_constructor=benchmark_constructor,
+            system_name=system_name,
+            parallel=args.parallel,
+            benchmark_dir=args.current_dir,
+            runs_dir=args.current_dir,
+            split=args.split,
+            run_id=args.run_id,
+            system_constructor=system_constructor,
+            subsample=args.subsample if args.subsample < 1 else None,
+            redo_eval=args.redo_eval,
+        )
+def run_system_sim_user(args: argparse.Namespace, system_name: str) -> None:
+    """
+    Run evaluation using the MagenticUISystem, which simulates user interactions.
+    Args:
+        args (argparse.Namespace): The arguments namespace containing experiment parameters.
+        system_name (str): The name of the system being evaluated.
+    """
+    config = load_config(args.config)
+    if system_name == "LLM":
+        # Use LLMSystem for LLM-based evaluations
+        system = LLMSystem(
+            system_name=system_name,
+            endpoint_config=config.get("model_client") if config else None,
+        )
+    else:
+        system = MagenticUISimUserSystem(
+            simulated_user_type=args.simulated_user_type,
+            endpoint_config_orch=config.get("orchestrator_client") if config else None,
+            endpoint_config_websurfer=config.get("web_surfer_client") if config else None,
+            endpoint_config_coder=config.get("coder_client") if config else None,
+            endpoint_config_file_surfer=config.get("file_surfer_client")
+            if config
+            else None,
+            endpoint_config_user_proxy=config.get("user_proxy_client") if config else None,
+            web_surfer_only=args.web_surfer_only,
+            how_helpful_user_proxy=args.how_helpful_user_proxy,
+            dataset_name=args.dataset,
+        )
+    run_system_evaluation(args, system, system_name, config)
+def main() -> None:
+    """
+    Main entry point for running or evaluating the Magentic-UI system on benchmarks.
+    Parses command-line arguments and dispatches to the appropriate system runner.
+    """
+    parser = argparse.ArgumentParser(
+        description="Run or evaluate Magentic-UI system on benchmarks"
+    )
+    parser.add_argument(
+        "--mode",
+        choices=["run", "eval"],
+        default="run",
+        help="Mode to run: 'run' for running benchmarks, 'eval' for evaluation",
+    )
+    parser.add_argument(
+        "--current-dir", default=os.getcwd(), help="Current working directory"
+    )
+    parser.add_argument("--split", default="validation-1", help="Dataset split to use")
+    parser.add_argument("--dataset", default="Gaia", help="Dataset name")
+    parser.add_argument(
+        "--config", required=False, help="Path to endpoint configuration file for LLMs"
+    )
+    parser.add_argument(
+        "--run-id", type=int, default=1, help="Run ID for the experiment"
+    )
+    parser.add_argument(
+        "--parallel", type=int, default=1, help="Number of parallel processes to use"
+    )
+    parser.add_argument(
+        "--subsample",
+        type=float,
+        default=1,
+        help="Subsample ratio for the dataset (only used in run mode)",
+    )
+    parser.add_argument(
+        "--simulated-user-type",
+        type=str,
+        default="none",
+        help="Type of simulated user (co-planning, co-execution, co-planning-and-execution, dummy, none)",
+    )
+    parser.add_argument(
+        "--how-helpful-user-proxy",
+        type=str,
+        default="soft",
+        help="How helpful the user proxy should be (strict, soft, no_hints)",
+    )
+    parser.add_argument(
+        "--user-messages-data",
+        type=str,
+        help="Path to user messages data CSV file",
+    )
+    parser.add_argument(
+        "--system-type",
+        type=str,
+        default="MagenticUI",
+        choices=["MagenticUI", "magentic-ui-sim-user", "LLM"],
+        help="Type of system to run",
+    )
+    parser.add_argument(
+        "--web-surfer-only",
+        type=bool,
+        default=False,
+        help="Run only the web surfer agent",
+    )
+    parser.add_argument(
+        "--redo-eval",
+        action="store_true",
+        default=False,
+        help="Redo evaluation even if results exist (default: False)",
+    )
+    args = parser.parse_args()
+    # Determine system name based on arguments
+    system_name = args.system_type
+    if args.simulated_user_type != "none":
+        system_name += f"_{args.simulated_user_type}_{args.how_helpful_user_proxy}"
+    if args.web_surfer_only:
+        system_name += "_web_surfer_only"
+    # Save experiment args
+    save_experiment_args(args, system_name)
+    # Run the appropriate system
+    run_system_sim_user(args, system_name)
+if __name__ == "__main__":
+    main()

experiments/eval/sample_eval_systems.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from autogen_core.models import ChatCompletionClient
+from systems import MagenticUIAutonomousSystem
+from systems.magentic_one_system import MagenticOneSystem
+from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
+import os
+def test_magentic_ui_system():
+    default_client_config = {
+        "provider": "OpenAIChatCompletionClient",
+        "config": {
+            "model": "gpt-4o-2024-08-06",
+        },
+        "max_retries": 10,
+    }
+    system = MagenticUIAutonomousSystem(
+        endpoint_config_orch=default_client_config,
+        endpoint_config_websurfer=default_client_config,
+        endpoint_config_coder=default_client_config,
+        endpoint_config_file_surfer=default_client_config,
+        use_local_browser=True,
+        web_surfer_only=True,
+    )
+    client = ChatCompletionClient.load_component(default_client_config)
+    benchmark = WebVoyagerBenchmark(
+        data_dir="WebVoyager",
+        eval_method="gpt_eval",
+        model_client=client,
+    )
+    benchmark.download_dataset()
+    benchmark.load_dataset()
+    test_task = benchmark.tasks["Allrecipes--0"]
+    print(test_task)
+    os.makedirs("test_output_magentic_ui", exist_ok=True)
+    answer = system.get_answer(
+        task_id="Allrecipes--0",
+        task=test_task,
+        output_dir="test_output_magentic_ui",
+    )
+    print(answer)
+    score = benchmark.evaluator(test_task, answer)
+    print(score)
+def test_magentic_one_system():
+    default_client_config = {
+        "provider": "OpenAIChatCompletionClient",
+        "config": {
+            "model": "gpt-4o-2024-08-06",
+        },
+        "max_retries": 10,
+    }
+    system = MagenticOneSystem(
+        model_client_config=default_client_config,
+        web_surfer_only=True,
+    )
+    client = ChatCompletionClient.load_component(default_client_config)
+    benchmark = WebVoyagerBenchmark(
+        data_dir="WebVoyager",
+        eval_method="gpt_eval",
+        model_client=client,
+    )
+    benchmark.download_dataset()
+    benchmark.load_dataset()
+    test_task = benchmark.tasks["Allrecipes--0"]
+    print(test_task)
+    os.makedirs("test_output_magentic_one", exist_ok=True)
+    answer = system.get_answer(
+        task_id="Allrecipes--0",
+        task=test_task,
+        output_dir="test_output_magentic_one",
+    )
+    print(answer)
+    score = benchmark.evaluator(test_task, answer)
+    print(score)
+if __name__ == "__main__":
+    test_magentic_one_system()

experiments/eval/systems/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .magentic_ui_sim_user_system import MagenticUISimUserSystem
+from .magentic_ui_system import MagenticUIAutonomousSystem
+from .magentic_one_system import MagenticOneSystem
+__all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem", "MagententicOneSystem"]

experiments/eval/systems/magentic_one_system.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import asyncio
+import json
+import os
+import aiofiles
+import logging
+import datetime
+from PIL import Image
+from pydantic import BaseModel
+from typing import List, Dict, Any, Tuple
+from autogen_core.models import ChatCompletionClient
+from autogen_core import Image as AGImage
+from autogen_agentchat.base import TaskResult, ChatAgent
+from autogen_agentchat.messages import (
+    MultiModalMessage,
+    TextMessage,
+)
+from autogen_ext.agents.file_surfer import FileSurfer
+from autogen_ext.agents.web_surfer import MultimodalWebSurfer
+from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
+from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
+from autogen_agentchat.agents import CodeExecutorAgent
+from autogen_agentchat.teams import MagenticOneGroupChat
+from magentic_ui.eval.basesystem import BaseSystem
+from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
+from magentic_ui.types import CheckpointEvent
+logger = logging.getLogger(__name__)
+logging.getLogger("autogen").setLevel(logging.WARNING)
+logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
+logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)
+class LogEventSystem(BaseModel):
+    """
+    Data model for logging events.
+    Attributes:
+        source (str): The source of the event (e.g., agent name).
+        content (str): The content/message of the event.
+        timestamp (str): ISO-formatted timestamp of the event.
+        metadata (Dict[str, str]): Additional metadata for the event.
+    """
+    source: str
+    content: str
+    timestamp: str
+    metadata: Dict[str, str] = {}
+class MagenticOneSystem(BaseSystem):
+    """
+    MagenticOneSystem
+    Args:
+        name (str): Name of the system instance.
+        model_client_config (Dict[str, Any]): Model client config.
+        web_surfer_only (bool): If True, only the web surfer agent is used.
+        dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
+    """
+    def __init__(
+        self,
+        model_client_config: Dict[str, Any],
+        web_surfer_only: bool = False,
+        name: str = "MagenticOneSystem",
+        dataset_name: str = "Gaia",
+    ):
+        super().__init__(name)
+        self.candidate_class = WebVoyagerCandidate
+        self.model_client_config = model_client_config
+        self.dataset_name = dataset_name
+        self.web_surfer_only = web_surfer_only
+    def get_answer(
+        self, task_id: str, task: BaseTask, output_dir: str
+    ) -> BaseCandidate:
+        """
+        Runs the agent team to solve a given task and saves the answer and logs to disk.
+        Args:
+            task_id (str): Unique identifier for the task.
+            task (BaseTask): The task object containing the question and metadata.
+            output_dir (str): Directory to save logs, screenshots, and answer files.
+        Returns:
+            BaseCandidate: An object containing the final answer and any screenshots taken during execution.
+        """
+        async def _runner() -> Tuple[str, List[str]]:
+            """
+            Asynchronous runner that executes the agent team and collects the answer and screenshots.
+            Returns:
+                Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
+            """
+            messages_so_far: List[LogEventSystem] = []
+            task_question: str = task.question
+            # Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
+            FINAL_ANSWER_PROMPT = """
+            output a FINAL ANSWER to the task.
+            The task is: {task}`
+            To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
+            Don't put your answer in brackets or quotes.
+            Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+            ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+            If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+            If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+            If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+            You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
+            """
+            model_client = ChatCompletionClient.load_component(self.model_client_config)
+            # Instantiate agents explicitly
+            ws = MultimodalWebSurfer(
+                "WebSurfer",
+                model_client=model_client,
+                to_save_screenshots=True,
+                debug_dir=output_dir,
+            )
+            agents: List[ChatAgent] = []
+            if self.web_surfer_only:
+                agents = [ws]
+            else:
+                coder = MagenticOneCoderAgent("Coder", model_client=model_client)
+                executor = CodeExecutorAgent(
+                    "ComputerTerminal", code_executor=LocalCommandLineCodeExecutor()
+                )
+                fs = FileSurfer("FileSurfer", model_client=model_client)
+                agents = [fs, ws, coder, executor]
+            m1_agent = MagenticOneGroupChat(
+                agents,
+                model_client=model_client,
+                final_answer_prompt=FINAL_ANSWER_PROMPT,
+            )
+            # Step 3: Prepare the task message
+            answer: str = ""
+            # check if file name is an image if it exists
+            if (
+                hasattr(task, "file_name")
+                and task.file_name
+                and task.file_name.endswith((".png", ".jpg", ".jpeg"))
+            ):
+                task_message = MultiModalMessage(
+                    content=[
+                        task_question,
+                        AGImage.from_pil(Image.open(task.file_name)),
+                    ],
+                    source="user",
+                )
+            else:
+                task_message = TextMessage(content=task_question, source="user")
+            # Step 4: Run the team on the task
+            async for message in m1_agent.run_stream(task=task_message):
+                # Store log events
+                message_str: str = ""
+                try:
+                    if isinstance(message, TaskResult) or isinstance(
+                        message, CheckpointEvent
+                    ):
+                        continue
+                    message_str = message.to_text()
+                    # Create log event with source, content and timestamp
+                    log_event = LogEventSystem(
+                        source=message.source,
+                        content=message_str,
+                        timestamp=datetime.datetime.now().isoformat(),
+                        metadata=message.metadata,
+                    )
+                    messages_so_far.append(log_event)
+                except Exception as e:
+                    logger.info(
+                        f"[likely nothing] When creating model_dump of message encountered exception {e}"
+                    )
+                    pass
+                # save to file
+                logger.info(f"Run in progress: {task_id}, message: {message_str}")
+                async with aiofiles.open(
+                    f"{output_dir}/{task_id}_messages.json", "w"
+                ) as f:
+                    # Convert list of logevent objects to list of dicts
+                    messages_json = [msg.model_dump() for msg in messages_so_far]
+                    await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
+                # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
+            # get last message with source MagenticOneOrchestrator, might not be the last message
+            last_message_with_orchestrator = None
+            for message in messages_so_far:
+                if message.source == "MagenticOneOrchestrator":
+                    last_message_with_orchestrator = message
+            if last_message_with_orchestrator:
+                answer = last_message_with_orchestrator.content
+                answer = answer.split("FINAL ANSWER:")[0].strip()
+            else:
+                answer = messages_so_far[-1].content
+            assert isinstance(
+                answer, str
+            ), f"Expected answer to be a string, got {type(answer)}"
+            # save the usage of each of the client in a usage json file
+            def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
+                return {
+                    "prompt_tokens": model_client.total_usage().prompt_tokens,
+                    "completion_tokens": model_client.total_usage().completion_tokens,
+                }
+            usage_json = {
+                "client": get_usage(model_client),
+            }
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
+            # Step 5: Prepare the screenshots
+            screenshots_paths = []
+            # check the directory for screenshots which start with screenshot_raw_
+            for file in os.listdir(output_dir):
+                if file.startswith("screenshot_"):
+                    timestamp = file.split("_")[1]
+                    screenshots_paths.append(
+                        [timestamp, os.path.join(output_dir, file)]
+                    )
+            # restrict to last 15 screenshots by timestamp
+            screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
+            screenshots_paths = [x[1] for x in screenshots_paths]
+            return answer, screenshots_paths
+        # Step 6: Return the answer and screenshots
+        answer, screenshots_paths = asyncio.run(_runner())
+        answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
+        self.save_answer_to_disk(task_id, answer, output_dir)
+        return answer

experiments/eval/systems/magentic_ui_sim_user_system.py ADDED Viewed

	@@ -0,0 +1,484 @@

+import asyncio
+import json
+import os
+import aiofiles
+import logging
+import datetime
+from pathlib import Path
+from PIL import Image
+from pydantic import BaseModel
+from typing import List, Optional, Union, Dict, Any, Literal, Tuple
+from autogen_core import ComponentModel
+from autogen_core.models import ChatCompletionClient
+from autogen_core import Image as AGImage
+from autogen_agentchat.base import TaskResult, ChatAgent
+from autogen_agentchat.messages import (
+    MultiModalMessage,
+    TextMessage,
+)
+from autogen_agentchat.conditions import TimeoutTermination
+from magentic_ui import OrchestratorConfig
+from magentic_ui.eval.basesystem import BaseSystem
+from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
+from magentic_ui.types import CheckpointEvent
+from magentic_ui.agents import WebSurfer, CoderAgent, FileSurfer
+from magentic_ui.teams import GroupChat
+from magentic_ui.agents.users import MetadataUserProxy, DummyUserProxy
+from magentic_ui.tools.playwright.browser import VncDockerPlaywrightBrowser
+from magentic_ui.tools.playwright.browser.utils import get_available_port
+from magentic_ui.approval_guard import (
+    ApprovalGuard,
+    ApprovalGuardContext,
+    ApprovalConfig,
+)
+logger = logging.getLogger(__name__)
+logging.getLogger("autogen").setLevel(logging.WARNING)
+logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
+logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)
+class LogEventSystem(BaseModel):
+    """
+    Data model for logging events.
+    Attributes:
+        source (str): The source of the event (e.g., agent name).
+        content (str): The content/message of the event.
+        timestamp (str): ISO-formatted timestamp of the event.
+        metadata (Dict[str, str]): Additional metadata for the event.
+    """
+    source: str
+    content: str
+    timestamp: str
+    metadata: Dict[str, str] = {}
+USER_PROXY_DESCRIPTION = """
+The human user who gave the original task.
+The human user cannot browse the web or write code or access files. So do not ask them to perform any actions on the web.
+In case where the task requires further clarifying information, the user can be asked to clarify the task.
+In case where you are stuck and unable to make progress on completing the task, you can ask the user for help.
+Make sure to do your best to complete the task with other agents before asking the user for help.
+The human can help if you're stuck by providing hints on how to solve the task.
+The human can also help verify your answer and provide you guidance.
+"""
+class MagenticUISimUserSystem(BaseSystem):
+    """
+    MagenticUISimUserSystem orchestrates a simulated user and a team of agents to solve tasks using Magentic-UI.
+    This class manages the instantiation of agents (WebSurfer, CoderAgent, FileSurfer, and optionally a user proxy), configures the orchestration logic, launches a browser for web tasks, and coordinates the team to solve a given task. It logs all agent messages, saves answers and resource usage, and supports different evaluation datasets and user simulation types.
+    Args:
+        name (str): Name of the system instance.
+        simulated_user_type (Literal): Type of simulated user ("co-planning", "co-execution", etc.).
+        how_helpful_user_proxy (Literal): Determines how helpful the user proxy is ("strict", "soft", "no_hints").
+        web_surfer_only (bool): If True, only the web surfer agent is used.
+        endpoint_config_orch (Optional[Dict]): Orchestrator model client config.
+        endpoint_config_websurfer (Optional[Dict]): WebSurfer agent model client config.
+        endpoint_config_coder (Optional[Dict]): Coder agent model client config.
+        endpoint_config_file_surfer (Optional[Dict]): FileSurfer agent model client config.
+        endpoint_config_user_proxy (Optional[Dict]): User proxy agent model client config.
+        dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
+        include_metadata_in_task_message (bool): Whether to include rewritten metadata in the task message.
+    """
+    default_client_config = {
+        "provider": "OpenAIChatCompletionClient",
+        "config": {
+            "model": "gpt-4o-2024-08-06",
+        },
+        "max_retries": 10,
+    }
+    o4_client_config = {
+        "provider": "OpenAIChatCompletionClient",
+        "config": {
+            "model": "o4-mini",
+        },
+        "max_retries": 10,
+    }
+    def __init__(
+        self,
+        name: str = "MagenticUISimUserSystem",
+        simulated_user_type: Literal[
+            "co-planning",
+            "co-execution",
+            "co-planning-and-execution",
+            "none",
+            "dummy",
+        ] = "none",
+        how_helpful_user_proxy: Literal["strict", "soft", "no_hints"] = "soft",
+        web_surfer_only: bool = False,
+        endpoint_config_orch: Optional[Dict[str, Any]] = default_client_config,
+        endpoint_config_websurfer: Optional[Dict[str, Any]] = default_client_config,
+        endpoint_config_coder: Optional[Dict[str, Any]] = default_client_config,
+        endpoint_config_file_surfer: Optional[Dict[str, Any]] = default_client_config,
+        endpoint_config_user_proxy: Optional[Dict[str, Any]] = default_client_config,
+        dataset_name: str = "Gaia",
+        include_metadata_in_task_message: bool = False,
+    ):
+        super().__init__(name)
+        self.candidate_class = WebVoyagerCandidate
+        self.endpoint_config_orch = endpoint_config_orch
+        self.endpoint_config_websurfer = endpoint_config_websurfer
+        self.endpoint_config_coder = endpoint_config_coder
+        self.endpoint_config_file_surfer = endpoint_config_file_surfer
+        self.simulated_user_type = simulated_user_type
+        self.endpoint_config_user_proxy = endpoint_config_user_proxy
+        self.web_surfer_only = web_surfer_only
+        self.dataset_name = dataset_name
+        self.how_helpful_user_proxy = how_helpful_user_proxy
+        self.include_metadata_in_task_message = include_metadata_in_task_message
+    def get_answer(
+        self, task_id: str, task: BaseTask, output_dir: str
+    ) -> BaseCandidate:
+        """
+        Runs the agent team to solve a given task and saves the answer and logs to disk.
+        Args:
+            task_id (str): Unique identifier for the task.
+            task (BaseTask): The task object containing the question and metadata.
+            output_dir (str): Directory to save logs, screenshots, and answer files.
+        Returns:
+            BaseCandidate: An object containing the final answer and any screenshots taken during execution.
+        """
+        async def _runner() -> Tuple[str, List[str]]:
+            """
+            Asynchronous runner that executes the agent team and collects the answer and screenshots.
+            Returns:
+                Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
+            """
+            task_question: str = task.question
+            # STEP 1: FINAL ANSWER PROMPT
+            if self.dataset_name == "WebVoyager":
+                # For WebVoyager, there is no restrictions on the final answer like Gaia or AssistantBench for evaluation
+                FINAL_ANSWER_PROMPT = f"""
+                output a FINAL ANSWER to the task
+                The real task is: {task_question}
+                Try your best to answer the question and provide a final answer that completely answers
+                To output the final answer, use the following template FINAL ANSWER: [YOUR FINAL ANSWER]
+                Don't put your answer in brackets or quotes.
+                """
+            else:
+                if (
+                    self.simulated_user_type != "none"
+                    or self.dataset_name == "AssistantBench"
+                ):
+                    # This allows model to say "Unable to determine" or "None" if it is unable to answer the question.
+                    FINAL_ANSWER_PROMPT = f"""
+                    output a FINAL ANSWER to the task.
+                    The real task is: {task_question}
+                    To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
+                    Don't put your answer in brackets or quotes.
+                    Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+                    ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+                    If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+                    If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+                    If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+                    If you are unable to determine the final answer, output '[any explanation for final answer] FINAL ANSWER: Unable to determine'
+                    Try your best to answer the question and provide a smart guess if you are unsure.
+                    """
+                else:
+                    # Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
+                    FINAL_ANSWER_PROMPT = f"""
+                    output a FINAL ANSWER to the task.
+                    The real task is: {task_question}
+                    To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
+                    Don't put your answer in brackets or quotes.
+                    Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+                    ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+                    If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+                    If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+                    If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+                    You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
+                    """
+            # Step 2: Create the Magentic-UI team
+            # TERMINATION CONDITION
+            termination_condition = TimeoutTermination(
+                timeout_seconds=60 * 15
+            )  # 15 minutes
+            model_context_token_limit = 110000
+            # ORCHESTRATOR CONFIGURATION
+            orchestrator_config = OrchestratorConfig(
+                cooperative_planning=False
+                if self.simulated_user_type in ["co-execution", "none"]
+                else True,
+                autonomous_execution=True
+                if self.simulated_user_type in ["co-planning", "none", "dummy"]
+                else False,
+                allow_follow_up_input=False,
+                final_answer_prompt=FINAL_ANSWER_PROMPT,
+                model_context_token_limit=model_context_token_limit,
+                no_overwrite_of_task=True,
+            )
+            # GET MODEL CLIENTS
+            def get_model_client(
+                endpoint_config: Optional[Union[ComponentModel, Dict[str, Any]]],
+            ) -> ChatCompletionClient:
+                """
+                Loads a ChatCompletionClient from a given endpoint configuration.
+                Args:
+                    endpoint_config (Optional[Union[ComponentModel, Dict[str, Any]]]):
+                        The configuration for the model client.
+                Returns:
+                    ChatCompletionClient: The loaded model client.
+                """
+                if endpoint_config is None:
+                    return ChatCompletionClient.load_component(
+                        self.default_client_config
+                    )
+                return ChatCompletionClient.load_component(endpoint_config)
+            model_client_orch = get_model_client(self.endpoint_config_orch)
+            model_client_coder = get_model_client(self.endpoint_config_coder)
+            model_client_websurfer = get_model_client(self.endpoint_config_websurfer)
+            model_client_file_surfer = get_model_client(
+                self.endpoint_config_file_surfer
+            )
+            model_client_user_proxy = get_model_client(self.endpoint_config_user_proxy)
+            # launch the browser
+            playwright_port, socket = get_available_port()
+            novnc_port, socket_vnc = get_available_port()
+            socket.close()
+            socket_vnc.close()
+            browser = VncDockerPlaywrightBrowser(
+                bind_dir=Path(output_dir),
+                playwright_port=playwright_port,
+                novnc_port=novnc_port,
+                inside_docker=False,
+            )
+            # Create action guard with default policy "never"
+            action_guard = ApprovalGuard(
+                input_func=None,
+                default_approval=False,
+                model_client=model_client_orch,
+                config=ApprovalConfig(
+                    approval_policy="never",
+                ),
+            )
+            # CREATE AGENTS
+            coder_agent = CoderAgent(
+                name="coder_agent",
+                model_client=model_client_coder,
+                work_dir=os.path.abspath(output_dir),
+                model_context_token_limit=model_context_token_limit,
+            )
+            file_surfer = FileSurfer(
+                name="file_surfer",
+                model_client=model_client_file_surfer,
+                work_dir=os.path.abspath(output_dir),
+                bind_dir=os.path.abspath(output_dir),
+                model_context_token_limit=model_context_token_limit,
+            )
+            # Create web surfer
+            with ApprovalGuardContext.populate_context(action_guard):
+                web_surfer = WebSurfer(
+                    name="web_surfer",
+                    model_client=model_client_websurfer,
+                    browser=browser,
+                    animate_actions=False,
+                    max_actions_per_step=10,
+                    start_page="about:blank" if task.url_path == "" else task.url_path,
+                    downloads_folder=os.path.abspath(output_dir),
+                    debug_dir=os.path.abspath(output_dir),
+                    model_context_token_limit=model_context_token_limit,
+                    to_save_screenshots=True,
+                )
+            # USER PROXY IF NEEDED for simulated user
+            task_metadata = getattr(task, "metadata", "")
+            if task_metadata and "Steps" in task_metadata:
+                task_metadata = task_metadata["Steps"]  # type: ignore
+            if self.simulated_user_type == "none":
+                user_proxy = None
+            elif self.simulated_user_type == "dummy":
+                user_proxy = DummyUserProxy(
+                    name="user_proxy",
+                )
+            else:
+                user_proxy = MetadataUserProxy(
+                    name="user_proxy",
+                    description=USER_PROXY_DESCRIPTION,
+                    task=task.question,
+                    helpful_task_hints=task_metadata,
+                    task_answer=getattr(task, "ground_truth", ""),
+                    model_client=model_client_user_proxy,
+                    simulated_user_type=self.simulated_user_type,  # type: ignore
+                    how_helpful=self.how_helpful_user_proxy,  # type: ignore
+                )
+            agent_list: List[ChatAgent] = [web_surfer, coder_agent, file_surfer]
+            if self.web_surfer_only:
+                agent_list = [web_surfer]
+            if user_proxy:
+                agent_list.append(user_proxy)
+            team = GroupChat(
+                participants=agent_list,
+                orchestrator_config=orchestrator_config,
+                model_client=model_client_orch,
+                termination_condition=termination_condition,
+            )
+            await team.lazy_init()
+            # Step 3: Prepare the task message
+            answer: str = ""
+            messages_so_far: List[LogEventSystem] = []
+            # Optionally append rewritten metadata for both multimodal and non-multimodal
+            rewritten_metadata = None
+            if self.include_metadata_in_task_message and task_metadata:
+                from autogen_core import CancellationToken
+                from autogen_core.models import UserMessage
+                prompt = f"""Rewrite the following helpful hints to help solve the task, but remove any information that directly reveals the answer. \nKeep the hints as close to the original as possible but remove any information that directly reveals the answer.\nHelpful hints: {task_metadata}\n\nAnswer: {getattr(task, "ground_truth", "")}\n\nDo not include anything else in your response except the rewritten hints.\nRewritten helpful hints:"""
+                result = await model_client_orch.create(
+                    messages=[UserMessage(content=prompt, source="user")],
+                    cancellation_token=CancellationToken(),
+                )
+                assert isinstance(result.content, str)
+                rewritten_metadata = (
+                    "\n\nWe have access to helpful hints that helps in solving the task: "
+                    + result.content.strip()
+                )
+            # check if file name is an image if it exists
+            if (
+                hasattr(task, "file_name")
+                and task.file_name
+                and task.file_name.endswith((".png", ".jpg", ".jpeg"))
+            ):
+                content_list: list[Union[str, AGImage]] = [task_question]
+                if rewritten_metadata:
+                    if isinstance(content_list[0], str):
+                        content_list[0] = content_list[0] + rewritten_metadata
+                content_list.append(AGImage.from_pil(Image.open(task.file_name)))
+                task_message = MultiModalMessage(
+                    content=content_list,
+                    source="user",
+                )
+            else:
+                if rewritten_metadata:
+                    task_message = TextMessage(
+                        content=task_question + rewritten_metadata, source="user"
+                    )
+                else:
+                    task_message = TextMessage(content=task_question, source="user")
+            # Step 4: Run the team on the task
+            async for message in team.run_stream(task=task_message):
+                # Store log events
+                message_str: str = ""
+                try:
+                    if isinstance(message, TaskResult) or isinstance(
+                        message, CheckpointEvent
+                    ):
+                        continue
+                    message_str = message.to_text()
+                    # Create log event with source, content and timestamp
+                    log_event = LogEventSystem(
+                        source=message.source,
+                        content=message_str,
+                        timestamp=datetime.datetime.now().isoformat(),
+                        metadata=message.metadata,
+                    )
+                    messages_so_far.append(log_event)
+                except Exception as e:
+                    logger.info(
+                        f"[likely nothing] When creating model_dump of message encountered exception {e}"
+                    )
+                    pass
+                # save to file
+                logger.info(f"Run in progress: {task_id}, message: {message_str}")
+                async with aiofiles.open(
+                    f"{output_dir}/{task_id}_messages.json", "w"
+                ) as f:
+                    # Convert list of logevent objects to list of dicts
+                    messages_json = [msg.model_dump() for msg in messages_so_far]
+                    await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
+                # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
+                if message_str.startswith("Final Answer:"):
+                    answer = message_str[len("Final Answer:") :].strip()
+                    # remove the "FINAL ANSWER:" part and get the string after it
+                    answer = answer.split("FINAL ANSWER:")[1].strip()
+            assert isinstance(answer, str), (
+                f"Expected answer to be a string, got {type(answer)}"
+            )
+            # save the usage of each of the client in a usage json file
+            def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
+                return {
+                    "prompt_tokens": model_client.total_usage().prompt_tokens,
+                    "completion_tokens": model_client.total_usage().completion_tokens,
+                }
+            usage_json = {
+                "orchestrator": get_usage(model_client_orch),
+                "websurfer": get_usage(model_client_websurfer),
+                "coder": get_usage(model_client_coder),
+                "file_surfer": get_usage(model_client_file_surfer),
+                "user_proxy": get_usage(model_client_user_proxy),
+            }
+            usage_json["total_without_user_proxy"] = {
+                "prompt_tokens": sum(
+                    usage_json[key]["prompt_tokens"]
+                    for key in usage_json
+                    if key != "user_proxy"
+                ),
+                "completion_tokens": sum(
+                    usage_json[key]["completion_tokens"]
+                    for key in usage_json
+                    if key != "user_proxy"
+                ),
+            }
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
+            await team.close()
+            # Step 5: Prepare the screenshots
+            screenshots_paths = []
+            # check the directory for screenshots which start with screenshot_raw_
+            for file in os.listdir(output_dir):
+                if file.startswith("screenshot_raw_"):
+                    # screenshot_raw_1746259609.png
+                    # get the timestamp from the file name
+                    timestamp = file.split("_")[1]
+                    screenshots_paths.append(
+                        [timestamp, os.path.join(output_dir, file)]
+                    )
+            # restrict to last 15 screenshots by timestamp
+            screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
+            screenshots_paths = [x[1] for x in screenshots_paths]
+            return answer, screenshots_paths
+        # Step 6: Return the answer and screenshots
+        answer, screenshots_paths = asyncio.run(_runner())
+        answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
+        self.save_answer_to_disk(task_id, answer, output_dir)
+        return answer

experiments/eval/systems/magentic_ui_system.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import asyncio
+import json
+import os
+import aiofiles
+import logging
+import datetime
+from pathlib import Path
+from PIL import Image
+from pydantic import BaseModel
+from typing import List, Dict, Any, Tuple
+from autogen_core.models import ChatCompletionClient
+from autogen_core import Image as AGImage
+from autogen_agentchat.base import TaskResult, ChatAgent
+from autogen_agentchat.messages import (
+    MultiModalMessage,
+    TextMessage,
+)
+from autogen_agentchat.conditions import TimeoutTermination
+from magentic_ui import OrchestratorConfig
+from magentic_ui.eval.basesystem import BaseSystem
+from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
+from magentic_ui.types import CheckpointEvent
+from magentic_ui.agents import WebSurfer, CoderAgent, FileSurfer
+from magentic_ui.teams import GroupChat
+from magentic_ui.tools.playwright.browser import VncDockerPlaywrightBrowser
+from magentic_ui.tools.playwright.browser import LocalPlaywrightBrowser
+from magentic_ui.tools.playwright.browser.utils import get_available_port
+logger = logging.getLogger(__name__)
+logging.getLogger("autogen").setLevel(logging.WARNING)
+logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
+logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)
+class LogEventSystem(BaseModel):
+    """
+    Data model for logging events.
+    Attributes:
+        source (str): The source of the event (e.g., agent name).
+        content (str): The content/message of the event.
+        timestamp (str): ISO-formatted timestamp of the event.
+        metadata (Dict[str, str]): Additional metadata for the event.
+    """
+    source: str
+    content: str
+    timestamp: str
+    metadata: Dict[str, str] = {}
+class MagenticUIAutonomousSystem(BaseSystem):
+    """
+    MagenticUIAutonomousSystem
+    Args:
+        name (str): Name of the system instance.
+        web_surfer_only (bool): If True, only the web surfer agent is used.
+        endpoint_config_orch (Optional[Dict]): Orchestrator model client config.
+        endpoint_config_websurfer (Optional[Dict]): WebSurfer agent model client config.
+        endpoint_config_coder (Optional[Dict]): Coder agent model client config.
+        endpoint_config_file_surfer (Optional[Dict]): FileSurfer agent model client config.
+        dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
+        use_local_browser (bool): If True, use the local browser.
+    """
+    def __init__(
+        self,
+        endpoint_config_orch: Dict[str, Any],
+        endpoint_config_websurfer: Dict[str, Any],
+        endpoint_config_coder: Dict[str, Any],
+        endpoint_config_file_surfer: Dict[str, Any],
+        name: str = "MagenticUIAutonomousSystem",
+        dataset_name: str = "Gaia",
+        web_surfer_only: bool = False,
+        use_local_browser: bool = False,
+    ):
+        super().__init__(name)
+        self.candidate_class = WebVoyagerCandidate
+        self.endpoint_config_orch = endpoint_config_orch
+        self.endpoint_config_websurfer = endpoint_config_websurfer
+        self.endpoint_config_coder = endpoint_config_coder
+        self.endpoint_config_file_surfer = endpoint_config_file_surfer
+        self.web_surfer_only = web_surfer_only
+        self.dataset_name = dataset_name
+        self.use_local_browser = use_local_browser
+    def get_answer(
+        self, task_id: str, task: BaseTask, output_dir: str
+    ) -> BaseCandidate:
+        """
+        Runs the agent team to solve a given task and saves the answer and logs to disk.
+        Args:
+            task_id (str): Unique identifier for the task.
+            task (BaseTask): The task object containing the question and metadata.
+            output_dir (str): Directory to save logs, screenshots, and answer files.
+        Returns:
+            BaseCandidate: An object containing the final answer and any screenshots taken during execution.
+        """
+        async def _runner() -> Tuple[str, List[str]]:
+            """
+            Asynchronous runner that executes the agent team and collects the answer and screenshots.
+            Returns:
+                Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
+            """
+            messages_so_far: List[LogEventSystem] = []
+            task_question: str = task.question
+            # Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
+            FINAL_ANSWER_PROMPT = f"""
+            output a FINAL ANSWER to the task.
+            The real task is: {task_question}
+            To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
+            Don't put your answer in brackets or quotes.
+            Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+            ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+            If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+            If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+            If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+            You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
+            """
+            # Step 2: Create the Magentic-UI team
+            # TERMINATION CONDITION
+            termination_condition = TimeoutTermination(
+                timeout_seconds=60 * 15
+            )  # 15 minutes
+            model_context_token_limit = 110000
+            # ORCHESTRATOR CONFIGURATION
+            orchestrator_config = OrchestratorConfig(
+                cooperative_planning=False,
+                autonomous_execution=True,
+                allow_follow_up_input=False,
+                final_answer_prompt=FINAL_ANSWER_PROMPT,
+                model_context_token_limit=model_context_token_limit,
+                no_overwrite_of_task=True,
+            )
+            model_client_orch = ChatCompletionClient.load_component(
+                self.endpoint_config_orch
+            )
+            model_client_coder = ChatCompletionClient.load_component(
+                self.endpoint_config_coder
+            )
+            model_client_websurfer = ChatCompletionClient.load_component(
+                self.endpoint_config_websurfer
+            )
+            model_client_file_surfer = ChatCompletionClient.load_component(
+                self.endpoint_config_file_surfer
+            )
+            # launch the browser
+            if self.use_local_browser:
+                browser = LocalPlaywrightBrowser(headless=True)
+            else:
+                playwright_port, socket = get_available_port()
+                novnc_port, socket_vnc = get_available_port()
+                socket.close()
+                socket_vnc.close()
+                browser = VncDockerPlaywrightBrowser(
+                    bind_dir=Path(output_dir),
+                    playwright_port=playwright_port,
+                    novnc_port=novnc_port,
+                    inside_docker=False,
+                )
+                browser_location_log = LogEventSystem(
+                    source="browser",
+                    content=f"Browser at novnc port {novnc_port} and playwright port {playwright_port} launched",
+                    timestamp=datetime.datetime.now().isoformat(),
+                )
+                messages_so_far.append(browser_location_log)
+            # Create web surfer
+            web_surfer = WebSurfer(
+                name="web_surfer",
+                model_client=model_client_websurfer,
+                browser=browser,
+                animate_actions=False,
+                max_actions_per_step=10,
+                start_page="about:blank" if task.url_path == "" else task.url_path,
+                downloads_folder=os.path.abspath(output_dir),
+                debug_dir=os.path.abspath(output_dir),
+                model_context_token_limit=model_context_token_limit,
+                to_save_screenshots=True,
+            )
+            agent_list: List[ChatAgent] = [web_surfer]
+            if not self.web_surfer_only:
+                coder_agent = CoderAgent(
+                    name="coder_agent",
+                    model_client=model_client_coder,
+                    work_dir=os.path.abspath(output_dir),
+                    model_context_token_limit=model_context_token_limit,
+                )
+                file_surfer = FileSurfer(
+                    name="file_surfer",
+                    model_client=model_client_file_surfer,
+                    work_dir=os.path.abspath(output_dir),
+                    bind_dir=os.path.abspath(output_dir),
+                    model_context_token_limit=model_context_token_limit,
+                )
+                agent_list.append(coder_agent)
+                agent_list.append(file_surfer)
+            team = GroupChat(
+                participants=agent_list,
+                orchestrator_config=orchestrator_config,
+                model_client=model_client_orch,
+                termination_condition=termination_condition,
+            )
+            await team.lazy_init()
+            # Step 3: Prepare the task message
+            answer: str = ""
+            # check if file name is an image if it exists
+            if (
+                hasattr(task, "file_name")
+                and task.file_name
+                and task.file_name.endswith((".png", ".jpg", ".jpeg"))
+            ):
+                task_message = MultiModalMessage(
+                    content=[
+                        task_question,
+                        AGImage.from_pil(Image.open(task.file_name)),
+                    ],
+                    source="user",
+                )
+            else:
+                task_message = TextMessage(content=task_question, source="user")
+            # Step 4: Run the team on the task
+            async for message in team.run_stream(task=task_message):
+                # Store log events
+                message_str: str = ""
+                try:
+                    if isinstance(message, TaskResult) or isinstance(
+                        message, CheckpointEvent
+                    ):
+                        continue
+                    message_str = message.to_text()
+                    # Create log event with source, content and timestamp
+                    log_event = LogEventSystem(
+                        source=message.source,
+                        content=message_str,
+                        timestamp=datetime.datetime.now().isoformat(),
+                        metadata=message.metadata,
+                    )
+                    messages_so_far.append(log_event)
+                except Exception as e:
+                    logger.info(
+                        f"[likely nothing] When creating model_dump of message encountered exception {e}"
+                    )
+                    pass
+                # save to file
+                logger.info(f"Run in progress: {task_id}, message: {message_str}")
+                async with aiofiles.open(
+                    f"{output_dir}/{task_id}_messages.json", "w"
+                ) as f:
+                    # Convert list of logevent objects to list of dicts
+                    messages_json = [msg.model_dump() for msg in messages_so_far]
+                    await f.write(json.dumps(messages_json, indent=2))
+                    await f.flush()  # Flush to disk immediately
+                # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
+                if message_str.startswith("Final Answer:"):
+                    answer = message_str[len("Final Answer:") :].strip()
+                    # remove the "FINAL ANSWER:" part and get the string after it
+                    answer = answer.split("FINAL ANSWER:")[1].strip()
+            assert isinstance(
+                answer, str
+            ), f"Expected answer to be a string, got {type(answer)}"
+            # save the usage of each of the client in a usage json file
+            def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
+                return {
+                    "prompt_tokens": model_client.total_usage().prompt_tokens,
+                    "completion_tokens": model_client.total_usage().completion_tokens,
+                }
+            usage_json = {
+                "orchestrator": get_usage(model_client_orch),
+                "websurfer": get_usage(model_client_websurfer),
+                "coder": get_usage(model_client_coder),
+                "file_surfer": get_usage(model_client_file_surfer),
+            }
+            usage_json["total_without_user_proxy"] = {
+                "prompt_tokens": sum(
+                    usage_json[key]["prompt_tokens"]
+                    for key in usage_json
+                    if key != "user_proxy"
+                ),
+                "completion_tokens": sum(
+                    usage_json[key]["completion_tokens"]
+                    for key in usage_json
+                    if key != "user_proxy"
+                ),
+            }
+            async with aiofiles.open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                await f.write(json.dumps(usage_json, indent=2))
+            await team.close()
+            # Step 5: Prepare the screenshots
+            screenshots_paths = []
+            # check the directory for screenshots which start with screenshot_raw_
+            for file in os.listdir(output_dir):
+                if file.startswith("screenshot_raw_"):
+                    timestamp = file.split("_")[1]
+                    screenshots_paths.append(
+                        [timestamp, os.path.join(output_dir, file)]
+                    )
+            # restrict to last 15 screenshots by timestamp
+            screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
+            screenshots_paths = [x[1] for x in screenshots_paths]
+            return answer, screenshots_paths
+        # Step 6: Return the answer and screenshots
+        answer, screenshots_paths = asyncio.run(_runner())
+        answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
+        self.save_answer_to_disk(task_id, answer, output_dir)
+        return answer

fara_config.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+model_config_local_surfer: &client_surfer
+  provider: OpenAIChatCompletionClient
+  config:
+    model: "microsoft/Fara-7B"
+    base_url: http://localhost:5000/v1
+    api_key: not-needed
+    model_info:
+      vision: true
+      function_calling: true
+      json_output: false
+      family: "unknown"
+      structured_output: false
+      multiple_system_messages: false
+orchestrator_client: *client_surfer
+coder_client: *client_surfer
+web_surfer_client: *client_surfer
+file_surfer_client: *client_surfer
+action_guard_client: *client_surfer
+model_client: *client_surfer

frontend/.env.default ADDED Viewed

	@@ -0,0 +1 @@


1	+ GATSBY_API_URL=http://127.0.0.1:8081/api

frontend/.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+node_modules/
+.cache/
+public
+src/gatsby-types.d.ts
+.env.development
+.env.production

frontend/README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+## 🚀 Running UI in Dev Mode
+Run the UI in dev mode (make changes and see them reflected in the browser with hotreloading):
+- Ensure yarn is installed.
+- `yarn install`
+- `yarn start`
+This should start the server on port 8000.
+## Design Elements
+- **Gatsby**: The app is created in Gatsby. A guide on bootstrapping a Gatsby app can be found here - https://www.gatsbyjs.com/docs/quick-start/.
+  This provides an overview of the project file structure include functionality of files like `gatsby-config.js`, `gatsby-node.js`, `gatsby-browser.js` and `gatsby-ssr.js`.
+- **TailwindCSS**: The app uses TailwindCSS for styling. A guide on using TailwindCSS with Gatsby can be found here - https://tailwindcss.com/docs/guides/gatsby.https://tailwindcss.com/docs/guides/gatsby . This will explain the functionality in tailwind.config.js and postcss.config.js.
+## Modifying the UI, Adding Pages
+The core of the app can be found in the `src` folder. To add pages, add a new folder in `src/pages` and add a `index.js` file. This will be the entry point for the page. For example to add a route in the app like `/about`, add a folder `about` in `src/pages` and add a `index.tsx` file. You can follow the content style in `src/pages/index.tsx` to add content to the page.
+Core logic for each component should be written in the `src/components` folder and then imported in pages as needed.
+## connecting to front end
+the front end makes request to the backend api and expects it at `http://localhost:8081/api`
+## setting env variables for the UI
+- please look at `.env.default`
+- make a copy of this file and name it `.env.development`
+- set the values for the variables in this file
+  - The main variable here is `GATSBY_API_URL` which should be set to `http://localhost:8081/api` for local development. This tells the UI where to make requests to the backend.

frontend/gatsby-browser.js ADDED Viewed

	@@ -0,0 +1,6 @@

+import "antd/dist/reset.css";
+import "./src/styles/global.css";
+import AuthProvider from "./src/hooks/provider";
+export const wrapRootElement = AuthProvider;

frontend/gatsby-config.ts ADDED Viewed

	@@ -0,0 +1,59 @@

+import type { GatsbyConfig } from "gatsby";
+import fs from "fs";
+const envFile = `.env.${process.env.NODE_ENV}`;
+fs.access(envFile, fs.constants.F_OK, (err) => {
+  if (err) {
+    console.warn(`File '${envFile}' is missing. Using default values.`);
+  }
+});
+require("dotenv").config({
+  path: envFile,
+});
+const config: GatsbyConfig = {
+  pathPrefix: process.env.PREFIX_PATH_VALUE || "",
+  siteMetadata: {
+    title: `Magentic-UI`,
+    description: `Human-centered web agent interface`,
+    siteUrl: `http://tbd.place`,
+  },
+  // More easily incorporate content into your pages through automatic TypeScript type generation and better GraphQL IntelliSense.
+  // If you use VSCode you can also use the GraphQL plugin
+  // Learn more at: https://gatsby.dev/graphql-typegen
+  graphqlTypegen: true,
+  plugins: [
+    "gatsby-plugin-postcss",
+    "gatsby-plugin-image",
+    "gatsby-plugin-sitemap",
+    {
+      resolve: "gatsby-plugin-manifest",
+      options: {
+        icon: "src/images/icon.png",
+      },
+    },
+    "gatsby-plugin-mdx",
+    "gatsby-plugin-sharp",
+    "gatsby-transformer-sharp",
+    {
+      resolve: "gatsby-source-filesystem",
+      options: {
+        name: "images",
+        path: "./src/images/",
+      },
+      __key: "images",
+    },
+    {
+      resolve: "gatsby-source-filesystem",
+      options: {
+        name: "pages",
+        path: "./src/pages/",
+      },
+      __key: "pages",
+    },
+  ],
+};
+export default config;

frontend/gatsby-ssr.tsx ADDED Viewed

	@@ -0,0 +1,16 @@

+import React from "react";
+const codeToRunOnClient = `(function() {
+  try {
+    var mode = localStorage.getItem('darkmode');
+    document.getElementsByTagName("html")[0].className === 'dark' ? 'dark' : 'light';
+  } catch (e) {}
+})();`;
+export const onRenderBody = ({ setHeadComponents }) =>
+  setHeadComponents([
+    <script
+      key="myscript"
+      dangerouslySetInnerHTML={{ __html: codeToRunOnClient }}
+    />,
+  ]);

frontend/package.json ADDED Viewed

	@@ -0,0 +1,72 @@

+{
+  "name": "Magentic-UI",
+  "version": "1.0.0",
+  "private": true,
+  "description": "Magentic-UI",
+  "author": "Microsoft",
+  "keywords": [
+    "gatsby"
+  ],
+  "scripts": {
+    "develop": "gatsby clean && gatsby develop",
+    "dev": "npm run develop",
+    "start": "gatsby clean && gatsby develop",
+    "build": "gatsby clean && rm -rf ../src/magentic_ui/backend/web/ui && PREFIX_PATH_VALUE='' gatsby build --prefix-paths && rsync -a --delete public/ ../src/magentic_ui/backend/web/ui/",
+    "serve": "gatsby serve",
+    "clean": "gatsby clean",
+    "typecheck": "tsc --noEmit"
+  },
+  "dependencies": {
+    "@dagrejs/dagre": "^1.1.4",
+    "@dnd-kit/core": "^6.2.0",
+    "@headlessui/react": "^2.2.0",
+    "@hello-pangea/dnd": "^17.0.0",
+    "@heroicons/react": "^2.0.18",
+    "@mdx-js/react": "^3.1.0",
+    "@monaco-editor/react": "^4.6.0",
+    "@tailwindcss/typography": "^0.5.9",
+    "@xyflow/react": "^12.3.5",
+    "antd": "^5.22.1",
+    "autoprefixer": "^10.4.20",
+    "gatsby": "^5.14.0",
+    "gatsby-plugin-image": "^3.14.0",
+    "gatsby-plugin-manifest": "^5.14.0",
+    "gatsby-plugin-mdx": "^5.14.0",
+    "gatsby-plugin-postcss": "^6.14.0",
+    "gatsby-plugin-sharp": "^5.14.0",
+    "gatsby-plugin-sitemap": "^6.14.0",
+    "gatsby-source-filesystem": "^5.14.0",
+    "gatsby-transformer-sharp": "^5.14.0",
+    "install": "^0.13.0",
+    "js-yaml": "^4.1.0",
+    "lucide-react": "^0.460.0",
+    "postcss": "^8.4.49",
+    "react": "^18.2.0",
+    "react-dom": "^18.2.0",
+    "react-markdown": "^9.0.1",
+    "react-syntax-highlighter": "^15.6.1",
+    "react-vnc": "^3.0.8",
+    "react-window": "^1.8.11",
+    "remark-gfm": "^4.0.0",
+    "tailwindcss": "^3.4.14",
+    "yarn": "^1.22.22",
+    "zod": "^3.25.63",
+    "zustand": "^5.0.1"
+  },
+  "devDependencies": {
+    "@types/lodash.debounce": "^4.0.9",
+    "@types/node": "^22.9.0",
+    "@types/react": "^18.2.55",
+    "@types/react-dom": "^18.2.19",
+    "@types/react-syntax-highlighter": "^15.5.13",
+    "@types/uuid": "^10.0.0",
+    "typescript": "^5.3.3"
+  },
+  "resolutions": {
+    "tar-fs": "2.1.2",
+    "path-to-regexp": "0.1.12",
+    "prismjs": "1.30.0",
+    "cookie": "0.7.0",
+    "base-x": "3.0.11"
+  }
+}

frontend/postcss.config.js ADDED Viewed

	@@ -0,0 +1,6 @@

+module.exports = {
+  plugins: {
+    tailwindcss: {},
+    autoprefixer: {},
+  },
+}

frontend/src/assets/logo.svg ADDED Viewed

frontend/src/components/common/AutoResizeTextarea.tsx ADDED Viewed

	@@ -0,0 +1,117 @@

+import React, { useEffect, useLayoutEffect, useRef } from "react";
+interface AutoResizeTextareaProps
+  extends React.TextareaHTMLAttributes<HTMLTextAreaElement> {
+  value: string;
+  onChange: (e: React.ChangeEvent<HTMLTextAreaElement>) => void;
+  className: string;
+  minHeight?: string;
+  maxHeight?: string;
+}
+const AutoResizeTextarea: React.FC<AutoResizeTextareaProps> = ({
+  value,
+  onChange,
+  className,
+  minHeight = "30px",
+  maxHeight = "120px",
+  ...props
+}) => {
+  const textareaRef = useRef<HTMLTextAreaElement>(null);
+  const observerRef = useRef<ResizeObserver | null>(null);
+  const adjustHeight = () => {
+    const textarea = textareaRef.current;
+    if (!textarea) return;
+    // Reset height to get the correct scrollHeight measurement
+    textarea.style.height = minHeight;
+    // Convert min and max heights to numbers for comparison
+    const minHeightPx = parseInt(minHeight);
+    const maxHeightPx = parseInt(maxHeight);
+    // Set the height to match content, bounded by min and max heights
+    const desiredHeight = Math.min(
+      Math.max(minHeightPx, textarea.scrollHeight),
+      maxHeightPx
+    );
+    textarea.style.height = `${desiredHeight}px`;
+    // Add scrollbar if content exceeds maxHeight
+    textarea.style.overflowY =
+      textarea.scrollHeight > maxHeightPx ? "auto" : "hidden";
+  };
+  // Initial height adjustment using useLayoutEffect to prevent flash
+  useLayoutEffect(() => {
+    adjustHeight();
+  }, []);
+  // Adjust height when value changes
+  useEffect(() => {
+    adjustHeight();
+  }, [value]);
+  // Setup resize observer and window resize handler
+  useEffect(() => {
+    const textarea = textareaRef.current;
+    if (!textarea) return;
+    // Create resize observer
+    observerRef.current = new ResizeObserver(() => {
+      adjustHeight();
+    });
+    // Observe both the textarea and its parent element
+    observerRef.current.observe(textarea);
+    if (textarea.parentElement) {
+      observerRef.current.observe(textarea.parentElement);
+    }
+    // Handle window resize
+    const handleResize = () => adjustHeight();
+    window.addEventListener("resize", handleResize);
+    // Setup intersection observer for visibility changes
+    const intersectionObserver = new IntersectionObserver(
+      (entries) => {
+        entries.forEach((entry) => {
+          if (entry.isIntersecting) {
+            adjustHeight();
+          }
+        });
+      },
+      { threshold: 0.1 }
+    );
+    intersectionObserver.observe(textarea);
+    return () => {
+      window.removeEventListener("resize", handleResize);
+      if (observerRef.current) {
+        observerRef.current.disconnect();
+      }
+      intersectionObserver.disconnect();
+    };
+  }, []);
+  return (
+    <textarea
+      ref={textareaRef}
+      value={value}
+      onChange={onChange}
+      className={className}
+      style={{
+        minHeight,
+        maxHeight,
+        overflowY: "auto",
+        resize: "none",
+        ...props.style,
+      }}
+      {...props}
+    />
+  );
+};
+export default AutoResizeTextarea;

frontend/src/components/common/Button.tsx ADDED Viewed

	@@ -0,0 +1,96 @@

+import React from "react";
+import { Spin } from "antd";
+export type ButtonVariant =
+  | "primary"
+  | "secondary"
+  | "tertiary"
+  | "success"
+  | "warning"
+  | "danger";
+export type ButtonSize = "xs" | "sm" | "md" | "lg";
+interface ButtonProps extends React.ButtonHTMLAttributes<HTMLButtonElement> {
+  variant?: ButtonVariant;
+  size?: ButtonSize;
+  isLoading?: boolean;
+  icon?: React.ReactNode;
+  iconPosition?: "left" | "right";
+  fullWidth?: boolean;
+  children?: React.ReactNode;
+  className?: string;
+}
+export const Button: React.FC<ButtonProps> = ({
+  variant = "primary",
+  size = "md",
+  isLoading = false,
+  icon,
+  iconPosition = "left",
+  fullWidth = false,
+  disabled = false,
+  children,
+  className = "",
+  ...props
+}) => {
+  // Base classes shared by all buttons
+  const baseClasses =
+    "inline-flex items-center justify-center rounded-md transition-colors focus:outline-none";
+  // Size variations
+  const sizeClasses = {
+    xs: "px-2 py-1 text-xs",
+    sm: "px-2.5 py-1.5 text-sm",
+    md: "px-4 py-2 text-base",
+    lg: "px-6 py-3 text-lg",
+  };
+  // Variant classes - these would use your color variables
+  const variantClasses = {
+    primary:
+      "bg-magenta-800 text-white hover:bg-magenta-900 focus:ring-2 focus:ring-magenta-900",
+    secondary:
+      "bg-transparent border border-magenta-800 text-magenta-800 hover:bg-magenta-900/50",
+    tertiary: "bg-transparent text-gray-800 hover:text-primary",
+    success:
+      "bg-green-600 text-white hover:bg-green-700 focus:ring-2 focus:ring-green-400",
+    warning:
+      "bg-warning-primary text-white hover:bg-amber-600 focus:ring-2 focus:ring-amber-400",
+    danger:
+      "bg-red-600 text-white hover:bg-red-700 focus:ring-2 focus:ring-red-400",
+  };
+  // States
+  const stateClasses =
+    disabled || isLoading ? "opacity-60 cursor-not-allowed" : "cursor-pointer";
+  // Width
+  const widthClass = fullWidth ? "w-full" : "";
+  return (
+    <button
+      disabled={disabled || isLoading}
+      className={`
+        ${baseClasses}
+        ${sizeClasses[size]}
+        ${variantClasses[variant]}
+        ${stateClasses}
+        ${widthClass}
+        ${className}
+      `}
+      {...props}
+    >
+      {isLoading && <Spin size="small" className={children ? "mr-2" : ""} />}
+      {!isLoading && icon && iconPosition === "left" && (
+        <span className={`${children ? "mr-2" : ""}`}>{icon}</span>
+      )}
+      {children}
+      {!isLoading && icon && iconPosition === "right" && (
+        <span className={`${children ? "ml-2" : ""}`}>{icon}</span>
+      )}
+    </button>
+  );
+};