diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000000000000000000000000000000000000..85438cdadb0226b91e66d9f12cb3122b1656bf05
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,8 @@
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+
+# Set to false to disable anonymized telemetry
+ANONYMIZED_TELEMETRY=true
+
+# LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
+BROWSER_USE_LOGGING_LEVEL=info
diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..da99c1975fbd3321669c447d6823a4d5a1ae8554 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
+static/*.gif filter=lfs diff=lfs merge=lfs -text
+# static/*.mp4 filter=lfs diff=lfs merge=lfs -text
+docs/images/checks-passed.png filter=lfs diff=lfs merge=lfs -text
+docs/images/laminar.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 0000000000000000000000000000000000000000..08a567b04989b9988f40d382d09dcd586dde1770
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,84 @@
+name: π Bug Report
+description: Report a bug in browser-use
+labels: ["bug", "triage"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report! Please fill out the form below to help us reproduce and fix the issue.
+
+ - type: textarea
+ id: description
+ attributes:
+ label: Bug Description
+ description: A clear and concise description of what the bug is.
+ placeholder: When I try to... the library...
+ validations:
+ required: true
+
+ - type: textarea
+ id: reproduction
+ attributes:
+ label: Reproduction Steps
+ description: Steps to reproduce the behavior
+ placeholder: |
+ 1. Install browser-use...
+ 2. Run the following task...
+ 3. See error...
+ validations:
+ required: true
+
+ - type: textarea
+ id: code
+ attributes:
+ label: Code Sample
+ description: Include a minimal code sample that reproduces the issue
+ render: python
+ validations:
+ required: true
+
+ - type: input
+ id: version
+ attributes:
+ label: Version
+ description: What version of browser-use are you using? (Run `uv pip show browser-use` to find out)
+ placeholder: "e.g., pip 0.1.26, or git main branch"
+ validations:
+ required: true
+
+ - type: dropdown
+ id: model
+ attributes:
+ label: LLM Model
+ description: Which LLM model(s) are you using?
+ multiple: true
+ options:
+ - GPT-4o
+ - GPT-4
+ - Claude 3.5 Sonnet
+ - Claude 3.5 Opus
+ - Claude 3.5 Haiku
+ - Gemini 1.5 Pro
+ - Gemini 1.5 Ultra
+ - Fireworks Mixtral
+ - DeepSeek Coder
+ - Local Model (Specify model in description)
+ - Other (specify in description)
+ validations:
+ required: true
+
+ - type: input
+ id: os
+ attributes:
+ label: Operating System
+ description: What operating system are you using?
+ placeholder: "e.g., macOS 13.1, Windows 11, Ubuntu 22.04"
+ validations:
+ required: true
+
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant Log Output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code.
+ render: shell
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..a8607c3ab0b7014e22eb1b2b82d403056e454584
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,11 @@
+blank_issues_enabled: false # Set to true if you want to allow blank issues
+contact_links:
+ - name: π€ Quickstart Guide
+ url: https://docs.browser-use.com/quickstart
+ about: Most common issues can be resolved by following our quickstart guide
+ - name: π€ Questions and Help
+ url: https://link.browser-use.com/discord
+ about: Please ask questions in our Discord community
+ - name: π Documentation
+ url: https://docs.browser-use.com
+ about: Check our documentation for answers first
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/docs_issue.yml b/.github/ISSUE_TEMPLATE/docs_issue.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b0504a44979a6272827fa9d32f95f5c2342d6c98
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/docs_issue.yml
@@ -0,0 +1,55 @@
+name: π Documentation Issue
+description: Report an issue in the browser-use documentation
+labels: ["documentation"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to improve our documentation! Please fill out the form below to help us understand the issue.
+
+ - type: dropdown
+ id: type
+ attributes:
+ label: Type of Documentation Issue
+ description: What type of documentation issue is this?
+ options:
+ - Missing documentation
+ - Incorrect documentation
+ - Unclear documentation
+ - Broken link
+ - Other (specify in description)
+ validations:
+ required: true
+
+ - type: input
+ id: page
+ attributes:
+ label: Documentation Page
+ description: Which page or section of the documentation is this about?
+ placeholder: "e.g., https://docs.browser-use.com/getting-started or Installation Guide"
+ validations:
+ required: true
+
+ - type: textarea
+ id: description
+ attributes:
+ label: Issue Description
+ description: Describe what's wrong or missing in the documentation
+ placeholder: The documentation should...
+ validations:
+ required: true
+
+ - type: textarea
+ id: suggestion
+ attributes:
+ label: Suggested Changes
+ description: If you have specific suggestions for how to improve the documentation, please share them
+ placeholder: |
+ The documentation could be improved by...
+
+ Example:
+ ```python
+ # Your suggested code example or text here
+ ```
+ validations:
+ required: true
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4b5d90f93342c1dba3f24e571a886c7eff16b4f0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -0,0 +1,43 @@
+name: π‘ Feature Request
+description: Suggest a new feature for browser-use
+labels: ["enhancement"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to suggest a new feature! Please fill out the form below to help us understand your suggestion.
+
+ - type: textarea
+ id: problem
+ attributes:
+ label: Problem Description
+ description: Is your feature request related to a problem? Please describe.
+ placeholder: I'm always frustrated when...
+ validations:
+ required: true
+
+ - type: textarea
+ id: solution
+ attributes:
+ label: Proposed Solution
+ description: Describe the solution you'd like to see
+ placeholder: It would be great if...
+ validations:
+ required: true
+
+ - type: textarea
+ id: alternatives
+ attributes:
+ label: Alternative Solutions
+ description: Describe any alternative solutions or features you've considered
+ placeholder: I've also thought about...
+
+ - type: textarea
+ id: context
+ attributes:
+ label: Additional Context
+ description: Add any other context or examples about the feature request here
+ placeholder: |
+ - Example use cases
+ - Screenshots or mockups
+ - Related issues or discussions
\ No newline at end of file
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8ee4acc7e98ab9174d40ee9661d45ab3ce136fd6
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,38 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+ release:
+ types: [published]
+
+permissions:
+ contents: read
+
+jobs:
+ deploy:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.x"
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build hatch
+ - name: Build package
+ run: python -m build
+ - name: Publish package
+ uses: pypa/gh-action-pypi-publish@release/v1
+ with:
+ user: __token__
+ password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0affd458965be1c34d27f1958db821b03d1b0a72
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,190 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+test_env/
+
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+temp
+tmp
+
+
+.DS_Store
+
+private_example.py
+private_example
+
+browser_cookies.json
+cookies.json
+AgentHistory.json
+cv_04_24.pdf
+AgentHistoryList.json
+*.gif
+gcp-login.json
+.vscode
+.ruff_cache
+.idea
+*.txt
+*.pdf
+*.csv
+*.json
+*.jsonl
+
+uv.lock
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34d34cda0f22789b7543dfec0b3d2321a7258ea6
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.3.0
+ hooks:
+ - id: ruff
+ args: [
+ --line-length=130,
+ --select=E,F,I,
+ --fix,
+ ]
+
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v4.5.0
+ hooks:
+ - id: trailing-whitespace
+ - id: end-of-file-fixer
+ - id: check-yaml
+ - id: check-toml
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000000000000000000000000000000000000..2c0733315e415bfb5e5b353f9996ecd964d395b2
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.11
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..1ea3836ce58a4cd32c90c0b4f4e736d840d23780
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Gregor Zunic
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
index a3c5a1c84b0539855f3343d3e00484aa09cd6a0c..3a6f60fe1b96bcaf6b12561d14335fcbbc052eb1 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,193 @@
----
-title: Use
-emoji: π
-colorFrom: indigo
-colorTo: indigo
-sdk: static
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+
+
+
+
+
+
+
Enable AI to control your browser π€
+
+[](https://github.com/gregpr07/browser-use/stargazers)
+[](https://link.browser-use.com/discord)
+[](https://cloud.browser-use.com)
+[](https://docs.browser-use.com)
+[](https://x.com/gregpr07)
+[](https://x.com/mamagnus00)
+[](https://app.workweave.ai/reports/repository/org_T5Pvn3UBswTHIsN1dWS3voPg/881458615)
+
+π Browser-use is the easiest way to connect your AI agents with the browser.
+
+π‘ See what others are building and share your projects in our [Discord](https://link.browser-use.com/discord)! Want Swag? Check out our [Merch store](https://browsermerch.com).
+
+π€οΈ Skip the setup - try our hosted version for instant browser automation! [Try the cloud βοΈ](https://cloud.browser-use.com) .
+
+# Quick start
+
+With pip (Python>=3.11):
+
+```bash
+pip install browser-use
+```
+
+install playwright:
+
+```bash
+playwright install
+```
+
+Spin up your agent:
+
+```python
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+import asyncio
+from dotenv import load_dotenv
+load_dotenv()
+
+async def main():
+ agent = Agent(
+ task="Compare the price of gpt-4o and DeepSeek-V3",
+ llm=ChatOpenAI(model="gpt-4o"),
+ )
+ await agent.run()
+
+asyncio.run(main())
+```
+
+Add your API keys for the provider you want to use to your `.env` file.
+
+```bash
+OPENAI_API_KEY=
+```
+
+For other settings, models, and more, check out the [documentation π](https://docs.browser-use.com).
+
+### Test with UI
+
+You can test [browser-use with a UI repository](https://github.com/browser-use/web-ui)
+
+Or simply run the gradio example:
+
+```
+uv pip install gradio
+```
+
+```bash
+python examples/ui/gradio_demo.py
+```
+
+# Demos
+
+
+
+[Task](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/shopping.py): Add grocery items to cart, and checkout.
+
+[](https://www.youtube.com/watch?v=L2Ya9PYNns8)
+
+
+
+Prompt: Add my latest LinkedIn follower to my leads in Salesforce.
+
+
+
+
+
+[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/find_and_apply_to_jobs.py): Read my CV & find ML jobs, save them to a file, and then start applying for them in new tabs, if you need help, ask me.'
+
+https://github.com/user-attachments/assets/171fb4d6-0355-46f2-863e-edb04a828d04
+
+
+
+[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/browser/real_browser.py): Write a letter in Google Docs to my Papa, thanking him for everything, and save the document as a PDF.
+
+
+
+
+
+[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/custom-functions/save_to_file_hugging_face.py): Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.
+
+https://github.com/user-attachments/assets/de73ee39-432c-4b97-b4e8-939fd7f323b3
+
+
+
+## More examples
+
+For more examples see the [examples](examples) folder or join the [Discord](https://link.browser-use.com/discord) and show off your project.
+
+# Vision
+
+Tell your computer what to do, and it gets it done.
+
+## Roadmap
+
+### Agent
+
+- [ ] Improve agent memory (summarize, compress, RAG, etc.)
+- [ ] Enhance planning capabilities (load website specific context)
+- [ ] Reduce token consumption (system prompt, DOM state)
+
+### DOM Extraction
+
+- [ ] Improve extraction for datepickers, dropdowns, special elements
+- [ ] Improve state representation for UI elements
+
+### Rerunning tasks
+
+- [ ] LLM as fallback
+- [ ] Make it easy to define workfows templates where LLM fills in the details
+- [ ] Return playwright script from the agent
+
+### Datasets
+
+- [ ] Create datasets for complex tasks
+- [ ] Benchmark various models against each other
+- [ ] Fine-tuning models for specific tasks
+
+### User Experience
+
+- [ ] Human-in-the-loop execution
+- [ ] Improve the generated GIF quality
+- [ ] Create various demos for tutorial execution, job application, QA testing, social media, etc.
+
+## Contributing
+
+We love contributions! Feel free to open issues for bugs or feature requests. To contribute to the docs, check out the `/docs` folder.
+
+## Local Setup
+
+To learn more about the library, check out the [local setup π](https://docs.browser-use.com/development/local-setup).
+
+## Cooperations
+
+We are forming a commission to define best practices for UI/UX design for browser agents.
+Together, we're exploring how software redesign improves the performance of AI agents and gives these companies a competitive advantage by designing their existing software to be at the forefront of the agent age.
+
+Email [Toby](mailto:tbiddle@loop11.com?subject=I%20want%20to%20join%20the%20UI/UX%20commission%20for%20AI%20agents&body=Hi%20Toby%2C%0A%0AI%20found%20you%20in%20the%20browser-use%20GitHub%20README.%0A%0A) to apply for a seat on the committee.
+
+## Swag
+
+Want to show off your Browser-use swag? Check out our [Merch store](https://browsermerch.com). Good contributors will receive swag for free π.
+
+## Citation
+
+If you use Browser Use in your research or project, please cite:
+
+```bibtex
+@software{browser_use2024,
+ author = {MΓΌller, Magnus and Ε½uniΔ, Gregor},
+ title = {Browser Use: Enable AI to control your browser},
+ year = {2024},
+ publisher = {GitHub},
+ url = {https://github.com/browser-use/browser-use}
+}
+```
+
+
+
+[](https://x.com/gregpr07)
+[](https://x.com/mamagnus00)
+
+
+
+
+Made with β€οΈ in Zurich and San Francisco
+
diff --git a/SECURITY.md b/SECURITY.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0969a3a05f2fbbb22ad7ff009d25f8da9fb6589
--- /dev/null
+++ b/SECURITY.md
@@ -0,0 +1,20 @@
+## Reporting Security Issues
+
+If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure.
+
+**Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.**
+
+Instead, please open a new [Github security advisory](https://github.com/browser-use/browser-use/security/advisories/new).
+
+Please include as much of the information listed below as you can to help me better understand and resolve the issue:
+
+* The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
+* Full paths of source file(s) related to the manifestation of the issue
+* The location of the affected source code (tag/branch/commit or direct URL)
+* Any special configuration required to reproduce the issue
+* Step-by-step instructions to reproduce the issue
+* Proof-of-concept or exploit code (if possible)
+* Impact of the issue, including how an attacker might exploit the issue
+
+This information will help me triage your report more quickly.
+
diff --git a/browser_use/README.md b/browser_use/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ed850d74033b54ae377e8021f3849a1cc273beb4
--- /dev/null
+++ b/browser_use/README.md
@@ -0,0 +1,51 @@
+# Codebase Structure
+
+> The code structure inspired by https://github.com/Netflix/dispatch.
+
+Very good structure on how to make a scalable codebase is also in [this repo](https://github.com/zhanymkanov/fastapi-best-practices).
+
+Just a brief document about how we should structure our backend codebase.
+
+## Code Structure
+
+```markdown
+src/
+//
+models.py
+services.py
+prompts.py
+views.py
+utils.py
+routers.py
+
+ /_/
+```
+
+### Service.py
+
+Always a single file, except if it becomes too long - more than ~500 lines, split it into \_subservices
+
+### Views.py
+
+Always split the views into two parts
+
+```python
+# All
+...
+
+# Requests
+...
+
+# Responses
+...
+```
+
+If too long β split into multiple files
+
+### Prompts.py
+
+Single file; if too long β split into multiple files (one prompt per file or so)
+
+### Routers.py
+
+Never split into more than one file
diff --git a/browser_use/__init__.py b/browser_use/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb9b1a4e30f886fda7d2aff7dade1bd7d543ecb3
--- /dev/null
+++ b/browser_use/__init__.py
@@ -0,0 +1,27 @@
+from browser_use.logging_config import setup_logging
+
+setup_logging()
+
+from browser_use.agent.prompts import SystemPrompt as SystemPrompt
+from browser_use.agent.service import Agent as Agent
+from browser_use.agent.views import ActionModel as ActionModel
+from browser_use.agent.views import ActionResult as ActionResult
+from browser_use.agent.views import AgentHistoryList as AgentHistoryList
+from browser_use.browser.browser import Browser as Browser
+from browser_use.browser.browser import BrowserConfig as BrowserConfig
+from browser_use.browser.context import BrowserContextConfig
+from browser_use.controller.service import Controller as Controller
+from browser_use.dom.service import DomService as DomService
+
+__all__ = [
+ 'Agent',
+ 'Browser',
+ 'BrowserConfig',
+ 'Controller',
+ 'DomService',
+ 'SystemPrompt',
+ 'ActionResult',
+ 'ActionModel',
+ 'AgentHistoryList',
+ 'BrowserContextConfig',
+]
diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cb7cbc9ce15444c2b6b69c0313580f77700177b
--- /dev/null
+++ b/browser_use/agent/gif.py
@@ -0,0 +1,325 @@
+from __future__ import annotations
+
+import base64
+import io
+import logging
+import os
+import platform
+from typing import TYPE_CHECKING, Optional
+
+from browser_use.agent.views import (
+ AgentHistoryList,
+)
+
+if TYPE_CHECKING:
+ from PIL import Image, ImageFont
+
+logger = logging.getLogger(__name__)
+
+
+def create_history_gif(
+ task: str,
+ history: AgentHistoryList,
+ #
+ output_path: str = 'agent_history.gif',
+ duration: int = 3000,
+ show_goals: bool = True,
+ show_task: bool = True,
+ show_logo: bool = False,
+ font_size: int = 40,
+ title_font_size: int = 56,
+ goal_font_size: int = 44,
+ margin: int = 40,
+ line_spacing: float = 1.5,
+) -> None:
+ """Create a GIF from the agent's history with overlaid task and goal text."""
+ if not history.history:
+ logger.warning('No history to create GIF from')
+ return
+
+ from PIL import Image, ImageFont
+
+ images = []
+
+ # if history is empty or first screenshot is None, we can't create a gif
+ if not history.history or not history.history[0].state.screenshot:
+ logger.warning('No history or first screenshot to create GIF from')
+ return
+
+ # Try to load nicer fonts
+ try:
+ # Try different font options in order of preference
+ font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana']
+ font_loaded = False
+
+ for font_name in font_options:
+ try:
+ if platform.system() == 'Windows':
+ # Need to specify the abs font path on Windows
+ font_name = os.path.join(os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts'), font_name + '.ttf')
+ regular_font = ImageFont.truetype(font_name, font_size)
+ title_font = ImageFont.truetype(font_name, title_font_size)
+ goal_font = ImageFont.truetype(font_name, goal_font_size)
+ font_loaded = True
+ break
+ except OSError:
+ continue
+
+ if not font_loaded:
+ raise OSError('No preferred fonts found')
+
+ except OSError:
+ regular_font = ImageFont.load_default()
+ title_font = ImageFont.load_default()
+
+ goal_font = regular_font
+
+ # Load logo if requested
+ logo = None
+ if show_logo:
+ try:
+ logo = Image.open('./static/browser-use.png')
+ # Resize logo to be small (e.g., 40px height)
+ logo_height = 150
+ aspect_ratio = logo.width / logo.height
+ logo_width = int(logo_height * aspect_ratio)
+ logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
+ except Exception as e:
+ logger.warning(f'Could not load logo: {e}')
+
+ # Create task frame if requested
+ if show_task and task:
+ task_frame = _create_task_frame(
+ task,
+ history.history[0].state.screenshot,
+ title_font, # type: ignore
+ regular_font, # type: ignore
+ logo,
+ line_spacing,
+ )
+ images.append(task_frame)
+
+ # Process each history item
+ for i, item in enumerate(history.history, 1):
+ if not item.state.screenshot:
+ continue
+
+ # Convert base64 screenshot to PIL Image
+ img_data = base64.b64decode(item.state.screenshot)
+ image = Image.open(io.BytesIO(img_data))
+
+ if show_goals and item.model_output:
+ image = _add_overlay_to_image(
+ image=image,
+ step_number=i,
+ goal_text=item.model_output.current_state.next_goal,
+ regular_font=regular_font, # type: ignore
+ title_font=title_font, # type: ignore
+ margin=margin,
+ logo=logo,
+ )
+
+ images.append(image)
+
+ if images:
+ # Save the GIF
+ images[0].save(
+ output_path,
+ save_all=True,
+ append_images=images[1:],
+ duration=duration,
+ loop=0,
+ optimize=False,
+ )
+ logger.info(f'Created GIF at {output_path}')
+ else:
+ logger.warning('No images found in history to create GIF')
+
+
+def _create_task_frame(
+ task: str,
+ first_screenshot: str,
+ title_font: 'ImageFont.FreeTypeFont',
+ regular_font: 'ImageFont.FreeTypeFont',
+ logo: Optional[Image.Image] = None,
+ line_spacing: float = 1.5,
+) -> 'Image.Image':
+ """Create initial frame showing the task."""
+ from PIL import Image, ImageDraw, ImageFont
+
+ img_data = base64.b64decode(first_screenshot)
+ template = Image.open(io.BytesIO(img_data))
+ image = Image.new('RGB', template.size, (0, 0, 0))
+ draw = ImageDraw.Draw(image)
+
+ # Calculate vertical center of image
+ center_y = image.height // 2
+
+ # Draw task text with increased font size
+ margin = 140 # Increased margin
+ max_width = image.width - (2 * margin)
+ larger_font = ImageFont.truetype(regular_font.path, regular_font.size + 16) # Increase font size more
+ wrapped_text = _wrap_text(task, larger_font, max_width)
+
+ # Calculate line height with spacing
+ line_height = larger_font.size * line_spacing
+
+ # Split text into lines and draw with custom spacing
+ lines = wrapped_text.split('\n')
+ total_height = line_height * len(lines)
+
+ # Start position for first line
+ text_y = center_y - (total_height / 2) + 50 # Shifted down slightly
+
+ for line in lines:
+ # Get line width for centering
+ line_bbox = draw.textbbox((0, 0), line, font=larger_font)
+ text_x = (image.width - (line_bbox[2] - line_bbox[0])) // 2
+
+ draw.text(
+ (text_x, text_y),
+ line,
+ font=larger_font,
+ fill=(255, 255, 255),
+ )
+ text_y += line_height
+
+ # Add logo if provided (top right corner)
+ if logo:
+ logo_margin = 20
+ logo_x = image.width - logo.width - logo_margin
+ image.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
+
+ return image
+
+
+def _add_overlay_to_image(
+ image: 'Image.Image',
+ step_number: int,
+ goal_text: str,
+ regular_font: 'ImageFont.FreeTypeFont',
+ title_font: 'ImageFont.FreeTypeFont',
+ margin: int,
+ logo: Optional['Image.Image'] = None,
+ display_step: bool = True,
+ text_color: tuple[int, int, int, int] = (255, 255, 255, 255),
+ text_box_color: tuple[int, int, int, int] = (0, 0, 0, 255),
+) -> 'Image.Image':
+ """Add step number and goal overlay to an image."""
+ from PIL import Image, ImageDraw
+
+ image = image.convert('RGBA')
+ txt_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
+ draw = ImageDraw.Draw(txt_layer)
+ if display_step:
+ # Add step number (bottom left)
+ step_text = str(step_number)
+ step_bbox = draw.textbbox((0, 0), step_text, font=title_font)
+ step_width = step_bbox[2] - step_bbox[0]
+ step_height = step_bbox[3] - step_bbox[1]
+
+ # Position step number in bottom left
+ x_step = margin + 10 # Slight additional offset from edge
+ y_step = image.height - margin - step_height - 10 # Slight offset from bottom
+
+ # Draw rounded rectangle background for step number
+ padding = 20 # Increased padding
+ step_bg_bbox = (
+ x_step - padding,
+ y_step - padding,
+ x_step + step_width + padding,
+ y_step + step_height + padding,
+ )
+ draw.rounded_rectangle(
+ step_bg_bbox,
+ radius=15, # Add rounded corners
+ fill=text_box_color,
+ )
+
+ # Draw step number
+ draw.text(
+ (x_step, y_step),
+ step_text,
+ font=title_font,
+ fill=text_color,
+ )
+
+ # Draw goal text (centered, bottom)
+ max_width = image.width - (4 * margin)
+ wrapped_goal = _wrap_text(goal_text, title_font, max_width)
+ goal_bbox = draw.multiline_textbbox((0, 0), wrapped_goal, font=title_font)
+ goal_width = goal_bbox[2] - goal_bbox[0]
+ goal_height = goal_bbox[3] - goal_bbox[1]
+
+ # Center goal text horizontally, place above step number
+ x_goal = (image.width - goal_width) // 2
+ y_goal = y_step - goal_height - padding * 4 # More space between step and goal
+
+ # Draw rounded rectangle background for goal
+ padding_goal = 25 # Increased padding for goal
+ goal_bg_bbox = (
+ x_goal - padding_goal, # Remove extra space for logo
+ y_goal - padding_goal,
+ x_goal + goal_width + padding_goal,
+ y_goal + goal_height + padding_goal,
+ )
+ draw.rounded_rectangle(
+ goal_bg_bbox,
+ radius=15, # Add rounded corners
+ fill=text_box_color,
+ )
+
+ # Draw goal text
+ draw.multiline_text(
+ (x_goal, y_goal),
+ wrapped_goal,
+ font=title_font,
+ fill=text_color,
+ align='center',
+ )
+
+ # Add logo if provided (top right corner)
+ if logo:
+ logo_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
+ logo_margin = 20
+ logo_x = image.width - logo.width - logo_margin
+ logo_layer.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
+ txt_layer = Image.alpha_composite(logo_layer, txt_layer)
+
+ # Composite and convert
+ result = Image.alpha_composite(image, txt_layer)
+ return result.convert('RGB')
+
+
+def _wrap_text(text: str, font: 'ImageFont.FreeTypeFont', max_width: int) -> str:
+ """
+ Wrap text to fit within a given width.
+
+ Args:
+ text: Text to wrap
+ font: Font to use for text
+ max_width: Maximum width in pixels
+
+ Returns:
+ Wrapped text with newlines
+ """
+ words = text.split()
+ lines = []
+ current_line = []
+
+ for word in words:
+ current_line.append(word)
+ line = ' '.join(current_line)
+ bbox = font.getbbox(line)
+ if bbox[2] > max_width:
+ if len(current_line) == 1:
+ lines.append(current_line.pop())
+ else:
+ current_line.pop()
+ lines.append(' '.join(current_line))
+ current_line = [word]
+
+ if current_line:
+ lines.append(' '.join(current_line))
+
+ return '\n'.join(lines)
diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b3cf78708a4c3d2a28a8c58ecaca4ecfd176e7
--- /dev/null
+++ b/browser_use/agent/message_manager/service.py
@@ -0,0 +1,306 @@
+from __future__ import annotations
+
+import logging
+from typing import Dict, List, Optional
+
+from langchain_core.messages import (
+ AIMessage,
+ BaseMessage,
+ HumanMessage,
+ SystemMessage,
+ ToolMessage,
+)
+from pydantic import BaseModel
+
+from browser_use.agent.message_manager.views import MessageMetadata
+from browser_use.agent.prompts import AgentMessagePrompt
+from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
+from browser_use.browser.views import BrowserState
+from browser_use.utils import time_execution_sync
+
+logger = logging.getLogger(__name__)
+
+
+class MessageManagerSettings(BaseModel):
+ max_input_tokens: int = 128000
+ estimated_characters_per_token: int = 3
+ image_tokens: int = 800
+ include_attributes: list[str] = []
+ message_context: Optional[str] = None
+ sensitive_data: Optional[Dict[str, str]] = None
+ available_file_paths: Optional[List[str]] = None
+
+
+class MessageManager:
+ def __init__(
+ self,
+ task: str,
+ system_message: SystemMessage,
+ settings: MessageManagerSettings = MessageManagerSettings(),
+ state: MessageManagerState = MessageManagerState(),
+ ):
+ self.task = task
+ self.settings = settings
+ self.state = state
+ self.system_prompt = system_message
+
+ # Only initialize messages if state is empty
+ if len(self.state.history.messages) == 0:
+ self._init_messages()
+
+ def _init_messages(self) -> None:
+ """Initialize the message history with system message, context, task, and other initial messages"""
+ self._add_message_with_tokens(self.system_prompt)
+
+ if self.settings.message_context:
+ context_message = HumanMessage(content='Context for the task' + self.settings.message_context)
+ self._add_message_with_tokens(context_message)
+
+ task_message = HumanMessage(
+ content=f'Your ultimate task is: """{self.task}""". If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task. If not, continue as usual.'
+ )
+ self._add_message_with_tokens(task_message)
+
+ if self.settings.sensitive_data:
+ info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}'
+ info += 'To use them, write the placeholder name '
+ info_message = HumanMessage(content=info)
+ self._add_message_with_tokens(info_message)
+
+ placeholder_message = HumanMessage(content='Example output:')
+ self._add_message_with_tokens(placeholder_message)
+
+ tool_calls = [
+ {
+ 'name': 'AgentOutput',
+ 'args': {
+ 'current_state': {
+ 'evaluation_previous_goal': 'Success - I opend the first page',
+ 'memory': 'Starting with the new task. I have completed 1/10 steps',
+ 'next_goal': 'Click on company a',
+ },
+ 'action': [{'click_element': {'index': 0}}],
+ },
+ 'id': str(self.state.tool_id),
+ 'type': 'tool_call',
+ }
+ ]
+
+ example_tool_call = AIMessage(
+ content='',
+ tool_calls=tool_calls,
+ )
+ self._add_message_with_tokens(example_tool_call)
+ self.add_tool_message(content='Browser started')
+
+ placeholder_message = HumanMessage(content='[Your task history memory starts here]')
+ self._add_message_with_tokens(placeholder_message)
+
+ if self.settings.available_file_paths:
+ filepaths_msg = HumanMessage(content=f'Here are file paths you can use: {self.settings.available_file_paths}')
+ self._add_message_with_tokens(filepaths_msg)
+
+ def add_new_task(self, new_task: str) -> None:
+ content = f'Your new ultimate task is: """{new_task}""". Take the previous context into account and finish your new ultimate task. '
+ msg = HumanMessage(content=content)
+ self._add_message_with_tokens(msg)
+ self.task = new_task
+
+ @time_execution_sync('--add_state_message')
+ def add_state_message(
+ self,
+ state: BrowserState,
+ result: Optional[List[ActionResult]] = None,
+ step_info: Optional[AgentStepInfo] = None,
+ use_vision=True,
+ ) -> None:
+ """Add browser state as human message"""
+
+ # if keep in memory, add to directly to history and add state without result
+ if result:
+ for r in result:
+ if r.include_in_memory:
+ if r.extracted_content:
+ msg = HumanMessage(content='Action result: ' + str(r.extracted_content))
+ self._add_message_with_tokens(msg)
+ if r.error:
+ # if endswith \n, remove it
+ if r.error.endswith('\n'):
+ r.error = r.error[:-1]
+ # get only last line of error
+ last_line = r.error.split('\n')[-1]
+ msg = HumanMessage(content='Action error: ' + last_line)
+ self._add_message_with_tokens(msg)
+ result = None # if result in history, we dont want to add it again
+
+ # otherwise add state message and result to next message (which will not stay in memory)
+ state_message = AgentMessagePrompt(
+ state,
+ result,
+ include_attributes=self.settings.include_attributes,
+ step_info=step_info,
+ ).get_user_message(use_vision)
+ self._add_message_with_tokens(state_message)
+
+ def add_model_output(self, model_output: AgentOutput) -> None:
+ """Add model output as AI message"""
+ tool_calls = [
+ {
+ 'name': 'AgentOutput',
+ 'args': model_output.model_dump(mode='json', exclude_unset=True),
+ 'id': str(self.state.tool_id),
+ 'type': 'tool_call',
+ }
+ ]
+
+ msg = AIMessage(
+ content='',
+ tool_calls=tool_calls,
+ )
+
+ self._add_message_with_tokens(msg)
+ # empty tool response
+ self.add_tool_message(content='')
+
+ def add_plan(self, plan: Optional[str], position: int | None = None) -> None:
+ if plan:
+ msg = AIMessage(content=plan)
+ self._add_message_with_tokens(msg, position)
+
+ @time_execution_sync('--get_messages')
+ def get_messages(self) -> List[BaseMessage]:
+ """Get current message list, potentially trimmed to max tokens"""
+
+ msg = [m.message for m in self.state.history.messages]
+ # debug which messages are in history with token count # log
+ total_input_tokens = 0
+ logger.debug(f'Messages in history: {len(self.state.history.messages)}:')
+ for m in self.state.history.messages:
+ total_input_tokens += m.metadata.tokens
+ logger.debug(f'{m.message.__class__.__name__} - Token count: {m.metadata.tokens}')
+ logger.debug(f'Total input tokens: {total_input_tokens}')
+
+ return msg
+
+ def _add_message_with_tokens(self, message: BaseMessage, position: int | None = None) -> None:
+ """Add message with token count metadata
+ position: None for last, -1 for second last, etc.
+ """
+
+ # filter out sensitive data from the message
+ if self.settings.sensitive_data:
+ message = self._filter_sensitive_data(message)
+
+ token_count = self._count_tokens(message)
+ metadata = MessageMetadata(tokens=token_count)
+ self.state.history.add_message(message, metadata, position)
+
+ @time_execution_sync('--filter_sensitive_data')
+ def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage:
+ """Filter out sensitive data from the message"""
+
+ def replace_sensitive(value: str) -> str:
+ if not self.settings.sensitive_data:
+ return value
+ for key, val in self.settings.sensitive_data.items():
+ if not val:
+ continue
+ value = value.replace(val, f'{key} ')
+ return value
+
+ if isinstance(message.content, str):
+ message.content = replace_sensitive(message.content)
+ elif isinstance(message.content, list):
+ for i, item in enumerate(message.content):
+ if isinstance(item, dict) and 'text' in item:
+ item['text'] = replace_sensitive(item['text'])
+ message.content[i] = item
+ return message
+
+ def _count_tokens(self, message: BaseMessage) -> int:
+ """Count tokens in a message using the model's tokenizer"""
+ tokens = 0
+ if isinstance(message.content, list):
+ for item in message.content:
+ if 'image_url' in item:
+ tokens += self.settings.image_tokens
+ elif isinstance(item, dict) and 'text' in item:
+ tokens += self._count_text_tokens(item['text'])
+ else:
+ msg = message.content
+ if hasattr(message, 'tool_calls'):
+ msg += str(message.tool_calls) # type: ignore
+ tokens += self._count_text_tokens(msg)
+ return tokens
+
+ def _count_text_tokens(self, text: str) -> int:
+ """Count tokens in a text string"""
+ tokens = len(text) // self.settings.estimated_characters_per_token # Rough estimate if no tokenizer available
+ return tokens
+
+ def cut_messages(self):
+ """Get current message list, potentially trimmed to max tokens"""
+ diff = self.state.history.current_tokens - self.settings.max_input_tokens
+ if diff <= 0:
+ return None
+
+ msg = self.state.history.messages[-1]
+
+ # if list with image remove image
+ if isinstance(msg.message.content, list):
+ text = ''
+ for item in msg.message.content:
+ if 'image_url' in item:
+ msg.message.content.remove(item)
+ diff -= self.settings.image_tokens
+ msg.metadata.tokens -= self.settings.image_tokens
+ self.state.history.current_tokens -= self.settings.image_tokens
+ logger.debug(
+ f'Removed image with {self.settings.image_tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens}'
+ )
+ elif 'text' in item and isinstance(item, dict):
+ text += item['text']
+ msg.message.content = text
+ self.state.history.messages[-1] = msg
+
+ if diff <= 0:
+ return None
+
+ # if still over, remove text from state message proportionally to the number of tokens needed with buffer
+ # Calculate the proportion of content to remove
+ proportion_to_remove = diff / msg.metadata.tokens
+ if proportion_to_remove > 0.99:
+ raise ValueError(
+ f'Max token limit reached - history is too long - reduce the system prompt or task. '
+ f'proportion_to_remove: {proportion_to_remove}'
+ )
+ logger.debug(
+ f'Removing {proportion_to_remove * 100:.2f}% of the last message {proportion_to_remove * msg.metadata.tokens:.2f} / {msg.metadata.tokens:.2f} tokens)'
+ )
+
+ content = msg.message.content
+ characters_to_remove = int(len(content) * proportion_to_remove)
+ content = content[:-characters_to_remove]
+
+ # remove tokens and old long message
+ self.state.history.remove_last_state_message()
+
+ # new message with updated content
+ msg = HumanMessage(content=content)
+ self._add_message_with_tokens(msg)
+
+ last_msg = self.state.history.messages[-1]
+
+ logger.debug(
+ f'Added message with {last_msg.metadata.tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens} - total messages: {len(self.state.history.messages)}'
+ )
+
+ def _remove_last_state_message(self) -> None:
+ """Remove last state message from history"""
+ self.state.history.remove_last_state_message()
+
+ def add_tool_message(self, content: str) -> None:
+ """Add tool message to history"""
+ msg = ToolMessage(content=content, tool_call_id=str(self.state.tool_id))
+ self.state.tool_id += 1
+ self._add_message_with_tokens(msg)
diff --git a/browser_use/agent/message_manager/tests.py b/browser_use/agent/message_manager/tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..94c1beb59e2c38d69a2b7a30043d616ea0a0c29e
--- /dev/null
+++ b/browser_use/agent/message_manager/tests.py
@@ -0,0 +1,237 @@
+import pytest
+from langchain_anthropic import ChatAnthropic
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
+from langchain_openai import AzureChatOpenAI, ChatOpenAI
+
+from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
+from browser_use.agent.views import ActionResult
+from browser_use.browser.views import BrowserState, TabInfo
+from browser_use.dom.views import DOMElementNode, DOMTextNode
+
+
+@pytest.fixture(
+ params=[
+ ChatOpenAI(model='gpt-4o-mini'),
+ AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'),
+ ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None),
+ ],
+ ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'],
+)
+def message_manager(request: pytest.FixtureRequest):
+ task = 'Test task'
+ action_descriptions = 'Test actions'
+ return MessageManager(
+ task=task,
+ system_message=SystemMessage(content=action_descriptions),
+ settings=MessageManagerSettings(
+ max_input_tokens=1000,
+ estimated_characters_per_token=3,
+ image_tokens=800,
+ ),
+ )
+
+
+def test_initial_messages(message_manager: MessageManager):
+ """Test that message manager initializes with system and task messages"""
+ messages = message_manager.get_messages()
+ assert len(messages) == 2
+ assert isinstance(messages[0], SystemMessage)
+ assert isinstance(messages[1], HumanMessage)
+ assert 'Test task' in messages[1].content
+
+
+def test_add_state_message(message_manager: MessageManager):
+ """Test adding browser state message"""
+ state = BrowserState(
+ url='https://test.com',
+ title='Test Page',
+ element_tree=DOMElementNode(
+ tag_name='div',
+ attributes={},
+ children=[],
+ is_visible=True,
+ parent=None,
+ xpath='//div',
+ ),
+ selector_map={},
+ tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
+ )
+ message_manager.add_state_message(state)
+
+ messages = message_manager.get_messages()
+ assert len(messages) == 3
+ assert isinstance(messages[2], HumanMessage)
+ assert 'https://test.com' in messages[2].content
+
+
+def test_add_state_with_memory_result(message_manager: MessageManager):
+ """Test adding state with result that should be included in memory"""
+ state = BrowserState(
+ url='https://test.com',
+ title='Test Page',
+ element_tree=DOMElementNode(
+ tag_name='div',
+ attributes={},
+ children=[],
+ is_visible=True,
+ parent=None,
+ xpath='//div',
+ ),
+ selector_map={},
+ tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
+ )
+ result = ActionResult(extracted_content='Important content', include_in_memory=True)
+
+ message_manager.add_state_message(state, [result])
+ messages = message_manager.get_messages()
+
+ # Should have system, task, extracted content, and state messages
+ assert len(messages) == 4
+ assert 'Important content' in messages[2].content
+ assert isinstance(messages[2], HumanMessage)
+ assert isinstance(messages[3], HumanMessage)
+ assert 'Important content' not in messages[3].content
+
+
+def test_add_state_with_non_memory_result(message_manager: MessageManager):
+ """Test adding state with result that should not be included in memory"""
+ state = BrowserState(
+ url='https://test.com',
+ title='Test Page',
+ element_tree=DOMElementNode(
+ tag_name='div',
+ attributes={},
+ children=[],
+ is_visible=True,
+ parent=None,
+ xpath='//div',
+ ),
+ selector_map={},
+ tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
+ )
+ result = ActionResult(extracted_content='Temporary content', include_in_memory=False)
+
+ message_manager.add_state_message(state, [result])
+ messages = message_manager.get_messages()
+
+ # Should have system, task, and combined state+result message
+ assert len(messages) == 3
+ assert 'Temporary content' in messages[2].content
+ assert isinstance(messages[2], HumanMessage)
+
+
+@pytest.mark.skip('not sure how to fix this')
+@pytest.mark.parametrize('max_tokens', [100000, 10000, 5000])
+def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens):
+ """Test handling of token overflow in a realistic message flow"""
+ # Set more realistic token limit
+ message_manager.settings.max_input_tokens = max_tokens
+
+ # Create a long sequence of interactions
+ for i in range(200): # Simulate 40 steps of interaction
+ # Create state with varying content length
+ state = BrowserState(
+ url=f'https://test{i}.com',
+ title=f'Test Page {i}',
+ element_tree=DOMElementNode(
+ tag_name='div',
+ attributes={},
+ children=[
+ DOMTextNode(
+ text=f'Content {j} ' * (10 + i), # Increasing content length
+ is_visible=True,
+ parent=None,
+ )
+ for j in range(5) # Multiple DOM items
+ ],
+ is_visible=True,
+ parent=None,
+ xpath='//div',
+ ),
+ selector_map={j: f'//div[{j}]' for j in range(5)},
+ tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')],
+ )
+
+ # Alternate between different types of results
+ result = None
+ if i % 2 == 0: # Every other iteration
+ result = ActionResult(
+ extracted_content=f'Important content from step {i}' * 5,
+ include_in_memory=i % 4 == 0, # Include in memory every 4th message
+ )
+
+ # Add state message
+ if result:
+ message_manager.add_state_message(state, [result])
+ else:
+ message_manager.add_state_message(state)
+
+ try:
+ messages = message_manager.get_messages()
+ except ValueError as e:
+ if 'Max token limit reached - history is too long' in str(e):
+ return # If error occurs, end the test
+ else:
+ raise e
+
+ assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100
+
+ last_msg = messages[-1]
+ assert isinstance(last_msg, HumanMessage)
+
+ if i % 4 == 0:
+ assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage)
+ if i % 2 == 0 and not i % 4 == 0:
+ if isinstance(last_msg.content, list):
+ assert 'Current url: https://test' in last_msg.content[0]['text']
+ else:
+ assert 'Current url: https://test' in last_msg.content
+
+ # Add model output every time
+ from browser_use.agent.views import AgentBrain, AgentOutput
+ from browser_use.controller.registry.views import ActionModel
+
+ output = AgentOutput(
+ current_state=AgentBrain(
+ evaluation_previous_goal=f'Success in step {i}',
+ memory=f'Memory from step {i}',
+ next_goal=f'Goal for step {i + 1}',
+ ),
+ action=[ActionModel()],
+ )
+ message_manager._remove_last_state_message()
+ message_manager.add_model_output(output)
+
+ # Get messages and verify after each addition
+ messages = [m.message for m in message_manager.state.history.messages]
+
+ # Verify token limit is respected
+
+ # Verify essential messages are preserved
+ assert isinstance(messages[0], SystemMessage) # System prompt always first
+ assert isinstance(messages[1], HumanMessage) # Task always second
+ assert 'Test task' in messages[1].content
+
+ # Verify structure of latest messages
+ assert isinstance(messages[-1], AIMessage) # Last message should be model output
+ assert f'step {i}' in messages[-1].content # Should contain current step info
+
+ # Log token usage for debugging
+ token_usage = message_manager.state.history.current_tokens
+ token_limit = message_manager.settings.max_input_tokens
+ # print(f'Step {i}: Using {token_usage}/{token_limit} tokens')
+
+ # go through all messages and verify that the token count and total tokens is correct
+ total_tokens = 0
+ real_tokens = []
+ stored_tokens = []
+ for msg in message_manager.state.history.messages:
+ total_tokens += msg.metadata.tokens
+ stored_tokens.append(msg.metadata.tokens)
+ real_tokens.append(message_manager._count_tokens(msg.message))
+ assert total_tokens == sum(real_tokens)
+ assert stored_tokens == real_tokens
+ assert message_manager.state.history.current_tokens == total_tokens
+
+
+# pytest -s browser_use/agent/message_manager/tests.py
diff --git a/browser_use/agent/message_manager/utils.py b/browser_use/agent/message_manager/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9490124c8b2415e858bbb7126860ccdc4d2b26
--- /dev/null
+++ b/browser_use/agent/message_manager/utils.py
@@ -0,0 +1,127 @@
+from __future__ import annotations
+
+import json
+import logging
+import os
+from typing import Any, Optional, Type
+
+from langchain_core.messages import (
+ AIMessage,
+ BaseMessage,
+ HumanMessage,
+ SystemMessage,
+ ToolMessage,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def extract_json_from_model_output(content: str) -> dict:
+ """Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
+ try:
+ # If content is wrapped in code blocks, extract just the JSON part
+ if '```' in content:
+ # Find the JSON content between code blocks
+ content = content.split('```')[1]
+ # Remove language identifier if present (e.g., 'json\n')
+ if '\n' in content:
+ content = content.split('\n', 1)[1]
+ # Parse the cleaned content
+ return json.loads(content)
+ except json.JSONDecodeError as e:
+ logger.warning(f'Failed to parse model output: {content} {str(e)}')
+ raise ValueError('Could not parse response.')
+
+
+def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]:
+ """Convert input messages to a format that is compatible with the planner model"""
+ if model_name is None:
+ return input_messages
+ if model_name == 'deepseek-reasoner' or 'deepseek-r1' in model_name:
+ converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages)
+ merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage)
+ merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage)
+ return merged_input_messages
+ return input_messages
+
+
+def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]:
+ """Convert messages for non-function-calling models"""
+ output_messages = []
+ for message in input_messages:
+ if isinstance(message, HumanMessage):
+ output_messages.append(message)
+ elif isinstance(message, SystemMessage):
+ output_messages.append(message)
+ elif isinstance(message, ToolMessage):
+ output_messages.append(HumanMessage(content=message.content))
+ elif isinstance(message, AIMessage):
+ # check if tool_calls is a valid JSON object
+ if message.tool_calls:
+ tool_calls = json.dumps(message.tool_calls)
+ output_messages.append(AIMessage(content=tool_calls))
+ else:
+ output_messages.append(message)
+ else:
+ raise ValueError(f'Unknown message type: {type(message)}')
+ return output_messages
+
+
+def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]:
+ """Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one."""
+ merged_messages = []
+ streak = 0
+ for message in messages:
+ if isinstance(message, class_to_merge):
+ streak += 1
+ if streak > 1:
+ if isinstance(message.content, list):
+ merged_messages[-1].content += message.content[0]['text'] # type:ignore
+ else:
+ merged_messages[-1].content += message.content
+ else:
+ merged_messages.append(message)
+ else:
+ merged_messages.append(message)
+ streak = 0
+ return merged_messages
+
+
+def save_conversation(input_messages: list[BaseMessage], response: Any, target: str, encoding: Optional[str] = None) -> None:
+ """Save conversation history to file."""
+
+ # create folders if not exists
+ os.makedirs(os.path.dirname(target), exist_ok=True)
+
+ with open(
+ target,
+ 'w',
+ encoding=encoding,
+ ) as f:
+ _write_messages_to_file(f, input_messages)
+ _write_response_to_file(f, response)
+
+
+def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None:
+ """Write messages to conversation file"""
+ for message in messages:
+ f.write(f' {message.__class__.__name__} \n')
+
+ if isinstance(message.content, list):
+ for item in message.content:
+ if isinstance(item, dict) and item.get('type') == 'text':
+ f.write(item['text'].strip() + '\n')
+ elif isinstance(message.content, str):
+ try:
+ content = json.loads(message.content)
+ f.write(json.dumps(content, indent=2) + '\n')
+ except json.JSONDecodeError:
+ f.write(message.content.strip() + '\n')
+
+ f.write('\n')
+
+
+def _write_response_to_file(f: Any, response: Any) -> None:
+ """Write model response to conversation file"""
+ f.write(' RESPONSE\n')
+ f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2))
diff --git a/browser_use/agent/message_manager/views.py b/browser_use/agent/message_manager/views.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad8c9c67056c0842bf20d7afa49f7b1ff2665e93
--- /dev/null
+++ b/browser_use/agent/message_manager/views.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any
+
+from langchain_core.load import dumpd, load
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
+from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator
+
+if TYPE_CHECKING:
+ from browser_use.agent.views import AgentOutput
+
+
+class MessageMetadata(BaseModel):
+ """Metadata for a message"""
+
+ tokens: int = 0
+
+
+class ManagedMessage(BaseModel):
+ """A message with its metadata"""
+
+ message: BaseMessage
+ metadata: MessageMetadata = Field(default_factory=MessageMetadata)
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ # https://github.com/pydantic/pydantic/discussions/7558
+ @model_serializer(mode='wrap')
+ def to_json(self, original_dump):
+ """
+ Returns the JSON representation of the model.
+
+ It uses langchain's `dumps` function to serialize the `message`
+ property before encoding the overall dict with json.dumps.
+ """
+ data = original_dump(self)
+
+ # NOTE: We override the message field to use langchain JSON serialization.
+ data['message'] = dumpd(self.message)
+
+ return data
+
+ @model_validator(mode='before')
+ @classmethod
+ def validate(
+ cls,
+ value: Any,
+ *,
+ strict: bool | None = None,
+ from_attributes: bool | None = None,
+ context: Any | None = None,
+ ) -> Any:
+ """
+ Custom validator that uses langchain's `loads` function
+ to parse the message if it is provided as a JSON string.
+ """
+ if isinstance(value, dict) and 'message' in value:
+ # NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object.
+ value['message'] = load(value['message'])
+ return value
+
+
+class MessageHistory(BaseModel):
+ """History of messages with metadata"""
+
+ messages: list[ManagedMessage] = Field(default_factory=list)
+ current_tokens: int = 0
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
+ """Add message with metadata to history"""
+ if position is None:
+ self.messages.append(ManagedMessage(message=message, metadata=metadata))
+ else:
+ self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
+ self.current_tokens += metadata.tokens
+
+ def add_model_output(self, output: 'AgentOutput') -> None:
+ """Add model output as AI message"""
+ tool_calls = [
+ {
+ 'name': 'AgentOutput',
+ 'args': output.model_dump(mode='json', exclude_unset=True),
+ 'id': '1',
+ 'type': 'tool_call',
+ }
+ ]
+
+ msg = AIMessage(
+ content='',
+ tool_calls=tool_calls,
+ )
+ self.add_message(msg, MessageMetadata(tokens=100)) # Estimate tokens for tool calls
+
+ # Empty tool response
+ tool_message = ToolMessage(content='', tool_call_id='1')
+ self.add_message(tool_message, MessageMetadata(tokens=10)) # Estimate tokens for empty response
+
+ def get_messages(self) -> list[BaseMessage]:
+ """Get all messages"""
+ return [m.message for m in self.messages]
+
+ def get_total_tokens(self) -> int:
+ """Get total tokens in history"""
+ return self.current_tokens
+
+ def remove_oldest_message(self) -> None:
+ """Remove oldest non-system message"""
+ for i, msg in enumerate(self.messages):
+ if not isinstance(msg.message, SystemMessage):
+ self.current_tokens -= msg.metadata.tokens
+ self.messages.pop(i)
+ break
+
+ def remove_last_state_message(self) -> None:
+ """Remove last state message from history"""
+ if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage):
+ self.current_tokens -= self.messages[-1].metadata.tokens
+ self.messages.pop()
+
+
+class MessageManagerState(BaseModel):
+ """Holds the state for MessageManager"""
+
+ history: MessageHistory = Field(default_factory=MessageHistory)
+ tool_id: int = 1
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78cfe5bc652851087abc49c385f73f8cdbcf9f6
--- /dev/null
+++ b/browser_use/agent/prompts.py
@@ -0,0 +1,165 @@
+import datetime
+import importlib.resources
+from datetime import datetime
+from typing import TYPE_CHECKING, List, Optional
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+if TYPE_CHECKING:
+ from browser_use.agent.views import ActionResult, AgentStepInfo
+ from browser_use.browser.views import BrowserState
+
+
+class SystemPrompt:
+ def __init__(
+ self,
+ action_description: str,
+ max_actions_per_step: int = 10,
+ override_system_message: Optional[str] = None,
+ extend_system_message: Optional[str] = None,
+ ):
+ self.default_action_description = action_description
+ self.max_actions_per_step = max_actions_per_step
+ prompt = ''
+ if override_system_message:
+ prompt = override_system_message
+ else:
+ self._load_prompt_template()
+ prompt = self.prompt_template.format(max_actions=self.max_actions_per_step)
+
+ if extend_system_message:
+ prompt += f'\n{extend_system_message}'
+
+ self.system_message = SystemMessage(content=prompt)
+
+ def _load_prompt_template(self) -> None:
+ """Load the prompt template from the markdown file."""
+ try:
+ # This works both in development and when installed as a package
+ with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r') as f:
+ self.prompt_template = f.read()
+ except Exception as e:
+ raise RuntimeError(f'Failed to load system prompt template: {e}')
+
+ def get_system_message(self) -> SystemMessage:
+ """
+ Get the system prompt for the agent.
+
+ Returns:
+ SystemMessage: Formatted system prompt
+ """
+ return self.system_message
+
+
+# Functions:
+# {self.default_action_description}
+
+# Example:
+# {self.example_response()}
+# Your AVAILABLE ACTIONS:
+# {self.default_action_description}
+
+
+class AgentMessagePrompt:
+ def __init__(
+ self,
+ state: 'BrowserState',
+ result: Optional[List['ActionResult']] = None,
+ include_attributes: list[str] = [],
+ step_info: Optional['AgentStepInfo'] = None,
+ ):
+ self.state = state
+ self.result = result
+ self.include_attributes = include_attributes
+ self.step_info = step_info
+
+ def get_user_message(self, use_vision: bool = True) -> HumanMessage:
+ elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
+
+ has_content_above = (self.state.pixels_above or 0) > 0
+ has_content_below = (self.state.pixels_below or 0) > 0
+
+ if elements_text != '':
+ if has_content_above:
+ elements_text = (
+ f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
+ )
+ else:
+ elements_text = f'[Start of page]\n{elements_text}'
+ if has_content_below:
+ elements_text = (
+ f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
+ )
+ else:
+ elements_text = f'{elements_text}\n[End of page]'
+ else:
+ elements_text = 'empty page'
+
+ if self.step_info:
+ step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}'
+ else:
+ step_info_description = ''
+ time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
+ step_info_description += f'Current date and time: {time_str}'
+
+ state_description = f"""
+[Task history memory ends]
+[Current state starts here]
+The following is one-time information - if you need to remember it write it to memory:
+Current url: {self.state.url}
+Available tabs:
+{self.state.tabs}
+Interactive elements from top layer of the current page inside the viewport:
+{elements_text}
+{step_info_description}
+"""
+
+ if self.result:
+ for i, result in enumerate(self.result):
+ if result.extracted_content:
+ state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.extracted_content}'
+ if result.error:
+ # only use last line of error
+ error = result.error.split('\n')[-1]
+ state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}'
+
+ if self.state.screenshot and use_vision == True:
+ # Format message for vision model
+ return HumanMessage(
+ content=[
+ {'type': 'text', 'text': state_description},
+ {
+ 'type': 'image_url',
+ 'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'}, # , 'detail': 'low'
+ },
+ ]
+ )
+
+ return HumanMessage(content=state_description)
+
+
+class PlannerPrompt(SystemPrompt):
+ def get_system_message(self) -> SystemMessage:
+ return SystemMessage(
+ content="""You are a planning agent that helps break down tasks into smaller steps and reason about the current state.
+Your role is to:
+1. Analyze the current state and history
+2. Evaluate progress towards the ultimate goal
+3. Identify potential challenges or roadblocks
+4. Suggest the next high-level steps to take
+
+Inside your messages, there will be AI messages from different agents with different formats.
+
+Your output format should be always a JSON object with the following fields:
+{
+ "state_analysis": "Brief analysis of the current state and what has been done so far",
+ "progress_evaluation": "Evaluation of progress towards the ultimate goal (as percentage and description)",
+ "challenges": "List any potential challenges or roadblocks",
+ "next_steps": "List 2-3 concrete next steps to take",
+ "reasoning": "Explain your reasoning for the suggested next steps"
+}
+
+Ignore the other AI messages output structures.
+
+Keep your responses concise and focused on actionable insights."""
+ )
diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..50947781fb088babad2dcb594739ed398aad4de5
--- /dev/null
+++ b/browser_use/agent/service.py
@@ -0,0 +1,964 @@
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import time
+from pathlib import Path
+from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar
+
+from dotenv import load_dotenv
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import (
+ BaseMessage,
+ HumanMessage,
+ SystemMessage,
+)
+
+# from lmnr.sdk.decorators import observe
+from pydantic import BaseModel, ValidationError
+
+from browser_use.agent.gif import create_history_gif
+from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
+from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation
+from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt
+from browser_use.agent.views import (
+ ActionResult,
+ AgentError,
+ AgentHistory,
+ AgentHistoryList,
+ AgentOutput,
+ AgentSettings,
+ AgentState,
+ AgentStepInfo,
+ StepMetadata,
+ ToolCallingMethod,
+)
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContext
+from browser_use.browser.views import BrowserState, BrowserStateHistory
+from browser_use.controller.registry.views import ActionModel
+from browser_use.controller.service import Controller
+from browser_use.dom.history_tree_processor.service import (
+ DOMHistoryElement,
+ HistoryTreeProcessor,
+)
+from browser_use.telemetry.service import ProductTelemetry
+from browser_use.telemetry.views import (
+ AgentEndTelemetryEvent,
+ AgentRunTelemetryEvent,
+ AgentStepTelemetryEvent,
+)
+from browser_use.utils import time_execution_async, time_execution_sync
+
+load_dotenv()
+logger = logging.getLogger(__name__)
+
+
+def log_response(response: AgentOutput) -> None:
+ """Utility function to log the model's response."""
+
+ if 'Success' in response.current_state.evaluation_previous_goal:
+ emoji = 'π'
+ elif 'Failed' in response.current_state.evaluation_previous_goal:
+ emoji = 'β '
+ else:
+ emoji = 'π€·'
+
+ logger.info(f'{emoji} Eval: {response.current_state.evaluation_previous_goal}')
+ logger.info(f'π§ Memory: {response.current_state.memory}')
+ logger.info(f'π― Next goal: {response.current_state.next_goal}')
+ for i, action in enumerate(response.action):
+ logger.info(f'π οΈ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}')
+
+
+Context = TypeVar('Context')
+
+
+class Agent(Generic[Context]):
+ @time_execution_sync('--init (agent)')
+ def __init__(
+ self,
+ task: str,
+ llm: BaseChatModel,
+ # Optional parameters
+ browser: Browser | None = None,
+ browser_context: BrowserContext | None = None,
+ controller: Controller[Context] = Controller(),
+ # Initial agent run parameters
+ sensitive_data: Optional[Dict[str, str]] = None,
+ initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
+ # Cloud Callbacks
+ register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None,
+ register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None,
+ register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None,
+ # Agent settings
+ use_vision: bool = True,
+ use_vision_for_planner: bool = False,
+ save_conversation_path: Optional[str] = None,
+ save_conversation_path_encoding: Optional[str] = 'utf-8',
+ max_failures: int = 3,
+ retry_delay: int = 10,
+ override_system_message: Optional[str] = None,
+ extend_system_message: Optional[str] = None,
+ max_input_tokens: int = 128000,
+ validate_output: bool = False,
+ message_context: Optional[str] = None,
+ generate_gif: bool | str = False,
+ available_file_paths: Optional[list[str]] = None,
+ include_attributes: list[str] = [
+ 'title',
+ 'type',
+ 'name',
+ 'role',
+ 'aria-label',
+ 'placeholder',
+ 'value',
+ 'alt',
+ 'aria-expanded',
+ 'data-date-format',
+ ],
+ max_actions_per_step: int = 10,
+ tool_calling_method: Optional[ToolCallingMethod] = 'auto',
+ page_extraction_llm: Optional[BaseChatModel] = None,
+ planner_llm: Optional[BaseChatModel] = None,
+ planner_interval: int = 1, # Run planner every N steps
+ # Inject state
+ injected_agent_state: Optional[AgentState] = None,
+ #
+ context: Context | None = None,
+ ):
+ if page_extraction_llm is None:
+ page_extraction_llm = llm
+
+ # Core components
+ self.task = task
+ self.llm = llm
+ self.controller = controller
+ self.sensitive_data = sensitive_data
+
+ self.settings = AgentSettings(
+ use_vision=use_vision,
+ use_vision_for_planner=use_vision_for_planner,
+ save_conversation_path=save_conversation_path,
+ save_conversation_path_encoding=save_conversation_path_encoding,
+ max_failures=max_failures,
+ retry_delay=retry_delay,
+ override_system_message=override_system_message,
+ extend_system_message=extend_system_message,
+ max_input_tokens=max_input_tokens,
+ validate_output=validate_output,
+ message_context=message_context,
+ generate_gif=generate_gif,
+ available_file_paths=available_file_paths,
+ include_attributes=include_attributes,
+ max_actions_per_step=max_actions_per_step,
+ tool_calling_method=tool_calling_method,
+ page_extraction_llm=page_extraction_llm,
+ planner_llm=planner_llm,
+ planner_interval=planner_interval,
+ )
+
+ # Initialize state
+ self.state = injected_agent_state or AgentState()
+
+ # Action setup
+ self._setup_action_models()
+ self._set_browser_use_version_and_source()
+ self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
+
+ # Model setup
+ self._set_model_names()
+
+ # for models without tool calling, add available actions to context
+ self.available_actions = self.controller.registry.get_prompt_description()
+
+ self.tool_calling_method = self._set_tool_calling_method()
+ self.settings.message_context = self._set_message_context()
+
+ # Initialize message manager with state
+ self._message_manager = MessageManager(
+ task=task,
+ system_message=SystemPrompt(
+ action_description=self.available_actions,
+ max_actions_per_step=self.settings.max_actions_per_step,
+ override_system_message=override_system_message,
+ extend_system_message=extend_system_message,
+ ).get_system_message(),
+ settings=MessageManagerSettings(
+ max_input_tokens=self.settings.max_input_tokens,
+ include_attributes=self.settings.include_attributes,
+ message_context=self.settings.message_context,
+ sensitive_data=sensitive_data,
+ available_file_paths=self.settings.available_file_paths,
+ ),
+ state=self.state.message_manager_state,
+ )
+
+ # Browser setup
+ self.injected_browser = browser is not None
+ self.injected_browser_context = browser_context is not None
+ self.browser = browser if browser is not None else (None if browser_context else Browser())
+ if browser_context:
+ self.browser_context = browser_context
+ elif self.browser:
+ self.browser_context = BrowserContext(browser=self.browser, config=self.browser.config.new_context_config)
+ else:
+ self.browser = Browser()
+ self.browser_context = BrowserContext(browser=self.browser)
+
+ # Callbacks
+ self.register_new_step_callback = register_new_step_callback
+ self.register_done_callback = register_done_callback
+ self.register_external_agent_status_raise_error_callback = register_external_agent_status_raise_error_callback
+
+ # Context
+ self.context = context
+
+ # Telemetry
+ self.telemetry = ProductTelemetry()
+
+ if self.settings.save_conversation_path:
+ logger.info(f'Saving conversation to {self.settings.save_conversation_path}')
+
+ def _set_message_context(self) -> str | None:
+ if self.tool_calling_method == 'raw':
+ if self.settings.message_context:
+ self.settings.message_context += f'\n\nAvailable actions: {self.available_actions}'
+ else:
+ self.settings.message_context = f'Available actions: {self.available_actions}'
+ return self.settings.message_context
+
+ def _set_browser_use_version_and_source(self) -> None:
+ """Get the version and source of the browser-use package (git or pip in a nutshell)"""
+ try:
+ # First check for repository-specific files
+ repo_files = ['.git', 'README.md', 'docs', 'examples']
+ package_root = Path(__file__).parent.parent.parent
+
+ # If all of these files/dirs exist, it's likely from git
+ if all(Path(package_root / file).exists() for file in repo_files):
+ try:
+ import subprocess
+
+ version = subprocess.check_output(['git', 'describe', '--tags']).decode('utf-8').strip()
+ except Exception:
+ version = 'unknown'
+ source = 'git'
+ else:
+ # If no repo files found, try getting version from pip
+ import pkg_resources
+
+ version = pkg_resources.get_distribution('browser-use').version
+ source = 'pip'
+ except Exception:
+ version = 'unknown'
+ source = 'unknown'
+
+ logger.debug(f'Version: {version}, Source: {source}')
+ self.version = version
+ self.source = source
+
+ def _set_model_names(self) -> None:
+ self.chat_model_library = self.llm.__class__.__name__
+ self.model_name = 'Unknown'
+ if hasattr(self.llm, 'model_name'):
+ model = self.llm.model_name # type: ignore
+ self.model_name = model if model is not None else 'Unknown'
+ elif hasattr(self.llm, 'model'):
+ model = self.llm.model # type: ignore
+ self.model_name = model if model is not None else 'Unknown'
+
+ if self.settings.planner_llm:
+ if hasattr(self.settings.planner_llm, 'model_name'):
+ self.planner_model_name = self.settings.planner_llm.model_name # type: ignore
+ elif hasattr(self.settings.planner_llm, 'model'):
+ self.planner_model_name = self.settings.planner_llm.model # type: ignore
+ else:
+ self.planner_model_name = 'Unknown'
+ else:
+ self.planner_model_name = None
+
+ def _setup_action_models(self) -> None:
+ """Setup dynamic action models from controller's registry"""
+ self.ActionModel = self.controller.registry.create_action_model()
+ # Create output model with the dynamic actions
+ self.AgentOutput = AgentOutput.type_with_custom_actions(self.ActionModel)
+
+ # used to force the done action when max_steps is reached
+ self.DoneActionModel = self.controller.registry.create_action_model(include_actions=['done'])
+ self.DoneAgentOutput = AgentOutput.type_with_custom_actions(self.DoneActionModel)
+
+ def _set_tool_calling_method(self) -> Optional[ToolCallingMethod]:
+ tool_calling_method = self.settings.tool_calling_method
+ if tool_calling_method == 'auto':
+ if 'deepseek-reasoner' in self.model_name or 'deepseek-r1' in self.model_name:
+ return 'raw'
+ elif self.chat_model_library == 'ChatGoogleGenerativeAI':
+ return None
+ elif self.chat_model_library == 'ChatOpenAI':
+ return 'function_calling'
+ elif self.chat_model_library == 'AzureChatOpenAI':
+ return 'function_calling'
+ else:
+ return None
+ else:
+ return tool_calling_method
+
+ def add_new_task(self, new_task: str) -> None:
+ self._message_manager.add_new_task(new_task)
+
+ async def _raise_if_stopped_or_paused(self) -> None:
+ """Utility function that raises an InterruptedError if the agent is stopped or paused."""
+
+ if self.register_external_agent_status_raise_error_callback:
+ if await self.register_external_agent_status_raise_error_callback():
+ raise InterruptedError
+
+ if self.state.stopped or self.state.paused:
+ logger.debug('Agent paused after getting state')
+ raise InterruptedError
+
+ # @observe(name='agent.step', ignore_output=True, ignore_input=True)
+ @time_execution_async('--step (agent)')
+ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
+ """Execute one step of the task"""
+ logger.info(f'π Step {self.state.n_steps}')
+ state = None
+ model_output = None
+ result: list[ActionResult] = []
+ step_start_time = time.time()
+ tokens = 0
+
+ try:
+ state = await self.browser_context.get_state()
+
+ await self._raise_if_stopped_or_paused()
+
+ self._message_manager.add_state_message(state, self.state.last_result, step_info, self.settings.use_vision)
+
+ # Run planner at specified intervals if planner is configured
+ if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0:
+ plan = await self._run_planner()
+ # add plan before last state message
+ self._message_manager.add_plan(plan, position=-1)
+
+ if step_info and step_info.is_last_step():
+ # Add last step warning if needed
+ msg = 'Now comes your last step. Use only the "done" action now. No other actions - so here your action sequence must have length 1.'
+ msg += '\nIf the task is not yet fully finished as requested by the user, set success in "done" to false! E.g. if not all steps are fully completed.'
+ msg += '\nIf the task is fully finished, set success in "done" to true.'
+ msg += '\nInclude everything you found out for the ultimate task in the done text.'
+ logger.info('Last step finishing up')
+ self._message_manager._add_message_with_tokens(HumanMessage(content=msg))
+ self.AgentOutput = self.DoneAgentOutput
+
+ input_messages = self._message_manager.get_messages()
+ tokens = self._message_manager.state.history.current_tokens
+
+ try:
+ model_output = await self.get_next_action(input_messages)
+
+ self.state.n_steps += 1
+
+ if self.register_new_step_callback:
+ await self.register_new_step_callback(state, model_output, self.state.n_steps)
+
+ if self.settings.save_conversation_path:
+ target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt'
+ save_conversation(input_messages, model_output, target, self.settings.save_conversation_path_encoding)
+
+ self._message_manager._remove_last_state_message() # we dont want the whole state in the chat history
+
+ await self._raise_if_stopped_or_paused()
+
+ self._message_manager.add_model_output(model_output)
+ except Exception as e:
+ # model call failed, remove last state message from history
+ self._message_manager._remove_last_state_message()
+ raise e
+
+ result: list[ActionResult] = await self.multi_act(model_output.action)
+
+ self.state.last_result = result
+
+ if len(result) > 0 and result[-1].is_done:
+ logger.info(f'π Result: {result[-1].extracted_content}')
+
+ self.state.consecutive_failures = 0
+
+ except InterruptedError:
+ logger.debug('Agent paused')
+ self.state.last_result = [
+ ActionResult(
+ error='The agent was paused - now continuing actions might need to be repeated', include_in_memory=True
+ )
+ ]
+ return
+ except Exception as e:
+ result = await self._handle_step_error(e)
+ self.state.last_result = result
+
+ finally:
+ step_end_time = time.time()
+ actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else []
+ self.telemetry.capture(
+ AgentStepTelemetryEvent(
+ agent_id=self.state.agent_id,
+ step=self.state.n_steps,
+ actions=actions,
+ consecutive_failures=self.state.consecutive_failures,
+ step_error=[r.error for r in result if r.error] if result else ['No result'],
+ )
+ )
+ if not result:
+ return
+
+ if state:
+ metadata = StepMetadata(
+ step_number=self.state.n_steps,
+ step_start_time=step_start_time,
+ step_end_time=step_end_time,
+ input_tokens=tokens,
+ )
+ self._make_history_item(model_output, state, result, metadata)
+
+ @time_execution_async('--handle_step_error (agent)')
+ async def _handle_step_error(self, error: Exception) -> list[ActionResult]:
+ """Handle all types of errors that can occur during a step"""
+ include_trace = logger.isEnabledFor(logging.DEBUG)
+ error_msg = AgentError.format_error(error, include_trace=include_trace)
+ prefix = f'β Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures} times:\n '
+
+ if isinstance(error, (ValidationError, ValueError)):
+ logger.error(f'{prefix}{error_msg}')
+ if 'Max token limit reached' in error_msg:
+ # cut tokens from history
+ self._message_manager.settings.max_input_tokens = self.settings.max_input_tokens - 500
+ logger.info(
+ f'Cutting tokens from history - new max input tokens: {self._message_manager.settings.max_input_tokens}'
+ )
+ self._message_manager.cut_messages()
+ elif 'Could not parse response' in error_msg:
+ # give model a hint how output should look like
+ error_msg += '\n\nReturn a valid JSON object with the required fields.'
+
+ self.state.consecutive_failures += 1
+ else:
+ from google.api_core.exceptions import ResourceExhausted
+ from openai import RateLimitError
+
+ if isinstance(error, RateLimitError) or isinstance(error, ResourceExhausted):
+ logger.warning(f'{prefix}{error_msg}')
+ await asyncio.sleep(self.settings.retry_delay)
+ self.state.consecutive_failures += 1
+ else:
+ logger.error(f'{prefix}{error_msg}')
+ self.state.consecutive_failures += 1
+
+ return [ActionResult(error=error_msg, include_in_memory=True)]
+
+ def _make_history_item(
+ self,
+ model_output: AgentOutput | None,
+ state: BrowserState,
+ result: list[ActionResult],
+ metadata: Optional[StepMetadata] = None,
+ ) -> None:
+ """Create and store history item"""
+
+ if model_output:
+ interacted_elements = AgentHistory.get_interacted_element(model_output, state.selector_map)
+ else:
+ interacted_elements = [None]
+
+ state_history = BrowserStateHistory(
+ url=state.url,
+ title=state.title,
+ tabs=state.tabs,
+ interacted_element=interacted_elements,
+ screenshot=state.screenshot,
+ )
+
+ history_item = AgentHistory(model_output=model_output, result=result, state=state_history, metadata=metadata)
+
+ self.state.history.history.append(history_item)
+
+ THINK_TAGS = re.compile(r'.*? ', re.DOTALL)
+ STRAY_CLOSE_TAG = re.compile(r'.*?', re.DOTALL)
+
+ def _remove_think_tags(self, text: str) -> str:
+ # Step 1: Remove well-formed ...
+ text = re.sub(self.THINK_TAGS, '', text)
+ # Step 2: If there's an unmatched closing tag ,
+ # remove everything up to and including that.
+ text = re.sub(self.STRAY_CLOSE_TAG, '', text)
+ return text.strip()
+
+ def _convert_input_messages(self, input_messages: list[BaseMessage]) -> list[BaseMessage]:
+ """Convert input messages to the correct format"""
+ if self.model_name == 'deepseek-reasoner' or 'deepseek-r1' in self.model_name:
+ return convert_input_messages(input_messages, self.model_name)
+ else:
+ return input_messages
+
+ @time_execution_async('--get_next_action (agent)')
+ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
+ """Get next action from LLM based on current state"""
+ input_messages = self._convert_input_messages(input_messages)
+
+ if self.tool_calling_method == 'raw':
+ output = self.llm.invoke(input_messages)
+ # TODO: currently invoke does not return reasoning_content, we should override invoke
+ output.content = self._remove_think_tags(str(output.content))
+ try:
+ parsed_json = extract_json_from_model_output(output.content)
+ parsed = self.AgentOutput(**parsed_json)
+ except (ValueError, ValidationError) as e:
+ logger.warning(f'Failed to parse model output: {output} {str(e)}')
+ raise ValueError('Could not parse response.')
+
+ elif self.tool_calling_method is None:
+ structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True)
+ response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore
+ parsed: AgentOutput | None = response['parsed']
+ else:
+ structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True, method=self.tool_calling_method)
+ response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore
+ parsed: AgentOutput | None = response['parsed']
+
+ if parsed is None:
+ raise ValueError('Could not parse response.')
+
+ # cut the number of actions to max_actions_per_step if needed
+ if len(parsed.action) > self.settings.max_actions_per_step:
+ parsed.action = parsed.action[: self.settings.max_actions_per_step]
+
+ log_response(parsed)
+
+ return parsed
+
+ def _log_agent_run(self) -> None:
+ """Log the agent run"""
+ logger.info(f'π Starting task: {self.task}')
+
+ logger.debug(f'Version: {self.version}, Source: {self.source}')
+ self.telemetry.capture(
+ AgentRunTelemetryEvent(
+ agent_id=self.state.agent_id,
+ use_vision=self.settings.use_vision,
+ task=self.task,
+ model_name=self.model_name,
+ chat_model_library=self.chat_model_library,
+ version=self.version,
+ source=self.source,
+ )
+ )
+
+ async def take_step(self) -> tuple[bool, bool]:
+ """Take a step
+
+ Returns:
+ Tuple[bool, bool]: (is_done, is_valid)
+ """
+ await self.step()
+
+ if self.state.history.is_done():
+ if self.settings.validate_output:
+ if not await self._validate_output():
+ return True, False
+
+ await self.log_completion()
+ if self.register_done_callback:
+ await self.register_done_callback(self.state.history)
+
+ return True, True
+
+ return False, False
+
+ # @observe(name='agent.run', ignore_output=True)
+ @time_execution_async('--run (agent)')
+ async def run(self, max_steps: int = 100) -> AgentHistoryList:
+ """Execute the task with maximum number of steps"""
+ try:
+ self._log_agent_run()
+
+ # Execute initial actions if provided
+ if self.initial_actions:
+ result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
+ self.state.last_result = result
+
+ for step in range(max_steps):
+ # Check if we should stop due to too many failures
+ if self.state.consecutive_failures >= self.settings.max_failures:
+ logger.error(f'β Stopping due to {self.settings.max_failures} consecutive failures')
+ break
+
+ # Check control flags before each step
+ if self.state.stopped:
+ logger.info('Agent stopped')
+ break
+
+ while self.state.paused:
+ await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
+ if self.state.stopped: # Allow stopping while paused
+ break
+
+ step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
+ await self.step(step_info)
+
+ if self.state.history.is_done():
+ if self.settings.validate_output and step < max_steps - 1:
+ if not await self._validate_output():
+ continue
+
+ await self.log_completion()
+ break
+ else:
+ logger.info('β Failed to complete task in maximum steps')
+
+ return self.state.history
+ finally:
+ self.telemetry.capture(
+ AgentEndTelemetryEvent(
+ agent_id=self.state.agent_id,
+ is_done=self.state.history.is_done(),
+ success=self.state.history.is_successful(),
+ steps=self.state.n_steps,
+ max_steps_reached=self.state.n_steps >= max_steps,
+ errors=self.state.history.errors(),
+ total_input_tokens=self.state.history.total_input_tokens(),
+ total_duration_seconds=self.state.history.total_duration_seconds(),
+ )
+ )
+
+ if not self.injected_browser_context:
+ await self.browser_context.close()
+
+ if not self.injected_browser and self.browser:
+ await self.browser.close()
+
+ if self.settings.generate_gif:
+ output_path: str = 'agent_history.gif'
+ if isinstance(self.settings.generate_gif, str):
+ output_path = self.settings.generate_gif
+
+ create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
+
+ # @observe(name='controller.multi_act')
+ @time_execution_async('--multi-act (agent)')
+ async def multi_act(
+ self,
+ actions: list[ActionModel],
+ check_for_new_elements: bool = True,
+ ) -> list[ActionResult]:
+ """Execute multiple actions"""
+ results = []
+
+ cached_selector_map = await self.browser_context.get_selector_map()
+ cached_path_hashes = set(e.hash.branch_path_hash for e in cached_selector_map.values())
+
+ await self.browser_context.remove_highlights()
+
+ for i, action in enumerate(actions):
+ if action.get_index() is not None and i != 0:
+ new_state = await self.browser_context.get_state()
+ new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values())
+ if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes):
+ # next action requires index but there are new elements on the page
+ msg = f'Something new appeared after action {i} / {len(actions)}'
+ logger.info(msg)
+ results.append(ActionResult(extracted_content=msg, include_in_memory=True))
+ break
+
+ await self._raise_if_stopped_or_paused()
+
+ result = await self.controller.act(
+ action,
+ self.browser_context,
+ self.settings.page_extraction_llm,
+ self.sensitive_data,
+ self.settings.available_file_paths,
+ context=self.context,
+ )
+
+ results.append(result)
+
+ logger.debug(f'Executed action {i + 1} / {len(actions)}')
+ if results[-1].is_done or results[-1].error or i == len(actions) - 1:
+ break
+
+ await asyncio.sleep(self.browser_context.config.wait_between_actions)
+ # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page)
+
+ return results
+
+ async def _validate_output(self) -> bool:
+ """Validate the output of the last action is what the user wanted"""
+ system_msg = (
+ f'You are a validator of an agent who interacts with a browser. '
+ f'Validate if the output of last action is what the user wanted and if the task is completed. '
+ f'If the task is unclear defined, you can let it pass. But if something is missing or the image does not show what was requested dont let it pass. '
+ f'Try to understand the page and help the model with suggestions like scroll, do x, ... to get the solution right. '
+ f'Task to validate: {self.task}. Return a JSON object with 2 keys: is_valid and reason. '
+ f'is_valid is a boolean that indicates if the output is correct. '
+ f'reason is a string that explains why it is valid or not.'
+ f' example: {{"is_valid": false, "reason": "The user wanted to search for "cat photos", but the agent searched for "dog photos" instead."}}'
+ )
+
+ if self.browser_context.session:
+ state = await self.browser_context.get_state()
+ content = AgentMessagePrompt(
+ state=state,
+ result=self.state.last_result,
+ include_attributes=self.settings.include_attributes,
+ )
+ msg = [SystemMessage(content=system_msg), content.get_user_message(self.settings.use_vision)]
+ else:
+ # if no browser session, we can't validate the output
+ return True
+
+ class ValidationResult(BaseModel):
+ """
+ Validation results.
+ """
+
+ is_valid: bool
+ reason: str
+
+ validator = self.llm.with_structured_output(ValidationResult, include_raw=True)
+ response: dict[str, Any] = await validator.ainvoke(msg) # type: ignore
+ parsed: ValidationResult = response['parsed']
+ is_valid = parsed.is_valid
+ if not is_valid:
+ logger.info(f'β Validator decision: {parsed.reason}')
+ msg = f'The output is not yet correct. {parsed.reason}.'
+ self.state.last_result = [ActionResult(extracted_content=msg, include_in_memory=True)]
+ else:
+ logger.info(f'β
Validator decision: {parsed.reason}')
+ return is_valid
+
+ async def log_completion(self) -> None:
+ """Log the completion of the task"""
+ logger.info('β
Task completed')
+ if self.state.history.is_successful():
+ logger.info('β
Successfully')
+ else:
+ logger.info('β Unfinished')
+
+ if self.register_done_callback:
+ await self.register_done_callback(self.state.history)
+
+ async def rerun_history(
+ self,
+ history: AgentHistoryList,
+ max_retries: int = 3,
+ skip_failures: bool = True,
+ delay_between_actions: float = 2.0,
+ ) -> list[ActionResult]:
+ """
+ Rerun a saved history of actions with error handling and retry logic.
+
+ Args:
+ history: The history to replay
+ max_retries: Maximum number of retries per action
+ skip_failures: Whether to skip failed actions or stop execution
+ delay_between_actions: Delay between actions in seconds
+
+ Returns:
+ List of action results
+ """
+ # Execute initial actions if provided
+ if self.initial_actions:
+ result = await self.multi_act(self.initial_actions)
+ self.state.last_result = result
+
+ results = []
+
+ for i, history_item in enumerate(history.history):
+ goal = history_item.model_output.current_state.next_goal if history_item.model_output else ''
+ logger.info(f'Replaying step {i + 1}/{len(history.history)}: goal: {goal}')
+
+ if (
+ not history_item.model_output
+ or not history_item.model_output.action
+ or history_item.model_output.action == [None]
+ ):
+ logger.warning(f'Step {i + 1}: No action to replay, skipping')
+ results.append(ActionResult(error='No action to replay'))
+ continue
+
+ retry_count = 0
+ while retry_count < max_retries:
+ try:
+ result = await self._execute_history_step(history_item, delay_between_actions)
+ results.extend(result)
+ break
+
+ except Exception as e:
+ retry_count += 1
+ if retry_count == max_retries:
+ error_msg = f'Step {i + 1} failed after {max_retries} attempts: {str(e)}'
+ logger.error(error_msg)
+ if not skip_failures:
+ results.append(ActionResult(error=error_msg))
+ raise RuntimeError(error_msg)
+ else:
+ logger.warning(f'Step {i + 1} failed (attempt {retry_count}/{max_retries}), retrying...')
+ await asyncio.sleep(delay_between_actions)
+
+ return results
+
+ async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]:
+ """Execute a single step from history with element validation"""
+ state = await self.browser_context.get_state()
+ if not state or not history_item.model_output:
+ raise ValueError('Invalid state or model output')
+ updated_actions = []
+ for i, action in enumerate(history_item.model_output.action):
+ updated_action = await self._update_action_indices(
+ history_item.state.interacted_element[i],
+ action,
+ state,
+ )
+ updated_actions.append(updated_action)
+
+ if updated_action is None:
+ raise ValueError(f'Could not find matching element {i} in current page')
+
+ result = await self.multi_act(updated_actions)
+
+ await asyncio.sleep(delay)
+ return result
+
+ async def _update_action_indices(
+ self,
+ historical_element: Optional[DOMHistoryElement],
+ action: ActionModel, # Type this properly based on your action model
+ current_state: BrowserState,
+ ) -> Optional[ActionModel]:
+ """
+ Update action indices based on current page state.
+ Returns updated action or None if element cannot be found.
+ """
+ if not historical_element or not current_state.element_tree:
+ return action
+
+ current_element = HistoryTreeProcessor.find_history_element_in_tree(historical_element, current_state.element_tree)
+
+ if not current_element or current_element.highlight_index is None:
+ return None
+
+ old_index = action.get_index()
+ if old_index != current_element.highlight_index:
+ action.set_index(current_element.highlight_index)
+ logger.info(f'Element moved in DOM, updated index from {old_index} to {current_element.highlight_index}')
+
+ return action
+
+ async def load_and_rerun(self, history_file: Optional[str | Path] = None, **kwargs) -> list[ActionResult]:
+ """
+ Load history from file and rerun it.
+
+ Args:
+ history_file: Path to the history file
+ **kwargs: Additional arguments passed to rerun_history
+ """
+ if not history_file:
+ history_file = 'AgentHistory.json'
+ history = AgentHistoryList.load_from_file(history_file, self.AgentOutput)
+ return await self.rerun_history(history, **kwargs)
+
+ def save_history(self, file_path: Optional[str | Path] = None) -> None:
+ """Save the history to a file"""
+ if not file_path:
+ file_path = 'AgentHistory.json'
+ self.state.history.save_to_file(file_path)
+
+ def pause(self) -> None:
+ """Pause the agent before the next step"""
+ logger.info('π pausing Agent ')
+ self.state.paused = True
+
+ def resume(self) -> None:
+ """Resume the agent"""
+ logger.info('βΆοΈ Agent resuming')
+ self.state.paused = False
+
+ def stop(self) -> None:
+ """Stop the agent"""
+ logger.info('βΉοΈ Agent stopping')
+ self.state.stopped = True
+
+ def _convert_initial_actions(self, actions: List[Dict[str, Dict[str, Any]]]) -> List[ActionModel]:
+ """Convert dictionary-based actions to ActionModel instances"""
+ converted_actions = []
+ action_model = self.ActionModel
+ for action_dict in actions:
+ # Each action_dict should have a single key-value pair
+ action_name = next(iter(action_dict))
+ params = action_dict[action_name]
+
+ # Get the parameter model for this action from registry
+ action_info = self.controller.registry.registry.actions[action_name]
+ param_model = action_info.param_model
+
+ # Create validated parameters using the appropriate param model
+ validated_params = param_model(**params)
+
+ # Create ActionModel instance with the validated parameters
+ action_model = self.ActionModel(**{action_name: validated_params})
+ converted_actions.append(action_model)
+
+ return converted_actions
+
+ async def _run_planner(self) -> Optional[str]:
+ """Run the planner to analyze state and suggest next steps"""
+ # Skip planning if no planner_llm is set
+ if not self.settings.planner_llm:
+ return None
+
+ # Create planner message history using full message history
+ planner_messages = [
+ PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(),
+ *self._message_manager.get_messages()[1:], # Use full message history except the first
+ ]
+
+ if not self.settings.use_vision_for_planner and self.settings.use_vision:
+ last_state_message: HumanMessage = planner_messages[-1]
+ # remove image from last state message
+ new_msg = ''
+ if isinstance(last_state_message.content, list):
+ for msg in last_state_message.content:
+ if msg['type'] == 'text': # type: ignore
+ new_msg += msg['text'] # type: ignore
+ elif msg['type'] == 'image_url': # type: ignore
+ continue # type: ignore
+ else:
+ new_msg = last_state_message.content
+
+ planner_messages[-1] = HumanMessage(content=new_msg)
+
+ planner_messages = convert_input_messages(planner_messages, self.planner_model_name)
+
+ # Get planner output
+ response = await self.settings.planner_llm.ainvoke(planner_messages)
+ plan = str(response.content)
+ # if deepseek-reasoner, remove think tags
+ if self.planner_model_name and ('deepseek-r1' in self.planner_model_name or 'deepseek-reasoner' in self.planner_model_name):
+ plan = self._remove_think_tags(plan)
+ try:
+ plan_json = json.loads(plan)
+ logger.info(f'Planning Analysis:\n{json.dumps(plan_json, indent=4)}')
+ except json.JSONDecodeError:
+ logger.info(f'Planning Analysis:\n{plan}')
+ except Exception as e:
+ logger.debug(f'Error parsing planning analysis: {e}')
+ logger.info(f'Plan: {plan}')
+
+ return plan
+
+ @property
+ def message_manager(self) -> MessageManager:
+ return self._message_manager
diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md
new file mode 100644
index 0000000000000000000000000000000000000000..e70ae4952fca15c61c532960d3391c6dc423ff69
--- /dev/null
+++ b/browser_use/agent/system_prompt.md
@@ -0,0 +1,69 @@
+You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
+
+# Input Format
+Task
+Previous steps
+Current URL
+Open Tabs
+Interactive Elements
+[index]text
+- index: Numeric identifier for interaction
+- type: HTML element type (button, input, etc.)
+- text: Element description
+Example:
+[33]Submit Form
+
+- Only elements with numeric indexes in [] are interactive
+- elements without [] provide only context
+
+# Response Rules
+1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
+{{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
+"memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
+"next_goal": "What needs to be done with the next immediate action"}},
+"action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
+
+2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
+Common action sequences:
+- Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
+- Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
+- Actions are executed in the given order
+- If the page changes after an action, the sequence is interrupted and you get the new state.
+- Only provide the action sequence until an action which changes the page state significantly.
+- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
+- only use multiple actions if it makes sense.
+
+3. ELEMENT INTERACTION:
+- Only use indexes of the interactive elements
+- Elements marked with "[]Non-interactive text" are non-interactive
+
+4. NAVIGATION & ERROR HANDLING:
+- If no suitable elements exist, use other functions to complete the task
+- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
+- Handle popups/cookies by accepting or closing them
+- Use scroll to find elements you are looking for
+- If you want to research something, open a new tab instead of using the current tab
+- If captcha pops up, try to solve it - else try a different approach
+- If the page is not fully loaded, use wait action
+
+5. TASK COMPLETION:
+- Use the done action as the last action as soon as the ultimate task is complete
+- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
+- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
+- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
+- Don't hallucinate actions
+- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
+
+6. VISUAL CONTEXT:
+- When an image is provided, use it to understand the page layout
+- Bounding boxes with labels on their top right corner correspond to element indexes
+
+7. Form filling:
+- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
+
+8. Long tasks:
+- Keep track of the status and subresults in the memory.
+
+9. Extraction:
+- If your task is to find information - call extract_content on the specific pages to get and store the information.
+Your responses must be always JSON with the specified format.
\ No newline at end of file
diff --git a/browser_use/agent/tests.py b/browser_use/agent/tests.py
new file mode 100644
index 0000000000000000000000000000000000000000..15c47357da7b549b8adb0f8e38f0e2224b994da3
--- /dev/null
+++ b/browser_use/agent/tests.py
@@ -0,0 +1,197 @@
+import pytest
+
+from browser_use.agent.views import (
+ ActionResult,
+ AgentBrain,
+ AgentHistory,
+ AgentHistoryList,
+ AgentOutput,
+)
+from browser_use.browser.views import BrowserState, BrowserStateHistory, TabInfo
+from browser_use.controller.registry.service import Registry
+from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction
+from browser_use.dom.views import DOMElementNode
+
+
+@pytest.fixture
+def sample_browser_state():
+ return BrowserState(
+ url='https://example.com',
+ title='Example Page',
+ tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)],
+ screenshot='screenshot1.png',
+ element_tree=DOMElementNode(
+ tag_name='root',
+ is_visible=True,
+ parent=None,
+ xpath='',
+ attributes={},
+ children=[],
+ ),
+ selector_map={},
+ )
+
+
+@pytest.fixture
+def action_registry():
+ registry = Registry()
+
+ # Register the actions we need for testing
+ @registry.action(description='Click an element', param_model=ClickElementAction)
+ def click_element(params: ClickElementAction, browser=None):
+ pass
+
+ @registry.action(
+ description='Extract page content',
+ param_model=ExtractPageContentAction,
+ )
+ def extract_page_content(params: ExtractPageContentAction, browser=None):
+ pass
+
+ @registry.action(description='Mark task as done', param_model=DoneAction)
+ def done(params: DoneAction):
+ pass
+
+ # Create the dynamic ActionModel with all registered actions
+ return registry.create_action_model()
+
+
+@pytest.fixture
+def sample_history(action_registry):
+ # Create actions with nested params structure
+ click_action = action_registry(click_element={'index': 1})
+
+ extract_action = action_registry(extract_page_content={'value': 'text'})
+
+ done_action = action_registry(done={'text': 'Task completed'})
+
+ histories = [
+ AgentHistory(
+ model_output=AgentOutput(
+ current_state=AgentBrain(
+ evaluation_previous_goal='None',
+ memory='Started task',
+ next_goal='Click button',
+ ),
+ action=[click_action],
+ ),
+ result=[ActionResult(is_done=False)],
+ state=BrowserStateHistory(
+ url='https://example.com',
+ title='Page 1',
+ tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)],
+ screenshot='screenshot1.png',
+ interacted_element=[{'xpath': '//button[1]'}],
+ ),
+ ),
+ AgentHistory(
+ model_output=AgentOutput(
+ current_state=AgentBrain(
+ evaluation_previous_goal='Clicked button',
+ memory='Button clicked',
+ next_goal='Extract content',
+ ),
+ action=[extract_action],
+ ),
+ result=[
+ ActionResult(
+ is_done=False,
+ extracted_content='Extracted text',
+ error='Failed to extract completely',
+ )
+ ],
+ state=BrowserStateHistory(
+ url='https://example.com/page2',
+ title='Page 2',
+ tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
+ screenshot='screenshot2.png',
+ interacted_element=[{'xpath': '//div[1]'}],
+ ),
+ ),
+ AgentHistory(
+ model_output=AgentOutput(
+ current_state=AgentBrain(
+ evaluation_previous_goal='Extracted content',
+ memory='Content extracted',
+ next_goal='Finish task',
+ ),
+ action=[done_action],
+ ),
+ result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)],
+ state=BrowserStateHistory(
+ url='https://example.com/page2',
+ title='Page 2',
+ tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
+ screenshot='screenshot3.png',
+ interacted_element=[{'xpath': '//div[1]'}],
+ ),
+ ),
+ ]
+ return AgentHistoryList(history=histories)
+
+
+def test_last_model_output(sample_history: AgentHistoryList):
+ last_output = sample_history.last_action()
+ print(last_output)
+ assert last_output == {'done': {'text': 'Task completed'}}
+
+
+def test_get_errors(sample_history: AgentHistoryList):
+ errors = sample_history.errors()
+ assert len(errors) == 1
+ assert errors[0] == 'Failed to extract completely'
+
+
+def test_final_result(sample_history: AgentHistoryList):
+ assert sample_history.final_result() == 'Task completed'
+
+
+def test_is_done(sample_history: AgentHistoryList):
+ assert sample_history.is_done() == True
+
+
+def test_urls(sample_history: AgentHistoryList):
+ urls = sample_history.urls()
+ assert 'https://example.com' in urls
+ assert 'https://example.com/page2' in urls
+
+
+def test_all_screenshots(sample_history: AgentHistoryList):
+ screenshots = sample_history.screenshots()
+ assert len(screenshots) == 3
+ assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png']
+
+
+def test_all_model_outputs(sample_history: AgentHistoryList):
+ outputs = sample_history.model_actions()
+ print(f'DEBUG: {outputs[0]}')
+ assert len(outputs) == 3
+ # get first key value pair
+ assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}}
+ assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}}
+ assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}}
+
+
+def test_all_model_outputs_filtered(sample_history: AgentHistoryList):
+ filtered = sample_history.model_actions_filtered(include=['click_element'])
+ assert len(filtered) == 1
+ assert filtered[0]['click_element']['index'] == 1
+
+
+def test_empty_history():
+ empty_history = AgentHistoryList(history=[])
+ assert empty_history.last_action() is None
+ assert empty_history.final_result() is None
+ assert empty_history.is_done() == False
+ assert len(empty_history.urls()) == 0
+
+
+# Add a test to verify action creation
+def test_action_creation(action_registry):
+ click_action = action_registry(click_element={'index': 1})
+
+ assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}}
+
+
+# run this with:
+# pytest browser_use/agent/tests.py
diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d8249727b90e10c71c2b2c433fa9a504fc1ae52
--- /dev/null
+++ b/browser_use/agent/views.py
@@ -0,0 +1,393 @@
+from __future__ import annotations
+
+import json
+import traceback
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Type
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from openai import RateLimitError
+from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model
+
+from browser_use.agent.message_manager.views import MessageManagerState
+from browser_use.browser.views import BrowserStateHistory
+from browser_use.controller.registry.views import ActionModel
+from browser_use.dom.history_tree_processor.service import (
+ DOMElementNode,
+ DOMHistoryElement,
+ HistoryTreeProcessor,
+)
+from browser_use.dom.views import SelectorMap
+
+ToolCallingMethod = Literal['function_calling', 'json_mode', 'raw', 'auto']
+
+
+class AgentSettings(BaseModel):
+ """Options for the agent"""
+
+ use_vision: bool = True
+ use_vision_for_planner: bool = False
+ save_conversation_path: Optional[str] = None
+ save_conversation_path_encoding: Optional[str] = 'utf-8'
+ max_failures: int = 3
+ retry_delay: int = 10
+ max_input_tokens: int = 128000
+ validate_output: bool = False
+ message_context: Optional[str] = None
+ generate_gif: bool | str = False
+ available_file_paths: Optional[list[str]] = None
+ override_system_message: Optional[str] = None
+ extend_system_message: Optional[str] = None
+ include_attributes: list[str] = [
+ 'title',
+ 'type',
+ 'name',
+ 'role',
+ 'tabindex',
+ 'aria-label',
+ 'placeholder',
+ 'value',
+ 'alt',
+ 'aria-expanded',
+ ]
+ max_actions_per_step: int = 10
+
+ tool_calling_method: Optional[ToolCallingMethod] = 'auto'
+ page_extraction_llm: Optional[BaseChatModel] = None
+ planner_llm: Optional[BaseChatModel] = None
+ planner_interval: int = 1 # Run planner every N steps
+
+
+class AgentState(BaseModel):
+ """Holds all state information for an Agent"""
+
+ agent_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+ n_steps: int = 1
+ consecutive_failures: int = 0
+ last_result: Optional[List['ActionResult']] = None
+ history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[]))
+ last_plan: Optional[str] = None
+ paused: bool = False
+ stopped: bool = False
+
+ message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
+
+ # class Config:
+ # arbitrary_types_allowed = True
+
+
+@dataclass
+class AgentStepInfo:
+ step_number: int
+ max_steps: int
+
+ def is_last_step(self) -> bool:
+ """Check if this is the last step"""
+ return self.step_number >= self.max_steps - 1
+
+
+class ActionResult(BaseModel):
+ """Result of executing an action"""
+
+ is_done: Optional[bool] = False
+ success: Optional[bool] = None
+ extracted_content: Optional[str] = None
+ error: Optional[str] = None
+ include_in_memory: bool = False # whether to include in past messages as context or not
+
+
+class StepMetadata(BaseModel):
+ """Metadata for a single step including timing and token information"""
+
+ step_start_time: float
+ step_end_time: float
+ input_tokens: int # Approximate tokens from message manager for this step
+ step_number: int
+
+ @property
+ def duration_seconds(self) -> float:
+ """Calculate step duration in seconds"""
+ return self.step_end_time - self.step_start_time
+
+
+class AgentBrain(BaseModel):
+ """Current state of the agent"""
+
+ evaluation_previous_goal: str
+ memory: str
+ next_goal: str
+
+
+class AgentOutput(BaseModel):
+ """Output model for agent
+
+ @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
+ """
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ current_state: AgentBrain
+ action: list[ActionModel] = Field(
+ ...,
+ description='List of actions to execute',
+ json_schema_extra={'min_items': 1}, # Ensure at least one action is provided
+ )
+
+ @staticmethod
+ def type_with_custom_actions(custom_actions: Type[ActionModel]) -> Type['AgentOutput']:
+ """Extend actions with custom actions"""
+ model_ = create_model(
+ 'AgentOutput',
+ __base__=AgentOutput,
+ action=(
+ list[custom_actions],
+ Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}),
+ ),
+ __module__=AgentOutput.__module__,
+ )
+ model_.__doc__ = 'AgentOutput model with custom actions'
+ return model_
+
+
+class AgentHistory(BaseModel):
+ """History item for agent actions"""
+
+ model_output: AgentOutput | None
+ result: list[ActionResult]
+ state: BrowserStateHistory
+ metadata: Optional[StepMetadata] = None
+
+ model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=())
+
+ @staticmethod
+ def get_interacted_element(model_output: AgentOutput, selector_map: SelectorMap) -> list[DOMHistoryElement | None]:
+ elements = []
+ for action in model_output.action:
+ index = action.get_index()
+ if index and index in selector_map:
+ el: DOMElementNode = selector_map[index]
+ elements.append(HistoryTreeProcessor.convert_dom_element_to_history_element(el))
+ else:
+ elements.append(None)
+ return elements
+
+ def model_dump(self, **kwargs) -> Dict[str, Any]:
+ """Custom serialization handling circular references"""
+
+ # Handle action serialization
+ model_output_dump = None
+ if self.model_output:
+ action_dump = [action.model_dump(exclude_none=True) for action in self.model_output.action]
+ model_output_dump = {
+ 'current_state': self.model_output.current_state.model_dump(),
+ 'action': action_dump, # This preserves the actual action data
+ }
+
+ return {
+ 'model_output': model_output_dump,
+ 'result': [r.model_dump(exclude_none=True) for r in self.result],
+ 'state': self.state.to_dict(),
+ 'metadata': self.metadata.model_dump() if self.metadata else None,
+ }
+
+
+class AgentHistoryList(BaseModel):
+ """List of agent history items"""
+
+ history: list[AgentHistory]
+
+ def total_duration_seconds(self) -> float:
+ """Get total duration of all steps in seconds"""
+ total = 0.0
+ for h in self.history:
+ if h.metadata:
+ total += h.metadata.duration_seconds
+ return total
+
+ def total_input_tokens(self) -> int:
+ """
+ Get total tokens used across all steps.
+ Note: These are from the approximate token counting of the message manager.
+ For accurate token counting, use tools like LangChain Smith or OpenAI's token counters.
+ """
+ total = 0
+ for h in self.history:
+ if h.metadata:
+ total += h.metadata.input_tokens
+ return total
+
+ def input_token_usage(self) -> list[int]:
+ """Get token usage for each step"""
+ return [h.metadata.input_tokens for h in self.history if h.metadata]
+
+ def __str__(self) -> str:
+ """Representation of the AgentHistoryList object"""
+ return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})'
+
+ def __repr__(self) -> str:
+ """Representation of the AgentHistoryList object"""
+ return self.__str__()
+
+ def save_to_file(self, filepath: str | Path) -> None:
+ """Save history to JSON file with proper serialization"""
+ try:
+ Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+ data = self.model_dump()
+ with open(filepath, 'w', encoding='utf-8') as f:
+ json.dump(data, f, indent=2)
+ except Exception as e:
+ raise e
+
+ def model_dump(self, **kwargs) -> Dict[str, Any]:
+ """Custom serialization that properly uses AgentHistory's model_dump"""
+ return {
+ 'history': [h.model_dump(**kwargs) for h in self.history],
+ }
+
+ @classmethod
+ def load_from_file(cls, filepath: str | Path, output_model: Type[AgentOutput]) -> 'AgentHistoryList':
+ """Load history from JSON file"""
+ with open(filepath, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+ # loop through history and validate output_model actions to enrich with custom actions
+ for h in data['history']:
+ if h['model_output']:
+ if isinstance(h['model_output'], dict):
+ h['model_output'] = output_model.model_validate(h['model_output'])
+ else:
+ h['model_output'] = None
+ if 'interacted_element' not in h['state']:
+ h['state']['interacted_element'] = None
+ history = cls.model_validate(data)
+ return history
+
+ def last_action(self) -> None | dict:
+ """Last action in history"""
+ if self.history and self.history[-1].model_output:
+ return self.history[-1].model_output.action[-1].model_dump(exclude_none=True)
+ return None
+
+ def errors(self) -> list[str | None]:
+ """Get all errors from history, with None for steps without errors"""
+ errors = []
+ for h in self.history:
+ step_errors = [r.error for r in h.result if r.error]
+
+ # each step can have only one error
+ errors.append(step_errors[0] if step_errors else None)
+ return errors
+
+ def final_result(self) -> None | str:
+ """Final result from history"""
+ if self.history and self.history[-1].result[-1].extracted_content:
+ return self.history[-1].result[-1].extracted_content
+ return None
+
+ def is_done(self) -> bool:
+ """Check if the agent is done"""
+ if self.history and len(self.history[-1].result) > 0:
+ last_result = self.history[-1].result[-1]
+ return last_result.is_done is True
+ return False
+
+ def is_successful(self) -> bool | None:
+ """Check if the agent completed successfully - the agent decides in the last step if it was successful or not. None if not done yet."""
+ if self.history and len(self.history[-1].result) > 0:
+ last_result = self.history[-1].result[-1]
+ if last_result.is_done is True:
+ return last_result.success
+ return None
+
+ def has_errors(self) -> bool:
+ """Check if the agent has any non-None errors"""
+ return any(error is not None for error in self.errors())
+
+ def urls(self) -> list[str | None]:
+ """Get all unique URLs from history"""
+ return [h.state.url if h.state.url is not None else None for h in self.history]
+
+ def screenshots(self) -> list[str | None]:
+ """Get all screenshots from history"""
+ return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history]
+
+ def action_names(self) -> list[str]:
+ """Get all action names from history"""
+ action_names = []
+ for action in self.model_actions():
+ actions = list(action.keys())
+ if actions:
+ action_names.append(actions[0])
+ return action_names
+
+ def model_thoughts(self) -> list[AgentBrain]:
+ """Get all thoughts from history"""
+ return [h.model_output.current_state for h in self.history if h.model_output]
+
+ def model_outputs(self) -> list[AgentOutput]:
+ """Get all model outputs from history"""
+ return [h.model_output for h in self.history if h.model_output]
+
+ # get all actions with params
+ def model_actions(self) -> list[dict]:
+ """Get all actions from history"""
+ outputs = []
+
+ for h in self.history:
+ if h.model_output:
+ for action, interacted_element in zip(h.model_output.action, h.state.interacted_element):
+ output = action.model_dump(exclude_none=True)
+ output['interacted_element'] = interacted_element
+ outputs.append(output)
+ return outputs
+
+ def action_results(self) -> list[ActionResult]:
+ """Get all results from history"""
+ results = []
+ for h in self.history:
+ results.extend([r for r in h.result if r])
+ return results
+
+ def extracted_content(self) -> list[str]:
+ """Get all extracted content from history"""
+ content = []
+ for h in self.history:
+ content.extend([r.extracted_content for r in h.result if r.extracted_content])
+ return content
+
+ def model_actions_filtered(self, include: list[str] | None = None) -> list[dict]:
+ """Get all model actions from history as JSON"""
+ if include is None:
+ include = []
+ outputs = self.model_actions()
+ result = []
+ for o in outputs:
+ for i in include:
+ if i == list(o.keys())[0]:
+ result.append(o)
+ return result
+
+ def number_of_steps(self) -> int:
+ """Get the number of steps in the history"""
+ return len(self.history)
+
+
+class AgentError:
+ """Container for agent error handling"""
+
+ VALIDATION_ERROR = 'Invalid model output format. Please follow the correct schema.'
+ RATE_LIMIT_ERROR = 'Rate limit reached. Waiting before retry.'
+ NO_VALID_ACTION = 'No valid action found'
+
+ @staticmethod
+ def format_error(error: Exception, include_trace: bool = False) -> str:
+ """Format error message based on error type and optionally include trace"""
+ message = ''
+ if isinstance(error, ValidationError):
+ return f'{AgentError.VALIDATION_ERROR}\nDetails: {str(error)}'
+ if isinstance(error, RateLimitError):
+ return AgentError.RATE_LIMIT_ERROR
+ if include_trace:
+ return f'{str(error)}\nStacktrace:\n{traceback.format_exc()}'
+ return f'{str(error)}'
diff --git a/browser_use/browser/browser.py b/browser_use/browser/browser.py
new file mode 100644
index 0000000000000000000000000000000000000000..9278ac34c9dffb7937897dab3af1c6eefe416226
--- /dev/null
+++ b/browser_use/browser/browser.py
@@ -0,0 +1,253 @@
+"""
+Playwright browser on steroids.
+"""
+
+import asyncio
+import gc
+import logging
+from dataclasses import dataclass, field
+
+from playwright._impl._api_structures import ProxySettings
+from playwright.async_api import Browser as PlaywrightBrowser
+from playwright.async_api import (
+ Playwright,
+ async_playwright,
+)
+
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from browser_use.utils import time_execution_async
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class BrowserConfig:
+ r"""
+ Configuration for the Browser.
+
+ Default values:
+ headless: True
+ Whether to run browser in headless mode
+
+ disable_security: True
+ Disable browser security features
+
+ extra_chromium_args: []
+ Extra arguments to pass to the browser
+
+ wss_url: None
+ Connect to a browser instance via WebSocket
+
+ cdp_url: None
+ Connect to a browser instance via CDP
+
+ chrome_instance_path: None
+ Path to a Chrome instance to use to connect to your normal browser
+ e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome'
+ """
+
+ headless: bool = False
+ disable_security: bool = True
+ extra_chromium_args: list[str] = field(default_factory=list)
+ chrome_instance_path: str | None = None
+ wss_url: str | None = None
+ cdp_url: str | None = None
+
+ proxy: ProxySettings | None = field(default=None)
+ new_context_config: BrowserContextConfig = field(default_factory=BrowserContextConfig)
+
+ _force_keep_browser_alive: bool = False
+
+
+# @singleton: TODO - think about id singleton makes sense here
+# @dev By default this is a singleton, but you can create multiple instances if you need to.
+class Browser:
+ """
+ Playwright browser on steroids.
+
+ This is persistant browser factory that can spawn multiple browser contexts.
+ It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise).
+ """
+
+ def __init__(
+ self,
+ config: BrowserConfig = BrowserConfig(),
+ ):
+ logger.debug('Initializing new browser')
+ self.config = config
+ self.playwright: Playwright | None = None
+ self.playwright_browser: PlaywrightBrowser | None = None
+
+ self.disable_security_args = []
+ if self.config.disable_security:
+ self.disable_security_args = [
+ '--disable-web-security',
+ '--disable-site-isolation-trials',
+ '--disable-features=IsolateOrigins,site-per-process',
+ ]
+
+ async def new_context(self, config: BrowserContextConfig = BrowserContextConfig()) -> BrowserContext:
+ """Create a browser context"""
+ return BrowserContext(config=config, browser=self)
+
+ async def get_playwright_browser(self) -> PlaywrightBrowser:
+ """Get a browser context"""
+ if self.playwright_browser is None:
+ return await self._init()
+
+ return self.playwright_browser
+
+ @time_execution_async('--init (browser)')
+ async def _init(self):
+ """Initialize the browser session"""
+ playwright = await async_playwright().start()
+ browser = await self._setup_browser(playwright)
+
+ self.playwright = playwright
+ self.playwright_browser = browser
+
+ return self.playwright_browser
+
+ async def _setup_cdp(self, playwright: Playwright) -> PlaywrightBrowser:
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
+ if not self.config.cdp_url:
+ raise ValueError('CDP URL is required')
+ logger.info(f'Connecting to remote browser via CDP {self.config.cdp_url}')
+ browser = await playwright.chromium.connect_over_cdp(self.config.cdp_url)
+ return browser
+
+ async def _setup_wss(self, playwright: Playwright) -> PlaywrightBrowser:
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
+ if not self.config.wss_url:
+ raise ValueError('WSS URL is required')
+ logger.info(f'Connecting to remote browser via WSS {self.config.wss_url}')
+ browser = await playwright.chromium.connect(self.config.wss_url)
+ return browser
+
+ async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser:
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
+ if not self.config.chrome_instance_path:
+ raise ValueError('Chrome instance path is required')
+ import subprocess
+
+ import requests
+
+ try:
+ # Check if browser is already running
+ response = requests.get('http://localhost:9222/json/version', timeout=2)
+ if response.status_code == 200:
+ logger.info('Reusing existing Chrome instance')
+ browser = await playwright.chromium.connect_over_cdp(
+ endpoint_url='http://localhost:9222',
+ timeout=20000, # 20 second timeout for connection
+ )
+ return browser
+ except requests.ConnectionError:
+ logger.debug('No existing Chrome instance found, starting a new one')
+
+ # Start a new Chrome instance
+ subprocess.Popen(
+ [
+ self.config.chrome_instance_path,
+ '--remote-debugging-port=9222',
+ ]
+ + self.config.extra_chromium_args,
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+
+ # Attempt to connect again after starting a new instance
+ for _ in range(10):
+ try:
+ response = requests.get('http://localhost:9222/json/version', timeout=2)
+ if response.status_code == 200:
+ break
+ except requests.ConnectionError:
+ pass
+ await asyncio.sleep(1)
+
+ # Attempt to connect again after starting a new instance
+ try:
+ browser = await playwright.chromium.connect_over_cdp(
+ endpoint_url='http://localhost:9222',
+ timeout=20000, # 20 second timeout for connection
+ )
+ return browser
+ except Exception as e:
+ logger.error(f'Failed to start a new Chrome instance.: {str(e)}')
+ raise RuntimeError(
+ ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.'
+ )
+
+ async def _setup_standard_browser(self, playwright: Playwright) -> PlaywrightBrowser:
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
+ browser = await playwright.chromium.launch(
+ headless=self.config.headless,
+ args=[
+ '--no-sandbox',
+ '--disable-blink-features=AutomationControlled',
+ '--disable-infobars',
+ '--disable-background-timer-throttling',
+ '--disable-popup-blocking',
+ '--disable-backgrounding-occluded-windows',
+ '--disable-renderer-backgrounding',
+ '--disable-window-activation',
+ '--disable-focus-on-load',
+ '--no-first-run',
+ '--no-default-browser-check',
+ '--no-startup-window',
+ '--window-position=0,0',
+ # '--window-size=1280,1000',
+ ]
+ + self.disable_security_args
+ + self.config.extra_chromium_args,
+ proxy=self.config.proxy,
+ )
+ # convert to Browser
+ return browser
+
+ async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
+ try:
+ if self.config.cdp_url:
+ return await self._setup_cdp(playwright)
+ if self.config.wss_url:
+ return await self._setup_wss(playwright)
+ elif self.config.chrome_instance_path:
+ return await self._setup_browser_with_instance(playwright)
+ else:
+ return await self._setup_standard_browser(playwright)
+ except Exception as e:
+ logger.error(f'Failed to initialize Playwright browser: {str(e)}')
+ raise
+
+ async def close(self):
+ """Close the browser instance"""
+ try:
+ if not self.config._force_keep_browser_alive:
+ if self.playwright_browser:
+ await self.playwright_browser.close()
+ del self.playwright_browser
+ if self.playwright:
+ await self.playwright.stop()
+ del self.playwright
+
+ except Exception as e:
+ logger.debug(f'Failed to close browser properly: {e}')
+ finally:
+ self.playwright_browser = None
+ self.playwright = None
+
+ gc.collect()
+
+ def __del__(self):
+ """Async cleanup when object is destroyed"""
+ try:
+ if self.playwright_browser or self.playwright:
+ loop = asyncio.get_running_loop()
+ if loop.is_running():
+ loop.create_task(self.close())
+ else:
+ asyncio.run(self.close())
+ except Exception as e:
+ logger.debug(f'Failed to cleanup browser in destructor: {e}')
diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..d005be4ea6eb5a23f2434cc0655ba0650012ba2b
--- /dev/null
+++ b/browser_use/browser/context.py
@@ -0,0 +1,1353 @@
+"""
+Playwright browser on steroids.
+"""
+
+import asyncio
+import base64
+import gc
+import json
+import logging
+import os
+import re
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Optional, TypedDict
+
+from playwright._impl._errors import TimeoutError
+from playwright.async_api import Browser as PlaywrightBrowser
+from playwright.async_api import (
+ BrowserContext as PlaywrightBrowserContext,
+)
+from playwright.async_api import (
+ ElementHandle,
+ FrameLocator,
+ Page,
+)
+
+from browser_use.browser.views import (
+ BrowserError,
+ BrowserState,
+ TabInfo,
+ URLNotAllowedError,
+)
+from browser_use.dom.service import DomService
+from browser_use.dom.views import DOMElementNode, SelectorMap
+from browser_use.utils import time_execution_async, time_execution_sync
+
+if TYPE_CHECKING:
+ from browser_use.browser.browser import Browser
+
+logger = logging.getLogger(__name__)
+
+
+class BrowserContextWindowSize(TypedDict):
+ width: int
+ height: int
+
+
+@dataclass
+class BrowserContextConfig:
+ """
+ Configuration for the BrowserContext.
+
+ Default values:
+ cookies_file: None
+ Path to cookies file for persistence
+
+ disable_security: True
+ Disable browser security features
+
+ minimum_wait_page_load_time: 0.5
+ Minimum time to wait before getting page state for LLM input
+
+ wait_for_network_idle_page_load_time: 1.0
+ Time to wait for network requests to finish before getting page state.
+ Lower values may result in incomplete page loads.
+
+ maximum_wait_page_load_time: 5.0
+ Maximum time to wait for page load before proceeding anyway
+
+ wait_between_actions: 1.0
+ Time to wait between multiple per step actions
+
+ browser_window_size: {
+ 'width': 1280,
+ 'height': 1100,
+ }
+ Default browser window size
+
+ no_viewport: False
+ Disable viewport
+
+ save_recording_path: None
+ Path to save video recordings
+
+ save_downloads_path: None
+ Path to save downloads to
+
+ trace_path: None
+ Path to save trace files. It will auto name the file with the TRACE_PATH/{context_id}.zip
+
+ locale: None
+ Specify user locale, for example en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. If not provided, defaults to the system default locale.
+
+ user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
+ custom user agent to use.
+
+ highlight_elements: True
+ Highlight elements in the DOM on the screen
+
+ viewport_expansion: 500
+ Viewport expansion in pixels. This amount will increase the number of elements which are included in the state what the LLM will see. If set to -1, all elements will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
+
+ allowed_domains: None
+ List of allowed domains that can be accessed. If None, all domains are allowed.
+ Example: ['example.com', 'api.example.com']
+
+ include_dynamic_attributes: bool = True
+ Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False.
+ """
+
+ cookies_file: str | None = None
+ minimum_wait_page_load_time: float = 0.25
+ wait_for_network_idle_page_load_time: float = 0.5
+ maximum_wait_page_load_time: float = 5
+ wait_between_actions: float = 0.5
+
+ disable_security: bool = True
+
+ browser_window_size: BrowserContextWindowSize = field(default_factory=lambda: {'width': 1280, 'height': 1100})
+ no_viewport: Optional[bool] = None
+
+ save_recording_path: str | None = None
+ save_downloads_path: str | None = None
+ trace_path: str | None = None
+ locale: str | None = None
+ user_agent: str = (
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
+ )
+
+ highlight_elements: bool = True
+ viewport_expansion: int = 500
+ allowed_domains: list[str] | None = None
+ include_dynamic_attributes: bool = True
+
+ _force_keep_context_alive: bool = False
+
+
+@dataclass
+class BrowserSession:
+ context: PlaywrightBrowserContext
+ cached_state: BrowserState | None
+
+
+@dataclass
+class BrowserContextState:
+ """
+ State of the browser context
+ """
+
+ target_id: str | None = None # CDP target ID
+
+
+class BrowserContext:
+ def __init__(
+ self,
+ browser: 'Browser',
+ config: BrowserContextConfig = BrowserContextConfig(),
+ state: Optional[BrowserContextState] = None,
+ ):
+ self.context_id = str(uuid.uuid4())
+ logger.debug(f'Initializing new browser context with id: {self.context_id}')
+
+ self.config = config
+ self.browser = browser
+
+ self.state = state or BrowserContextState()
+
+ # Initialize these as None - they'll be set up when needed
+ self.session: BrowserSession | None = None
+
+ async def __aenter__(self):
+ """Async context manager entry"""
+ await self._initialize_session()
+ return self
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ """Async context manager exit"""
+ await self.close()
+
+ @time_execution_async('--close')
+ async def close(self):
+ """Close the browser instance"""
+ logger.debug('Closing browser context')
+
+ try:
+ if self.session is None:
+ return
+
+ # Then remove CDP protocol listeners
+ if self._page_event_handler and self.session.context:
+ try:
+ # This actually sends a CDP command to unsubscribe
+ self.session.context.remove_listener('page', self._page_event_handler)
+ except Exception as e:
+ logger.debug(f'Failed to remove CDP listener: {e}')
+ self._page_event_handler = None
+
+ await self.save_cookies()
+
+ if self.config.trace_path:
+ try:
+ await self.session.context.tracing.stop(path=os.path.join(self.config.trace_path, f'{self.context_id}.zip'))
+ except Exception as e:
+ logger.debug(f'Failed to stop tracing: {e}')
+
+ # This is crucial - it closes the CDP connection
+ if not self.config._force_keep_context_alive:
+ try:
+ await self.session.context.close()
+ except Exception as e:
+ logger.debug(f'Failed to close context: {e}')
+
+ finally:
+ # Dereference everything
+ self.session = None
+ self._page_event_handler = None
+
+ def __del__(self):
+ """Cleanup when object is destroyed"""
+ if not self.config._force_keep_context_alive and self.session is not None:
+ logger.debug('BrowserContext was not properly closed before destruction')
+ try:
+ # Use sync Playwright method for force cleanup
+ if hasattr(self.session.context, '_impl_obj'):
+ asyncio.run(self.session.context._impl_obj.close())
+
+ self.session = None
+ gc.collect()
+ except Exception as e:
+ logger.warning(f'Failed to force close browser context: {e}')
+
+ @time_execution_async('--initialize_session')
+ async def _initialize_session(self):
+ """Initialize the browser session"""
+ logger.debug('Initializing browser context')
+
+ playwright_browser = await self.browser.get_playwright_browser()
+ context = await self._create_context(playwright_browser)
+ self._page_event_handler = None
+
+ # Get or create a page to use
+ pages = context.pages
+
+ self.session = BrowserSession(
+ context=context,
+ cached_state=None,
+ )
+
+ active_page = None
+ if self.browser.config.cdp_url:
+ # If we have a saved target ID, try to find and activate it
+ if self.state.target_id:
+ targets = await self._get_cdp_targets()
+ for target in targets:
+ if target['targetId'] == self.state.target_id:
+ # Find matching page by URL
+ for page in pages:
+ if page.url == target['url']:
+ active_page = page
+ break
+ break
+
+ # If no target ID or couldn't find it, use existing page or create new
+ if not active_page:
+ if pages:
+ active_page = pages[0]
+ logger.debug('Using existing page')
+ else:
+ active_page = await context.new_page()
+ logger.debug('Created new page')
+
+ # Get target ID for the active page
+ if self.browser.config.cdp_url:
+ targets = await self._get_cdp_targets()
+ for target in targets:
+ if target['url'] == active_page.url:
+ self.state.target_id = target['targetId']
+ break
+
+ # Bring page to front
+ await active_page.bring_to_front()
+ await active_page.wait_for_load_state('load')
+
+ return self.session
+
+ def _add_new_page_listener(self, context: PlaywrightBrowserContext):
+ async def on_page(page: Page):
+ if self.browser.config.cdp_url:
+ await page.reload() # Reload the page to avoid timeout errors
+ await page.wait_for_load_state()
+ logger.debug(f'New page opened: {page.url}')
+ if self.session is not None:
+ self.state.target_id = None
+
+ self._page_event_handler = on_page
+ context.on('page', on_page)
+
+ async def get_session(self) -> BrowserSession:
+ """Lazy initialization of the browser and related components"""
+ if self.session is None:
+ return await self._initialize_session()
+ return self.session
+
+ async def get_current_page(self) -> Page:
+ """Get the current page"""
+ session = await self.get_session()
+ return await self._get_current_page(session)
+
+ async def _create_context(self, browser: PlaywrightBrowser):
+ """Creates a new browser context with anti-detection measures and loads cookies if available."""
+ if self.browser.config.cdp_url and len(browser.contexts) > 0:
+ context = browser.contexts[0]
+ elif self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
+ # Connect to existing Chrome instance instead of creating new one
+ context = browser.contexts[0]
+ else:
+ # Original code for creating new context
+ context = await browser.new_context(
+ viewport=self.config.browser_window_size,
+ no_viewport=False,
+ user_agent=self.config.user_agent,
+ java_script_enabled=True,
+ bypass_csp=self.config.disable_security,
+ ignore_https_errors=self.config.disable_security,
+ record_video_dir=self.config.save_recording_path,
+ record_video_size=self.config.browser_window_size,
+ locale=self.config.locale,
+ )
+
+ if self.config.trace_path:
+ await context.tracing.start(screenshots=True, snapshots=True, sources=True)
+
+ # Load cookies if they exist
+ if self.config.cookies_file and os.path.exists(self.config.cookies_file):
+ with open(self.config.cookies_file, 'r') as f:
+ cookies = json.load(f)
+ logger.info(f'Loaded {len(cookies)} cookies from {self.config.cookies_file}')
+ await context.add_cookies(cookies)
+
+ # Expose anti-detection scripts
+ await context.add_init_script(
+ """
+ // Webdriver property
+ Object.defineProperty(navigator, 'webdriver', {
+ get: () => undefined
+ });
+
+ // Languages
+ Object.defineProperty(navigator, 'languages', {
+ get: () => ['en-US']
+ });
+
+ // Plugins
+ Object.defineProperty(navigator, 'plugins', {
+ get: () => [1, 2, 3, 4, 5]
+ });
+
+ // Chrome runtime
+ window.chrome = { runtime: {} };
+
+ // Permissions
+ const originalQuery = window.navigator.permissions.query;
+ window.navigator.permissions.query = (parameters) => (
+ parameters.name === 'notifications' ?
+ Promise.resolve({ state: Notification.permission }) :
+ originalQuery(parameters)
+ );
+ (function () {
+ const originalAttachShadow = Element.prototype.attachShadow;
+ Element.prototype.attachShadow = function attachShadow(options) {
+ return originalAttachShadow.call(this, { ...options, mode: "open" });
+ };
+ })();
+ """
+ )
+
+ return context
+
+ async def _wait_for_stable_network(self):
+ page = await self.get_current_page()
+
+ pending_requests = set()
+ last_activity = asyncio.get_event_loop().time()
+
+ # Define relevant resource types and content types
+ RELEVANT_RESOURCE_TYPES = {
+ 'document',
+ 'stylesheet',
+ 'image',
+ 'font',
+ 'script',
+ 'iframe',
+ }
+
+ RELEVANT_CONTENT_TYPES = {
+ 'text/html',
+ 'text/css',
+ 'application/javascript',
+ 'image/',
+ 'font/',
+ 'application/json',
+ }
+
+ # Additional patterns to filter out
+ IGNORED_URL_PATTERNS = {
+ # Analytics and tracking
+ 'analytics',
+ 'tracking',
+ 'telemetry',
+ 'beacon',
+ 'metrics',
+ # Ad-related
+ 'doubleclick',
+ 'adsystem',
+ 'adserver',
+ 'advertising',
+ # Social media widgets
+ 'facebook.com/plugins',
+ 'platform.twitter',
+ 'linkedin.com/embed',
+ # Live chat and support
+ 'livechat',
+ 'zendesk',
+ 'intercom',
+ 'crisp.chat',
+ 'hotjar',
+ # Push notifications
+ 'push-notifications',
+ 'onesignal',
+ 'pushwoosh',
+ # Background sync/heartbeat
+ 'heartbeat',
+ 'ping',
+ 'alive',
+ # WebRTC and streaming
+ 'webrtc',
+ 'rtmp://',
+ 'wss://',
+ # Common CDNs for dynamic content
+ 'cloudfront.net',
+ 'fastly.net',
+ }
+
+ async def on_request(request):
+ # Filter by resource type
+ if request.resource_type not in RELEVANT_RESOURCE_TYPES:
+ return
+
+ # Filter out streaming, websocket, and other real-time requests
+ if request.resource_type in {
+ 'websocket',
+ 'media',
+ 'eventsource',
+ 'manifest',
+ 'other',
+ }:
+ return
+
+ # Filter out by URL patterns
+ url = request.url.lower()
+ if any(pattern in url for pattern in IGNORED_URL_PATTERNS):
+ return
+
+ # Filter out data URLs and blob URLs
+ if url.startswith(('data:', 'blob:')):
+ return
+
+ # Filter out requests with certain headers
+ headers = request.headers
+ if headers.get('purpose') == 'prefetch' or headers.get('sec-fetch-dest') in [
+ 'video',
+ 'audio',
+ ]:
+ return
+
+ nonlocal last_activity
+ pending_requests.add(request)
+ last_activity = asyncio.get_event_loop().time()
+ # logger.debug(f'Request started: {request.url} ({request.resource_type})')
+
+ async def on_response(response):
+ request = response.request
+ if request not in pending_requests:
+ return
+
+ # Filter by content type if available
+ content_type = response.headers.get('content-type', '').lower()
+
+ # Skip if content type indicates streaming or real-time data
+ if any(
+ t in content_type
+ for t in [
+ 'streaming',
+ 'video',
+ 'audio',
+ 'webm',
+ 'mp4',
+ 'event-stream',
+ 'websocket',
+ 'protobuf',
+ ]
+ ):
+ pending_requests.remove(request)
+ return
+
+ # Only process relevant content types
+ if not any(ct in content_type for ct in RELEVANT_CONTENT_TYPES):
+ pending_requests.remove(request)
+ return
+
+ # Skip if response is too large (likely not essential for page load)
+ content_length = response.headers.get('content-length')
+ if content_length and int(content_length) > 5 * 1024 * 1024: # 5MB
+ pending_requests.remove(request)
+ return
+
+ nonlocal last_activity
+ pending_requests.remove(request)
+ last_activity = asyncio.get_event_loop().time()
+ # logger.debug(f'Request resolved: {request.url} ({content_type})')
+
+ # Attach event listeners
+ page.on('request', on_request)
+ page.on('response', on_response)
+
+ try:
+ # Wait for idle time
+ start_time = asyncio.get_event_loop().time()
+ while True:
+ await asyncio.sleep(0.1)
+ now = asyncio.get_event_loop().time()
+ if len(pending_requests) == 0 and (now - last_activity) >= self.config.wait_for_network_idle_page_load_time:
+ break
+ if now - start_time > self.config.maximum_wait_page_load_time:
+ logger.debug(
+ f'Network timeout after {self.config.maximum_wait_page_load_time}s with {len(pending_requests)} '
+ f'pending requests: {[r.url for r in pending_requests]}'
+ )
+ break
+
+ finally:
+ # Clean up event listeners
+ page.remove_listener('request', on_request)
+ page.remove_listener('response', on_response)
+
+ logger.debug(f'Network stabilized for {self.config.wait_for_network_idle_page_load_time} seconds')
+
+ async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None):
+ """
+ Ensures page is fully loaded before continuing.
+ Waits for either network to be idle or minimum WAIT_TIME, whichever is longer.
+ Also checks if the loaded URL is allowed.
+ """
+ # Start timing
+ start_time = time.time()
+
+ # Wait for page load
+ try:
+ await self._wait_for_stable_network()
+
+ # Check if the loaded URL is allowed
+ page = await self.get_current_page()
+ await self._check_and_handle_navigation(page)
+ except URLNotAllowedError as e:
+ raise e
+ except Exception:
+ logger.warning('Page load failed, continuing...')
+ pass
+
+ # Calculate remaining time to meet minimum WAIT_TIME
+ elapsed = time.time() - start_time
+ remaining = max((timeout_overwrite or self.config.minimum_wait_page_load_time) - elapsed, 0)
+
+ logger.debug(f'--Page loaded in {elapsed:.2f} seconds, waiting for additional {remaining:.2f} seconds')
+
+ # Sleep remaining time if needed
+ if remaining > 0:
+ await asyncio.sleep(remaining)
+
+ def _is_url_allowed(self, url: str) -> bool:
+ """Check if a URL is allowed based on the whitelist configuration."""
+ if not self.config.allowed_domains:
+ return True
+
+ try:
+ from urllib.parse import urlparse
+
+ parsed_url = urlparse(url)
+ domain = parsed_url.netloc.lower()
+
+ # Remove port number if present
+ if ':' in domain:
+ domain = domain.split(':')[0]
+
+ # Check if domain matches any allowed domain pattern
+ return any(
+ domain == allowed_domain.lower() or domain.endswith('.' + allowed_domain.lower())
+ for allowed_domain in self.config.allowed_domains
+ )
+ except Exception as e:
+ logger.error(f'Error checking URL allowlist: {str(e)}')
+ return False
+
+ async def _check_and_handle_navigation(self, page: Page) -> None:
+ """Check if current page URL is allowed and handle if not."""
+ if not self._is_url_allowed(page.url):
+ logger.warning(f'Navigation to non-allowed URL detected: {page.url}')
+ try:
+ await self.go_back()
+ except Exception as e:
+ logger.error(f'Failed to go back after detecting non-allowed URL: {str(e)}')
+ raise URLNotAllowedError(f'Navigation to non-allowed URL: {page.url}')
+
+ async def navigate_to(self, url: str):
+ """Navigate to a URL"""
+ if not self._is_url_allowed(url):
+ raise BrowserError(f'Navigation to non-allowed URL: {url}')
+
+ page = await self.get_current_page()
+ await page.goto(url)
+ await page.wait_for_load_state()
+
+ async def refresh_page(self):
+ """Refresh the current page"""
+ page = await self.get_current_page()
+ await page.reload()
+ await page.wait_for_load_state()
+
+ async def go_back(self):
+ """Navigate back in history"""
+ page = await self.get_current_page()
+ try:
+ # 10 ms timeout
+ await page.go_back(timeout=10, wait_until='domcontentloaded')
+ # await self._wait_for_page_and_frames_load(timeout_overwrite=1.0)
+ except Exception as e:
+ # Continue even if its not fully loaded, because we wait later for the page to load
+ logger.debug(f'During go_back: {e}')
+
+ async def go_forward(self):
+ """Navigate forward in history"""
+ page = await self.get_current_page()
+ try:
+ await page.go_forward(timeout=10, wait_until='domcontentloaded')
+ except Exception as e:
+ # Continue even if its not fully loaded, because we wait later for the page to load
+ logger.debug(f'During go_forward: {e}')
+
+ async def close_current_tab(self):
+ """Close the current tab"""
+ session = await self.get_session()
+ page = await self._get_current_page(session)
+ await page.close()
+
+ # Switch to the first available tab if any exist
+ if session.context.pages:
+ await self.switch_to_tab(0)
+
+ # otherwise the browser will be closed
+
+ async def get_page_html(self) -> str:
+ """Get the current page HTML content"""
+ page = await self.get_current_page()
+ return await page.content()
+
+ async def execute_javascript(self, script: str):
+ """Execute JavaScript code on the page"""
+ page = await self.get_current_page()
+ return await page.evaluate(script)
+
+ async def get_page_structure(self) -> str:
+ """Get a debug view of the page structure including iframes"""
+ debug_script = """(() => {
+ function getPageStructure(element = document, depth = 0, maxDepth = 10) {
+ if (depth >= maxDepth) return '';
+
+ const indent = ' '.repeat(depth);
+ let structure = '';
+
+ // Skip certain elements that clutter the output
+ const skipTags = new Set(['script', 'style', 'link', 'meta', 'noscript']);
+
+ // Add current element info if it's not the document
+ if (element !== document) {
+ const tagName = element.tagName.toLowerCase();
+
+ // Skip uninteresting elements
+ if (skipTags.has(tagName)) return '';
+
+ const id = element.id ? `#${element.id}` : '';
+ const classes = element.className && typeof element.className === 'string' ?
+ `.${element.className.split(' ').filter(c => c).join('.')}` : '';
+
+ // Get additional useful attributes
+ const attrs = [];
+ if (element.getAttribute('role')) attrs.push(`role="${element.getAttribute('role')}"`);
+ if (element.getAttribute('aria-label')) attrs.push(`aria-label="${element.getAttribute('aria-label')}"`);
+ if (element.getAttribute('type')) attrs.push(`type="${element.getAttribute('type')}"`);
+ if (element.getAttribute('name')) attrs.push(`name="${element.getAttribute('name')}"`);
+ if (element.getAttribute('src')) {
+ const src = element.getAttribute('src');
+ attrs.push(`src="${src.substring(0, 50)}${src.length > 50 ? '...' : ''}"`);
+ }
+
+ // Add element info
+ structure += `${indent}${tagName}${id}${classes}${attrs.length ? ' [' + attrs.join(', ') + ']' : ''}\\n`;
+
+ // Handle iframes specially
+ if (tagName === 'iframe') {
+ try {
+ const iframeDoc = element.contentDocument || element.contentWindow?.document;
+ if (iframeDoc) {
+ structure += `${indent} [IFRAME CONTENT]:\\n`;
+ structure += getPageStructure(iframeDoc, depth + 2, maxDepth);
+ } else {
+ structure += `${indent} [IFRAME: No access - likely cross-origin]\\n`;
+ }
+ } catch (e) {
+ structure += `${indent} [IFRAME: Access denied - ${e.message}]\\n`;
+ }
+ }
+ }
+
+ // Get all child elements
+ const children = element.children || element.childNodes;
+ for (const child of children) {
+ if (child.nodeType === 1) { // Element nodes only
+ structure += getPageStructure(child, depth + 1, maxDepth);
+ }
+ }
+
+ return structure;
+ }
+
+ return getPageStructure();
+ })()"""
+
+ page = await self.get_current_page()
+ structure = await page.evaluate(debug_script)
+ return structure
+
+ @time_execution_sync('--get_state') # This decorator might need to be updated to handle async
+ async def get_state(self) -> BrowserState:
+ """Get the current state of the browser"""
+ await self._wait_for_page_and_frames_load()
+ session = await self.get_session()
+ session.cached_state = await self._update_state()
+
+ # Save cookies if a file is specified
+ if self.config.cookies_file:
+ asyncio.create_task(self.save_cookies())
+
+ return session.cached_state
+
+ async def _update_state(self, focus_element: int = -1) -> BrowserState:
+ """Update and return state."""
+ session = await self.get_session()
+
+ # Check if current page is still valid, if not switch to another available page
+ try:
+ page = await self.get_current_page()
+ # Test if page is still accessible
+ await page.evaluate('1')
+ except Exception as e:
+ logger.debug(f'Current page is no longer accessible: {str(e)}')
+ # Get all available pages
+ pages = session.context.pages
+ if pages:
+ self.state.target_id = None
+ page = await self._get_current_page(session)
+ logger.debug(f'Switched to page: {await page.title()}')
+ else:
+ raise BrowserError('Browser closed: no valid pages available')
+
+ try:
+ await self.remove_highlights()
+ dom_service = DomService(page)
+ content = await dom_service.get_clickable_elements(
+ focus_element=focus_element,
+ viewport_expansion=self.config.viewport_expansion,
+ highlight_elements=self.config.highlight_elements,
+ )
+
+ screenshot_b64 = await self.take_screenshot()
+ pixels_above, pixels_below = await self.get_scroll_info(page)
+
+ self.current_state = BrowserState(
+ element_tree=content.element_tree,
+ selector_map=content.selector_map,
+ url=page.url,
+ title=await page.title(),
+ tabs=await self.get_tabs_info(),
+ screenshot=screenshot_b64,
+ pixels_above=pixels_above,
+ pixels_below=pixels_below,
+ )
+
+ return self.current_state
+ except Exception as e:
+ logger.error(f'Failed to update state: {str(e)}')
+ # Return last known good state if available
+ if hasattr(self, 'current_state'):
+ return self.current_state
+ raise
+
+ # region - Browser Actions
+ @time_execution_async('--take_screenshot')
+ async def take_screenshot(self, full_page: bool = False) -> str:
+ """
+ Returns a base64 encoded screenshot of the current page.
+ """
+ page = await self.get_current_page()
+
+ await page.bring_to_front()
+ await page.wait_for_load_state()
+
+ screenshot = await page.screenshot(
+ full_page=full_page,
+ animations='disabled',
+ )
+
+ screenshot_b64 = base64.b64encode(screenshot).decode('utf-8')
+
+ # await self.remove_highlights()
+
+ return screenshot_b64
+
+ @time_execution_async('--remove_highlights')
+ async def remove_highlights(self):
+ """
+ Removes all highlight overlays and labels created by the highlightElement function.
+ Handles cases where the page might be closed or inaccessible.
+ """
+ try:
+ page = await self.get_current_page()
+ await page.evaluate(
+ """
+ try {
+ // Remove the highlight container and all its contents
+ const container = document.getElementById('playwright-highlight-container');
+ if (container) {
+ container.remove();
+ }
+
+ // Remove highlight attributes from elements
+ const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]');
+ highlightedElements.forEach(el => {
+ el.removeAttribute('browser-user-highlight-id');
+ });
+ } catch (e) {
+ console.error('Failed to remove highlights:', e);
+ }
+ """
+ )
+ except Exception as e:
+ logger.debug(f'Failed to remove highlights (this is usually ok): {str(e)}')
+ # Don't raise the error since this is not critical functionality
+ pass
+
+ # endregion
+
+ # region - User Actions
+
+ @classmethod
+ def _convert_simple_xpath_to_css_selector(cls, xpath: str) -> str:
+ """Converts simple XPath expressions to CSS selectors."""
+ if not xpath:
+ return ''
+
+ # Remove leading slash if present
+ xpath = xpath.lstrip('/')
+
+ # Split into parts
+ parts = xpath.split('/')
+ css_parts = []
+
+ for part in parts:
+ if not part:
+ continue
+
+ # Handle index notation [n]
+ if '[' in part:
+ base_part = part[: part.find('[')]
+ index_part = part[part.find('[') :]
+
+ # Handle multiple indices
+ indices = [i.strip('[]') for i in index_part.split(']')[:-1]]
+
+ for idx in indices:
+ try:
+ # Handle numeric indices
+ if idx.isdigit():
+ index = int(idx) - 1
+ base_part += f':nth-of-type({index + 1})'
+ # Handle last() function
+ elif idx == 'last()':
+ base_part += ':last-of-type'
+ # Handle position() functions
+ elif 'position()' in idx:
+ if '>1' in idx:
+ base_part += ':nth-of-type(n+2)'
+ except ValueError:
+ continue
+
+ css_parts.append(base_part)
+ else:
+ css_parts.append(part)
+
+ base_selector = ' > '.join(css_parts)
+ return base_selector
+
+ @classmethod
+ @time_execution_sync('--enhanced_css_selector_for_element')
+ def _enhanced_css_selector_for_element(cls, element: DOMElementNode, include_dynamic_attributes: bool = True) -> str:
+ """
+ Creates a CSS selector for a DOM element, handling various edge cases and special characters.
+
+ Args:
+ element: The DOM element to create a selector for
+
+ Returns:
+ A valid CSS selector string
+ """
+ try:
+ # Get base selector from XPath
+ css_selector = cls._convert_simple_xpath_to_css_selector(element.xpath)
+
+ # Handle class attributes
+ if 'class' in element.attributes and element.attributes['class'] and include_dynamic_attributes:
+ # Define a regex pattern for valid class names in CSS
+ valid_class_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_-]*$')
+
+ # Iterate through the class attribute values
+ classes = element.attributes['class'].split()
+ for class_name in classes:
+ # Skip empty class names
+ if not class_name.strip():
+ continue
+
+ # Check if the class name is valid
+ if valid_class_name_pattern.match(class_name):
+ # Append the valid class name to the CSS selector
+ css_selector += f'.{class_name}'
+ else:
+ # Skip invalid class names
+ continue
+
+ # Expanded set of safe attributes that are stable and useful for selection
+ SAFE_ATTRIBUTES = {
+ # Data attributes (if they're stable in your application)
+ 'id',
+ # Standard HTML attributes
+ 'name',
+ 'type',
+ 'placeholder',
+ # Accessibility attributes
+ 'aria-label',
+ 'aria-labelledby',
+ 'aria-describedby',
+ 'role',
+ # Common form attributes
+ 'for',
+ 'autocomplete',
+ 'required',
+ 'readonly',
+ # Media attributes
+ 'alt',
+ 'title',
+ 'src',
+ # Custom stable attributes (add any application-specific ones)
+ 'href',
+ 'target',
+ }
+
+ if include_dynamic_attributes:
+ dynamic_attributes = {
+ 'data-id',
+ 'data-qa',
+ 'data-cy',
+ 'data-testid',
+ }
+ SAFE_ATTRIBUTES.update(dynamic_attributes)
+
+ # Handle other attributes
+ for attribute, value in element.attributes.items():
+ if attribute == 'class':
+ continue
+
+ # Skip invalid attribute names
+ if not attribute.strip():
+ continue
+
+ if attribute not in SAFE_ATTRIBUTES:
+ continue
+
+ # Escape special characters in attribute names
+ safe_attribute = attribute.replace(':', r'\:')
+
+ # Handle different value cases
+ if value == '':
+ css_selector += f'[{safe_attribute}]'
+ elif any(char in value for char in '"\'<>`\n\r\t'):
+ # Use contains for values with special characters
+ # Regex-substitute *any* whitespace with a single space, then strip.
+ collapsed_value = re.sub(r'\s+', ' ', value).strip()
+ # Escape embedded double-quotes.
+ safe_value = collapsed_value.replace('"', '\\"')
+ css_selector += f'[{safe_attribute}*="{safe_value}"]'
+ else:
+ css_selector += f'[{safe_attribute}="{value}"]'
+
+ return css_selector
+
+ except Exception:
+ # Fallback to a more basic selector if something goes wrong
+ tag_name = element.tag_name or '*'
+ return f"{tag_name}[highlight_index='{element.highlight_index}']"
+
+ @time_execution_async('--get_locate_element')
+ async def get_locate_element(self, element: DOMElementNode) -> Optional[ElementHandle]:
+ current_frame = await self.get_current_page()
+
+ # Start with the target element and collect all parents
+ parents: list[DOMElementNode] = []
+ current = element
+ while current.parent is not None:
+ parent = current.parent
+ parents.append(parent)
+ current = parent
+
+ # Reverse the parents list to process from top to bottom
+ parents.reverse()
+
+ # Process all iframe parents in sequence
+ iframes = [item for item in parents if item.tag_name == 'iframe']
+ for parent in iframes:
+ css_selector = self._enhanced_css_selector_for_element(
+ parent,
+ include_dynamic_attributes=self.config.include_dynamic_attributes,
+ )
+ current_frame = current_frame.frame_locator(css_selector)
+
+ css_selector = self._enhanced_css_selector_for_element(
+ element, include_dynamic_attributes=self.config.include_dynamic_attributes
+ )
+
+ try:
+ if isinstance(current_frame, FrameLocator):
+ element_handle = await current_frame.locator(css_selector).element_handle()
+ return element_handle
+ else:
+ # Try to scroll into view if hidden
+ element_handle = await current_frame.query_selector(css_selector)
+ if element_handle:
+ await element_handle.scroll_into_view_if_needed()
+ return element_handle
+ return None
+ except Exception as e:
+ logger.error(f'Failed to locate element: {str(e)}')
+ return None
+
+ @time_execution_async('--input_text_element_node')
+ async def _input_text_element_node(self, element_node: DOMElementNode, text: str):
+ """
+ Input text into an element with proper error handling and state management.
+ Handles different types of input fields and ensures proper element state before input.
+ """
+ try:
+ # Highlight before typing
+ # if element_node.highlight_index is not None:
+ # await self._update_state(focus_element=element_node.highlight_index)
+
+ element_handle = await self.get_locate_element(element_node)
+
+ if element_handle is None:
+ raise BrowserError(f'Element: {repr(element_node)} not found')
+
+ # Ensure element is ready for input
+ try:
+ await element_handle.wait_for_element_state('stable', timeout=1000)
+ await element_handle.scroll_into_view_if_needed(timeout=1000)
+ except Exception:
+ pass
+
+ # Get element properties to determine input method
+ tag_handle = await element_handle.get_property("tagName")
+ tag_name = (await tag_handle.json_value()).lower()
+ is_contenteditable = await element_handle.get_property('isContentEditable')
+ readonly_handle = await element_handle.get_property("readOnly")
+ disabled_handle = await element_handle.get_property("disabled")
+
+ readonly = await readonly_handle.json_value() if readonly_handle else False
+ disabled = await disabled_handle.json_value() if disabled_handle else False
+
+ if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled):
+ await element_handle.evaluate('el => el.textContent = ""')
+ await element_handle.type(text, delay=5)
+ else:
+ await element_handle.fill(text)
+
+ except Exception as e:
+ logger.debug(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
+ raise BrowserError(f'Failed to input text into index {element_node.highlight_index}')
+
+ @time_execution_async('--click_element_node')
+ async def _click_element_node(self, element_node: DOMElementNode) -> Optional[str]:
+ """
+ Optimized method to click an element using xpath.
+ """
+ page = await self.get_current_page()
+
+ try:
+ # Highlight before clicking
+ # if element_node.highlight_index is not None:
+ # await self._update_state(focus_element=element_node.highlight_index)
+
+ element_handle = await self.get_locate_element(element_node)
+
+ if element_handle is None:
+ raise Exception(f'Element: {repr(element_node)} not found')
+
+ async def perform_click(click_func):
+ """Performs the actual click, handling both download
+ and navigation scenarios."""
+ if self.config.save_downloads_path:
+ try:
+ # Try short-timeout expect_download to detect a file download has been been triggered
+ async with page.expect_download(timeout=5000) as download_info:
+ await click_func()
+ download = await download_info.value
+ # Determine file path
+ suggested_filename = download.suggested_filename
+ unique_filename = await self._get_unique_filename(self.config.save_downloads_path, suggested_filename)
+ download_path = os.path.join(self.config.save_downloads_path, unique_filename)
+ await download.save_as(download_path)
+ logger.debug(f'Download triggered. Saved file to: {download_path}')
+ return download_path
+ except TimeoutError:
+ # If no download is triggered, treat as normal click
+ logger.debug('No download triggered within timeout. Checking navigation...')
+ await page.wait_for_load_state()
+ await self._check_and_handle_navigation(page)
+ else:
+ # Standard click logic if no download is expected
+ await click_func()
+ await page.wait_for_load_state()
+ await self._check_and_handle_navigation(page)
+
+ try:
+ return await perform_click(lambda: element_handle.click(timeout=1500))
+ except URLNotAllowedError as e:
+ raise e
+ except Exception:
+ try:
+ return await perform_click(lambda: page.evaluate('(el) => el.click()', element_handle))
+ except URLNotAllowedError as e:
+ raise e
+ except Exception as e:
+ raise Exception(f'Failed to click element: {str(e)}')
+
+ except URLNotAllowedError as e:
+ raise e
+ except Exception as e:
+ raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
+
+ @time_execution_async('--get_tabs_info')
+ async def get_tabs_info(self) -> list[TabInfo]:
+ """Get information about all tabs"""
+ session = await self.get_session()
+
+ tabs_info = []
+ for page_id, page in enumerate(session.context.pages):
+ tab_info = TabInfo(page_id=page_id, url=page.url, title=await page.title())
+ tabs_info.append(tab_info)
+
+ return tabs_info
+
+ @time_execution_async('--switch_to_tab')
+ async def switch_to_tab(self, page_id: int) -> None:
+ """Switch to a specific tab by its page_id"""
+ session = await self.get_session()
+ pages = session.context.pages
+
+ if page_id >= len(pages):
+ raise BrowserError(f'No tab found with page_id: {page_id}')
+
+ page = pages[page_id]
+
+ # Check if the tab's URL is allowed before switching
+ if not self._is_url_allowed(page.url):
+ raise BrowserError(f'Cannot switch to tab with non-allowed URL: {page.url}')
+
+ # Update target ID if using CDP
+ if self.browser.config.cdp_url:
+ targets = await self._get_cdp_targets()
+ for target in targets:
+ if target['url'] == page.url:
+ self.state.target_id = target['targetId']
+ break
+
+ await page.bring_to_front()
+ await page.wait_for_load_state()
+
+ @time_execution_async('--create_new_tab')
+ async def create_new_tab(self, url: str | None = None) -> None:
+ """Create a new tab and optionally navigate to a URL"""
+ if url and not self._is_url_allowed(url):
+ raise BrowserError(f'Cannot create new tab with non-allowed URL: {url}')
+
+ session = await self.get_session()
+ new_page = await session.context.new_page()
+ await new_page.wait_for_load_state()
+
+ if url:
+ await new_page.goto(url)
+ await self._wait_for_page_and_frames_load(timeout_overwrite=1)
+
+ # Get target ID for new page if using CDP
+ if self.browser.config.cdp_url:
+ targets = await self._get_cdp_targets()
+ for target in targets:
+ if target['url'] == new_page.url:
+ self.state.target_id = target['targetId']
+ break
+
+ # endregion
+
+ # region - Helper methods for easier access to the DOM
+ async def _get_current_page(self, session: BrowserSession) -> Page:
+ pages = session.context.pages
+
+ # Try to find page by target ID if using CDP
+ if self.browser.config.cdp_url and self.state.target_id:
+ targets = await self._get_cdp_targets()
+ for target in targets:
+ if target['targetId'] == self.state.target_id:
+ for page in pages:
+ if page.url == target['url']:
+ return page
+
+ # Fallback to last page
+ return pages[-1] if pages else await session.context.new_page()
+
+ async def get_selector_map(self) -> SelectorMap:
+ session = await self.get_session()
+ if session.cached_state is None:
+ return {}
+ return session.cached_state.selector_map
+
+ async def get_element_by_index(self, index: int) -> ElementHandle | None:
+ selector_map = await self.get_selector_map()
+ element_handle = await self.get_locate_element(selector_map[index])
+ return element_handle
+
+ async def get_dom_element_by_index(self, index: int) -> DOMElementNode:
+ selector_map = await self.get_selector_map()
+ return selector_map[index]
+
+ async def save_cookies(self):
+ """Save current cookies to file"""
+ if self.session and self.session.context and self.config.cookies_file:
+ try:
+ cookies = await self.session.context.cookies()
+ logger.debug(f'Saving {len(cookies)} cookies to {self.config.cookies_file}')
+
+ # Check if the path is a directory and create it if necessary
+ dirname = os.path.dirname(self.config.cookies_file)
+ if dirname:
+ os.makedirs(dirname, exist_ok=True)
+
+ with open(self.config.cookies_file, 'w') as f:
+ json.dump(cookies, f)
+ except Exception as e:
+ logger.warning(f'Failed to save cookies: {str(e)}')
+
+ async def is_file_uploader(self, element_node: DOMElementNode, max_depth: int = 3, current_depth: int = 0) -> bool:
+ """Check if element or its children are file uploaders"""
+ if current_depth > max_depth:
+ return False
+
+ # Check current element
+ is_uploader = False
+
+ if not isinstance(element_node, DOMElementNode):
+ return False
+
+ # Check for file input attributes
+ if element_node.tag_name == 'input':
+ is_uploader = element_node.attributes.get('type') == 'file' or element_node.attributes.get('accept') is not None
+
+ if is_uploader:
+ return True
+
+ # Recursively check children
+ if element_node.children and current_depth < max_depth:
+ for child in element_node.children:
+ if isinstance(child, DOMElementNode):
+ if await self.is_file_uploader(child, max_depth, current_depth + 1):
+ return True
+
+ return False
+
+ async def get_scroll_info(self, page: Page) -> tuple[int, int]:
+ """Get scroll position information for the current page."""
+ scroll_y = await page.evaluate('window.scrollY')
+ viewport_height = await page.evaluate('window.innerHeight')
+ total_height = await page.evaluate('document.documentElement.scrollHeight')
+ pixels_above = scroll_y
+ pixels_below = total_height - (scroll_y + viewport_height)
+ return pixels_above, pixels_below
+
+ async def reset_context(self):
+ """Reset the browser session
+ Call this when you don't want to kill the context but just kill the state
+ """
+ # close all tabs and clear cached state
+ session = await self.get_session()
+
+ pages = session.context.pages
+ for page in pages:
+ await page.close()
+
+ session.cached_state = None
+ self.state.target_id = None
+
+ async def _get_unique_filename(self, directory, filename):
+ """Generate a unique filename by appending (1), (2), etc., if a file already exists."""
+ base, ext = os.path.splitext(filename)
+ counter = 1
+ new_filename = filename
+ while os.path.exists(os.path.join(directory, new_filename)):
+ new_filename = f'{base} ({counter}){ext}'
+ counter += 1
+ return new_filename
+
+ async def _get_cdp_targets(self) -> list[dict]:
+ """Get all CDP targets directly using CDP protocol"""
+ if not self.browser.config.cdp_url or not self.session:
+ return []
+
+ try:
+ pages = self.session.context.pages
+ if not pages:
+ return []
+
+ cdp_session = await pages[0].context.new_cdp_session(pages[0])
+ result = await cdp_session.send('Target.getTargets')
+ await cdp_session.detach()
+ return result.get('targetInfos', [])
+ except Exception as e:
+ logger.debug(f'Failed to get CDP targets: {e}')
+ return []
diff --git a/browser_use/browser/tests/screenshot_test.py b/browser_use/browser/tests/screenshot_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..7255ccb615e9a89d2a2e87b4949ae9b581e5c57f
--- /dev/null
+++ b/browser_use/browser/tests/screenshot_test.py
@@ -0,0 +1,37 @@
+import base64
+
+import pytest
+
+from browser_use.browser.browser import Browser, BrowserConfig
+
+
+@pytest.fixture
+async def browser():
+ browser_service = Browser(config=BrowserConfig(headless=True))
+ yield browser_service
+
+ await browser_service.close()
+
+
+# @pytest.mark.skip(reason='takes too long')
+def test_take_full_page_screenshot(browser):
+ # Go to a test page
+ browser.go_to_url('https://example.com')
+
+ # Take full page screenshot
+ screenshot_b64 = browser.take_screenshot(full_page=True)
+
+ # Verify screenshot is not empty and is valid base64
+ assert screenshot_b64 is not None
+ assert isinstance(screenshot_b64, str)
+ assert len(screenshot_b64) > 0
+
+ # Test we can decode the base64 string
+ try:
+ base64.b64decode(screenshot_b64)
+ except Exception as e:
+ pytest.fail(f'Failed to decode base64 screenshot: {str(e)}')
+
+
+if __name__ == '__main__':
+ test_take_full_page_screenshot(Browser(config=BrowserConfig(headless=False)))
diff --git a/browser_use/browser/tests/test_clicks.py b/browser_use/browser/tests/test_clicks.py
new file mode 100644
index 0000000000000000000000000000000000000000..98ca74354c70406a8d1d701f86c32787b725c930
--- /dev/null
+++ b/browser_use/browser/tests/test_clicks.py
@@ -0,0 +1,94 @@
+import asyncio
+import json
+
+import pytest
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode
+from browser_use.utils import time_execution_sync
+
+
+class ElementTreeSerializer:
+ @staticmethod
+ def dom_element_node_to_json(element_tree: DOMElementNode) -> dict:
+ def node_to_dict(node: DOMBaseNode) -> dict:
+ if isinstance(node, DOMTextNode):
+ return {'type': 'text', 'text': node.text}
+ elif isinstance(node, DOMElementNode):
+ return {
+ 'type': 'element',
+ 'tag_name': node.tag_name,
+ 'attributes': node.attributes,
+ 'highlight_index': node.highlight_index,
+ 'children': [node_to_dict(child) for child in node.children],
+ }
+ return {}
+
+ return node_to_dict(element_tree)
+
+
+# run with: pytest browser_use/browser/tests/test_clicks.py
+@pytest.mark.asyncio
+async def test_highlight_elements():
+ browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
+
+ async with await browser.new_context() as context:
+ page = await context.get_current_page()
+ # await page.goto('https://immobilienscout24.de')
+ # await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans')
+ # await page.goto('https://google.com/search?q=elon+musk')
+ # await page.goto('https://kayak.com')
+ # await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe')
+ # await page.goto('https://dictionary.cambridge.org')
+ # await page.goto('https://github.com')
+ await page.goto('https://huggingface.co/')
+
+ await asyncio.sleep(1)
+
+ while True:
+ try:
+ # await asyncio.sleep(10)
+ state = await context.get_state()
+
+ with open('./tmp/page.json', 'w') as f:
+ json.dump(
+ ElementTreeSerializer.dom_element_node_to_json(state.element_tree),
+ f,
+ indent=1,
+ )
+
+ # await time_execution_sync('highlight_selector_map_elements')(
+ # browser.highlight_selector_map_elements
+ # )(state.selector_map)
+
+ # Find and print duplicate XPaths
+ xpath_counts = {}
+ if not state.selector_map:
+ continue
+ for selector in state.selector_map.values():
+ xpath = selector.xpath
+ if xpath in xpath_counts:
+ xpath_counts[xpath] += 1
+ else:
+ xpath_counts[xpath] = 1
+
+ print('\nDuplicate XPaths found:')
+ for xpath, count in xpath_counts.items():
+ if count > 1:
+ print(f'XPath: {xpath}')
+ print(f'Count: {count}\n')
+
+ print(list(state.selector_map.keys()), 'Selector map keys')
+ print(state.element_tree.clickable_elements_to_string())
+ action = input('Select next action: ')
+
+ await time_execution_sync('remove_highlight_elements')(context.remove_highlights)()
+
+ node_element = state.selector_map[int(action)]
+
+ # check if index of selector map are the same as index of items in dom_items
+
+ await context._click_element_node(node_element)
+
+ except Exception as e:
+ print(e)
diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py
new file mode 100644
index 0000000000000000000000000000000000000000..3434d86e2691caba5715f4028bd3d849d07bf9e7
--- /dev/null
+++ b/browser_use/browser/views.py
@@ -0,0 +1,53 @@
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+from pydantic import BaseModel
+
+from browser_use.dom.history_tree_processor.service import DOMHistoryElement
+from browser_use.dom.views import DOMState
+
+
+# Pydantic
+class TabInfo(BaseModel):
+ """Represents information about a browser tab"""
+
+ page_id: int
+ url: str
+ title: str
+
+
+@dataclass
+class BrowserState(DOMState):
+ url: str
+ title: str
+ tabs: list[TabInfo]
+ screenshot: Optional[str] = None
+ pixels_above: int = 0
+ pixels_below: int = 0
+ browser_errors: list[str] = field(default_factory=list)
+
+
+@dataclass
+class BrowserStateHistory:
+ url: str
+ title: str
+ tabs: list[TabInfo]
+ interacted_element: list[DOMHistoryElement | None] | list[None]
+ screenshot: Optional[str] = None
+
+ def to_dict(self) -> dict[str, Any]:
+ data = {}
+ data['tabs'] = [tab.model_dump() for tab in self.tabs]
+ data['screenshot'] = self.screenshot
+ data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
+ data['url'] = self.url
+ data['title'] = self.title
+ return data
+
+
+class BrowserError(Exception):
+ """Base class for all browser errors"""
+
+
+class URLNotAllowedError(BrowserError):
+ """Error raised when a URL is not allowed"""
diff --git a/browser_use/controller/registry/service.py b/browser_use/controller/registry/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..be52a4b68074f5e8c6dffafa504b57483f1fd7ba
--- /dev/null
+++ b/browser_use/controller/registry/service.py
@@ -0,0 +1,199 @@
+import asyncio
+from inspect import iscoroutinefunction, signature
+from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from pydantic import BaseModel, Field, create_model
+
+from browser_use.browser.context import BrowserContext
+from browser_use.controller.registry.views import (
+ ActionModel,
+ ActionRegistry,
+ RegisteredAction,
+)
+from browser_use.telemetry.service import ProductTelemetry
+from browser_use.telemetry.views import (
+ ControllerRegisteredFunctionsTelemetryEvent,
+ RegisteredFunction,
+)
+from browser_use.utils import time_execution_async, time_execution_sync
+
+Context = TypeVar('Context')
+
+
+class Registry(Generic[Context]):
+ """Service for registering and managing actions"""
+
+ def __init__(self, exclude_actions: list[str] | None = None):
+ self.registry = ActionRegistry()
+ self.telemetry = ProductTelemetry()
+ self.exclude_actions = exclude_actions if exclude_actions is not None else []
+
+ @time_execution_sync('--create_param_model')
+ def _create_param_model(self, function: Callable) -> Type[BaseModel]:
+ """Creates a Pydantic model from function signature"""
+ sig = signature(function)
+ params = {
+ name: (param.annotation, ... if param.default == param.empty else param.default)
+ for name, param in sig.parameters.items()
+ if name != 'browser' and name != 'page_extraction_llm' and name != 'available_file_paths'
+ }
+ # TODO: make the types here work
+ return create_model(
+ f'{function.__name__}_parameters',
+ __base__=ActionModel,
+ **params, # type: ignore
+ )
+
+ def action(
+ self,
+ description: str,
+ param_model: Optional[Type[BaseModel]] = None,
+ ):
+ """Decorator for registering actions"""
+
+ def decorator(func: Callable):
+ # Skip registration if action is in exclude_actions
+ if func.__name__ in self.exclude_actions:
+ return func
+
+ # Create param model from function if not provided
+ actual_param_model = param_model or self._create_param_model(func)
+
+ # Wrap sync functions to make them async
+ if not iscoroutinefunction(func):
+
+ async def async_wrapper(*args, **kwargs):
+ return await asyncio.to_thread(func, *args, **kwargs)
+
+ # Copy the signature and other metadata from the original function
+ async_wrapper.__signature__ = signature(func)
+ async_wrapper.__name__ = func.__name__
+ async_wrapper.__annotations__ = func.__annotations__
+ wrapped_func = async_wrapper
+ else:
+ wrapped_func = func
+
+ action = RegisteredAction(
+ name=func.__name__,
+ description=description,
+ function=wrapped_func,
+ param_model=actual_param_model,
+ )
+ self.registry.actions[func.__name__] = action
+ return func
+
+ return decorator
+
+ @time_execution_async('--execute_action')
+ async def execute_action(
+ self,
+ action_name: str,
+ params: dict,
+ browser: Optional[BrowserContext] = None,
+ page_extraction_llm: Optional[BaseChatModel] = None,
+ sensitive_data: Optional[Dict[str, str]] = None,
+ available_file_paths: Optional[list[str]] = None,
+ #
+ context: Context | None = None,
+ ) -> Any:
+ """Execute a registered action"""
+ if action_name not in self.registry.actions:
+ raise ValueError(f'Action {action_name} not found')
+
+ action = self.registry.actions[action_name]
+ try:
+ # Create the validated Pydantic model
+ validated_params = action.param_model(**params)
+
+ # Check if the first parameter is a Pydantic model
+ sig = signature(action.function)
+ parameters = list(sig.parameters.values())
+ is_pydantic = parameters and issubclass(parameters[0].annotation, BaseModel)
+ parameter_names = [param.name for param in parameters]
+
+ if sensitive_data:
+ validated_params = self._replace_sensitive_data(validated_params, sensitive_data)
+
+ # Check if the action requires browser
+ if 'browser' in parameter_names and not browser:
+ raise ValueError(f'Action {action_name} requires browser but none provided.')
+ if 'page_extraction_llm' in parameter_names and not page_extraction_llm:
+ raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.')
+ if 'available_file_paths' in parameter_names and not available_file_paths:
+ raise ValueError(f'Action {action_name} requires available_file_paths but none provided.')
+
+ if 'context' in parameter_names and not context:
+ raise ValueError(f'Action {action_name} requires context but none provided.')
+
+ # Prepare arguments based on parameter type
+ extra_args = {}
+ if 'context' in parameter_names:
+ extra_args['context'] = context
+ if 'browser' in parameter_names:
+ extra_args['browser'] = browser
+ if 'page_extraction_llm' in parameter_names:
+ extra_args['page_extraction_llm'] = page_extraction_llm
+ if 'available_file_paths' in parameter_names:
+ extra_args['available_file_paths'] = available_file_paths
+ if action_name == 'input_text' and sensitive_data:
+ extra_args['has_sensitive_data'] = True
+ if is_pydantic:
+ return await action.function(validated_params, **extra_args)
+ return await action.function(**validated_params.model_dump(), **extra_args)
+
+ except Exception as e:
+ raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
+
+ def _replace_sensitive_data(self, params: BaseModel, sensitive_data: Dict[str, str]) -> BaseModel:
+ """Replaces the sensitive data in the params"""
+ # if there are any str with placeholder in the params, replace them with the actual value from sensitive_data
+
+ import re
+
+ secret_pattern = re.compile(r'(.*?) ')
+
+ def replace_secrets(value):
+ if isinstance(value, str):
+ matches = secret_pattern.findall(value)
+ for placeholder in matches:
+ if placeholder in sensitive_data:
+ value = value.replace(f'{placeholder} ', sensitive_data[placeholder])
+ return value
+ elif isinstance(value, dict):
+ return {k: replace_secrets(v) for k, v in value.items()}
+ elif isinstance(value, list):
+ return [replace_secrets(v) for v in value]
+ return value
+
+ for key, value in params.model_dump().items():
+ params.__dict__[key] = replace_secrets(value)
+ return params
+
+ @time_execution_sync('--create_action_model')
+ def create_action_model(self, include_actions: Optional[list[str]] = None) -> Type[ActionModel]:
+ """Creates a Pydantic model from registered actions"""
+ fields = {
+ name: (
+ Optional[action.param_model],
+ Field(default=None, description=action.description),
+ )
+ for name, action in self.registry.actions.items()
+ if include_actions is None or name in include_actions
+ }
+
+ self.telemetry.capture(
+ ControllerRegisteredFunctionsTelemetryEvent(
+ registered_functions=[
+ RegisteredFunction(name=name, params=action.param_model.model_json_schema())
+ for name, action in self.registry.actions.items()
+ if include_actions is None or name in include_actions
+ ]
+ )
+ )
+
+ return create_model('ActionModel', __base__=ActionModel, **fields) # type:ignore
+
+ def get_prompt_description(self) -> str:
+ """Get a description of all actions for the prompt"""
+ return self.registry.get_prompt_description()
diff --git a/browser_use/controller/registry/views.py b/browser_use/controller/registry/views.py
new file mode 100644
index 0000000000000000000000000000000000000000..211c767a31609ee4afc46d50fa8353fc16f391de
--- /dev/null
+++ b/browser_use/controller/registry/views.py
@@ -0,0 +1,70 @@
+from typing import Callable, Dict, Type
+
+from pydantic import BaseModel, ConfigDict
+
+
+class RegisteredAction(BaseModel):
+ """Model for a registered action"""
+
+ name: str
+ description: str
+ function: Callable
+ param_model: Type[BaseModel]
+
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ def prompt_description(self) -> str:
+ """Get a description of the action for the prompt"""
+ skip_keys = ['title']
+ s = f'{self.description}: \n'
+ s += '{' + str(self.name) + ': '
+ s += str(
+ {
+ k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys}
+ for k, v in self.param_model.schema()['properties'].items()
+ }
+ )
+ s += '}'
+ return s
+
+
+class ActionModel(BaseModel):
+ """Base model for dynamically created action models"""
+
+ # this will have all the registered actions, e.g.
+ # click_element = param_model = ClickElementParams
+ # done = param_model = None
+ #
+ model_config = ConfigDict(arbitrary_types_allowed=True)
+
+ def get_index(self) -> int | None:
+ """Get the index of the action"""
+ # {'clicked_element': {'index':5}}
+ params = self.model_dump(exclude_unset=True).values()
+ if not params:
+ return None
+ for param in params:
+ if param is not None and 'index' in param:
+ return param['index']
+ return None
+
+ def set_index(self, index: int):
+ """Overwrite the index of the action"""
+ # Get the action name and params
+ action_data = self.model_dump(exclude_unset=True)
+ action_name = next(iter(action_data.keys()))
+ action_params = getattr(self, action_name)
+
+ # Update the index directly on the model
+ if hasattr(action_params, 'index'):
+ action_params.index = index
+
+
+class ActionRegistry(BaseModel):
+ """Model representing the action registry"""
+
+ actions: Dict[str, RegisteredAction] = {}
+
+ def get_prompt_description(self) -> str:
+ """Get a description of all actions for the prompt"""
+ return '\n'.join([action.prompt_description() for action in self.actions.values()])
diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf468f28734c632a378df3f03d871953637b251c
--- /dev/null
+++ b/browser_use/controller/service.py
@@ -0,0 +1,532 @@
+import asyncio
+import json
+import enum
+import logging
+from typing import Dict, Generic, Optional, Type, TypeVar
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.prompts import PromptTemplate
+
+# from lmnr.sdk.laminar import Laminar
+from pydantic import BaseModel
+
+from browser_use.agent.views import ActionModel, ActionResult
+from browser_use.browser.context import BrowserContext
+from browser_use.controller.registry.service import Registry
+from browser_use.controller.views import (
+ ClickElementAction,
+ DoneAction,
+ GoToUrlAction,
+ InputTextAction,
+ NoParamsAction,
+ OpenTabAction,
+ ScrollAction,
+ SearchGoogleAction,
+ SendKeysAction,
+ SwitchTabAction,
+)
+from browser_use.utils import time_execution_sync
+
+logger = logging.getLogger(__name__)
+
+
+Context = TypeVar('Context')
+
+
+class Controller(Generic[Context]):
+ def __init__(
+ self,
+ exclude_actions: list[str] = [],
+ output_model: Optional[Type[BaseModel]] = None,
+ ):
+ self.registry = Registry[Context](exclude_actions)
+
+ """Register all default browser actions"""
+
+ if output_model is not None:
+ # Create a new model that extends the output model with success parameter
+ class ExtendedOutputModel(BaseModel): # type: ignore
+ success: bool = True
+ data: output_model
+
+ @self.registry.action(
+ 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
+ param_model=ExtendedOutputModel,
+ )
+ async def done(params: ExtendedOutputModel):
+ # Exclude success from the output JSON since it's an internal parameter
+ output_dict = params.data.model_dump()
+
+ # Enums are not serializable, convert to string
+ for key, value in output_dict.items():
+ if isinstance(value, enum.Enum):
+ output_dict[key] = value.value
+
+ return ActionResult(is_done=True, success=params.success, extracted_content=json.dumps(output_dict))
+ else:
+
+ @self.registry.action(
+ 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
+ param_model=DoneAction,
+ )
+ async def done(params: DoneAction):
+ return ActionResult(is_done=True, success=params.success, extracted_content=params.text)
+
+ # Basic Navigation Actions
+ @self.registry.action(
+ 'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
+ param_model=SearchGoogleAction,
+ )
+ async def search_google(params: SearchGoogleAction, browser: BrowserContext):
+ page = await browser.get_current_page()
+ await page.goto(f'https://www.google.com/search?q={params.query}&udm=14')
+ await page.wait_for_load_state()
+ msg = f'π Searched for "{params.query}" in Google'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ @self.registry.action('Navigate to URL in the current tab', param_model=GoToUrlAction)
+ async def go_to_url(params: GoToUrlAction, browser: BrowserContext):
+ page = await browser.get_current_page()
+ await page.goto(params.url)
+ await page.wait_for_load_state()
+ msg = f'π Navigated to {params.url}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ @self.registry.action('Go back', param_model=NoParamsAction)
+ async def go_back(_: NoParamsAction, browser: BrowserContext):
+ await browser.go_back()
+ msg = 'π Navigated back'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ # wait for x seconds
+ @self.registry.action('Wait for x seconds default 3')
+ async def wait(seconds: int = 3):
+ msg = f'π Waiting for {seconds} seconds'
+ logger.info(msg)
+ await asyncio.sleep(seconds)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ # Element Interaction Actions
+ @self.registry.action('Click element', param_model=ClickElementAction)
+ async def click_element(params: ClickElementAction, browser: BrowserContext):
+ session = await browser.get_session()
+
+ if params.index not in await browser.get_selector_map():
+ raise Exception(f'Element with index {params.index} does not exist - retry or use alternative actions')
+
+ element_node = await browser.get_dom_element_by_index(params.index)
+ initial_pages = len(session.context.pages)
+
+ # if element has file uploader then dont click
+ if await browser.is_file_uploader(element_node):
+ msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ msg = None
+
+ try:
+ download_path = await browser._click_element_node(element_node)
+ if download_path:
+ msg = f'πΎ Downloaded file to {download_path}'
+ else:
+ msg = f'π±οΈ Clicked button with index {params.index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}'
+
+ logger.info(msg)
+ logger.debug(f'Element xpath: {element_node.xpath}')
+ if len(session.context.pages) > initial_pages:
+ new_tab_msg = 'New tab opened - switching to it'
+ msg += f' - {new_tab_msg}'
+ logger.info(new_tab_msg)
+ await browser.switch_to_tab(-1)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+ except Exception as e:
+ logger.warning(f'Element not clickable with index {params.index} - most likely the page changed')
+ return ActionResult(error=str(e))
+
+ @self.registry.action(
+ 'Input text into a input interactive element',
+ param_model=InputTextAction,
+ )
+ async def input_text(params: InputTextAction, browser: BrowserContext, has_sensitive_data: bool = False):
+ if params.index not in await browser.get_selector_map():
+ raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions')
+
+ element_node = await browser.get_dom_element_by_index(params.index)
+ await browser._input_text_element_node(element_node, params.text)
+ if not has_sensitive_data:
+ msg = f'β¨οΈ Input {params.text} into index {params.index}'
+ else:
+ msg = f'β¨οΈ Input sensitive data into index {params.index}'
+ logger.info(msg)
+ logger.debug(f'Element xpath: {element_node.xpath}')
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ # Tab Management Actions
+ @self.registry.action('Switch tab', param_model=SwitchTabAction)
+ async def switch_tab(params: SwitchTabAction, browser: BrowserContext):
+ await browser.switch_to_tab(params.page_id)
+ # Wait for tab to be ready
+ page = await browser.get_current_page()
+ await page.wait_for_load_state()
+ msg = f'π Switched to tab {params.page_id}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ @self.registry.action('Open url in new tab', param_model=OpenTabAction)
+ async def open_tab(params: OpenTabAction, browser: BrowserContext):
+ await browser.create_new_tab(params.url)
+ msg = f'π Opened new tab with {params.url}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ # Content Actions
+ @self.registry.action(
+ 'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links',
+ )
+ async def extract_content(goal: str, browser: BrowserContext, page_extraction_llm: BaseChatModel):
+ page = await browser.get_current_page()
+ import markdownify
+
+ content = markdownify.markdownify(await page.content())
+
+ prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
+ template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
+ try:
+ output = page_extraction_llm.invoke(template.format(goal=goal, page=content))
+ msg = f'π Extracted from page\n: {output.content}\n'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+ except Exception as e:
+ logger.debug(f'Error extracting content: {e}')
+ msg = f'π Extracted from page\n: {content}\n'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg)
+
+ @self.registry.action(
+ 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
+ param_model=ScrollAction,
+ )
+ async def scroll_down(params: ScrollAction, browser: BrowserContext):
+ page = await browser.get_current_page()
+ if params.amount is not None:
+ await page.evaluate(f'window.scrollBy(0, {params.amount});')
+ else:
+ await page.evaluate('window.scrollBy(0, window.innerHeight);')
+
+ amount = f'{params.amount} pixels' if params.amount is not None else 'one page'
+ msg = f'π Scrolled down the page by {amount}'
+ logger.info(msg)
+ return ActionResult(
+ extracted_content=msg,
+ include_in_memory=True,
+ )
+
+ # scroll up
+ @self.registry.action(
+ 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
+ param_model=ScrollAction,
+ )
+ async def scroll_up(params: ScrollAction, browser: BrowserContext):
+ page = await browser.get_current_page()
+ if params.amount is not None:
+ await page.evaluate(f'window.scrollBy(0, -{params.amount});')
+ else:
+ await page.evaluate('window.scrollBy(0, -window.innerHeight);')
+
+ amount = f'{params.amount} pixels' if params.amount is not None else 'one page'
+ msg = f'π Scrolled up the page by {amount}'
+ logger.info(msg)
+ return ActionResult(
+ extracted_content=msg,
+ include_in_memory=True,
+ )
+
+ # send keys
+ @self.registry.action(
+ 'Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. ',
+ param_model=SendKeysAction,
+ )
+ async def send_keys(params: SendKeysAction, browser: BrowserContext):
+ page = await browser.get_current_page()
+
+ try:
+ await page.keyboard.press(params.keys)
+ except Exception as e:
+ if 'Unknown key' in str(e):
+ # loop over the keys and try to send each one
+ for key in params.keys:
+ try:
+ await page.keyboard.press(key)
+ except Exception as e:
+ logger.debug(f'Error sending key {key}: {str(e)}')
+ raise e
+ else:
+ raise e
+ msg = f'β¨οΈ Sent keys: {params.keys}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ @self.registry.action(
+ description='If you dont find something which you want to interact with, scroll to it',
+ )
+ async def scroll_to_text(text: str, browser: BrowserContext): # type: ignore
+ page = await browser.get_current_page()
+ try:
+ # Try different locator strategies
+ locators = [
+ page.get_by_text(text, exact=False),
+ page.locator(f'text={text}'),
+ page.locator(f"//*[contains(text(), '{text}')]"),
+ ]
+
+ for locator in locators:
+ try:
+ # First check if element exists and is visible
+ if await locator.count() > 0 and await locator.first.is_visible():
+ await locator.first.scroll_into_view_if_needed()
+ await asyncio.sleep(0.5) # Wait for scroll to complete
+ msg = f'π Scrolled to text: {text}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+ except Exception as e:
+ logger.debug(f'Locator attempt failed: {str(e)}')
+ continue
+
+ msg = f"Text '{text}' not found or not visible on page"
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ except Exception as e:
+ msg = f"Failed to scroll to text '{text}': {str(e)}"
+ logger.error(msg)
+ return ActionResult(error=msg, include_in_memory=True)
+
+ @self.registry.action(
+ description='Get all options from a native dropdown',
+ )
+ async def get_dropdown_options(index: int, browser: BrowserContext) -> ActionResult:
+ """Get all options from a native dropdown"""
+ page = await browser.get_current_page()
+ selector_map = await browser.get_selector_map()
+ dom_element = selector_map[index]
+
+ try:
+ # Frame-aware approach since we know it works
+ all_options = []
+ frame_index = 0
+
+ for frame in page.frames:
+ try:
+ options = await frame.evaluate(
+ """
+ (xpath) => {
+ const select = document.evaluate(xpath, document, null,
+ XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+ if (!select) return null;
+
+ return {
+ options: Array.from(select.options).map(opt => ({
+ text: opt.text, //do not trim, because we are doing exact match in select_dropdown_option
+ value: opt.value,
+ index: opt.index
+ })),
+ id: select.id,
+ name: select.name
+ };
+ }
+ """,
+ dom_element.xpath,
+ )
+
+ if options:
+ logger.debug(f'Found dropdown in frame {frame_index}')
+ logger.debug(f'Dropdown ID: {options["id"]}, Name: {options["name"]}')
+
+ formatted_options = []
+ for opt in options['options']:
+ # encoding ensures AI uses the exact string in select_dropdown_option
+ encoded_text = json.dumps(opt['text'])
+ formatted_options.append(f'{opt["index"]}: text={encoded_text}')
+
+ all_options.extend(formatted_options)
+
+ except Exception as frame_e:
+ logger.debug(f'Frame {frame_index} evaluation failed: {str(frame_e)}')
+
+ frame_index += 1
+
+ if all_options:
+ msg = '\n'.join(all_options)
+ msg += '\nUse the exact text string in select_dropdown_option'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+ else:
+ msg = 'No options found in any frame for dropdown'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ except Exception as e:
+ logger.error(f'Failed to get dropdown options: {str(e)}')
+ msg = f'Error getting options: {str(e)}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ @self.registry.action(
+ description='Select dropdown option for interactive element index by the text of the option you want to select',
+ )
+ async def select_dropdown_option(
+ index: int,
+ text: str,
+ browser: BrowserContext,
+ ) -> ActionResult:
+ """Select dropdown option by the text of the option you want to select"""
+ page = await browser.get_current_page()
+ selector_map = await browser.get_selector_map()
+ dom_element = selector_map[index]
+
+ # Validate that we're working with a select element
+ if dom_element.tag_name != 'select':
+ logger.error(f'Element is not a select! Tag: {dom_element.tag_name}, Attributes: {dom_element.attributes}')
+ msg = f'Cannot select option: Element with index {index} is a {dom_element.tag_name}, not a select'
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ logger.debug(f"Attempting to select '{text}' using xpath: {dom_element.xpath}")
+ logger.debug(f'Element attributes: {dom_element.attributes}')
+ logger.debug(f'Element tag: {dom_element.tag_name}')
+
+ xpath = '//' + dom_element.xpath
+
+ try:
+ frame_index = 0
+ for frame in page.frames:
+ try:
+ logger.debug(f'Trying frame {frame_index} URL: {frame.url}')
+
+ # First verify we can find the dropdown in this frame
+ find_dropdown_js = """
+ (xpath) => {
+ try {
+ const select = document.evaluate(xpath, document, null,
+ XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+ if (!select) return null;
+ if (select.tagName.toLowerCase() !== 'select') {
+ return {
+ error: `Found element but it's a ${select.tagName}, not a SELECT`,
+ found: false
+ };
+ }
+ return {
+ id: select.id,
+ name: select.name,
+ found: true,
+ tagName: select.tagName,
+ optionCount: select.options.length,
+ currentValue: select.value,
+ availableOptions: Array.from(select.options).map(o => o.text.trim())
+ };
+ } catch (e) {
+ return {error: e.toString(), found: false};
+ }
+ }
+ """
+
+ dropdown_info = await frame.evaluate(find_dropdown_js, dom_element.xpath)
+
+ if dropdown_info:
+ if not dropdown_info.get('found'):
+ logger.error(f'Frame {frame_index} error: {dropdown_info.get("error")}')
+ continue
+
+ logger.debug(f'Found dropdown in frame {frame_index}: {dropdown_info}')
+
+ # "label" because we are selecting by text
+ # nth(0) to disable error thrown by strict mode
+ # timeout=1000 because we are already waiting for all network events, therefore ideally we don't need to wait a lot here (default 30s)
+ selected_option_values = (
+ await frame.locator('//' + dom_element.xpath).nth(0).select_option(label=text, timeout=1000)
+ )
+
+ msg = f'selected option {text} with value {selected_option_values}'
+ logger.info(msg + f' in frame {frame_index}')
+
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ except Exception as frame_e:
+ logger.error(f'Frame {frame_index} attempt failed: {str(frame_e)}')
+ logger.error(f'Frame type: {type(frame)}')
+ logger.error(f'Frame URL: {frame.url}')
+
+ frame_index += 1
+
+ msg = f"Could not select option '{text}' in any frame"
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+ except Exception as e:
+ msg = f'Selection failed: {str(e)}'
+ logger.error(msg)
+ return ActionResult(error=msg, include_in_memory=True)
+
+ # Register ---------------------------------------------------------------
+
+ def action(self, description: str, **kwargs):
+ """Decorator for registering custom actions
+
+ @param description: Describe the LLM what the function does (better description == better function calling)
+ """
+ return self.registry.action(description, **kwargs)
+
+ # Act --------------------------------------------------------------------
+
+ @time_execution_sync('--act')
+ async def act(
+ self,
+ action: ActionModel,
+ browser_context: BrowserContext,
+ #
+ page_extraction_llm: Optional[BaseChatModel] = None,
+ sensitive_data: Optional[Dict[str, str]] = None,
+ available_file_paths: Optional[list[str]] = None,
+ #
+ context: Context | None = None,
+ ) -> ActionResult:
+ """Execute an action"""
+
+ try:
+ for action_name, params in action.model_dump(exclude_unset=True).items():
+ if params is not None:
+ # with Laminar.start_as_current_span(
+ # name=action_name,
+ # input={
+ # 'action': action_name,
+ # 'params': params,
+ # },
+ # span_type='TOOL',
+ # ):
+ result = await self.registry.execute_action(
+ action_name,
+ params,
+ browser=browser_context,
+ page_extraction_llm=page_extraction_llm,
+ sensitive_data=sensitive_data,
+ available_file_paths=available_file_paths,
+ context=context,
+ )
+
+ # Laminar.set_span_output(result)
+
+ if isinstance(result, str):
+ return ActionResult(extracted_content=result)
+ elif isinstance(result, ActionResult):
+ return result
+ elif result is None:
+ return ActionResult()
+ else:
+ raise ValueError(f'Invalid action result type: {type(result)} of {result}')
+ return ActionResult()
+ except Exception as e:
+ raise e
diff --git a/browser_use/controller/views.py b/browser_use/controller/views.py
new file mode 100644
index 0000000000000000000000000000000000000000..82995c9e311578ca1c9c133509b70c3eede2bb40
--- /dev/null
+++ b/browser_use/controller/views.py
@@ -0,0 +1,65 @@
+from typing import Optional
+
+from pydantic import BaseModel, model_validator
+
+
+# Action Input Models
+class SearchGoogleAction(BaseModel):
+ query: str
+
+
+class GoToUrlAction(BaseModel):
+ url: str
+
+
+class ClickElementAction(BaseModel):
+ index: int
+ xpath: Optional[str] = None
+
+
+class InputTextAction(BaseModel):
+ index: int
+ text: str
+ xpath: Optional[str] = None
+
+
+class DoneAction(BaseModel):
+ text: str
+ success: bool
+
+
+class SwitchTabAction(BaseModel):
+ page_id: int
+
+
+class OpenTabAction(BaseModel):
+ url: str
+
+
+class ScrollAction(BaseModel):
+ amount: Optional[int] = None # The number of pixels to scroll. If None, scroll down/up one page
+
+
+class SendKeysAction(BaseModel):
+ keys: str
+
+
+class ExtractPageContentAction(BaseModel):
+ value: str
+
+
+class NoParamsAction(BaseModel):
+ """
+ Accepts absolutely anything in the incoming data
+ and discards it, so the final parsed model is empty.
+ """
+
+ @model_validator(mode='before')
+ def ignore_all_inputs(cls, values):
+ # No matter what the user sends, discard it and return empty.
+ return {}
+
+ class Config:
+ # If you want to silently allow unknown fields at top-level,
+ # set extra = 'allow' as well:
+ extra = 'allow'
diff --git a/browser_use/dom/__init__.py b/browser_use/dom/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/browser_use/dom/buildDomTree.js b/browser_use/dom/buildDomTree.js
new file mode 100644
index 0000000000000000000000000000000000000000..539c762259b044776229a7fe9542059af84c0ab9
--- /dev/null
+++ b/browser_use/dom/buildDomTree.js
@@ -0,0 +1,1055 @@
+(
+ args = {
+ doHighlightElements: true,
+ focusHighlightIndex: -1,
+ viewportExpansion: 0,
+ debugMode: false,
+ }
+) => {
+ const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode } = args;
+ let highlightIndex = 0; // Reset highlight index
+
+ // Add timing stack to handle recursion
+ const TIMING_STACK = {
+ nodeProcessing: [],
+ treeTraversal: [],
+ highlighting: [],
+ current: null
+ };
+
+ function pushTiming(type) {
+ TIMING_STACK[type] = TIMING_STACK[type] || [];
+ TIMING_STACK[type].push(performance.now());
+ }
+
+ function popTiming(type) {
+ const start = TIMING_STACK[type].pop();
+ const duration = performance.now() - start;
+ return duration;
+ }
+
+ // Only initialize performance tracking if in debug mode
+ const PERF_METRICS = debugMode ? {
+ buildDomTreeCalls: 0,
+ timings: {
+ buildDomTree: 0,
+ highlightElement: 0,
+ isInteractiveElement: 0,
+ isElementVisible: 0,
+ isTopElement: 0,
+ isInExpandedViewport: 0,
+ isTextNodeVisible: 0,
+ getEffectiveScroll: 0,
+ },
+ cacheMetrics: {
+ boundingRectCacheHits: 0,
+ boundingRectCacheMisses: 0,
+ computedStyleCacheHits: 0,
+ computedStyleCacheMisses: 0,
+ getBoundingClientRectTime: 0,
+ getComputedStyleTime: 0,
+ boundingRectHitRate: 0,
+ computedStyleHitRate: 0,
+ overallHitRate: 0,
+ },
+ nodeMetrics: {
+ totalNodes: 0,
+ processedNodes: 0,
+ skippedNodes: 0,
+ },
+ buildDomTreeBreakdown: {
+ totalTime: 0,
+ totalSelfTime: 0,
+ buildDomTreeCalls: 0,
+ domOperations: {
+ getBoundingClientRect: 0,
+ getComputedStyle: 0,
+ },
+ domOperationCounts: {
+ getBoundingClientRect: 0,
+ getComputedStyle: 0,
+ }
+ }
+ } : null;
+
+ // Simple timing helper that only runs in debug mode
+ function measureTime(fn) {
+ if (!debugMode) return fn;
+ return function (...args) {
+ const start = performance.now();
+ const result = fn.apply(this, args);
+ const duration = performance.now() - start;
+ return result;
+ };
+ }
+
+ // Helper to measure DOM operations
+ function measureDomOperation(operation, name) {
+ if (!debugMode) return operation();
+
+ const start = performance.now();
+ const result = operation();
+ const duration = performance.now() - start;
+
+ if (PERF_METRICS && name in PERF_METRICS.buildDomTreeBreakdown.domOperations) {
+ PERF_METRICS.buildDomTreeBreakdown.domOperations[name] += duration;
+ PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[name]++;
+ }
+
+ return result;
+ }
+
+ // Add caching mechanisms at the top level
+ const DOM_CACHE = {
+ boundingRects: new WeakMap(),
+ computedStyles: new WeakMap(),
+ clearCache: () => {
+ DOM_CACHE.boundingRects = new WeakMap();
+ DOM_CACHE.computedStyles = new WeakMap();
+ }
+ };
+
+ // Cache helper functions
+ function getCachedBoundingRect(element) {
+ if (!element) return null;
+
+ if (DOM_CACHE.boundingRects.has(element)) {
+ if (debugMode && PERF_METRICS) {
+ PERF_METRICS.cacheMetrics.boundingRectCacheHits++;
+ }
+ return DOM_CACHE.boundingRects.get(element);
+ }
+
+ if (debugMode && PERF_METRICS) {
+ PERF_METRICS.cacheMetrics.boundingRectCacheMisses++;
+ }
+
+ let rect;
+ if (debugMode) {
+ const start = performance.now();
+ rect = element.getBoundingClientRect();
+ const duration = performance.now() - start;
+ if (PERF_METRICS) {
+ PERF_METRICS.buildDomTreeBreakdown.domOperations.getBoundingClientRect += duration;
+ PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getBoundingClientRect++;
+ }
+ } else {
+ rect = element.getBoundingClientRect();
+ }
+
+ if (rect) {
+ DOM_CACHE.boundingRects.set(element, rect);
+ }
+ return rect;
+ }
+
+ function getCachedComputedStyle(element) {
+ if (!element) return null;
+
+ if (DOM_CACHE.computedStyles.has(element)) {
+ if (debugMode && PERF_METRICS) {
+ PERF_METRICS.cacheMetrics.computedStyleCacheHits++;
+ }
+ return DOM_CACHE.computedStyles.get(element);
+ }
+
+ if (debugMode && PERF_METRICS) {
+ PERF_METRICS.cacheMetrics.computedStyleCacheMisses++;
+ }
+
+ let style;
+ if (debugMode) {
+ const start = performance.now();
+ style = window.getComputedStyle(element);
+ const duration = performance.now() - start;
+ if (PERF_METRICS) {
+ PERF_METRICS.buildDomTreeBreakdown.domOperations.getComputedStyle += duration;
+ PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getComputedStyle++;
+ }
+ } else {
+ style = window.getComputedStyle(element);
+ }
+
+ if (style) {
+ DOM_CACHE.computedStyles.set(element, style);
+ }
+ return style;
+ }
+
+ /**
+ * Hash map of DOM nodes indexed by their highlight index.
+ *
+ * @type {Object}
+ */
+ const DOM_HASH_MAP = {};
+
+ const ID = { current: 0 };
+
+ const HIGHLIGHT_CONTAINER_ID = "playwright-highlight-container";
+
+ /**
+ * Highlights an element in the DOM and returns the index of the next element.
+ */
+ function highlightElement(element, index, parentIframe = null) {
+ if (!element) return index;
+
+ try {
+ // Create or get highlight container
+ let container = document.getElementById(HIGHLIGHT_CONTAINER_ID);
+ if (!container) {
+ container = document.createElement("div");
+ container.id = HIGHLIGHT_CONTAINER_ID;
+ container.style.position = "fixed";
+ container.style.pointerEvents = "none";
+ container.style.top = "0";
+ container.style.left = "0";
+ container.style.width = "100%";
+ container.style.height = "100%";
+ container.style.zIndex = "2147483647";
+ document.body.appendChild(container);
+ }
+
+ // Get element position
+ const rect = measureDomOperation(
+ () => element.getBoundingClientRect(),
+ 'getBoundingClientRect'
+ );
+
+ if (!rect) return index;
+
+ // Generate a color based on the index
+ const colors = [
+ "#FF0000",
+ "#00FF00",
+ "#0000FF",
+ "#FFA500",
+ "#800080",
+ "#008080",
+ "#FF69B4",
+ "#4B0082",
+ "#FF4500",
+ "#2E8B57",
+ "#DC143C",
+ "#4682B4",
+ ];
+ const colorIndex = index % colors.length;
+ const baseColor = colors[colorIndex];
+ const backgroundColor = baseColor + "1A"; // 10% opacity version of the color
+
+ // Create highlight overlay
+ const overlay = document.createElement("div");
+ overlay.style.position = "fixed";
+ overlay.style.border = `2px solid ${baseColor}`;
+ overlay.style.backgroundColor = backgroundColor;
+ overlay.style.pointerEvents = "none";
+ overlay.style.boxSizing = "border-box";
+
+ // Get element position
+ let iframeOffset = { x: 0, y: 0 };
+
+ // If element is in an iframe, calculate iframe offset
+ if (parentIframe) {
+ const iframeRect = parentIframe.getBoundingClientRect();
+ iframeOffset.x = iframeRect.left;
+ iframeOffset.y = iframeRect.top;
+ }
+
+ // Calculate position
+ const top = rect.top + iframeOffset.y;
+ const left = rect.left + iframeOffset.x;
+
+ overlay.style.top = `${top}px`;
+ overlay.style.left = `${left}px`;
+ overlay.style.width = `${rect.width}px`;
+ overlay.style.height = `${rect.height}px`;
+
+ // Create and position label
+ const label = document.createElement("div");
+ label.className = "playwright-highlight-label";
+ label.style.position = "fixed";
+ label.style.background = baseColor;
+ label.style.color = "white";
+ label.style.padding = "1px 4px";
+ label.style.borderRadius = "4px";
+ label.style.fontSize = `${Math.min(12, Math.max(8, rect.height / 2))}px`;
+ label.textContent = index;
+
+ const labelWidth = 20;
+ const labelHeight = 16;
+
+ let labelTop = top + 2;
+ let labelLeft = left + rect.width - labelWidth - 2;
+
+ if (rect.width < labelWidth + 4 || rect.height < labelHeight + 4) {
+ labelTop = top - labelHeight - 2;
+ labelLeft = left + rect.width - labelWidth;
+ }
+
+ label.style.top = `${labelTop}px`;
+ label.style.left = `${labelLeft}px`;
+
+ // Add to container
+ container.appendChild(overlay);
+ container.appendChild(label);
+
+ // Update positions on scroll
+ const updatePositions = () => {
+ const newRect = element.getBoundingClientRect();
+ let newIframeOffset = { x: 0, y: 0 };
+
+ if (parentIframe) {
+ const iframeRect = parentIframe.getBoundingClientRect();
+ newIframeOffset.x = iframeRect.left;
+ newIframeOffset.y = iframeRect.top;
+ }
+
+ const newTop = newRect.top + newIframeOffset.y;
+ const newLeft = newRect.left + newIframeOffset.x;
+
+ overlay.style.top = `${newTop}px`;
+ overlay.style.left = `${newLeft}px`;
+ overlay.style.width = `${newRect.width}px`;
+ overlay.style.height = `${newRect.height}px`;
+
+ let newLabelTop = newTop + 2;
+ let newLabelLeft = newLeft + newRect.width - labelWidth - 2;
+
+ if (newRect.width < labelWidth + 4 || newRect.height < labelHeight + 4) {
+ newLabelTop = newTop - labelHeight - 2;
+ newLabelLeft = newLeft + newRect.width - labelWidth;
+ }
+
+ label.style.top = `${newLabelTop}px`;
+ label.style.left = `${newLabelLeft}px`;
+ };
+
+ window.addEventListener('scroll', updatePositions);
+ window.addEventListener('resize', updatePositions);
+
+ return index + 1;
+ } finally {
+ popTiming('highlighting');
+ }
+ }
+
+ /**
+ * Returns an XPath tree string for an element.
+ */
+ function getXPathTree(element, stopAtBoundary = true) {
+ const segments = [];
+ let currentElement = element;
+
+ while (currentElement && currentElement.nodeType === Node.ELEMENT_NODE) {
+ // Stop if we hit a shadow root or iframe
+ if (
+ stopAtBoundary &&
+ (currentElement.parentNode instanceof ShadowRoot ||
+ currentElement.parentNode instanceof HTMLIFrameElement)
+ ) {
+ break;
+ }
+
+ let index = 0;
+ let sibling = currentElement.previousSibling;
+ while (sibling) {
+ if (
+ sibling.nodeType === Node.ELEMENT_NODE &&
+ sibling.nodeName === currentElement.nodeName
+ ) {
+ index++;
+ }
+ sibling = sibling.previousSibling;
+ }
+
+ const tagName = currentElement.nodeName.toLowerCase();
+ const xpathIndex = index > 0 ? `[${index + 1}]` : "";
+ segments.unshift(`${tagName}${xpathIndex}`);
+
+ currentElement = currentElement.parentNode;
+ }
+
+ return segments.join("/");
+ }
+
+ /**
+ * Checks if a text node is visible.
+ */
+ function isTextNodeVisible(textNode) {
+ try {
+ const range = document.createRange();
+ range.selectNodeContents(textNode);
+ const rect = range.getBoundingClientRect();
+
+ // Simple size check
+ if (rect.width === 0 || rect.height === 0) {
+ return false;
+ }
+
+ // Simple viewport check without scroll calculations
+ const isInViewport = !(
+ rect.bottom < -viewportExpansion ||
+ rect.top > window.innerHeight + viewportExpansion ||
+ rect.right < -viewportExpansion ||
+ rect.left > window.innerWidth + viewportExpansion
+ );
+
+ // Check parent visibility
+ const parentElement = textNode.parentElement;
+ if (!parentElement) return false;
+
+ try {
+ return isInViewport && parentElement.checkVisibility({
+ checkOpacity: true,
+ checkVisibilityCSS: true,
+ });
+ } catch (e) {
+ // Fallback if checkVisibility is not supported
+ const style = window.getComputedStyle(parentElement);
+ return isInViewport &&
+ style.display !== 'none' &&
+ style.visibility !== 'hidden' &&
+ style.opacity !== '0';
+ }
+ } catch (e) {
+ console.warn('Error checking text node visibility:', e);
+ return false;
+ }
+ }
+
+ // Helper function to check if element is accepted
+ function isElementAccepted(element) {
+ if (!element || !element.tagName) return false;
+
+ // Always accept body and common container elements
+ const alwaysAccept = new Set([
+ "body", "div", "main", "article", "section", "nav", "header", "footer"
+ ]);
+ const tagName = element.tagName.toLowerCase();
+
+ if (alwaysAccept.has(tagName)) return true;
+
+ const leafElementDenyList = new Set([
+ "svg",
+ "script",
+ "style",
+ "link",
+ "meta",
+ "noscript",
+ "template",
+ ]);
+
+ return !leafElementDenyList.has(tagName);
+ }
+
+ /**
+ * Checks if an element is visible.
+ */
+ function isElementVisible(element) {
+ const style = getCachedComputedStyle(element);
+ return (
+ element.offsetWidth > 0 &&
+ element.offsetHeight > 0 &&
+ style.visibility !== "hidden" &&
+ style.display !== "none"
+ );
+ }
+
+ /**
+ * Checks if an element is interactive.
+ */
+ function isInteractiveElement(element) {
+ if (!element || element.nodeType !== Node.ELEMENT_NODE) {
+ return false;
+ }
+
+ // Special handling for cookie banner elements
+ const isCookieBannerElement =
+ (typeof element.closest === 'function') && (
+ element.closest('[id*="onetrust"]') ||
+ element.closest('[class*="onetrust"]') ||
+ element.closest('[data-nosnippet="true"]') ||
+ element.closest('[aria-label*="cookie"]')
+ );
+
+ if (isCookieBannerElement) {
+ // Check if it's a button or interactive element within the banner
+ if (
+ element.tagName.toLowerCase() === 'button' ||
+ element.getAttribute('role') === 'button' ||
+ element.onclick ||
+ element.getAttribute('onclick') ||
+ (element.classList && (
+ element.classList.contains('ot-sdk-button') ||
+ element.classList.contains('accept-button') ||
+ element.classList.contains('reject-button')
+ )) ||
+ element.getAttribute('aria-label')?.toLowerCase().includes('accept') ||
+ element.getAttribute('aria-label')?.toLowerCase().includes('reject')
+ ) {
+ return true;
+ }
+ }
+
+ // Base interactive elements and roles
+ const interactiveElements = new Set([
+ "a", "button", "details", "embed", "input", "menu", "menuitem",
+ "object", "select", "textarea", "canvas", "summary", "dialog",
+ "banner"
+ ]);
+
+ const interactiveRoles = new Set(['button-icon', 'dialog', 'button-text-icon-only', 'treeitem', 'alert', 'grid', 'progressbar', 'radio', 'checkbox', 'menuitem', 'option', 'switch', 'dropdown', 'scrollbar', 'combobox', 'a-button-text', 'button', 'region', 'textbox', 'tabpanel', 'tab', 'click', 'button-text', 'spinbutton', 'a-button-inner', 'link', 'menu', 'slider', 'listbox', 'a-dropdown-button', 'button-icon-only', 'searchbox', 'menuitemradio', 'tooltip', 'tree', 'menuitemcheckbox']);
+
+ const tagName = element.tagName.toLowerCase();
+ const role = element.getAttribute("role");
+ const ariaRole = element.getAttribute("aria-role");
+ const tabIndex = element.getAttribute("tabindex");
+
+ // Add check for specific class
+ const hasAddressInputClass = element.classList && (
+ element.classList.contains("address-input__container__input") ||
+ element.classList.contains("nav-btn") ||
+ element.classList.contains("pull-left")
+ );
+
+ // Added enhancement to capture dropdown interactive elements
+ if (element.classList && (
+ element.classList.contains('dropdown-toggle') ||
+ element.getAttribute('data-toggle') === 'dropdown' ||
+ element.getAttribute('aria-haspopup') === 'true'
+ )) {
+ return true;
+ }
+
+ // Basic role/attribute checks
+ const hasInteractiveRole =
+ hasAddressInputClass ||
+ interactiveElements.has(tagName) ||
+ interactiveRoles.has(role) ||
+ interactiveRoles.has(ariaRole) ||
+ (tabIndex !== null &&
+ tabIndex !== "-1" &&
+ element.parentElement?.tagName.toLowerCase() !== "body") ||
+ element.getAttribute("data-action") === "a-dropdown-select" ||
+ element.getAttribute("data-action") === "a-dropdown-button";
+
+ if (hasInteractiveRole) return true;
+
+ // Additional checks for cookie banners and consent UI
+ const isCookieBanner =
+ element.id?.toLowerCase().includes('cookie') ||
+ element.id?.toLowerCase().includes('consent') ||
+ element.id?.toLowerCase().includes('notice') ||
+ (element.classList && (
+ element.classList.contains('otCenterRounded') ||
+ element.classList.contains('ot-sdk-container')
+ )) ||
+ element.getAttribute('data-nosnippet') === 'true' ||
+ element.getAttribute('aria-label')?.toLowerCase().includes('cookie') ||
+ element.getAttribute('aria-label')?.toLowerCase().includes('consent') ||
+ (element.tagName.toLowerCase() === 'div' && (
+ element.id?.includes('onetrust') ||
+ (element.classList && (
+ element.classList.contains('onetrust') ||
+ element.classList.contains('cookie') ||
+ element.classList.contains('consent')
+ ))
+ ));
+
+ if (isCookieBanner) return true;
+
+ // Additional check for buttons in cookie banners
+ const isInCookieBanner = typeof element.closest === 'function' && element.closest(
+ '[id*="cookie"],[id*="consent"],[class*="cookie"],[class*="consent"],[id*="onetrust"]'
+ );
+
+ if (isInCookieBanner && (
+ element.tagName.toLowerCase() === 'button' ||
+ element.getAttribute('role') === 'button' ||
+ (element.classList && element.classList.contains('button')) ||
+ element.onclick ||
+ element.getAttribute('onclick')
+ )) {
+ return true;
+ }
+
+ // Get computed style
+ const style = window.getComputedStyle(element);
+
+ // Check for event listeners
+ const hasClickHandler =
+ element.onclick !== null ||
+ element.getAttribute("onclick") !== null ||
+ element.hasAttribute("ng-click") ||
+ element.hasAttribute("@click") ||
+ element.hasAttribute("v-on:click");
+
+ // Helper function to safely get event listeners
+ function getEventListeners(el) {
+ try {
+ return window.getEventListeners?.(el) || {};
+ } catch (e) {
+ const listeners = {};
+ const eventTypes = [
+ "click",
+ "mousedown",
+ "mouseup",
+ "touchstart",
+ "touchend",
+ "keydown",
+ "keyup",
+ "focus",
+ "blur",
+ ];
+
+ for (const type of eventTypes) {
+ const handler = el[`on${type}`];
+ if (handler) {
+ listeners[type] = [{ listener: handler, useCapture: false }];
+ }
+ }
+ return listeners;
+ }
+ }
+
+ // Check for click-related events
+ const listeners = getEventListeners(element);
+ const hasClickListeners =
+ listeners &&
+ (listeners.click?.length > 0 ||
+ listeners.mousedown?.length > 0 ||
+ listeners.mouseup?.length > 0 ||
+ listeners.touchstart?.length > 0 ||
+ listeners.touchend?.length > 0);
+
+ // Check for ARIA properties
+ const hasAriaProps =
+ element.hasAttribute("aria-expanded") ||
+ element.hasAttribute("aria-pressed") ||
+ element.hasAttribute("aria-selected") ||
+ element.hasAttribute("aria-checked");
+
+ const isContentEditable = element.getAttribute("contenteditable") === "true" ||
+ element.isContentEditable ||
+ element.id === "tinymce" ||
+ element.classList.contains("mce-content-body") ||
+ (element.tagName.toLowerCase() === "body" && element.getAttribute("data-id")?.startsWith("mce_"));
+
+ // Check if element is draggable
+ const isDraggable =
+ element.draggable || element.getAttribute("draggable") === "true";
+
+ return (
+ hasAriaProps ||
+ hasClickHandler ||
+ hasClickListeners ||
+ isDraggable ||
+ isContentEditable
+ );
+ }
+
+ /**
+ * Checks if an element is the topmost element at its position.
+ */
+ function isTopElement(element) {
+ const rect = getCachedBoundingRect(element);
+
+ // If element is not in viewport, consider it top
+ const isInViewport = (
+ rect.left < window.innerWidth &&
+ rect.right > 0 &&
+ rect.top < window.innerHeight &&
+ rect.bottom > 0
+ );
+
+ if (!isInViewport) {
+ return true;
+ }
+
+ // Find the correct document context and root element
+ let doc = element.ownerDocument;
+
+ // If we're in an iframe, elements are considered top by default
+ if (doc !== window.document) {
+ return true;
+ }
+
+ // For shadow DOM, we need to check within its own root context
+ const shadowRoot = element.getRootNode();
+ if (shadowRoot instanceof ShadowRoot) {
+ const centerX = rect.left + rect.width / 2;
+ const centerY = rect.top + rect.height / 2;
+
+ try {
+ const topEl = measureDomOperation(
+ () => shadowRoot.elementFromPoint(centerX, centerY),
+ 'elementFromPoint'
+ );
+ if (!topEl) return false;
+
+ let current = topEl;
+ while (current && current !== shadowRoot) {
+ if (current === element) return true;
+ current = current.parentElement;
+ }
+ return false;
+ } catch (e) {
+ return true;
+ }
+ }
+
+ // For elements in viewport, check if they're topmost
+ const centerX = rect.left + rect.width / 2;
+ const centerY = rect.top + rect.height / 2;
+
+ try {
+ const topEl = document.elementFromPoint(centerX, centerY);
+ if (!topEl) return false;
+
+ let current = topEl;
+ while (current && current !== document.documentElement) {
+ if (current === element) return true;
+ current = current.parentElement;
+ }
+ return false;
+ } catch (e) {
+ return true;
+ }
+ }
+
+ /**
+ * Checks if an element is within the expanded viewport.
+ */
+ function isInExpandedViewport(element, viewportExpansion) {
+ if (viewportExpansion === -1) {
+ return true;
+ }
+
+ const rect = getCachedBoundingRect(element);
+
+ // Simple viewport check without scroll calculations
+ return !(
+ rect.bottom < -viewportExpansion ||
+ rect.top > window.innerHeight + viewportExpansion ||
+ rect.right < -viewportExpansion ||
+ rect.left > window.innerWidth + viewportExpansion
+ );
+ }
+
+ // Add this new helper function
+ function getEffectiveScroll(element) {
+ let currentEl = element;
+ let scrollX = 0;
+ let scrollY = 0;
+
+ return measureDomOperation(() => {
+ while (currentEl && currentEl !== document.documentElement) {
+ if (currentEl.scrollLeft || currentEl.scrollTop) {
+ scrollX += currentEl.scrollLeft;
+ scrollY += currentEl.scrollTop;
+ }
+ currentEl = currentEl.parentElement;
+ }
+
+ scrollX += window.scrollX;
+ scrollY += window.scrollY;
+
+ return { scrollX, scrollY };
+ }, 'scrollOperations');
+ }
+
+ // Add these helper functions at the top level
+ function isInteractiveCandidate(element) {
+ if (!element || element.nodeType !== Node.ELEMENT_NODE) return false;
+
+ const tagName = element.tagName.toLowerCase();
+
+ // Fast-path for common interactive elements
+ const interactiveElements = new Set([
+ "a", "button", "input", "select", "textarea", "details", "summary"
+ ]);
+
+ if (interactiveElements.has(tagName)) return true;
+
+ // Quick attribute checks without getting full lists
+ const hasQuickInteractiveAttr = element.hasAttribute("onclick") ||
+ element.hasAttribute("role") ||
+ element.hasAttribute("tabindex") ||
+ element.hasAttribute("aria-") ||
+ element.hasAttribute("data-action");
+
+ return hasQuickInteractiveAttr;
+ }
+
+ function quickVisibilityCheck(element) {
+ // Fast initial check before expensive getComputedStyle
+ return element.offsetWidth > 0 &&
+ element.offsetHeight > 0 &&
+ !element.hasAttribute("hidden") &&
+ element.style.display !== "none" &&
+ element.style.visibility !== "hidden";
+ }
+
+ /**
+ * Creates a node data object for a given node and its descendants.
+ */
+ function buildDomTree(node, parentIframe = null) {
+ if (debugMode) PERF_METRICS.nodeMetrics.totalNodes++;
+
+ if (!node || node.id === HIGHLIGHT_CONTAINER_ID) {
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+ return null;
+ }
+
+ // Special handling for root node (body)
+ if (node === document.body) {
+ const nodeData = {
+ tagName: 'body',
+ attributes: {},
+ xpath: '/body',
+ children: [],
+ };
+
+ // Process children of body
+ for (const child of node.childNodes) {
+ const domElement = buildDomTree(child, parentIframe);
+ if (domElement) nodeData.children.push(domElement);
+ }
+
+ const id = `${ID.current++}`;
+ DOM_HASH_MAP[id] = nodeData;
+ if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
+ return id;
+ }
+
+ // Early bailout for non-element nodes except text
+ if (node.nodeType !== Node.ELEMENT_NODE && node.nodeType !== Node.TEXT_NODE) {
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+ return null;
+ }
+
+ // Process text nodes
+ if (node.nodeType === Node.TEXT_NODE) {
+ const textContent = node.textContent.trim();
+ if (!textContent) {
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+ return null;
+ }
+
+ // Only check visibility for text nodes that might be visible
+ const parentElement = node.parentElement;
+ if (!parentElement || parentElement.tagName.toLowerCase() === 'script') {
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+ return null;
+ }
+
+ const id = `${ID.current++}`;
+ DOM_HASH_MAP[id] = {
+ type: "TEXT_NODE",
+ text: textContent,
+ isVisible: isTextNodeVisible(node),
+ };
+ if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
+ return id;
+ }
+
+ // Quick checks for element nodes
+ if (node.nodeType === Node.ELEMENT_NODE && !isElementAccepted(node)) {
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+ return null;
+ }
+
+ // Early viewport check - only filter out elements clearly outside viewport
+ if (viewportExpansion !== -1) {
+ const rect = getCachedBoundingRect(node);
+ const style = getCachedComputedStyle(node);
+
+ // Skip viewport check for fixed/sticky elements as they may appear anywhere
+ const isFixedOrSticky = style && (style.position === 'fixed' || style.position === 'sticky');
+
+ // Check if element has actual dimensions
+ const hasSize = node.offsetWidth > 0 || node.offsetHeight > 0;
+
+ if (!rect || (!isFixedOrSticky && !hasSize && (
+ rect.bottom < -viewportExpansion ||
+ rect.top > window.innerHeight + viewportExpansion ||
+ rect.right < -viewportExpansion ||
+ rect.left > window.innerWidth + viewportExpansion
+ ))) {
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+ return null;
+ }
+ }
+
+ // Process element node
+ const nodeData = {
+ tagName: node.tagName.toLowerCase(),
+ attributes: {},
+ xpath: getXPathTree(node, true),
+ children: [],
+ };
+
+ // Get attributes for interactive elements or potential text containers
+ if (isInteractiveCandidate(node) || node.tagName.toLowerCase() === 'iframe' || node.tagName.toLowerCase() === 'body') {
+ const attributeNames = node.getAttributeNames?.() || [];
+ for (const name of attributeNames) {
+ nodeData.attributes[name] = node.getAttribute(name);
+ }
+ }
+
+ // if (isInteractiveCandidate(node)) {
+
+ // Check interactivity
+ if (node.nodeType === Node.ELEMENT_NODE) {
+ nodeData.isVisible = isElementVisible(node);
+ if (nodeData.isVisible) {
+ nodeData.isTopElement = isTopElement(node);
+ if (nodeData.isTopElement) {
+ nodeData.isInteractive = isInteractiveElement(node);
+ if (nodeData.isInteractive) {
+ nodeData.isInViewport = true;
+ nodeData.highlightIndex = highlightIndex++;
+
+ if (doHighlightElements) {
+ if (focusHighlightIndex >= 0) {
+ if (focusHighlightIndex === nodeData.highlightIndex) {
+ highlightElement(node, nodeData.highlightIndex, parentIframe);
+ }
+ } else {
+ highlightElement(node, nodeData.highlightIndex, parentIframe);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Process children, with special handling for iframes and rich text editors
+ if (node.tagName) {
+ const tagName = node.tagName.toLowerCase();
+
+ // Handle iframes
+ if (tagName === "iframe") {
+ try {
+ const iframeDoc = node.contentDocument || node.contentWindow?.document;
+ if (iframeDoc) {
+ for (const child of iframeDoc.childNodes) {
+ const domElement = buildDomTree(child, node);
+ if (domElement) nodeData.children.push(domElement);
+ }
+ }
+ } catch (e) {
+ console.warn("Unable to access iframe:", e);
+ }
+ }
+ // Handle rich text editors and contenteditable elements
+ else if (
+ node.isContentEditable ||
+ node.getAttribute("contenteditable") === "true" ||
+ node.id === "tinymce" ||
+ node.classList.contains("mce-content-body") ||
+ (tagName === "body" && node.getAttribute("data-id")?.startsWith("mce_"))
+ ) {
+ // Process all child nodes to capture formatted text
+ for (const child of node.childNodes) {
+ const domElement = buildDomTree(child, parentIframe);
+ if (domElement) nodeData.children.push(domElement);
+ }
+ }
+ // Handle shadow DOM
+ else if (node.shadowRoot) {
+ nodeData.shadowRoot = true;
+ for (const child of node.shadowRoot.childNodes) {
+ const domElement = buildDomTree(child, parentIframe);
+ if (domElement) nodeData.children.push(domElement);
+ }
+ }
+ // Handle regular elements
+ else {
+ for (const child of node.childNodes) {
+ const domElement = buildDomTree(child, parentIframe);
+ if (domElement) nodeData.children.push(domElement);
+ }
+ }
+ }
+
+ // Skip empty anchor tags
+ if (nodeData.tagName === 'a' && nodeData.children.length === 0 && !nodeData.attributes.href) {
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+ return null;
+ }
+
+ const id = `${ID.current++}`;
+ DOM_HASH_MAP[id] = nodeData;
+ if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
+ return id;
+ }
+
+ // After all functions are defined, wrap them with performance measurement
+ // Remove buildDomTree from here as we measure it separately
+ highlightElement = measureTime(highlightElement);
+ isInteractiveElement = measureTime(isInteractiveElement);
+ isElementVisible = measureTime(isElementVisible);
+ isTopElement = measureTime(isTopElement);
+ isInExpandedViewport = measureTime(isInExpandedViewport);
+ isTextNodeVisible = measureTime(isTextNodeVisible);
+ getEffectiveScroll = measureTime(getEffectiveScroll);
+
+ const rootId = buildDomTree(document.body);
+
+ // Clear the cache before starting
+ DOM_CACHE.clearCache();
+
+ // Only process metrics in debug mode
+ if (debugMode && PERF_METRICS) {
+ // Convert timings to seconds and add useful derived metrics
+ Object.keys(PERF_METRICS.timings).forEach(key => {
+ PERF_METRICS.timings[key] = PERF_METRICS.timings[key] / 1000;
+ });
+
+ Object.keys(PERF_METRICS.buildDomTreeBreakdown).forEach(key => {
+ if (typeof PERF_METRICS.buildDomTreeBreakdown[key] === 'number') {
+ PERF_METRICS.buildDomTreeBreakdown[key] = PERF_METRICS.buildDomTreeBreakdown[key] / 1000;
+ }
+ });
+
+ // Add some useful derived metrics
+ if (PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls > 0) {
+ PERF_METRICS.buildDomTreeBreakdown.averageTimePerNode =
+ PERF_METRICS.buildDomTreeBreakdown.totalTime / PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls;
+ }
+
+ PERF_METRICS.buildDomTreeBreakdown.timeInChildCalls =
+ PERF_METRICS.buildDomTreeBreakdown.totalTime - PERF_METRICS.buildDomTreeBreakdown.totalSelfTime;
+
+ // Add average time per operation to the metrics
+ Object.keys(PERF_METRICS.buildDomTreeBreakdown.domOperations).forEach(op => {
+ const time = PERF_METRICS.buildDomTreeBreakdown.domOperations[op];
+ const count = PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[op];
+ if (count > 0) {
+ PERF_METRICS.buildDomTreeBreakdown.domOperations[`${op}Average`] = time / count;
+ }
+ });
+
+ // Calculate cache hit rates
+ const boundingRectTotal = PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.boundingRectCacheMisses;
+ const computedStyleTotal = PERF_METRICS.cacheMetrics.computedStyleCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheMisses;
+
+ if (boundingRectTotal > 0) {
+ PERF_METRICS.cacheMetrics.boundingRectHitRate = PERF_METRICS.cacheMetrics.boundingRectCacheHits / boundingRectTotal;
+ }
+
+ if (computedStyleTotal > 0) {
+ PERF_METRICS.cacheMetrics.computedStyleHitRate = PERF_METRICS.cacheMetrics.computedStyleCacheHits / computedStyleTotal;
+ }
+
+ if ((boundingRectTotal + computedStyleTotal) > 0) {
+ PERF_METRICS.cacheMetrics.overallHitRate =
+ (PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheHits) /
+ (boundingRectTotal + computedStyleTotal);
+ }
+ }
+
+ return debugMode ?
+ { rootId, map: DOM_HASH_MAP, perfMetrics: PERF_METRICS } :
+ { rootId, map: DOM_HASH_MAP };
+};
diff --git a/browser_use/dom/history_tree_processor/service.py b/browser_use/dom/history_tree_processor/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee43125c2786d01114b94f5bf6643376de4c2be
--- /dev/null
+++ b/browser_use/dom/history_tree_processor/service.py
@@ -0,0 +1,107 @@
+import hashlib
+from typing import Optional
+
+from browser_use.dom.history_tree_processor.view import DOMHistoryElement, HashedDomElement
+from browser_use.dom.views import DOMElementNode
+
+
+class HistoryTreeProcessor:
+ """ "
+ Operations on the DOM elements
+
+ @dev be careful - text nodes can change even if elements stay the same
+ """
+
+ @staticmethod
+ def convert_dom_element_to_history_element(dom_element: DOMElementNode) -> DOMHistoryElement:
+ from browser_use.browser.context import BrowserContext
+
+ parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
+ css_selector = BrowserContext._enhanced_css_selector_for_element(dom_element)
+ return DOMHistoryElement(
+ dom_element.tag_name,
+ dom_element.xpath,
+ dom_element.highlight_index,
+ parent_branch_path,
+ dom_element.attributes,
+ dom_element.shadow_root,
+ css_selector=css_selector,
+ page_coordinates=dom_element.page_coordinates,
+ viewport_coordinates=dom_element.viewport_coordinates,
+ viewport_info=dom_element.viewport_info,
+ )
+
+ @staticmethod
+ def find_history_element_in_tree(dom_history_element: DOMHistoryElement, tree: DOMElementNode) -> Optional[DOMElementNode]:
+ hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
+
+ def process_node(node: DOMElementNode):
+ if node.highlight_index is not None:
+ hashed_node = HistoryTreeProcessor._hash_dom_element(node)
+ if hashed_node == hashed_dom_history_element:
+ return node
+ for child in node.children:
+ if isinstance(child, DOMElementNode):
+ result = process_node(child)
+ if result is not None:
+ return result
+ return None
+
+ return process_node(tree)
+
+ @staticmethod
+ def compare_history_element_and_dom_element(dom_history_element: DOMHistoryElement, dom_element: DOMElementNode) -> bool:
+ hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
+ hashed_dom_element = HistoryTreeProcessor._hash_dom_element(dom_element)
+
+ return hashed_dom_history_element == hashed_dom_element
+
+ @staticmethod
+ def _hash_dom_history_element(dom_history_element: DOMHistoryElement) -> HashedDomElement:
+ branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(dom_history_element.entire_parent_branch_path)
+ attributes_hash = HistoryTreeProcessor._attributes_hash(dom_history_element.attributes)
+ xpath_hash = HistoryTreeProcessor._xpath_hash(dom_history_element.xpath)
+
+ return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
+
+ @staticmethod
+ def _hash_dom_element(dom_element: DOMElementNode) -> HashedDomElement:
+ parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
+ branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(parent_branch_path)
+ attributes_hash = HistoryTreeProcessor._attributes_hash(dom_element.attributes)
+ xpath_hash = HistoryTreeProcessor._xpath_hash(dom_element.xpath)
+ # text_hash = DomTreeProcessor._text_hash(dom_element)
+
+ return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
+
+ @staticmethod
+ def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]:
+ parents: list[DOMElementNode] = []
+ current_element: DOMElementNode = dom_element
+ while current_element.parent is not None:
+ parents.append(current_element)
+ current_element = current_element.parent
+
+ parents.reverse()
+
+ return [parent.tag_name for parent in parents]
+
+ @staticmethod
+ def _parent_branch_path_hash(parent_branch_path: list[str]) -> str:
+ parent_branch_path_string = '/'.join(parent_branch_path)
+ return hashlib.sha256(parent_branch_path_string.encode()).hexdigest()
+
+ @staticmethod
+ def _attributes_hash(attributes: dict[str, str]) -> str:
+ attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items())
+ return hashlib.sha256(attributes_string.encode()).hexdigest()
+
+ @staticmethod
+ def _xpath_hash(xpath: str) -> str:
+ return hashlib.sha256(xpath.encode()).hexdigest()
+
+ @staticmethod
+ def _text_hash(dom_element: DOMElementNode) -> str:
+ """ """
+ text_string = dom_element.get_all_text_till_next_clickable_element()
+ return hashlib.sha256(text_string.encode()).hexdigest()
diff --git a/browser_use/dom/history_tree_processor/view.py b/browser_use/dom/history_tree_processor/view.py
new file mode 100644
index 0000000000000000000000000000000000000000..e970ad5b53af7f340d93f2f21773a7651fd3a8d7
--- /dev/null
+++ b/browser_use/dom/history_tree_processor/view.py
@@ -0,0 +1,70 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+
+from pydantic import BaseModel
+
+
+@dataclass
+class HashedDomElement:
+ """
+ Hash of the dom element to be used as a unique identifier
+ """
+
+ branch_path_hash: str
+ attributes_hash: str
+ xpath_hash: str
+ # text_hash: str
+
+
+class Coordinates(BaseModel):
+ x: int
+ y: int
+
+
+class CoordinateSet(BaseModel):
+ top_left: Coordinates
+ top_right: Coordinates
+ bottom_left: Coordinates
+ bottom_right: Coordinates
+ center: Coordinates
+ width: int
+ height: int
+
+
+class ViewportInfo(BaseModel):
+ scroll_x: int
+ scroll_y: int
+ width: int
+ height: int
+
+
+@dataclass
+class DOMHistoryElement:
+ tag_name: str
+ xpath: str
+ highlight_index: Optional[int]
+ entire_parent_branch_path: list[str]
+ attributes: dict[str, str]
+ shadow_root: bool = False
+ css_selector: Optional[str] = None
+ page_coordinates: Optional[CoordinateSet] = None
+ viewport_coordinates: Optional[CoordinateSet] = None
+ viewport_info: Optional[ViewportInfo] = None
+
+ def to_dict(self) -> dict:
+ page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None
+ viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None
+ viewport_info = self.viewport_info.model_dump() if self.viewport_info else None
+
+ return {
+ 'tag_name': self.tag_name,
+ 'xpath': self.xpath,
+ 'highlight_index': self.highlight_index,
+ 'entire_parent_branch_path': self.entire_parent_branch_path,
+ 'attributes': self.attributes,
+ 'shadow_root': self.shadow_root,
+ 'css_selector': self.css_selector,
+ 'page_coordinates': page_coordinates,
+ 'viewport_coordinates': viewport_coordinates,
+ 'viewport_info': viewport_info,
+ }
diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..d03fbecfbf5abdc5db805d67764afdcc3868de87
--- /dev/null
+++ b/browser_use/dom/service.py
@@ -0,0 +1,169 @@
+import gc
+import json
+import logging
+from dataclasses import dataclass
+from importlib import resources
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+ from playwright.async_api import Page
+
+from browser_use.dom.views import (
+ DOMBaseNode,
+ DOMElementNode,
+ DOMState,
+ DOMTextNode,
+ SelectorMap,
+)
+from browser_use.utils import time_execution_async
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ViewportInfo:
+ width: int
+ height: int
+
+
+class DomService:
+ def __init__(self, page: 'Page'):
+ self.page = page
+ self.xpath_cache = {}
+
+ self.js_code = resources.read_text('browser_use.dom', 'buildDomTree.js')
+
+ # region - Clickable elements
+ @time_execution_async('--get_clickable_elements')
+ async def get_clickable_elements(
+ self,
+ highlight_elements: bool = True,
+ focus_element: int = -1,
+ viewport_expansion: int = 0,
+ ) -> DOMState:
+ element_tree, selector_map = await self._build_dom_tree(highlight_elements, focus_element, viewport_expansion)
+ return DOMState(element_tree=element_tree, selector_map=selector_map)
+
+ @time_execution_async('--build_dom_tree')
+ async def _build_dom_tree(
+ self,
+ highlight_elements: bool,
+ focus_element: int,
+ viewport_expansion: int,
+ ) -> tuple[DOMElementNode, SelectorMap]:
+ if await self.page.evaluate('1+1') != 2:
+ raise ValueError('The page cannot evaluate javascript code properly')
+
+ # NOTE: We execute JS code in the browser to extract important DOM information.
+ # The returned hash map contains information about the DOM tree and the
+ # relationship between the DOM elements.
+ debug_mode = logger.getEffectiveLevel() == logging.DEBUG
+ args = {
+ 'doHighlightElements': highlight_elements,
+ 'focusHighlightIndex': focus_element,
+ 'viewportExpansion': viewport_expansion,
+ 'debugMode': debug_mode,
+ }
+
+ try:
+ eval_page = await self.page.evaluate(self.js_code, args)
+ except Exception as e:
+ logger.error('Error evaluating JavaScript: %s', e)
+ raise
+
+ # Only log performance metrics in debug mode
+ if debug_mode and 'perfMetrics' in eval_page:
+ logger.debug('DOM Tree Building Performance Metrics:\n%s', json.dumps(eval_page['perfMetrics'], indent=2))
+
+ return await self._construct_dom_tree(eval_page)
+
+ @time_execution_async('--construct_dom_tree')
+ async def _construct_dom_tree(
+ self,
+ eval_page: dict,
+ ) -> tuple[DOMElementNode, SelectorMap]:
+ js_node_map = eval_page['map']
+ js_root_id = eval_page['rootId']
+
+ selector_map = {}
+ node_map = {}
+
+ for id, node_data in js_node_map.items():
+ node, children_ids = self._parse_node(node_data)
+ if node is None:
+ continue
+
+ node_map[id] = node
+
+ if isinstance(node, DOMElementNode) and node.highlight_index is not None:
+ selector_map[node.highlight_index] = node
+
+ # NOTE: We know that we are building the tree bottom up
+ # and all children are already processed.
+ if isinstance(node, DOMElementNode):
+ for child_id in children_ids:
+ if child_id not in node_map:
+ continue
+
+ child_node = node_map[child_id]
+
+ child_node.parent = node
+ node.children.append(child_node)
+
+ html_to_dict = node_map[str(js_root_id)]
+
+ del node_map
+ del js_node_map
+ del js_root_id
+
+ gc.collect()
+
+ if html_to_dict is None or not isinstance(html_to_dict, DOMElementNode):
+ raise ValueError('Failed to parse HTML to dictionary')
+
+ return html_to_dict, selector_map
+
+ def _parse_node(
+ self,
+ node_data: dict,
+ ) -> tuple[Optional[DOMBaseNode], list[int]]:
+ if not node_data:
+ return None, []
+
+ # Process text nodes immediately
+ if node_data.get('type') == 'TEXT_NODE':
+ text_node = DOMTextNode(
+ text=node_data['text'],
+ is_visible=node_data['isVisible'],
+ parent=None,
+ )
+ return text_node, []
+
+ # Process coordinates if they exist for element nodes
+
+ viewport_info = None
+
+ if 'viewport' in node_data:
+ viewport_info = ViewportInfo(
+ width=node_data['viewport']['width'],
+ height=node_data['viewport']['height'],
+ )
+
+ element_node = DOMElementNode(
+ tag_name=node_data['tagName'],
+ xpath=node_data['xpath'],
+ attributes=node_data.get('attributes', {}),
+ children=[],
+ is_visible=node_data.get('isVisible', False),
+ is_interactive=node_data.get('isInteractive', False),
+ is_top_element=node_data.get('isTopElement', False),
+ is_in_viewport=node_data.get('isInViewport', False),
+ highlight_index=node_data.get('highlightIndex'),
+ shadow_root=node_data.get('shadowRoot', False),
+ parent=None,
+ viewport_info=viewport_info,
+ )
+
+ children_ids = node_data.get('children', [])
+
+ return element_node, children_ids
diff --git a/browser_use/dom/tests/debug_page_structure.py b/browser_use/dom/tests/debug_page_structure.py
new file mode 100644
index 0000000000000000000000000000000000000000..49e99bbc2599a6280bab6dd156a4639938efc8ff
--- /dev/null
+++ b/browser_use/dom/tests/debug_page_structure.py
@@ -0,0 +1,123 @@
+import asyncio
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+
+async def analyze_page_structure(url: str):
+ """Analyze and print the structure of a webpage with enhanced debugging"""
+ browser = Browser(
+ config=BrowserConfig(
+ headless=False, # Set to True if you don't need to see the browser
+ )
+ )
+
+ context = BrowserContext(browser=browser)
+
+ try:
+ async with context as ctx:
+ # Navigate to the URL
+ page = await ctx.get_current_page()
+ await page.goto(url)
+ await page.wait_for_load_state('networkidle')
+
+ # Get viewport dimensions
+ viewport_info = await page.evaluate("""() => {
+ return {
+ viewport: {
+ width: window.innerWidth,
+ height: window.innerHeight,
+ scrollX: window.scrollX,
+ scrollY: window.scrollY
+ }
+ }
+ }""")
+
+ print('\nViewport Information:')
+ print(f'Width: {viewport_info["viewport"]["width"]}')
+ print(f'Height: {viewport_info["viewport"]["height"]}')
+ print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}')
+ print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}')
+
+ # Enhanced debug information for cookie consent and fixed position elements
+ debug_info = await page.evaluate("""() => {
+ function getElementInfo(element) {
+ const rect = element.getBoundingClientRect();
+ const style = window.getComputedStyle(element);
+ return {
+ tag: element.tagName.toLowerCase(),
+ id: element.id,
+ className: element.className,
+ position: style.position,
+ rect: {
+ top: rect.top,
+ right: rect.right,
+ bottom: rect.bottom,
+ left: rect.left,
+ width: rect.width,
+ height: rect.height
+ },
+ isFixed: style.position === 'fixed',
+ isSticky: style.position === 'sticky',
+ zIndex: style.zIndex,
+ visibility: style.visibility,
+ display: style.display,
+ opacity: style.opacity
+ };
+ }
+
+ // Find cookie-related elements
+ const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]'));
+ const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => {
+ const style = window.getComputedStyle(el);
+ return style.position === 'fixed' || style.position === 'sticky';
+ });
+
+ return {
+ cookieElements: cookieElements.map(getElementInfo),
+ fixedElements: fixedElements.map(getElementInfo)
+ };
+ }""")
+
+ print('\nCookie-related Elements:')
+ for elem in debug_info['cookieElements']:
+ print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
+ print(f'Position: {elem["position"]}')
+ print(f'Rect: {elem["rect"]}')
+ print(f'Z-Index: {elem["zIndex"]}')
+ print(f'Visibility: {elem["visibility"]}')
+ print(f'Display: {elem["display"]}')
+ print(f'Opacity: {elem["opacity"]}')
+
+ print('\nFixed/Sticky Position Elements:')
+ for elem in debug_info['fixedElements']:
+ print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
+ print(f'Position: {elem["position"]}')
+ print(f'Rect: {elem["rect"]}')
+ print(f'Z-Index: {elem["zIndex"]}')
+
+ print(f'\nPage Structure for {url}:\n')
+ structure = await ctx.get_page_structure()
+ print(structure)
+
+ input('Press Enter to close the browser...')
+ finally:
+ await browser.close()
+
+
+if __name__ == '__main__':
+ # You can modify this URL to analyze different pages
+
+ urls = [
+ 'https://www.mlb.com/yankees/stats/',
+ 'https://immobilienscout24.de',
+ 'https://www.zeiss.com/career/en/job-search.html?page=1',
+ 'https://www.zeiss.com/career/en/job-search.html?page=1',
+ 'https://reddit.com',
+ ]
+ for url in urls:
+ asyncio.run(analyze_page_structure(url))
diff --git a/browser_use/dom/tests/extraction_test.py b/browser_use/dom/tests/extraction_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a4dce638c820a213cbcb43ddd7ab42ef533afec
--- /dev/null
+++ b/browser_use/dom/tests/extraction_test.py
@@ -0,0 +1,147 @@
+import asyncio
+import time
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from browser_use.dom.service import DomService
+from browser_use.utils import time_execution_sync
+
+
+async def test_process_html_file():
+ config = BrowserContextConfig(
+ cookies_file='cookies3.json',
+ disable_security=True,
+ wait_for_network_idle_page_load_time=2,
+ )
+
+ browser = Browser(
+ config=BrowserConfig(
+ # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ )
+ )
+ context = BrowserContext(browser=browser, config=config) # noqa: F821
+
+ websites = [
+ 'https://kayak.com/flights',
+ 'https://immobilienscout24.de',
+ 'https://google.com',
+ 'https://amazon.com',
+ 'https://github.com',
+ ]
+
+ async with context as context:
+ page = await context.get_current_page()
+ dom_service = DomService(page)
+
+ for website in websites:
+ print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
+ await page.goto(website)
+ time.sleep(2) # Additional wait for dynamic content
+
+ async def test_viewport(expansion: int, description: str):
+ print(f'\n{description}:')
+ dom_state = await time_execution_sync(f'get_clickable_elements ({description})')(
+ dom_service.get_clickable_elements
+ )(highlight_elements=True, viewport_expansion=expansion)
+
+ elements = dom_state.element_tree
+ selector_map = dom_state.selector_map
+ element_count = len(selector_map.keys())
+ token_count = count_string_tokens(elements.clickable_elements_to_string(), model='gpt-4o')
+
+ print(f'Number of elements: {element_count}')
+ print(f'Token count: {token_count}')
+ return element_count, token_count
+
+ expansions = [0, 100, 200, 300, 400, 500, 600, 1000, -1, -200]
+ results = []
+
+ for i, expansion in enumerate(expansions):
+ description = (
+ f'{i + 1}. Expansion {expansion}px' if expansion >= 0 else f'{i + 1}. All elements ({expansion} expansion)'
+ )
+ count, tokens = await test_viewport(expansion, description)
+ results.append((count, tokens))
+ input('Press Enter to continue...')
+ await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
+
+ # Print comparison summary
+ print('\nComparison Summary:')
+ for i, (count, tokens) in enumerate(results):
+ expansion = expansions[i]
+ description = f'Expansion {expansion}px' if expansion >= 0 else 'All elements (-1)'
+ initial_count, initial_tokens = results[0]
+ print(f'{description}: {count} elements (+{count - initial_count}), {tokens} tokens')
+
+ input('\nPress Enter to continue to next website...')
+
+ # Clear highlights before next website
+ await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
+
+
+async def test_focus_vs_all_elements():
+ config = BrowserContextConfig(
+ cookies_file='cookies3.json',
+ disable_security=True,
+ wait_for_network_idle_page_load_time=2,
+ )
+
+ browser = Browser(
+ config=BrowserConfig(
+ # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ )
+ )
+ context = BrowserContext(browser=browser, config=config) # noqa: F821
+
+ websites = [
+ 'https://immobilienscout24.de',
+ 'https://www.zeiss.com/career/en/job-search.html?page=1',
+ 'https://www.mlb.com/yankees/stats/',
+ 'https://www.amazon.com/s?k=laptop&s=review-rank&crid=1RZCEJ289EUSI&qid=1740202453&sprefix=laptop%2Caps%2C166&ref=sr_st_review-rank&ds=v1%3A4EnYKXVQA7DIE41qCvRZoNB4qN92Jlztd3BPsTFXmxU',
+ 'https://codepen.io/geheimschriftstift/pen/mPLvQz',
+ 'https://reddit.com',
+ 'https://www.google.com/search?q=google+hi&oq=google+hi&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRhA0gEIMjI2NmowajSoAgCwAgE&sourceid=chrome&ie=UTF-8',
+ 'https://kayak.com/flights',
+ 'https://google.com',
+ 'https://amazon.com',
+ 'https://github.com',
+ ]
+
+ async with context as context:
+ page = await context.get_current_page()
+ dom_service = DomService(page)
+
+ for website in websites:
+ # sleep 2
+ await page.goto(website)
+ time.sleep(2)
+
+ while True:
+ try:
+ print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
+ # time.sleep(2) # Additional wait for dynamic content
+
+ # First get all elements
+ print('\nGetting all elements:')
+ all_elements_state = await time_execution_sync('get_all_elements')(dom_service.get_clickable_elements)(
+ highlight_elements=True, viewport_expansion=100
+ )
+
+ selector_map = all_elements_state.selector_map
+ total_elements = len(selector_map.keys())
+ print(f'Total number of elements: {total_elements}')
+
+ answer = input('Press Enter to clear highlights and continue...')
+ if answer == 'q':
+ break
+
+ await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
+
+ except Exception as e:
+ print(f'Error: {e}')
+ pass
+
+
+if __name__ == '__main__':
+ asyncio.run(test_focus_vs_all_elements())
+ asyncio.run(test_process_html_file())
diff --git a/browser_use/dom/tests/process_dom_test.py b/browser_use/dom/tests/process_dom_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..39bd2432885c1b961da9086de70dd658e5a1f840
--- /dev/null
+++ b/browser_use/dom/tests/process_dom_test.py
@@ -0,0 +1,40 @@
+import json
+import os
+import time
+
+from browser_use.browser.browser import Browser, BrowserConfig
+
+
+async def test_process_dom():
+ browser = Browser(config=BrowserConfig(headless=False))
+
+ async with await browser.new_context() as context:
+ page = await context.get_current_page()
+ await page.goto('https://kayak.com/flights')
+ # await page.goto('https://google.com/flights')
+ # await page.goto('https://immobilienscout24.de')
+ # await page.goto('https://seleniumbase.io/w3schools/iframes')
+
+ time.sleep(3)
+
+ with open('browser_use/dom/buildDomTree.js', 'r') as f:
+ js_code = f.read()
+
+ start = time.time()
+ dom_tree = await page.evaluate(js_code)
+ end = time.time()
+
+ # print(dom_tree)
+ print(f'Time: {end - start:.2f}s')
+
+ os.makedirs('./tmp', exist_ok=True)
+ with open('./tmp/dom.json', 'w') as f:
+ json.dump(dom_tree, f, indent=1)
+
+ # both of these work for immobilienscout24.de
+ # await page.click('.sc-dcJsrY.ezjNCe')
+ # await page.click(
+ # 'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)'
+ # )
+
+ input('Press Enter to continue...')
diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0acd9f86a7fe3f2e5b165ab4b904be27261c267
--- /dev/null
+++ b/browser_use/dom/views.py
@@ -0,0 +1,196 @@
+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+from browser_use.dom.history_tree_processor.view import CoordinateSet, HashedDomElement, ViewportInfo
+from browser_use.utils import time_execution_sync
+
+# Avoid circular import issues
+if TYPE_CHECKING:
+ from .views import DOMElementNode
+
+
+@dataclass(frozen=False)
+class DOMBaseNode:
+ is_visible: bool
+ # Use None as default and set parent later to avoid circular reference issues
+ parent: Optional['DOMElementNode']
+
+
+@dataclass(frozen=False)
+class DOMTextNode(DOMBaseNode):
+ text: str
+ type: str = 'TEXT_NODE'
+
+ def has_parent_with_highlight_index(self) -> bool:
+ current = self.parent
+ while current is not None:
+ # stop if the element has a highlight index (will be handled separately)
+ if current.highlight_index is not None:
+ return True
+
+ current = current.parent
+ return False
+
+ def is_parent_in_viewport(self) -> bool:
+ if self.parent is None:
+ return False
+ return self.parent.is_in_viewport
+
+ def is_parent_top_element(self) -> bool:
+ if self.parent is None:
+ return False
+ return self.parent.is_top_element
+
+
+@dataclass(frozen=False)
+class DOMElementNode(DOMBaseNode):
+ """
+ xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe).
+ To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`)
+ """
+
+ tag_name: str
+ xpath: str
+ attributes: Dict[str, str]
+ children: List[DOMBaseNode]
+ is_interactive: bool = False
+ is_top_element: bool = False
+ is_in_viewport: bool = False
+ shadow_root: bool = False
+ highlight_index: Optional[int] = None
+ viewport_coordinates: Optional[CoordinateSet] = None
+ page_coordinates: Optional[CoordinateSet] = None
+ viewport_info: Optional[ViewportInfo] = None
+
+ def __repr__(self) -> str:
+ tag_str = f'<{self.tag_name}'
+
+ # Add attributes
+ for key, value in self.attributes.items():
+ tag_str += f' {key}="{value}"'
+ tag_str += '>'
+
+ # Add extra info
+ extras = []
+ if self.is_interactive:
+ extras.append('interactive')
+ if self.is_top_element:
+ extras.append('top')
+ if self.shadow_root:
+ extras.append('shadow-root')
+ if self.highlight_index is not None:
+ extras.append(f'highlight:{self.highlight_index}')
+ if self.is_in_viewport:
+ extras.append('in-viewport')
+
+ if extras:
+ tag_str += f' [{", ".join(extras)}]'
+
+ return tag_str
+
+ @cached_property
+ def hash(self) -> HashedDomElement:
+ from browser_use.dom.history_tree_processor.service import (
+ HistoryTreeProcessor,
+ )
+
+ return HistoryTreeProcessor._hash_dom_element(self)
+
+ def get_all_text_till_next_clickable_element(self, max_depth: int = -1) -> str:
+ text_parts = []
+
+ def collect_text(node: DOMBaseNode, current_depth: int) -> None:
+ if max_depth != -1 and current_depth > max_depth:
+ return
+
+ # Skip this branch if we hit a highlighted element (except for the current node)
+ if isinstance(node, DOMElementNode) and node != self and node.highlight_index is not None:
+ return
+
+ if isinstance(node, DOMTextNode):
+ text_parts.append(node.text)
+ elif isinstance(node, DOMElementNode):
+ for child in node.children:
+ collect_text(child, current_depth + 1)
+
+ collect_text(self, 0)
+ return '\n'.join(text_parts).strip()
+
+ @time_execution_sync('--clickable_elements_to_string')
+ def clickable_elements_to_string(self, include_attributes: list[str] | None = None) -> str:
+ """Convert the processed DOM content to HTML."""
+ formatted_text = []
+
+ def process_node(node: DOMBaseNode, depth: int) -> None:
+ if isinstance(node, DOMElementNode):
+ # Add element with highlight_index
+ if node.highlight_index is not None:
+ attributes_str = ''
+ text = node.get_all_text_till_next_clickable_element()
+ if include_attributes:
+ attributes = list(
+ set(
+ [
+ str(value)
+ for key, value in node.attributes.items()
+ if key in include_attributes and value != node.tag_name
+ ]
+ )
+ )
+ if text in attributes:
+ attributes.remove(text)
+ attributes_str = ';'.join(attributes)
+ line = f'[{node.highlight_index}]<{node.tag_name} '
+ if attributes_str:
+ line += f'{attributes_str}'
+ if text:
+ if attributes_str:
+ line += f'>{text}'
+ else:
+ line += f'{text}'
+ line += '/>'
+ formatted_text.append(line)
+
+ # Process children regardless
+ for child in node.children:
+ process_node(child, depth + 1)
+
+ elif isinstance(node, DOMTextNode):
+ # Add text only if it doesn't have a highlighted parent
+ if not node.has_parent_with_highlight_index() and node.is_visible: # and node.is_parent_top_element()
+ formatted_text.append(f'{node.text}')
+
+ process_node(self, 0)
+ return '\n'.join(formatted_text)
+
+ def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']:
+ # Check if current element is a file input
+ if self.tag_name == 'input' and self.attributes.get('type') == 'file':
+ return self
+
+ # Check children
+ for child in self.children:
+ if isinstance(child, DOMElementNode):
+ result = child.get_file_upload_element(check_siblings=False)
+ if result:
+ return result
+
+ # Check siblings only for the initial call
+ if check_siblings and self.parent:
+ for sibling in self.parent.children:
+ if sibling is not self and isinstance(sibling, DOMElementNode):
+ result = sibling.get_file_upload_element(check_siblings=False)
+ if result:
+ return result
+
+ return None
+
+
+SelectorMap = dict[int, DOMElementNode]
+
+
+@dataclass
+class DOMState:
+ element_tree: DOMElementNode
+ selector_map: SelectorMap
diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..043252bd78d0ab711c206be30b96f1a9ff36537f
--- /dev/null
+++ b/browser_use/logging_config.py
@@ -0,0 +1,132 @@
+import logging
+import os
+import sys
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+def addLoggingLevel(levelName, levelNum, methodName=None):
+ """
+ Comprehensively adds a new logging level to the `logging` module and the
+ currently configured logging class.
+
+ `levelName` becomes an attribute of the `logging` module with the value
+ `levelNum`. `methodName` becomes a convenience method for both `logging`
+ itself and the class returned by `logging.getLoggerClass()` (usually just
+ `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
+ used.
+
+ To avoid accidental clobberings of existing attributes, this method will
+ raise an `AttributeError` if the level name is already an attribute of the
+ `logging` module or if the method name is already present
+
+ Example
+ -------
+ >>> addLoggingLevel('TRACE', logging.DEBUG - 5)
+ >>> logging.getLogger(__name__).setLevel('TRACE')
+ >>> logging.getLogger(__name__).trace('that worked')
+ >>> logging.trace('so did this')
+ >>> logging.TRACE
+ 5
+
+ """
+ if not methodName:
+ methodName = levelName.lower()
+
+ if hasattr(logging, levelName):
+ raise AttributeError('{} already defined in logging module'.format(levelName))
+ if hasattr(logging, methodName):
+ raise AttributeError('{} already defined in logging module'.format(methodName))
+ if hasattr(logging.getLoggerClass(), methodName):
+ raise AttributeError('{} already defined in logger class'.format(methodName))
+
+ # This method was inspired by the answers to Stack Overflow post
+ # http://stackoverflow.com/q/2183233/2988730, especially
+ # http://stackoverflow.com/a/13638084/2988730
+ def logForLevel(self, message, *args, **kwargs):
+ if self.isEnabledFor(levelNum):
+ self._log(levelNum, message, args, **kwargs)
+
+ def logToRoot(message, *args, **kwargs):
+ logging.log(levelNum, message, *args, **kwargs)
+
+ logging.addLevelName(levelNum, levelName)
+ setattr(logging, levelName, levelNum)
+ setattr(logging.getLoggerClass(), methodName, logForLevel)
+ setattr(logging, methodName, logToRoot)
+
+
+def setup_logging():
+ # Try to add RESULT level, but ignore if it already exists
+ try:
+ addLoggingLevel('RESULT', 35) # This allows ERROR, FATAL and CRITICAL
+ except AttributeError:
+ pass # Level already exists, which is fine
+
+ log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
+
+ # Check if handlers are already set up
+ if logging.getLogger().hasHandlers():
+ return
+
+ # Clear existing handlers
+ root = logging.getLogger()
+ root.handlers = []
+
+ class BrowserUseFormatter(logging.Formatter):
+ def format(self, record):
+ if type(record.name) == str and record.name.startswith('browser_use.'):
+ record.name = record.name.split('.')[-2]
+ return super().format(record)
+
+ # Setup single handler for all loggers
+ console = logging.StreamHandler(sys.stdout)
+
+ # adittional setLevel here to filter logs
+ if log_type == 'result':
+ console.setLevel('RESULT')
+ console.setFormatter(BrowserUseFormatter('%(message)s'))
+ else:
+ console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s'))
+
+ # Configure root logger only
+ root.addHandler(console)
+
+ # switch cases for log_type
+ if log_type == 'result':
+ root.setLevel('RESULT') # string usage to avoid syntax error
+ elif log_type == 'debug':
+ root.setLevel(logging.DEBUG)
+ else:
+ root.setLevel(logging.INFO)
+
+ # Configure browser_use logger
+ browser_use_logger = logging.getLogger('browser_use')
+ browser_use_logger.propagate = False # Don't propagate to root logger
+ browser_use_logger.addHandler(console)
+ browser_use_logger.setLevel(root.level) # Set same level as root logger
+
+ logger = logging.getLogger('browser_use')
+ logger.info('BrowserUse logging setup complete with level %s', log_type)
+ # Silence third-party loggers
+ for logger in [
+ 'WDM',
+ 'httpx',
+ 'selenium',
+ 'playwright',
+ 'urllib3',
+ 'asyncio',
+ 'langchain',
+ 'openai',
+ 'httpcore',
+ 'charset_normalizer',
+ 'anthropic._base_client',
+ 'PIL.PngImagePlugin',
+ 'trafilatura.htmlprocessing',
+ 'trafilatura',
+ ]:
+ third_party = logging.getLogger(logger)
+ third_party.setLevel(logging.ERROR)
+ third_party.propagate = False
diff --git a/browser_use/telemetry/service.py b/browser_use/telemetry/service.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a2e82e45801516de284816f38de79bfd67a0f84
--- /dev/null
+++ b/browser_use/telemetry/service.py
@@ -0,0 +1,105 @@
+import logging
+import os
+import uuid
+from pathlib import Path
+
+from dotenv import load_dotenv
+from posthog import Posthog
+
+from browser_use.telemetry.views import BaseTelemetryEvent
+from browser_use.utils import singleton
+
+load_dotenv()
+
+
+logger = logging.getLogger(__name__)
+
+
+POSTHOG_EVENT_SETTINGS = {
+ 'process_person_profile': True,
+}
+
+
+@singleton
+class ProductTelemetry:
+ """
+ Service for capturing anonymized telemetry data.
+
+ If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled.
+ """
+
+ USER_ID_PATH = str(Path.home() / '.cache' / 'browser_use' / 'telemetry_user_id')
+ PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh'
+ HOST = 'https://eu.i.posthog.com'
+ UNKNOWN_USER_ID = 'UNKNOWN'
+
+ _curr_user_id = None
+
+ def __init__(self) -> None:
+ telemetry_disabled = os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
+ self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug'
+
+ if telemetry_disabled:
+ self._posthog_client = None
+ else:
+ logging.info(
+ 'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.'
+ )
+ self._posthog_client = Posthog(
+ project_api_key=self.PROJECT_API_KEY,
+ host=self.HOST,
+ disable_geoip=False,
+ )
+
+ # Silence posthog's logging
+ if not self.debug_logging:
+ posthog_logger = logging.getLogger('posthog')
+ posthog_logger.disabled = True
+
+ if self._posthog_client is None:
+ logger.debug('Telemetry disabled')
+
+ def capture(self, event: BaseTelemetryEvent) -> None:
+ if self._posthog_client is None:
+ return
+
+ if self.debug_logging:
+ logger.debug(f'Telemetry event: {event.name} {event.properties}')
+ self._direct_capture(event)
+
+ def _direct_capture(self, event: BaseTelemetryEvent) -> None:
+ """
+ Should not be thread blocking because posthog magically handles it
+ """
+ if self._posthog_client is None:
+ return
+
+ try:
+ self._posthog_client.capture(
+ self.user_id,
+ event.name,
+ {**event.properties, **POSTHOG_EVENT_SETTINGS},
+ )
+ except Exception as e:
+ logger.error(f'Failed to send telemetry event {event.name}: {e}')
+
+ @property
+ def user_id(self) -> str:
+ if self._curr_user_id:
+ return self._curr_user_id
+
+ # File access may fail due to permissions or other reasons. We don't want to
+ # crash so we catch all exceptions.
+ try:
+ if not os.path.exists(self.USER_ID_PATH):
+ os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True)
+ with open(self.USER_ID_PATH, 'w') as f:
+ new_user_id = str(uuid.uuid4())
+ f.write(new_user_id)
+ self._curr_user_id = new_user_id
+ else:
+ with open(self.USER_ID_PATH, 'r') as f:
+ self._curr_user_id = f.read()
+ except Exception:
+ self._curr_user_id = 'UNKNOWN_USER_ID'
+ return self._curr_user_id
diff --git a/browser_use/telemetry/views.py b/browser_use/telemetry/views.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdba27303109e6ec12f763b537b2bdee9dca9d95
--- /dev/null
+++ b/browser_use/telemetry/views.py
@@ -0,0 +1,63 @@
+from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, Sequence
+
+
+@dataclass
+class BaseTelemetryEvent(ABC):
+ @property
+ @abstractmethod
+ def name(self) -> str:
+ pass
+
+ @property
+ def properties(self) -> Dict[str, Any]:
+ return {k: v for k, v in asdict(self).items() if k != 'name'}
+
+
+@dataclass
+class RegisteredFunction:
+ name: str
+ params: dict[str, Any]
+
+
+@dataclass
+class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent):
+ registered_functions: list[RegisteredFunction]
+ name: str = 'controller_registered_functions'
+
+
+@dataclass
+class AgentStepTelemetryEvent(BaseTelemetryEvent):
+ agent_id: str
+ step: int
+ step_error: list[str]
+ consecutive_failures: int
+ actions: list[dict]
+ name: str = 'agent_step'
+
+
+@dataclass
+class AgentRunTelemetryEvent(BaseTelemetryEvent):
+ agent_id: str
+ use_vision: bool
+ task: str
+ model_name: str
+ chat_model_library: str
+ version: str
+ source: str
+ name: str = 'agent_run'
+
+
+@dataclass
+class AgentEndTelemetryEvent(BaseTelemetryEvent):
+ agent_id: str
+ steps: int
+ max_steps_reached: bool
+ is_done: bool
+ success: bool | None
+ total_input_tokens: int
+ total_duration_seconds: float
+
+ errors: Sequence[str | None]
+ name: str = 'agent_end'
diff --git a/browser_use/utils.py b/browser_use/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..860b35a320d83f924483dbba53ed80bfcea27ff2
--- /dev/null
+++ b/browser_use/utils.py
@@ -0,0 +1,54 @@
+import logging
+import time
+from functools import wraps
+from typing import Any, Callable, Coroutine, ParamSpec, TypeVar
+
+logger = logging.getLogger(__name__)
+
+
+# Define generic type variables for return type and parameters
+R = TypeVar('R')
+P = ParamSpec('P')
+
+
+def time_execution_sync(additional_text: str = '') -> Callable[[Callable[P, R]], Callable[P, R]]:
+ def decorator(func: Callable[P, R]) -> Callable[P, R]:
+ @wraps(func)
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+ start_time = time.time()
+ result = func(*args, **kwargs)
+ execution_time = time.time() - start_time
+ logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
+ return result
+
+ return wrapper
+
+ return decorator
+
+
+def time_execution_async(
+ additional_text: str = '',
+) -> Callable[[Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]]:
+ def decorator(func: Callable[P, Coroutine[Any, Any, R]]) -> Callable[P, Coroutine[Any, Any, R]]:
+ @wraps(func)
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+ start_time = time.time()
+ result = await func(*args, **kwargs)
+ execution_time = time.time() - start_time
+ logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
+ return result
+
+ return wrapper
+
+ return decorator
+
+
+def singleton(cls):
+ instance = [None]
+
+ def wrapper(*args, **kwargs):
+ if instance[0] is None:
+ instance[0] = cls(*args, **kwargs)
+ return instance[0]
+
+ return wrapper
diff --git a/codebeaver.yml b/codebeaver.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c6a0739f0ff1fa9362996665ef33c5ec2ed9edaa
--- /dev/null
+++ b/codebeaver.yml
@@ -0,0 +1,4 @@
+environment:
+- OPENAI_API_KEY=empty
+- AZURE_OPENAI_API_KEY=empty
+from: pytest
diff --git a/conftest.py b/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..297403f266c035835c5c85c31dd66fabe3dd8b0b
--- /dev/null
+++ b/conftest.py
@@ -0,0 +1,10 @@
+import os
+import sys
+
+from browser_use.logging_config import setup_logging
+
+# Get the absolute path to the project root
+project_root = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, project_root)
+
+setup_logging()
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..10f09abffd8d6d4cce4455f33152a4f616312dcb
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,17 @@
+# Docs
+
+The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com).
+
+### Development
+
+Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command
+
+```
+npm i -g mintlify
+```
+
+Run the following command at the root of your documentation (where mint.json is)
+
+```
+mintlify dev
+```
diff --git a/docs/cloud/implementation.mdx b/docs/cloud/implementation.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..8c894017709f692c9bf319e561318dfd5d0dfdf4
--- /dev/null
+++ b/docs/cloud/implementation.mdx
@@ -0,0 +1,107 @@
+---
+title: "Implementing the API"
+description: "Learn how to implement the Browser Use API in Python"
+icon: "code"
+---
+
+This guide shows how to implement common API patterns using Python. We'll create a complete example that creates and monitors a browser automation task.
+
+## Basic Implementation
+
+For all settings see [Run Task](cloud/api-v10/run-task).
+
+Here's a simple implementation using Python's `requests` library to stream the task steps:
+
+```python
+import json
+import time
+
+import requests
+
+API_KEY = 'your_api_key_here'
+BASE_URL = 'https://api.browser-use.com/api/v1'
+HEADERS = {'Authorization': f'Bearer {API_KEY}'}
+
+
+def create_task(instructions: str):
+ """Create a new browser automation task"""
+ response = requests.post(f'{BASE_URL}/run-task', headers=HEADERS, json={'task': instructions})
+ return response.json()['id']
+
+
+def get_task_status(task_id: str):
+ """Get current task status"""
+ response = requests.get(f'{BASE_URL}/task/{task_id}/status', headers=HEADERS)
+ return response.json()
+
+
+def get_task_details(task_id: str):
+ """Get full task details including output"""
+ response = requests.get(f'{BASE_URL}/task/{task_id}', headers=HEADERS)
+ return response.json()
+
+
+def wait_for_completion(task_id: str, poll_interval: int = 2):
+ """Poll task status until completion"""
+ count = 0
+ unique_steps = []
+ while True:
+ details = get_task_details(task_id)
+ new_steps = details['steps']
+ # use only the new steps that are not in unique_steps.
+ if new_steps != unique_steps:
+ for step in new_steps:
+ if step not in unique_steps:
+ print(json.dumps(step, indent=4))
+ unique_steps = new_steps
+ count += 1
+ status = details['status']
+
+ if status in ['finished', 'failed', 'stopped']:
+ return details
+ time.sleep(poll_interval)
+
+
+def main():
+ task_id = create_task('Open https://www.google.com and search for openai')
+ print(f'Task created with ID: {task_id}')
+ task_details = wait_for_completion(task_id)
+ print(f"Final output: {task_details['output']}")
+
+
+if __name__ == '__main__':
+ main()
+
+```
+
+## Task Control Example
+
+Here's how to implement task control with pause/resume functionality:
+
+```python
+def control_task():
+ # Create a new task
+ task_id = create_task("Go to google.com and search for Browser Use")
+
+ # Wait for 5 seconds
+ time.sleep(5)
+
+ # Pause the task
+ requests.put(f"{BASE_URL}/pause-task?task_id={task_id}", headers=HEADERS)
+ print("Task paused! Check the live preview.")
+
+ # Wait for user input
+ input("Press Enter to resume...")
+
+ # Resume the task
+ requests.put(f"{BASE_URL}/resume-task?task_id={task_id}", headers=HEADERS)
+
+ # Wait for completion
+ result = wait_for_completion(task_id)
+ print(f"Task completed with output: {result['output']}")
+```
+
+
+ Remember to handle your API key securely and implement proper error handling
+ in production code.
+
diff --git a/docs/cloud/quickstart.mdx b/docs/cloud/quickstart.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..68ef795f9f7e0795afb5bedfabba5d09820b533c
--- /dev/null
+++ b/docs/cloud/quickstart.mdx
@@ -0,0 +1,103 @@
+---
+title: "Quickstart"
+description: "Learn how to get started with the Browser Use Cloud API"
+icon: "cloud"
+---
+
+The Browser Use Cloud API lets you create and manage browser automation agents programmatically. Each agent can execute tasks and provide real-time feedback through a live preview URL.
+
+## Prerequisites
+
+
+ You need an active subscription and an API key from
+ [cloud.browser-use.com/billing](https://cloud.browser-use.com/billing)
+
+
+## Pricing
+
+The Browser Use Cloud API is priced at $0.05 per step that the agent executes.
+
+
+ Since Browser Use can execute multiple steps at the same time,
+ the price for filling out forms is much lower than other services.
+
+
+## Creating Your First Agent
+
+Create a new browser automation task by providing instructions in natural language:
+
+```bash
+curl -X POST https://api.browser-use.com/api/v1/run-task \
+ -H "Authorization: Bearer your_api_key_here" \
+ -H "Content-Type: application/json" \
+ -d '{
+ "task": "Go to google.com and search for Browser Use"
+ }'
+```
+
+The API returns a task ID that you can use to manage the task and check the live preview URL.
+
+
+ The task response includes a `live_url` that you can embed in an iframe to
+ watch and control the agent in real-time.
+
+
+## Managing Tasks
+
+Control running tasks with these operations:
+
+
+
+ Temporarily pause task execution with [`/api/v1/pause-task`](/cloud/api-v1/pause-task) and resume with
+ [`/api/v1/resume-task`](/cloud/api-v1/resume-task). Useful for manual inspection or intervention.
+
+
+
+ Permanently stop a task using [`/api/v1/stop-task`](/cloud/api-v1/stop-task). The task cannot be
+ resumed after being stopped.
+
+
+
+For detailed API documentation, see the tabs on the left, which include the full coverage of the API.
+
+## Building your own client (OpenAPI)
+
+
+ We recommend this only if you don't need control and only need to run simple
+ tasks.
+
+
+The best way to build your own client is to use our [OpenAPI specification](http://api.browser-use.com/openapi.json) to generate a type-safe client library.
+
+### Python
+
+Use [openapi-python-client](https://github.com/openapi-generators/openapi-python-client) to generate a modern Python client:
+
+```bash
+# Install the generator
+pipx install openapi-python-client --include-deps
+
+# Generate the client
+openapi-python-client generate --url http://api.browser-use.com/openapi.json
+```
+
+This will create a Python package with full type hints, modern dataclasses, and async support.
+
+### TypeScript/JavaScript
+
+For TypeScript projects, use [openapi-typescript](https://www.npmjs.com/package/openapi-typescript) to generate type definitions:
+
+```bash
+# Install the generator
+npm install -D openapi-typescript
+
+# Generate the types
+npx openapi-typescript http://api.browser-use.com/openapi.json -o browser-use-api.ts
+```
+
+This will create TypeScript definitions you can use with your preferred HTTP client.
+
+
+ Need help? Contact our support team at support@browser-use.com or join our
+ [Discord community](https://link.browser-use.com/discord)
+
diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..58371705b09fa69955709bf65528bbba3666ab75
--- /dev/null
+++ b/docs/customize/agent-settings.mdx
@@ -0,0 +1,215 @@
+---
+title: "Agent Settings"
+description: "Learn how to configure the agent"
+icon: "gear"
+---
+
+## Overview
+
+The `Agent` class is the core component of Browser Use that handles browser automation. Here are the main configuration options you can use when initializing an agent.
+
+## Basic Settings
+
+```python
+from browser_use import Agent
+from langchain_openai import ChatOpenAI
+
+agent = Agent(
+ task="Search for latest news about AI",
+ llm=ChatOpenAI(model="gpt-4o"),
+)
+```
+
+### Required Parameters
+
+- `task`: The instruction for the agent to execute
+- `llm`: A LangChain chat model instance. See LangChain Models for supported models.
+
+## Agent Behavior
+
+Control how the agent operates:
+
+```python
+agent = Agent(
+ task="your task",
+ llm=llm,
+ controller=custom_controller, # For custom tool calling
+ use_vision=True, # Enable vision capabilities
+ save_conversation_path="logs/conversation" # Save chat logs
+)
+```
+
+### Behavior Parameters
+
+- `controller`: Registry of functions the agent can call. Defaults to base Controller. See Custom Functions for details.
+- `use_vision`: Enable/disable vision capabilities. Defaults to `True`.
+ - When enabled, the model processes visual information from web pages
+ - Disable to reduce costs or use models without vision support
+ - For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size)
+- `save_conversation_path`: Path to save the complete conversation history. Useful for debugging.
+- `system_prompt_class`: Custom system prompt class. See System Prompt for customization options.
+
+
+ Vision capabilities are recommended for better web interaction understanding,
+ but can be disabled to reduce costs or when using models without vision
+ support.
+
+
+## (Reuse) Browser Configuration
+
+You can configure how the agent interacts with the browser. To see more `Browser` options refer to the Browser Settings documentation.
+
+### Reuse Existing Browser
+
+`browser`: A Browser Use Browser instance. When provided, the agent will reuse this browser instance and automatically create new contexts for each `run()`.
+
+```python
+from browser_use import Agent, Browser
+from browser_use.browser.context import BrowserContext
+
+# Reuse existing browser
+browser = Browser()
+agent = Agent(
+ task=task1,
+ llm=llm,
+ browser=browser # Browser instance will be reused
+)
+
+await agent.run()
+
+# Manually close the browser
+await browser.close()
+```
+
+
+ Remember: in this scenario the `Browser` will not be closed automatically.
+
+
+### Reuse Existing Browser Context
+
+`browser_context`: A Playwright browser context. Useful for maintaining persistent sessions. See Persistent Browser for more details.
+
+```python
+from browser_use import Agent, Browser
+from playwright.async_api import BrowserContext
+
+# Use specific browser context (preferred method)
+async with await browser.new_context() as context:
+ agent = Agent(
+ task=task2,
+ llm=llm,
+ browser_context=context # Use persistent context
+ )
+
+ # Run the agent
+ await agent.run()
+
+ # Pass the context to the next agent
+ next_agent = Agent(
+ task=task2,
+ llm=llm,
+ browser_context=context
+ )
+
+ ...
+
+await browser.close()
+```
+
+For more information about how browser context works, refer to the [Playwright
+documentation](https://playwright.dev/docs/api/class-browsercontext).
+
+
+ You can reuse the same context for multiple agents. If you do nothing, the
+ browser will be automatically created and closed on `run()` completion.
+
+
+## Running the Agent
+
+The agent is executed using the async `run()` method:
+
+- `max_steps` (default: `100`)
+ Maximum number of steps the agent can take during execution. This prevents infinite loops and helps control execution time.
+
+## Agent History
+
+The method returns an `AgentHistoryList` object containing the complete execution history. This history is invaluable for debugging, analysis, and creating reproducible scripts.
+
+```python
+# Example of accessing history
+history = await agent.run()
+
+# Access (some) useful information
+history.urls() # List of visited URLs
+history.screenshots() # List of screenshot paths
+history.action_names() # Names of executed actions
+history.extracted_content() # Content extracted during execution
+history.errors() # Any errors that occurred
+history.model_actions() # All actions with their parameters
+```
+
+The `AgentHistoryList` provides many helper methods to analyze the execution:
+
+- `final_result()`: Get the final extracted content
+- `is_done()`: Check if the agent completed successfully
+- `has_errors()`: Check if any errors occurred
+- `model_thoughts()`: Get the agent's reasoning process
+- `action_results()`: Get results of all actions
+
+
+ For a complete list of helper methods and detailed history analysis
+ capabilities, refer to the [AgentHistoryList source
+ code](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L111).
+
+
+## Run initial actions without LLM
+With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/initial_actions.py) you can run initial actions without the LLM.
+Specify the action as a dictionary where the key is the action name and the value is the action parameters. You can find all our actions in the [Controller](https://github.com/browser-use/browser-use/blob/main/browser_use/controller/service.py) source code.
+```python
+
+initial_actions = [
+ {'open_tab': {'url': 'https://www.google.com'}},
+ {'open_tab': {'url': 'https://en.wikipedia.org/wiki/Randomness'}},
+ {'scroll_down': {'amount': 1000}},
+]
+agent = Agent(
+ task='What theories are displayed on the page?',
+ initial_actions=initial_actions,
+ llm=llm,
+)
+```
+
+## Run with planner model
+
+You can configure the agent to use a separate planner model for high-level task planning:
+
+```python
+from langchain_openai import ChatOpenAI
+
+# Initialize models
+llm = ChatOpenAI(model='gpt-4o')
+planner_llm = ChatOpenAI(model='o3-mini')
+
+agent = Agent(
+ task="your task",
+ llm=llm,
+ planner_llm=planner_llm, # Separate model for planning
+ use_vision_for_planner=False, # Disable vision for planner
+ planner_interval=4 # Plan every 4 steps
+)
+```
+
+### Planner Parameters
+
+- `planner_llm`: A LangChain chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM.
+- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`.
+- `planner_interval`: Number of steps between planning phases. Defaults to `1`.
+
+Using a separate planner model can help:
+- Reduce costs by using a smaller model for high-level planning
+- Improve task decomposition and strategic thinking
+- Better handle complex, multi-step tasks
+
+
+ The planner model is optional. If not specified, the agent will not use the planner model.
+
diff --git a/docs/customize/browser-settings.mdx b/docs/customize/browser-settings.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..41995f9e1eb0c9b3de9abd8d99e65d7e18da53a6
--- /dev/null
+++ b/docs/customize/browser-settings.mdx
@@ -0,0 +1,179 @@
+---
+title: "Browser Settings"
+description: "Configure browser behavior and context settings"
+icon: "globe"
+---
+
+Browser Use allows you to customize the browser's behavior through two main configuration classes: `BrowserConfig` and `BrowserContextConfig`. These settings control everything from headless mode to proxy settings and page load behavior.
+
+
+ We are currently working on improving how browser contexts are managed. The
+ system will soon transition to a "1 agent, 1 browser, 1 context" model for
+ better stability and developer experience.
+
+
+# Browser Configuration
+
+The `BrowserConfig` class controls the core browser behavior and connection settings.
+
+```python
+from browser_use import BrowserConfig
+
+# Basic configuration
+config = BrowserConfig(
+ headless=False,
+ disable_security=True
+)
+
+browser = Browser(config=config)
+
+agent = Agent(
+ browser=browser,
+ # ...
+)
+```
+
+## Core Settings
+
+- **headless** (default: `False`)
+ Runs the browser without a visible UI. Note that some websites may detect headless mode.
+
+- **disable_security** (default: `True`)
+ Disables browser security features. While this can fix certain functionality issues (like cross-site iFrames), it should be used cautiously, especially when visiting untrusted websites.
+
+### Additional Settings
+
+- **extra_chromium_args** (default: `[]`)
+ Additional arguments are passed to the browser at launch. See the [full list of available arguments](https://github.com/browser-use/browser-use/blob/main/browser_use/browser/browser.py#L180).
+
+- **proxy** (default: `None`)
+ Standard Playwright proxy settings for using external proxy services.
+
+- **new_context_config** (default: `BrowserContextConfig()`)
+ Default settings for new browser contexts. See Context Configuration below.
+
+
+ For web scraping tasks on sites that restrict automated access, we recommend
+ using external browser or proxy providers for better reliability.
+
+
+## Alternative Initialization
+
+These settings allow you to connect to external browser providers or use a local Chrome instance.
+
+### External Browser Provider (wss)
+
+Connect to cloud-based browser services for enhanced reliability and proxy capabilities.
+
+```python
+config = BrowserConfig(
+ wss_url="wss://your-browser-provider.com/ws"
+)
+```
+
+- **wss_url** (default: `None`)
+ WebSocket URL for connecting to external browser providers (e.g., anchorbrowser.com, steel.dev, browserbase.com, browserless.io).
+
+
+ This overrides local browser settings and uses the provider's configuration.
+ Refer to their documentation for settings.
+
+
+### External Browser Provider (cdp)
+
+Connect to cloud or local Chrome instances using Chrome DevTools Protocol (CDP) for use with tools like `headless-shell` or `browserless`.
+
+```python
+config = BrowserConfig(
+ cdp_url="http://localhost:9222"
+)
+```
+
+- **cdp_url** (default: `None`)
+ URL for connecting to a Chrome instance via CDP. Commonly used for debugging or connecting to locally running Chrome instances.
+
+### Local Chrome Instance (binary)
+
+Connect to your existing Chrome installation to access saved states and cookies.
+
+```python
+config = BrowserConfig(
+ chrome_instance_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
+)
+```
+
+- **chrome_instance_path** (default: `None`)
+ Path to connect to an existing Chrome installation. Particularly useful for workflows requiring existing login states or browser preferences.
+
+This will overwrite other browser settings.
+
+# Context Configuration
+
+The `BrowserContextConfig` class controls settings for individual browser contexts.
+
+```python
+from browser_use.browser.context import BrowserContextConfig
+
+config = BrowserContextConfig(
+ cookies_file="path/to/cookies.json",
+ wait_for_network_idle_page_load_time=3.0,
+ browser_window_size={'width': 1280, 'height': 1100},
+ locale='en-US',
+ user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
+ highlight_elements=True,
+ viewport_expansion=500,
+ allowed_domains=['google.com', 'wikipedia.org'],
+)
+
+browser = Browser()
+context = BrowserContext(browser=browser, config=config)
+
+
+async def run_search():
+ agent = Agent(
+ browser_context=context,
+ task='Your task',
+ llm=llm)
+```
+
+## Configuration Options
+
+### Page Load Settings
+
+- **minimum_wait_page_load_time** (default: `0.5`)
+ Minimum time to wait before capturing page state for LLM input.
+
+- **wait_for_network_idle_page_load_time** (default: `1.0`)
+ Time to wait for network activity to cease. Increase to 3-5s for slower websites. This tracks essential content loading, not dynamic elements like videos.
+
+- **maximum_wait_page_load_time** (default: `5.0`)
+ Maximum time to wait for page load before proceeding.
+
+### Display Settings
+
+- **browser_window_size** (default: `{'width': 1280, 'height': 1100}`)
+ Browser window dimensions. The default size is optimized for general use cases and interaction with common UI elements like cookie banners.
+
+- **locale** (default: `None`)
+ Specify user locale, for example en-GB, de-DE, etc. Locale will affect the navigator. Language value, Accept-Language request header value as well as number and date formatting rules. If not provided, defaults to the system default locale.
+
+- **highlight_elements** (default: `True`)
+ Highlight interactive elements on the screen with colorful bounding boxes.
+
+- **viewport_expansion** (default: `500`)
+ Viewport expansion in pixels. With this you can controll how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
+ Default is 500 pixels, that means that we inlcude a little bit more than the visible viewport inside the context.
+
+### Restrict URLs
+
+- **allowed_domains** (default: `None`)
+ List of allowed domains that the agent can access. If None, all domains are allowed.
+ Example: ['google.com', 'wikipedia.org'] - Here the agent will only be able to access google and wikipedia.
+
+### Debug and Recording
+
+- **save_recording_path** (default: `None`)
+ Directory path for saving video recordings.
+
+- **trace_path** (default: `None`)
+ Directory path for saving trace files. Files are automatically named as `{trace_path}/{context_id}.zip`.
diff --git a/docs/customize/custom-functions.mdx b/docs/customize/custom-functions.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..5e3ceb95c263d8714ad095bc278c7faae691e3a0
--- /dev/null
+++ b/docs/customize/custom-functions.mdx
@@ -0,0 +1,128 @@
+---
+title: "Custom Functions"
+description: "Extend default agent and write custom function calls"
+icon: "function"
+---
+
+## Basic Function Registration
+
+Functions can be either `sync` or `async`. Keep them focused and single-purpose.
+
+```python
+from browser_use import Controller, ActionResult
+# Initialize the controller
+controller = Controller()
+
+@controller.action('Ask user for information')
+def ask_human(question: str) -> str:
+ answer = input(f'\n{question}\nInput: ')
+ return ActionResult(extracted_content=answer)
+```
+
+
+ Basic `Controller` has all basic functionality you might need to interact with
+ the browser already implemented.
+
+
+```python
+# ... then pass controller to the agent
+agent = Agent(
+ task=task,
+ llm=llm,
+ controller=controller
+)
+```
+
+
+ Keep the function name and description short and concise. The Agent use the
+ function solely based on the name and description. The stringified output of
+ the action is passed to the Agent.
+
+
+## Browser-Aware Functions
+
+For actions that need browser access, simply add the `browser` parameter inside the function parameters:
+
+```python
+from browser_use import Browser, Controller, ActionResult
+
+controller = Controller()
+@controller.action('Open website')
+async def open_website(url: str, browser: Browser):
+ page = browser.get_current_page()
+ await page.goto(url)
+ return ActionResult(extracted_content='Website opened')
+```
+
+## Structured Parameters with Pydantic
+
+For complex actions, you can define parameter schemas using Pydantic models:
+
+```python
+from pydantic import BaseModel
+from typing import Optional
+from browser_use import Controller, ActionResult, Browser
+
+controller = Controller()
+
+class JobDetails(BaseModel):
+ title: str
+ company: str
+ job_link: str
+ salary: Optional[str] = None
+
+@controller.action(
+ 'Save job details which you found on page',
+ param_model=JobDetails
+)
+async def save_job(params: JobDetails, browser: Browser):
+ print(f"Saving job: {params.title} at {params.company}")
+
+ # Access browser if needed
+ page = browser.get_current_page()
+ await page.goto(params.job_link)
+```
+
+## Using Custom Actions with multiple agents
+
+You can use the same controller for multiple agents.
+
+```python
+controller = Controller()
+
+# ... register actions to the controller
+
+agent = Agent(
+ task="Go to website X and find the latest news",
+ llm=llm,
+ controller=controller
+)
+
+# Run the agent
+await agent.run()
+
+agent2 = Agent(
+ task="Go to website Y and find the latest news",
+ llm=llm,
+ controller=controller
+)
+
+await agent2.run()
+```
+
+
+ The controller is stateless and can be used to register multiple actions and
+ multiple agents.
+
+
+
+
+## Exclude functions
+If you want less actions to be used by the agent, you can exclude them from the controller.
+```python
+controller = Controller(exclude_actions=['open_tab', 'search_google'])
+```
+
+
+For more examples like file upload or notifications, visit [examples/custom-functions](https://github.com/browser-use/browser-use/tree/main/examples/custom-functions).
+
diff --git a/docs/customize/output-format.mdx b/docs/customize/output-format.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..d893a7513b9ae0bf1f1957e84d169512df22b350
--- /dev/null
+++ b/docs/customize/output-format.mdx
@@ -0,0 +1,50 @@
+---
+title: "Output Format"
+description: "The default is text. But you can define a structured output format to make post-processing easier."
+icon: "code"
+---
+
+## Custom output format
+With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) you can define what output format the agent should return to you.
+
+```python
+from pydantic import BaseModel
+# Define the output format as a Pydantic model
+class Post(BaseModel):
+ post_title: str
+ post_url: str
+ num_comments: int
+ hours_since_post: int
+
+
+class Posts(BaseModel):
+ posts: List[Post]
+
+
+controller = Controller(output_model=Posts)
+
+
+async def main():
+ task = 'Go to hackernews show hn and give me the first 5 posts'
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, controller=controller)
+
+ history = await agent.run()
+
+ result = history.final_result()
+ if result:
+ parsed: Posts = Posts.model_validate_json(result)
+
+ for post in parsed.posts:
+ print('\n--------------------------------')
+ print(f'Title: {post.post_title}')
+ print(f'URL: {post.post_url}')
+ print(f'Comments: {post.num_comments}')
+ print(f'Hours since post: {post.hours_since_post}')
+ else:
+ print('No result')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
+```
\ No newline at end of file
diff --git a/docs/customize/real-browser.mdx b/docs/customize/real-browser.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..aafb92f9129d80908c33161403eae822d9a88d0b
--- /dev/null
+++ b/docs/customize/real-browser.mdx
@@ -0,0 +1,53 @@
+---
+title: "Connect to your Browser"
+description: "With this you can connect to your real browser, where you are logged in with all your accounts."
+icon: "computer"
+---
+
+## Overview
+
+You can connect the agent to your real Chrome browser instance, allowing it to access your existing browser profile with all your logged-in accounts and settings. This is particularly useful when you want the agent to interact with services where you're already authenticated.
+
+
+ First make sure to close all running Chrome instances.
+
+
+## Basic Configuration
+
+To connect to your real Chrome browser, you'll need to specify the path to your Chrome executable when creating the Browser instance:
+
+```python
+from browser_use import Agent, Browser, BrowserConfig
+from langchain_openai import ChatOpenAI
+import asyncio
+# Configure the browser to connect to your Chrome instance
+browser = Browser(
+ config=BrowserConfig(
+ # Specify the path to your Chrome executable
+ chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # macOS path
+ # For Windows, typically: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
+ # For Linux, typically: '/usr/bin/google-chrome'
+ )
+)
+
+# Create the agent with your configured browser
+agent = Agent(
+ task="Your task here",
+ llm=ChatOpenAI(model='gpt-4o'),
+ browser=browser,
+)
+
+async def main():
+ await agent.run()
+
+ input('Press Enter to close the browser...')
+ await browser.close()
+
+if __name__ == '__main__':
+ asyncio.run(main())
+```
+
+
+
+ When using your real browser, the agent will have access to all your logged-in sessions. Make sure to ALWAYS review the task you're giving to the agent and ensure it aligns with your security requirements!
+
diff --git a/docs/customize/sensitive-data.mdx b/docs/customize/sensitive-data.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..4130eff4d110498dcf733bdc651d20aac04f21e7
--- /dev/null
+++ b/docs/customize/sensitive-data.mdx
@@ -0,0 +1,50 @@
+---
+title: "Sensitive Data"
+description: "Handle sensitive information securely by preventing the model from seeing actual passwords."
+icon: "shield"
+---
+
+## Handling Sensitive Data
+
+When working with sensitive information like passwords, you can use the `sensitive_data` parameter to prevent the model from seeing the actual values while still allowing it to reference them in its actions.
+
+Here's an example of how to use sensitive data:
+
+```python
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+
+load_dotenv()
+
+# Initialize the model
+llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+)
+
+# Define sensitive data
+# The model will only see the keys (x_name, x_password) but never the actual values
+sensitive_data = {'x_name': 'magnus', 'x_password': '12345678'}
+
+# Use the placeholder names in your task description
+task = 'go to x.com and login with x_name and x_password then write a post about the meaning of life'
+
+# Pass the sensitive data to the agent
+agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data)
+
+async def main():
+ await agent.run()
+
+if __name__ == '__main__':
+ asyncio.run(main())
+```
+
+In this example:
+1. The model only sees `x_name` and `x_password` as placeholders.
+2. When the model wants to use your password it outputs x_password - and we replace it with the actual value.
+3. When your password is visable on the current page, we replace it in the LLM input - so that the model never has it in its state.
+
+Warning: Vision models still see the image of the page - where the sensitive data might be visible.
+
+This approach ensures that sensitive information remains secure while still allowing the agent to perform tasks that require authentication.
\ No newline at end of file
diff --git a/docs/customize/supported-models.mdx b/docs/customize/supported-models.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..1798cfbfbc3dfd3828cf128c02d48c7cf98c448c
--- /dev/null
+++ b/docs/customize/supported-models.mdx
@@ -0,0 +1,223 @@
+---
+title: "Supported Models"
+description: "Guide to using different LangChain chat models with Browser Use"
+icon: "robot"
+---
+
+## Overview
+
+Browser Use supports various LangChain chat models. Here's how to configure and use the most popular ones. The full list is available in the [LangChain documentation](https://python.langchain.com/docs/integrations/chat/).
+
+## Model Recommendations
+
+We have yet to test performance across all models. Currently, we achieve the best results using GPT-4o with an 89% accuracy on the [WebVoyager Dataset](https://browser-use.com/posts/sota-technical-report). DeepSeek-V3 is 30 times cheaper than GPT-4o. Gemini-2.0-exp is also gaining popularity in the community because it is currently free.
+We also support local models, like Qwen 2.5, but be aware that small models often return the wrong output structure-which lead to parsing errors. We believe that local models will improve significantly this year.
+
+
+
+ All models require their respective API keys. Make sure to set them in your
+ environment variables before running the agent.
+
+
+## Supported Models
+
+All LangChain chat models, which support tool-calling are available. We will document the most popular ones here.
+
+### OpenAI
+
+OpenAI's GPT-4o models are recommended for best performance.
+
+```python
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+
+# Initialize the model
+llm = ChatOpenAI(
+ model="gpt-4o",
+ temperature=0.0,
+)
+
+# Create agent with the model
+agent = Agent(
+ task="Your task here",
+ llm=llm
+)
+```
+
+Required environment variables:
+
+```bash .env
+OPENAI_API_KEY=
+```
+
+### Anthropic
+
+
+```python
+from langchain_anthropic import ChatAnthropic
+from browser_use import Agent
+
+# Initialize the model
+llm = ChatAnthropic(
+ model_name="claude-3-5-sonnet-20240620",
+ temperature=0.0,
+ timeout=100, # Increase for complex tasks
+)
+
+# Create agent with the model
+agent = Agent(
+ task="Your task here",
+ llm=llm
+)
+```
+
+And add the variable:
+
+```bash .env
+ANTHROPIC_API_KEY=
+```
+
+### Azure OpenAI
+
+```python
+from langchain_openai import AzureChatOpenAI
+from browser_use import Agent
+from pydantic import SecretStr
+import os
+
+# Initialize the model
+llm = AzureChatOpenAI(
+ model="gpt-4o",
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+)
+
+# Create agent with the model
+agent = Agent(
+ task="Your task here",
+ llm=llm
+)
+```
+
+Required environment variables:
+
+```bash .env
+AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/
+AZURE_OPENAI_KEY=
+```
+
+
+### Gemini
+
+```python
+from langchain_google_genai import ChatGoogleGenerativeAI
+from browser_use import Agent
+from pydantic import SecretStr
+import os
+from dotenv import load_dotenv
+load_dotenv()
+
+api_key = os.getenv("GEMINI_API_KEY")
+
+# Initialize the model
+llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(os.getenv('GEMINI_API_KEY')))
+
+# Create agent with the model
+agent = Agent(
+ task="Your task here",
+ llm=llm
+)
+```
+
+Required environment variables:
+
+```bash .env
+GEMINI_API_KEY=
+```
+
+
+### DeepSeek-V3
+The community likes DeepSeek-V3 for its low price, no rate limits, open-source nature, and good performance.
+The example is available [here](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek.py).
+
+```python
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+from pydantic import SecretStr
+
+
+# Initialize the model
+llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-chat', api_key=SecretStr(api_key))
+
+# Create agent with the model
+agent = Agent(
+ task="Your task here",
+ llm=llm,
+ use_vision=False
+)
+```
+
+Required environment variables:
+
+```bash .env
+DEEPSEEK_API_KEY=
+```
+
+### DeepSeek-R1
+We support DeepSeek-R1. Its not fully tested yet, more and more functionality will be added, like e.g. the output of it'sreasoning content.
+The example is available [here](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek-r1.py).
+It does not support vision. The model is open-source so you could also use it with Ollama, but we have not tested it.
+```python
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+from pydantic import SecretStr
+
+
+# Initialize the model
+llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-reasoner', api_key=SecretStr(api_key))
+
+# Create agent with the model
+agent = Agent(
+ task="Your task here",
+ llm=llm,
+ use_vision=False
+)
+```
+
+Required environment variables:
+
+```bash .env
+DEEPSEEK_API_KEY=
+```
+
+### Ollama
+Many users asked for local models. Here they are.
+
+1. Download Ollama from [here](https://ollama.ai/download)
+2. Run `ollama pull model_name`. Pick a model which supports tool-calling from [here](https://ollama.com/search?c=tools)
+3. Run `ollama start`
+
+```python
+from langchain_ollama import ChatOllama
+from browser_use import Agent
+from pydantic import SecretStr
+
+
+# Initialize the model
+llm=ChatOllama(model="qwen2.5", num_ctx=32000)
+
+# Create agent with the model
+agent = Agent(
+ task="Your task here",
+ llm=llm
+)
+```
+
+Required environment variables: None!
+
+## Coming soon
+(We are working on it)
+- Groq
+- Github
+- Fine-tuned models
diff --git a/docs/customize/system-prompt.mdx b/docs/customize/system-prompt.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..ef7b5343a021eb928aa32fc53a614cb1c4167a87
--- /dev/null
+++ b/docs/customize/system-prompt.mdx
@@ -0,0 +1,66 @@
+---
+title: "System Prompt"
+description: "Customize the system prompt to control agent behavior and capabilities"
+icon: "message"
+---
+
+## Overview
+
+You can customize the system prompt in two ways:
+
+1. Extend the default system prompt with additional instructions
+2. Override the default system prompt entirely
+
+
+ Custom system prompts allow you to modify the agent's behavior at a
+ fundamental level. Use this feature carefully as it can significantly impact
+ the agent's performance and reliability.
+
+
+### Extend System Prompt (recommended)
+
+To add additional instructions to the default system prompt:
+
+```python
+from browser_use import Agent
+from langchain_openai import ChatOpenAI
+
+# Add your custom instructions
+extend_system_message = """
+REMEMBER the most important RULE:
+ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!
+"""
+
+# Create agent with extended system prompt
+agent = Agent(
+ task="Your task here",
+ llm=ChatOpenAI(model='gpt-4'),
+ extend_system_message=extend_system_message
+)
+```
+
+### Override System Prompt
+
+
+ Not recommended! If you must override the [default system
+ prompt](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/system_prompt.md),
+ make sure to test the agent yourself.
+
+
+Anyway, to override the default system prompt:
+
+```python
+# Define your complete custom prompt
+override_system_message = """
+You are an AI agent that helps users with web browsing tasks.
+
+[Your complete custom instructions here...]
+"""
+
+# Create agent with custom system prompt
+agent = Agent(
+ task="Your task here",
+ llm=ChatOpenAI(model='gpt-4'),
+ override_system_message=override_system_message
+)
+```
diff --git a/docs/development.mdx b/docs/development.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..c2f2ccdae147db3df4b46d491b3fc6d4fcb2d2f5
--- /dev/null
+++ b/docs/development.mdx
@@ -0,0 +1,106 @@
+---
+title: 'Development'
+description: 'Preview changes locally to update your docs'
+---
+
+
+ **Prerequisite**: Please install Node.js (version 19 or higher) before proceeding.
+
+
+Follow these steps to install and run Mintlify on your operating system:
+
+**Step 1**: Install Mintlify:
+
+
+
+ ```bash npm
+ npm i -g mintlify
+ ```
+
+```bash yarn
+yarn global add mintlify
+```
+
+
+
+**Step 2**: Navigate to the docs directory (where the `mint.json` file is located) and execute the following command:
+
+```bash
+mintlify dev
+```
+
+A local preview of your documentation will be available at `http://localhost:3000`.
+
+### Custom Ports
+
+By default, Mintlify uses port 3000. You can customize the port Mintlify runs on by using the `--port` flag. To run Mintlify on port 3333, for instance, use this command:
+
+```bash
+mintlify dev --port 3333
+```
+
+If you attempt to run Mintlify on a port that's already in use, it will use the next available port:
+
+```md
+Port 3000 is already in use. Trying 3001 instead.
+```
+
+## Mintlify Versions
+
+Please note that each CLI release is associated with a specific version of Mintlify. If your local website doesn't align with the production version, please update the CLI:
+
+
+
+```bash npm
+npm i -g mintlify@latest
+```
+
+```bash yarn
+yarn global upgrade mintlify
+```
+
+
+
+## Validating Links
+
+The CLI can assist with validating reference links made in your documentation. To identify any broken links, use the following command:
+
+```bash
+mintlify broken-links
+```
+
+## Deployment
+
+
+ Unlimited editors available under the [Pro
+ Plan](https://mintlify.com/pricing) and above.
+
+
+If the deployment is successful, you should see the following:
+
+
+
+
+
+## Code Formatting
+
+We suggest using extensions on your IDE to recognize and format MDX. If you're a VSCode user, consider the [MDX VSCode extension](https://marketplace.visualstudio.com/items?itemName=unifiedjs.vscode-mdx) for syntax highlighting, and [Prettier](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode) for code formatting.
+
+## Troubleshooting
+
+
+
+
+ This may be due to an outdated version of node. Try the following:
+ 1. Remove the currently-installed version of mintlify: `npm remove -g mintlify`
+ 2. Upgrade to Node v19 or higher.
+ 3. Reinstall mintlify: `npm install -g mintlify`
+
+
+
+
+ Solution: Go to the root of your device and delete the \~/.mintlify folder. Afterwards, run `mintlify dev` again.
+
+
+
+Curious about what changed in the CLI version? [Check out the CLI changelog.](https://www.npmjs.com/package/mintlify?activeTab=versions)
diff --git a/docs/development/contribution-guide.mdx b/docs/development/contribution-guide.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..37b551953577b1168c41086f3a381f4214fec9c8
--- /dev/null
+++ b/docs/development/contribution-guide.mdx
@@ -0,0 +1,7 @@
+---
+title: "Contribution Guide"
+description: "Learn how to contribute to Browser Use"
+icon: "code-pull-request"
+---
+
+Working on it!
diff --git a/docs/development/local-setup.mdx b/docs/development/local-setup.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..da2bc3f7118fd1a7f3b468a7d63c65fa4d8d96af
--- /dev/null
+++ b/docs/development/local-setup.mdx
@@ -0,0 +1,82 @@
+---
+title: "Local Setup"
+description: "Set up Browser Use development environment locally"
+icon: "laptop-code"
+---
+
+## Prerequisites
+
+Browser Use requires Python 3.11 or higher. We recommend using [uv](https://docs.astral.sh/uv/) for Python environment management.
+
+## Clone the Repository
+
+First, clone the Browser Use repository:
+
+```bash
+git clone https://github.com/browser-use/browser-use
+cd browser-use
+```
+
+## Environment Setup
+
+1. Create a virtual environment:
+
+```bash
+uv venv --python 3.11
+```
+
+2. Install dependencies:
+
+```bash
+# Install the package in editable mode with all development dependencies
+uv pip install -e ".[dev]"
+```
+
+
+ The `-e` flag installs the package in "editable" mode, which means your local code changes
+ will be reflected immediately without requiring reinstallation. The `[dev]` part installs
+ additional dependencies needed for development.
+
+
+## Configuration
+
+Set up your environment variables:
+
+```bash
+# Copy the example environment file
+cp .env.example .env
+```
+
+Or manually create a `.env` file with your API keys:
+
+```bash .env
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+```
+
+
+ You can use any LLM model supported by LangChain. See [LangChain
+ Models](/customize/supported-models) for available options and their specific
+ API key requirements.
+
+
+## Development
+
+After setup, you can:
+
+- Run tests with `pytest`
+- Build the package with `hatch build`
+- Try the examples in the `examples/` directory
+
+## Getting Help
+
+If you run into any issues:
+
+1. Check our [GitHub Issues](https://github.com/browser-use/browser-use/issues)
+2. Join our [Discord community](https://link.browser-use.com/discord) for support
+
+
+ We welcome contributions! See our [Contribution
+ Guide](/development/contribution-guide) for guidelines on how to help improve
+ Browser Use.
+
diff --git a/docs/development/observability.mdx b/docs/development/observability.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..955e0388eb2669a85fc5f0526ba96cedca55d82d
--- /dev/null
+++ b/docs/development/observability.mdx
@@ -0,0 +1,66 @@
+---
+title: "Observability"
+description: "Trace Browser Use's agent execution steps and browser sessions"
+icon: "eye"
+---
+
+## Overview
+
+Browser Use has a native integration with [Laminar](https://lmnr.ai) - open-source platform for tracing, evals and labeling of AI agents.
+Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai).
+
+
+ Laminar excels at tracing browser agents by providing unified visibility into both browser session recordings and agent execution steps.
+
+
+## Setup
+
+To setup Laminar, you need to install the `lmnr` package and set the `LMNR_PROJECT_API_KEY` environment variable.
+
+To get your project API key, you can either:
+- Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings
+- Or spin up a local Laminar instance and get the key from the settings page
+
+```bash
+pip install 'lmnr[all]'
+export LMNR_PROJECT_API_KEY=
+```
+
+## Usage
+
+Then, you simply initialize the Laminar at the top of your project and both Browser Use and session recordings will be automatically traced.
+
+```python {5-8}
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+import asyncio
+
+from lmnr import Laminar
+# this line auto-instruments Browser Use and any browser you use (local or remote)
+Laminar.initialize(project_api_key="...") # you can also pass project api key here
+
+async def main():
+ agent = Agent(
+ task="open google, search Laminar AI",
+ llm=ChatOpenAI(model="gpt-4o-mini"),
+ )
+ result = await agent.run()
+ print(result)
+
+asyncio.run(main())
+```
+
+## Viewing Traces
+
+You can view traces in the Laminar UI by going to the traces tab in your project.
+When you select a trace, you can see both the browser session recording and the agent execution steps.
+
+Timeline of the browser session is synced with the agent execution steps, timeline highlights indicate the agent's current step synced with the browser session.
+In the trace view, you can also see the agent's current step, the tool it's using, and the tool's input and output. Tools are highlighted in the timeline with a yellow color.
+
+
+
+
+## Laminar
+
+To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai).
\ No newline at end of file
diff --git a/docs/development/roadmap.mdx b/docs/development/roadmap.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..34f05f5a407acbb052acf92966e1fc586acfb634
--- /dev/null
+++ b/docs/development/roadmap.mdx
@@ -0,0 +1,7 @@
+---
+title: "Roadmap"
+description: "Future plans and upcoming features for Browser Use"
+icon: "road"
+---
+
+Big things coming soon!
diff --git a/docs/development/telemetry.mdx b/docs/development/telemetry.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..fe4f7cb54c5c3095ebeccc77b6161a78bf3d96f4
--- /dev/null
+++ b/docs/development/telemetry.mdx
@@ -0,0 +1,39 @@
+---
+title: "Telemetry"
+description: "Understanding Browser Use's telemetry and privacy settings"
+icon: "chart-mixed"
+---
+
+## Overview
+
+Browser Use collects anonymous usage data to help us understand how the library is being used and to improve the user experience. It also helps us fix bugs faster and prioritize feature development.
+
+## Data Collection
+
+We use [PostHog](https://posthog.com) for telemetry collection. The data is completely anonymized and contains no personally identifiable information.
+
+
+ We never collect personal information, credentials, or specific content from
+ your browser automation tasks.
+
+
+## Opting Out
+
+You can disable telemetry by setting an environment variable:
+
+```bash .env
+ANONYMIZED_TELEMETRY=false
+```
+
+Or in your Python code:
+
+```python
+import os
+os.environ["ANONYMIZED_TELEMETRY"] = "false"
+```
+
+
+ Even when enabled, telemetry has zero impact on the library's performance or
+ functionality. Code is available in [Telemetry
+ Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry).
+
diff --git a/docs/favicon.svg b/docs/favicon.svg
new file mode 100644
index 0000000000000000000000000000000000000000..59f98742e385191e7338bfb7a5bbd70a24784865
--- /dev/null
+++ b/docs/favicon.svg
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/docs/images/browser-use.png b/docs/images/browser-use.png
new file mode 100644
index 0000000000000000000000000000000000000000..54685c4f4b3c721f74bbb09cfc54c6645216ae70
Binary files /dev/null and b/docs/images/browser-use.png differ
diff --git a/docs/images/checks-passed.png b/docs/images/checks-passed.png
new file mode 100644
index 0000000000000000000000000000000000000000..97603943defbd694eacebf9559b61ddc13020f60
--- /dev/null
+++ b/docs/images/checks-passed.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93add382731d1e6d443b128bbe1ac747b62d0efa1b8372ee3fcd37a59d86da30
+size 160724
diff --git a/docs/images/laminar.png b/docs/images/laminar.png
new file mode 100644
index 0000000000000000000000000000000000000000..8ec7136ea6e0d344b4b9054220c48216011ed813
--- /dev/null
+++ b/docs/images/laminar.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3852ced4a5dee49a567fac0afa8e8ee7a843d5b0f0729cb77d0578cde76d0956
+size 979079
diff --git a/docs/introduction.mdx b/docs/introduction.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..72d7b463d637a121dd07b1cd7a03c8900b77173c
--- /dev/null
+++ b/docs/introduction.mdx
@@ -0,0 +1,101 @@
+---
+title: "Introduction"
+description: "Welcome to Browser Use - We enable AI to control your browser"
+icon: "book-open"
+---
+
+
+
+## Overview
+
+Browser Use is the easiest way to connect your AI agents with the browser. It makes websites accessible for AI agents by providing a powerful, yet simple interface for browser automation.
+
+
+ If you have used Browser Use for your project, feel free to show it off in our
+ [Discord community](https://link.browser-use.com/discord)!
+
+
+## Getting Started
+
+
+
+ Get up and running with Browser Use in minutes
+
+
+ Configure different LLMs for your agents
+
+
+ Learn how to configure and customize your agents
+
+
+ Extend functionality with custom actions
+
+
+
+## Fancy Demos
+
+### Writing in Google Docs
+
+Task: Write a letter in Google Docs to my Papa, thanking him for everything, and save the document as a PDF.
+
+
+
+
+
+### Job Applications
+
+Task: Read my CV & find ML jobs, save them to a file, and then start applying for them in new tabs.
+
+
+
+
+
+### Flight Search
+
+Task: Find flights on kayak.com from Zurich to Beijing.
+
+
+
+
+
+### Data Collection
+
+Task: Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging Face, save top 5 to file.
+
+
+
+
+
+## Community & Support
+
+
+
+ Join our community for support and showcases
+
+
+ Star us on GitHub and contribute to development
+
+
+
+
+ Browser Use is MIT licensed and actively maintained. We welcome contributions
+ and feedback from the community!
+
diff --git a/docs/logo/dark.svg b/docs/logo/dark.svg
new file mode 100644
index 0000000000000000000000000000000000000000..02e2e4fcb9304bb3ea90558ecba2a181aac341a5
--- /dev/null
+++ b/docs/logo/dark.svg
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/docs/logo/light.svg b/docs/logo/light.svg
new file mode 100644
index 0000000000000000000000000000000000000000..f01b2e2502add4e60e5547d6a8bbcab7de79f1b5
--- /dev/null
+++ b/docs/logo/light.svg
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx
new file mode 100644
index 0000000000000000000000000000000000000000..451bbcf63c76314b535fb8bfe17e18c159f80815
--- /dev/null
+++ b/docs/quickstart.mdx
@@ -0,0 +1,76 @@
+---
+title: "Quickstart"
+description: "Start using Browser Use with this quickstart guide"
+icon: "rocket"
+---
+
+{/* You can install Browser Use from PyPI or clone it from Github. */}
+
+## Prepare the environment
+
+Browser Use requires Python 3.11 or higher.
+
+First, we recommend using [uv](https://docs.astral.sh/uv/) to setup the Python environment.
+
+```bash
+uv venv --python 3.11
+```
+
+and activate it with:
+
+```bash
+# For Mac/Linux:
+source .venv/bin/activate
+
+# For Windows:
+.venv\Scripts\activate
+```
+
+Install the dependencies:
+
+```bash
+uv pip install browser-use
+```
+
+Then install playwright:
+
+```bash
+playwright install
+```
+
+## Create an agent
+
+Then you can use the agent as follows:
+
+```python agent.py
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+from dotenv import load_dotenv
+load_dotenv()
+
+import asyncio
+
+llm = ChatOpenAI(model="gpt-4o")
+
+async def main():
+ agent = Agent(
+ task="Compare the price of gpt-4o and DeepSeek-V3",
+ llm=llm,
+ )
+ result = await agent.run()
+ print(result)
+
+asyncio.run(main())
+```
+
+## Set up your LLM API keys
+
+`ChatOpenAI` and other Langchain chat models require API keys. You should store these in your `.env` file. For example, for OpenAI and Anthropic, you can set the API keys in your `.env` file, such as:
+
+
+```bash .env
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+```
+
+For other LLM models you can refer to the [Langchain documentation](https://python.langchain.com/docs/integrations/chat/) to find how to set them up with their specific API keys.
diff --git a/eval/claude.py b/eval/claude.py
new file mode 100644
index 0000000000000000000000000000000000000000..b34c668c49d0c5f4a3c239e7dc924ddcf3fca393
--- /dev/null
+++ b/eval/claude.py
@@ -0,0 +1,18 @@
+from dotenv import load_dotenv
+from langchain_anthropic import ChatAnthropic
+
+from browser_use import Agent
+
+load_dotenv()
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatAnthropic(
+ model_name='claude-3-5-sonnet-20240620',
+ temperature=0.0,
+ timeout=100,
+ stop=None,
+ )
+ agent = Agent(task=task, llm=llm)
+ result = await agent.run(max_steps=max_steps)
+ return result
diff --git a/eval/deepseek-r1.py b/eval/deepseek-r1.py
new file mode 100644
index 0000000000000000000000000000000000000000..03da9edfc619749d12c8c30880e929181047b7c4
--- /dev/null
+++ b/eval/deepseek-r1.py
@@ -0,0 +1,24 @@
+import os
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+
+load_dotenv()
+
+api_key_deepseek = os.getenv('DEEPSEEK_API_KEY', '')
+if not api_key_deepseek:
+ raise ValueError('DEEPSEEK_API_KEY is not set')
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatOpenAI(
+ base_url='https://api.deepseek.com/v1',
+ model='deepseek-reasoner',
+ api_key=SecretStr(api_key_deepseek),
+ )
+ agent = Agent(task=task, llm=llm, use_vision=False)
+ result = await agent.run(max_steps=max_steps)
+ return result
diff --git a/eval/deepseek.py b/eval/deepseek.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec8289bc2be0c10c8dad2f4349684df8d791a08
--- /dev/null
+++ b/eval/deepseek.py
@@ -0,0 +1,24 @@
+import os
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+
+load_dotenv()
+
+api_key_deepseek = os.getenv('DEEPSEEK_API_KEY', '')
+if not api_key_deepseek:
+ raise ValueError('DEEPSEEK_API_KEY is not set')
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatOpenAI(
+ base_url='https://api.deepseek.com/v1',
+ model='deepseek-chat',
+ api_key=SecretStr(api_key_deepseek),
+ )
+ agent = Agent(task=task, llm=llm, use_vision=False)
+ result = await agent.run(max_steps=max_steps)
+ return result
diff --git a/eval/gemini-1.5-flash.py b/eval/gemini-1.5-flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..051f85d3c7fdb12b00714fc081c3453e2eafcda4
--- /dev/null
+++ b/eval/gemini-1.5-flash.py
@@ -0,0 +1,20 @@
+import os
+
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+
+load_dotenv()
+
+api_key = os.getenv('GEMINI_API_KEY', '')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash-latest', api_key=SecretStr(api_key))
+ agent = Agent(task=task, llm=llm)
+ result = await agent.run(max_steps=max_steps)
+ return result
diff --git a/eval/gemini-2.0-flash.py b/eval/gemini-2.0-flash.py
new file mode 100644
index 0000000000000000000000000000000000000000..803895c7735c273b12925e8a7a6cb1f5cd439154
--- /dev/null
+++ b/eval/gemini-2.0-flash.py
@@ -0,0 +1,20 @@
+import os
+
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+
+load_dotenv()
+
+api_key = os.getenv('GEMINI_API_KEY', '')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
+ agent = Agent(task=task, llm=llm)
+ result = await agent.run(max_steps=max_steps)
+ return result
diff --git a/eval/gpt-4o-no-boundingbox.py b/eval/gpt-4o-no-boundingbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..6343824147f59689ab751ed456f8330b419e16b5
--- /dev/null
+++ b/eval/gpt-4o-no-boundingbox.py
@@ -0,0 +1,32 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, BrowserConfig
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContextConfig
+
+load_dotenv()
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ browser = Browser(
+ config=BrowserConfig(
+ new_context_config=BrowserContextConfig(
+ highlight_elements=False,
+ ),
+ ),
+ )
+ llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+ )
+ agent = Agent(task=task, llm=llm, browser=browser)
+ result = await agent.run(max_steps=max_steps)
+ return result
+
+
+if __name__ == '__main__':
+ task = 'Open 1 random Wikipedia pages in new tab'
+ result = asyncio.run(run_agent(task))
diff --git a/eval/gpt-4o-no-vision.py b/eval/gpt-4o-no-vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..47a5c21fe02500bfff83701f79cd055e56a2f34e
--- /dev/null
+++ b/eval/gpt-4o-no-vision.py
@@ -0,0 +1,16 @@
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+load_dotenv()
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+ )
+ agent = Agent(task=task, llm=llm, use_vision=False)
+ result = await agent.run(max_steps=max_steps)
+ return result
diff --git a/eval/gpt-4o-viewport-0.py b/eval/gpt-4o-viewport-0.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb3ada91b285826da6c62decf36d84f361c408ad
--- /dev/null
+++ b/eval/gpt-4o-viewport-0.py
@@ -0,0 +1,33 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, BrowserConfig
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContextConfig
+
+load_dotenv()
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+ )
+ browser = Browser(
+ config=BrowserConfig(
+ new_context_config=BrowserContextConfig(
+ viewport_expansion=0,
+ ),
+ ),
+ )
+ agent = Agent(task=task, llm=llm, browser=browser)
+ result = await agent.run(max_steps=max_steps)
+ return result
+
+
+if __name__ == '__main__':
+ task = 'Go to https://www.google.com and search for "python" and click on the first result'
+ result = asyncio.run(run_agent(task))
+ print(result)
diff --git a/eval/gpt-4o.py b/eval/gpt-4o.py
new file mode 100644
index 0000000000000000000000000000000000000000..3cdcbd64d4859dbce607878a529749a9bf07be76
--- /dev/null
+++ b/eval/gpt-4o.py
@@ -0,0 +1,24 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+load_dotenv()
+
+
+async def run_agent(task: str, max_steps: int = 38):
+ llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+ )
+ agent = Agent(task=task, llm=llm)
+ result = await agent.run(max_steps=max_steps)
+ return result
+
+
+if __name__ == '__main__':
+ task = 'Go to https://www.google.com and search for "python" and click on the first result'
+ result = asyncio.run(run_agent(task))
+ print(result)
diff --git a/examples/browser/real_browser.py b/examples/browser/real_browser.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bd255ae885176c14d5659e76196d6a2f9da8b24
--- /dev/null
+++ b/examples/browser/real_browser.py
@@ -0,0 +1,38 @@
+import os
+import sys
+from pathlib import Path
+
+from browser_use.agent.views import ActionResult
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, Controller
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+browser = Browser(
+ config=BrowserConfig(
+ # NOTE: you need to close your chrome browser - so that this can open your browser in debug mode
+ chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ )
+)
+
+
+async def main():
+ agent = Agent(
+ task='In docs.google.com write my Papa a quick letter',
+ llm=ChatOpenAI(model='gpt-4o'),
+ browser=browser,
+ )
+
+ await agent.run()
+ await browser.close()
+
+ input('Press Enter to close...')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/browser/using_cdp.py b/examples/browser/using_cdp.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0c877b27c7538209e8e6ab976a1a90fb1ac63e7
--- /dev/null
+++ b/examples/browser/using_cdp.py
@@ -0,0 +1,60 @@
+"""
+Simple demonstration of the CDP feature.
+
+To test this locally, follow these steps:
+1. Create a shortcut for the executable Chrome file.
+2. Add the following argument to the shortcut:
+ - On Windows: `--remote-debugging-port=9222`
+3. Open a web browser and navigate to `http://localhost:9222/json/version` to verify that the Remote Debugging Protocol (CDP) is running.
+4. Launch this example.
+
+@dev You need to set the `GEMINI_API_KEY` environment variable before proceeding.
+"""
+
+import os
+import sys
+
+from dotenv import load_dotenv
+from pydantic import SecretStr
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+
+from langchain_google_genai import ChatGoogleGenerativeAI
+
+from browser_use import Agent, Controller
+from browser_use.browser.browser import Browser, BrowserConfig
+
+load_dotenv()
+api_key = os.getenv('GEMINI_API_KEY')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+
+browser = Browser(
+ config=BrowserConfig(
+ headless=False,
+ cdp_url='http://localhost:9222',
+ )
+)
+controller = Controller()
+
+
+async def main():
+ task = 'In docs.google.com write my Papa a quick thank you for everything letter \n - Magnus'
+ task += ' and save the document as pdf'
+ model = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(str(api_key)))
+ agent = Agent(
+ task=task,
+ llm=model,
+ controller=controller,
+ browser=browser,
+ )
+
+ await agent.run()
+ await browser.close()
+
+ input('Press Enter to close...')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/custom-functions/advanced_search.py b/examples/custom-functions/advanced_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..f11f6ec6be186c45272131e8184db164fe042d30
--- /dev/null
+++ b/examples/custom-functions/advanced_search.py
@@ -0,0 +1,98 @@
+import json
+import os
+import sys
+
+import requests
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+
+from browser_use import ActionResult, Agent, Controller
+
+load_dotenv()
+
+
+class Person(BaseModel):
+ name: str
+ email: str | None = None
+
+
+class PersonList(BaseModel):
+ people: list[Person]
+
+
+controller = Controller(exclude_actions=['search_google'], output_model=PersonList)
+BEARER_TOKEN = os.getenv('BEARER_TOKEN')
+
+if not BEARER_TOKEN:
+ # use the api key for ask tessa
+ # you can also use other apis like exa, xAI, perplexity, etc.
+ raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key')
+
+
+@controller.registry.action('Search the web for a specific query')
+async def search_web(query: str):
+ keys_to_use = ['url', 'title', 'content', 'author', 'score']
+ headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
+ response = requests.post('https://asktessa.ai/api/search', headers=headers, json={'query': query})
+
+ final_results = [
+ {key: source[key] for key in keys_to_use if key in source}
+ for source in response.json()['sources']
+ if source['score'] >= 0.8
+ ]
+ # print(json.dumps(final_results, indent=4))
+ result_text = json.dumps(final_results, indent=4)
+ print(result_text)
+ return ActionResult(extracted_content=result_text, include_in_memory=True)
+
+
+names = [
+ 'Ruedi Aebersold',
+ 'Bernd Bodenmiller',
+ 'Eugene Demler',
+ 'Erich Fischer',
+ 'Pietro Gambardella',
+ 'Matthias Huss',
+ 'Reto Knutti',
+ 'Maksym Kovalenko',
+ 'Antonio Lanzavecchia',
+ 'Maria Lukatskaya',
+ 'Jochen Markard',
+ 'Javier PΓ©rez-RamΓrez',
+ 'Federica Sallusto',
+ 'Gisbert Schneider',
+ 'Sonia I. Seneviratne',
+ 'Michael Siegrist',
+ 'Johan Six',
+ 'Tanja Stadler',
+ 'Shinichi Sunagawa',
+ 'Michael Bruce Zimmermann',
+]
+
+
+async def main():
+ task = 'use search_web with "find email address of the following ETH professor:" for each of the following persons in a list of actions. Finally return the list with name and email if provided'
+ task += '\n' + '\n'.join(names)
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, controller=controller, max_actions_per_step=20)
+
+ history = await agent.run()
+
+ result = history.final_result()
+ if result:
+ parsed: PersonList = PersonList.model_validate_json(result)
+
+ for person in parsed.people:
+ print(f'{person.name} - {person.email}')
+ else:
+ print('No result')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/custom-functions/clipboard.py b/examples/custom-functions/clipboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..e06d3e5de1ac0f4d5aa7cec045ec984e2838cf9e
--- /dev/null
+++ b/examples/custom-functions/clipboard.py
@@ -0,0 +1,58 @@
+import os
+import sys
+from pathlib import Path
+
+from browser_use.agent.views import ActionResult
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+
+import pyperclip
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, Controller
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+browser = Browser(
+ config=BrowserConfig(
+ headless=False,
+ )
+)
+controller = Controller()
+
+
+@controller.registry.action('Copy text to clipboard')
+def copy_to_clipboard(text: str):
+ pyperclip.copy(text)
+ return ActionResult(extracted_content=text)
+
+
+@controller.registry.action('Paste text from clipboard')
+async def paste_from_clipboard(browser: BrowserContext):
+ text = pyperclip.paste()
+ # send text to browser
+ page = await browser.get_current_page()
+ await page.keyboard.type(text)
+
+ return ActionResult(extracted_content=text)
+
+
+async def main():
+ task = f'Copy the text "Hello, world!" to the clipboard, then go to google.com and paste the text'
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(
+ task=task,
+ llm=model,
+ controller=controller,
+ browser=browser,
+ )
+
+ await agent.run()
+ await browser.close()
+
+ input('Press Enter to close...')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/custom-functions/file_upload.py b/examples/custom-functions/file_upload.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1efdf6c70952a8f31b9b705074df71892ea951f
--- /dev/null
+++ b/examples/custom-functions/file_upload.py
@@ -0,0 +1,108 @@
+import os
+import sys
+from pathlib import Path
+
+from browser_use.agent.views import ActionResult
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+import logging
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, Controller
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+logger = logging.getLogger(__name__)
+
+# Initialize controller first
+browser = Browser(
+ config=BrowserConfig(
+ headless=False,
+ chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ )
+)
+controller = Controller()
+
+
+@controller.action(
+ 'Upload file to interactive element with file path ',
+)
+async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
+ if path not in available_file_paths:
+ return ActionResult(error=f'File path {path} is not available')
+
+ if not os.path.exists(path):
+ return ActionResult(error=f'File {path} does not exist')
+
+ dom_el = await browser.get_dom_element_by_index(index)
+
+ file_upload_dom_el = dom_el.get_file_upload_element()
+
+ if file_upload_dom_el is None:
+ msg = f'No file upload element found at index {index}'
+ logger.info(msg)
+ return ActionResult(error=msg)
+
+ file_upload_el = await browser.get_locate_element(file_upload_dom_el)
+
+ if file_upload_el is None:
+ msg = f'No file upload element found at index {index}'
+ logger.info(msg)
+ return ActionResult(error=msg)
+
+ try:
+ await file_upload_el.set_input_files(path)
+ msg = f'Successfully uploaded file to index {index}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+ except Exception as e:
+ msg = f'Failed to upload file to index {index}: {str(e)}'
+ logger.info(msg)
+ return ActionResult(error=msg)
+
+
+@controller.action('Read the file content of a file given a path')
+async def read_file(path: str, available_file_paths: list[str]):
+ if path not in available_file_paths:
+ return ActionResult(error=f'File path {path} is not available')
+
+ with open(path, 'r') as f:
+ content = f.read()
+ msg = f'File content: {content}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg, include_in_memory=True)
+
+
+def create_file(file_type: str = 'txt'):
+ with open(f'tmp.{file_type}', 'w') as f:
+ f.write('test')
+ file_path = Path.cwd() / f'tmp.{file_type}'
+ logger.info(f'Created file: {file_path}')
+ return str(file_path)
+
+
+async def main():
+ task = f'Go to https://kzmpmkh2zfk1ojnpxfn1.lite.vusercontent.net/ and - read the file content and upload them to fields'
+
+ available_file_paths = [create_file('txt'), create_file('pdf'), create_file('csv')]
+
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(
+ task=task,
+ llm=model,
+ controller=controller,
+ browser=browser,
+ available_file_paths=available_file_paths,
+ )
+
+ await agent.run()
+
+ await browser.close()
+
+ input('Press Enter to close...')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/custom-functions/notification.py b/examples/custom-functions/notification.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a8bfedef9f125ad336ed988db652adf7603650
--- /dev/null
+++ b/examples/custom-functions/notification.py
@@ -0,0 +1,46 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+
+from browser_use import ActionResult, Agent, Controller
+
+load_dotenv()
+
+controller = Controller()
+
+
+@controller.registry.action('Done with task ')
+async def done(text: str):
+ import yagmail
+
+ # To send emails use
+ # STEP 1: go to https://support.google.com/accounts/answer/185833
+ # STEP 2: Create an app password (you cant use here your normal gmail password)
+ # STEP 3: Use the app password in the code below for the password
+ yag = yagmail.SMTP('your_email@gmail.com', 'your_app_password')
+ yag.send(
+ to='recipient@example.com',
+ subject='Test Email',
+ contents=f'result\n: {text}',
+ )
+
+ return ActionResult(is_done=True, extracted_content='Email sent!')
+
+
+async def main():
+ task = 'go to brower-use.com and then done'
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, controller=controller)
+
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/custom-functions/save_to_file_hugging_face.py b/examples/custom-functions/save_to_file_hugging_face.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c332a408d6826e569ffa661c91c548634bd46d8
--- /dev/null
+++ b/examples/custom-functions/save_to_file_hugging_face.py
@@ -0,0 +1,48 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+from typing import List, Optional
+
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+
+from browser_use.agent.service import Agent
+from browser_use.controller.service import Controller
+
+# Initialize controller first
+controller = Controller()
+
+
+class Model(BaseModel):
+ title: str
+ url: str
+ likes: int
+ license: str
+
+
+class Models(BaseModel):
+ models: List[Model]
+
+
+@controller.action('Save models', param_model=Models)
+def save_models(params: Models):
+ with open('models.txt', 'a') as f:
+ for model in params.models:
+ f.write(f'{model.title} ({model.url}): {model.likes} likes, {model.license}\n')
+
+
+# video: https://preview.screen.studio/share/EtOhIk0P
+async def main():
+ task = f'Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.'
+
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, controller=controller)
+
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/custom_output.py b/examples/features/custom_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf76d9dcc8a3034afad50a984db2ccf026b53f6c
--- /dev/null
+++ b/examples/features/custom_output.py
@@ -0,0 +1,60 @@
+"""
+Show how to use custom outputs.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+from typing import List
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+
+from browser_use import ActionResult, Agent, Controller
+
+load_dotenv()
+
+
+class Post(BaseModel):
+ post_title: str
+ post_url: str
+ num_comments: int
+ hours_since_post: int
+
+
+class Posts(BaseModel):
+ posts: List[Post]
+
+
+controller = Controller(output_model=Posts)
+
+
+async def main():
+ task = 'Go to hackernews show hn and give me the first 5 posts'
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, controller=controller)
+
+ history = await agent.run()
+
+ result = history.final_result()
+ if result:
+ parsed: Posts = Posts.model_validate_json(result)
+
+ for post in parsed.posts:
+ print('\n--------------------------------')
+ print(f'Title: {post.post_title}')
+ print(f'URL: {post.post_url}')
+ print(f'Comments: {post.num_comments}')
+ print(f'Hours since post: {post.hours_since_post}')
+ else:
+ print('No result')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/custom_system_prompt.py b/examples/features/custom_system_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..e873631bdd9a1057be76c25baf52e79548b347f6
--- /dev/null
+++ b/examples/features/custom_system_prompt.py
@@ -0,0 +1,36 @@
+import json
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+extend_system_message = (
+ 'REMEMBER the most important RULE: ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!!'
+)
+
+# or use override_system_message to completely override the system prompt
+
+
+async def main():
+ task = "do google search to find images of Elon Musk's wife"
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, extend_system_message=extend_system_message)
+
+ print(
+ json.dumps(
+ agent.message_manager.system_prompt.model_dump(exclude_unset=True),
+ indent=4,
+ )
+ )
+
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/custom_user_agent.py b/examples/features/custom_user_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..f832d92ade9bf2b7dcafaf327466fb3bd7318577
--- /dev/null
+++ b/examples/features/custom_user_agent.py
@@ -0,0 +1,76 @@
+import os
+import sys
+
+from langchain_anthropic import ChatAnthropic
+from langchain_openai import ChatOpenAI
+
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import asyncio
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+
+def get_llm(provider: str):
+ if provider == 'anthropic':
+ return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0)
+ elif provider == 'openai':
+ return ChatOpenAI(model='gpt-4o', temperature=0.0)
+
+ else:
+ raise ValueError(f'Unsupported provider: {provider}')
+
+
+# NOTE: This example is to find your current user agent string to use it in the browser_context
+task = 'go to https://whatismyuseragent.com and find the current user agent string '
+
+
+controller = Controller()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--query', type=str, help='The query to process', default=task)
+parser.add_argument(
+ '--provider',
+ type=str,
+ choices=['openai', 'anthropic'],
+ default='openai',
+ help='The model provider to use (default: openai)',
+)
+
+args = parser.parse_args()
+
+llm = get_llm(args.provider)
+
+
+browser = Browser(
+ config=BrowserConfig(
+ # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ )
+)
+
+browser_context = BrowserContext(config=BrowserContextConfig(user_agent='foobarfoo'), browser=browser)
+
+agent = Agent(
+ task=args.query,
+ llm=llm,
+ controller=controller,
+ # browser=browser,
+ browser_context=browser_context,
+ use_vision=True,
+ max_actions_per_step=1,
+)
+
+
+async def main():
+ await agent.run(max_steps=25)
+
+ input('Press Enter to close the browser...')
+ await browser_context.close()
+
+
+asyncio.run(main())
diff --git a/examples/features/download_file.py b/examples/features/download_file.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e2782cdd1f3400e15c397908dfe5791e96ab61e
--- /dev/null
+++ b/examples/features/download_file.py
@@ -0,0 +1,37 @@
+import asyncio
+import os
+
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContextConfig
+
+load_dotenv()
+api_key = os.getenv('GEMINI_API_KEY')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
+browser = Browser(
+ config=BrowserConfig(
+ new_context_config=BrowserContextConfig(save_downloads_path=os.path.join(os.path.expanduser('~'), 'downloads'))
+ )
+)
+
+
+async def run_download():
+ agent = Agent(
+ task=('Go to "https://file-examples.com/" and download the smallest doc file.'),
+ llm=llm,
+ max_actions_per_step=8,
+ use_vision=True,
+ browser=browser,
+ )
+ await agent.run(max_steps=25)
+ await browser.close()
+
+
+if __name__ == '__main__':
+ asyncio.run(run_download())
diff --git a/examples/features/follow_up_tasks.py b/examples/features/follow_up_tasks.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa326691a1a41489c6c1bbf185b32fd42c7475fe
--- /dev/null
+++ b/examples/features/follow_up_tasks.py
@@ -0,0 +1,37 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+from browser_use.agent.views import ActionResult
+from browser_use.controller.service import Controller
+
+load_dotenv()
+
+# Initialize the model
+llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+)
+controller = Controller()
+
+
+task = 'Find the founders of browser-use and draft them a short personalized message'
+
+agent = Agent(task=task, llm=llm, controller=controller)
+
+
+async def main():
+ await agent.run()
+
+ # new_task = input('Type in a new task: ')
+ new_task = 'Find an image of the founders'
+
+ agent.add_new_task(new_task)
+
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/initial_actions.py b/examples/features/initial_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc24a2acc64d20ffa8a6f68b118bb22a84541382
--- /dev/null
+++ b/examples/features/initial_actions.py
@@ -0,0 +1,28 @@
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+load_dotenv()
+llm = ChatOpenAI(model='gpt-4o')
+
+initial_actions = [
+ {'open_tab': {'url': 'https://www.google.com'}},
+ {'open_tab': {'url': 'https://en.wikipedia.org/wiki/Randomness'}},
+ {'scroll_down': {'amount': 1000}},
+]
+agent = Agent(
+ task='What theories are displayed on the page?',
+ initial_actions=initial_actions,
+ llm=llm,
+)
+
+
+async def main():
+ await agent.run(max_steps=10)
+
+
+if __name__ == '__main__':
+ import asyncio
+
+ asyncio.run(main())
diff --git a/examples/features/multi-tab_handling.py b/examples/features/multi-tab_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..c86fabef1d317d6438df14305030dee9215863d7
--- /dev/null
+++ b/examples/features/multi-tab_handling.py
@@ -0,0 +1,30 @@
+"""
+Simple try of the agent.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+# video: https://preview.screen.studio/share/clenCmS6
+llm = ChatOpenAI(model='gpt-4o')
+agent = Agent(
+ task='open 3 tabs with elon musk, trump, and steve jobs, then go back to the first and stop',
+ llm=llm,
+)
+
+
+async def main():
+ await agent.run()
+
+
+asyncio.run(main())
diff --git a/examples/features/multiple_agents_same_browser.py b/examples/features/multiple_agents_same_browser.py
new file mode 100644
index 0000000000000000000000000000000000000000..013b33ff3f850e087b6a7b588b71ecc60a9d6e0a
--- /dev/null
+++ b/examples/features/multiple_agents_same_browser.py
@@ -0,0 +1,64 @@
+import os
+import sys
+
+from langchain_openai import ChatOpenAI
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from browser_use import Agent, Browser, Controller
+
+
+# Video: https://preview.screen.studio/share/8Elaq9sm
+async def main():
+ # Persist the browser state across agents
+
+ browser = Browser()
+ async with await browser.new_context() as context:
+ model = ChatOpenAI(model='gpt-4o')
+ current_agent = None
+
+ async def get_input():
+ return await asyncio.get_event_loop().run_in_executor(
+ None, lambda: input('Enter task (p: pause current agent, r: resume, b: break): ')
+ )
+
+ while True:
+ task = await get_input()
+
+ if task.lower() == 'p':
+ # Pause the current agent if one exists
+ if current_agent:
+ current_agent.pause()
+ continue
+ elif task.lower() == 'r':
+ # Resume the current agent if one exists
+ if current_agent:
+ current_agent.resume()
+ continue
+ elif task.lower() == 'b':
+ # Break the current agent's execution if one exists
+ if current_agent:
+ current_agent.stop()
+ current_agent = None
+ continue
+
+ # If there's a current agent running, pause it before starting new one
+ if current_agent:
+ current_agent.pause()
+
+ # Create and run new agent with the task
+ current_agent = Agent(
+ task=task,
+ llm=model,
+ browser_context=context,
+ )
+
+ # Run the agent asynchronously without blocking
+ asyncio.create_task(current_agent.run())
+
+
+asyncio.run(main())
+
+# Now aad the cheapest to the cart
diff --git a/examples/features/outsource_state.py b/examples/features/outsource_state.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7bedc04c691d26c5685f58b376119b42f9b3164
--- /dev/null
+++ b/examples/features/outsource_state.py
@@ -0,0 +1,70 @@
+"""
+Show how to use custom outputs.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+
+from browser_use.agent.views import AgentState
+from browser_use.browser.browser import Browser, BrowserConfig
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+load_dotenv()
+
+
+async def main():
+ task = 'Go to hackernews show hn and give me the first 5 posts'
+
+ browser = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+
+ browser_context = await browser.new_context()
+
+ agent_state = AgentState()
+
+ for i in range(10):
+ agent = Agent(
+ task=task,
+ llm=ChatOpenAI(model='gpt-4o'),
+ browser=browser,
+ browser_context=browser_context,
+ injected_agent_state=agent_state,
+ page_extraction_llm=ChatOpenAI(model='gpt-4o-mini'),
+ )
+
+ done, valid = await agent.take_step()
+ print(f'Step {i}: Done: {done}, Valid: {valid}')
+
+ if done and valid:
+ break
+
+ agent_state.history.history = []
+
+ # Save state to file
+ with open('agent_state.json', 'w') as f:
+ serialized = agent_state.model_dump_json(exclude={'history'})
+ f.write(serialized)
+
+ # Load state back from file
+ with open('agent_state.json', 'r') as f:
+ loaded_json = f.read()
+ agent_state = AgentState.model_validate_json(loaded_json)
+
+ break
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/parallel_agents.py b/examples/features/parallel_agents.py
new file mode 100644
index 0000000000000000000000000000000000000000..e179b4478cc131e2840822e5d6bfd86ee9469ca5
--- /dev/null
+++ b/examples/features/parallel_agents.py
@@ -0,0 +1,55 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use.agent.service import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContextConfig
+
+browser = Browser(
+ config=BrowserConfig(
+ disable_security=True,
+ headless=False,
+ new_context_config=BrowserContextConfig(save_recording_path='./tmp/recordings'),
+ )
+)
+llm = ChatOpenAI(model='gpt-4o')
+
+
+async def main():
+ agents = [
+ Agent(task=task, llm=llm, browser=browser)
+ for task in [
+ 'Search Google for weather in Tokyo',
+ 'Check Reddit front page title',
+ 'Look up Bitcoin price on Coinbase',
+ 'Find NASA image of the day',
+ # 'Check top story on CNN',
+ # 'Search latest SpaceX launch date',
+ # 'Look up population of Paris',
+ # 'Find current time in Sydney',
+ # 'Check who won last Super Bowl',
+ # 'Search trending topics on Twitter',
+ ]
+ ]
+
+ await asyncio.gather(*[agent.run() for agent in agents])
+
+ # async with await browser.new_context() as context:
+ agentX = Agent(
+ task='Go to apple.com and return the title of the page',
+ llm=llm,
+ browser=browser,
+ # browser_context=context,
+ )
+ await agentX.run()
+
+ await browser.close()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/pause_agent.py b/examples/features/pause_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..e021224cfb4b79125ed1d654f0cbeba94aae3120
--- /dev/null
+++ b/examples/features/pause_agent.py
@@ -0,0 +1,98 @@
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+import threading
+import time
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+
+class AgentController:
+ def __init__(self):
+ llm = ChatOpenAI(model='gpt-4o')
+ self.agent = Agent(
+ task='open in one action https://www.google.com, https://www.wikipedia.org, https://www.youtube.com, https://www.github.com, https://amazon.com',
+ llm=llm,
+ )
+ self.running = False
+
+ async def run_agent(self):
+ """Run the agent"""
+ self.running = True
+ await self.agent.run()
+
+ def start(self):
+ """Start the agent in a separate thread"""
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+ loop.run_until_complete(self.run_agent())
+
+ def pause(self):
+ """Pause the agent"""
+ self.agent.pause()
+
+ def resume(self):
+ """Resume the agent"""
+ self.agent.resume()
+
+ def stop(self):
+ """Stop the agent"""
+ self.agent.stop()
+ self.running = False
+
+
+def print_menu():
+ print('\nAgent Control Menu:')
+ print('1. Start')
+ print('2. Pause')
+ print('3. Resume')
+ print('4. Stop')
+ print('5. Exit')
+
+
+def main():
+ controller = AgentController()
+ agent_thread = None
+
+ while True:
+ print_menu()
+ choice = input('Enter your choice (1-5): ')
+
+ if choice == '1' and not agent_thread:
+ print('Starting agent...')
+ agent_thread = threading.Thread(target=controller.start)
+ agent_thread.start()
+
+ elif choice == '2':
+ print('Pausing agent...')
+ controller.pause()
+
+ elif choice == '3':
+ print('Resuming agent...')
+ controller.resume()
+
+ elif choice == '4':
+ print('Stopping agent...')
+ controller.stop()
+ if agent_thread:
+ agent_thread.join()
+ agent_thread = None
+
+ elif choice == '5':
+ print('Exiting...')
+ if controller.running:
+ controller.stop()
+ if agent_thread:
+ agent_thread.join()
+ break
+
+ time.sleep(0.1) # Small delay to prevent CPU spinning
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/features/planner.py b/examples/features/planner.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f595375c4fed5bb5c82df4c91b65537ee7a8f6
--- /dev/null
+++ b/examples/features/planner.py
@@ -0,0 +1,23 @@
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+
+llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
+planner_llm = ChatOpenAI(
+ model='o3-mini',
+)
+task = 'your task'
+
+
+agent = Agent(task=task, llm=llm, planner_llm=planner_llm, use_vision_for_planner=False, planner_interval=1)
+
+
+async def main():
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/restrict_urls.py b/examples/features/restrict_urls.py
new file mode 100644
index 0000000000000000000000000000000000000000..398c785596a4785cd9327060dd98557d0e6a528a
--- /dev/null
+++ b/examples/features/restrict_urls.py
@@ -0,0 +1,47 @@
+import os
+import sys
+
+from langchain_anthropic import ChatAnthropic
+from langchain_openai import ChatOpenAI
+
+from browser_use.browser.context import BrowserContextConfig
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import asyncio
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
+task = (
+ 'go to google.com and search for openai.com and click on the first link then extract content and scroll down - whats there?'
+)
+
+allowed_domains = ['google.com']
+
+browser = Browser(
+ config=BrowserConfig(
+ chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ new_context_config=BrowserContextConfig(
+ allowed_domains=allowed_domains,
+ ),
+ ),
+)
+
+agent = Agent(
+ task=task,
+ llm=llm,
+ browser=browser,
+)
+
+
+async def main():
+ await agent.run(max_steps=25)
+
+ input('Press Enter to close the browser...')
+ await browser.close()
+
+
+asyncio.run(main())
diff --git a/examples/features/result_processing.py b/examples/features/result_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..53177f4eee06bc25f85b5244a2574fdeb537c128
--- /dev/null
+++ b/examples/features/result_processing.py
@@ -0,0 +1,63 @@
+import os
+import sys
+from pprint import pprint
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import (
+ BrowserContext,
+ BrowserContextConfig,
+ BrowserContextWindowSize,
+)
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+from browser_use.agent.views import AgentHistoryList
+from browser_use.controller.service import Controller
+
+llm = ChatOpenAI(model='gpt-4o')
+browser = Browser(
+ config=BrowserConfig(
+ headless=False,
+ disable_security=True,
+ extra_chromium_args=['--window-size=2000,2000'],
+ )
+)
+
+
+async def main():
+ async with await browser.new_context(
+ config=BrowserContextConfig(
+ trace_path='./tmp/result_processing',
+ no_viewport=False,
+ browser_window_size=BrowserContextWindowSize(width=1280, height=1000),
+ )
+ ) as browser_context:
+ agent = Agent(
+ task="go to google.com and type 'OpenAI' click search and give me the first url",
+ llm=llm,
+ browser_context=browser_context,
+ )
+ history: AgentHistoryList = await agent.run(max_steps=3)
+
+ print('Final Result:')
+ pprint(history.final_result(), indent=4)
+
+ print('\nErrors:')
+ pprint(history.errors(), indent=4)
+
+ # e.g. xPaths the model clicked on
+ print('\nModel Outputs:')
+ pprint(history.model_actions(), indent=4)
+
+ print('\nThoughts:')
+ pprint(history.model_thoughts(), indent=4)
+ # close browser
+ await browser.close()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/save_trace.py b/examples/features/save_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2b4bb677f257b4590a4766901c5eaa5f765fd30
--- /dev/null
+++ b/examples/features/save_trace.py
@@ -0,0 +1,32 @@
+import os
+import sys
+
+from langchain_openai import ChatOpenAI
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+
+from browser_use.agent.service import Agent
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContextConfig
+
+llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
+
+
+async def main():
+ browser = Browser()
+
+ async with await browser.new_context(
+ config=BrowserContextConfig(trace_path='./tmp/traces/')
+ ) as context:
+ agent = Agent(
+ task='Go to hackernews, then go to apple.com and return all titles of open tabs',
+ llm=llm,
+ browser_context=context,
+ )
+ await agent.run()
+
+ await browser.close()
+
+
+asyncio.run(main())
diff --git a/examples/features/sensitive_data.py b/examples/features/sensitive_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..52a8a5bc4004f33cbb92f6176336273c89503aed
--- /dev/null
+++ b/examples/features/sensitive_data.py
@@ -0,0 +1,27 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+load_dotenv()
+
+# Initialize the model
+llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+)
+# the model will see x_name and x_password, but never the actual values.
+sensitive_data = {'x_name': 'my_x_name', 'x_password': 'my_x_password'}
+task = 'go to x.com and login with x_name and x_password then find interesting posts and like them'
+
+agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data)
+
+
+async def main():
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/small_model_for_extraction.py b/examples/features/small_model_for_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..e86e0235bd90f54bb9dd672ad75ddb8844119c0e
--- /dev/null
+++ b/examples/features/small_model_for_extraction.py
@@ -0,0 +1,22 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+from browser_use.controller.service import Controller
+
+load_dotenv()
+
+llm = ChatOpenAI(model='gpt-4o', temperature=0.0)
+small_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0.0)
+task = 'Find the founders of browser-use in ycombinator, extract all links and open the links one by one'
+agent = Agent(task=task, llm=llm, page_extraction_llm=small_llm)
+
+
+async def main():
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/features/validate_output.py b/examples/features/validate_output.py
new file mode 100644
index 0000000000000000000000000000000000000000..afe96083ed899b9df4d30373cb00684e6843b1f0
--- /dev/null
+++ b/examples/features/validate_output.py
@@ -0,0 +1,49 @@
+"""
+Demostrate output validator.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+
+from browser_use import ActionResult, Agent, Controller
+
+load_dotenv()
+
+controller = Controller()
+
+
+class DoneResult(BaseModel):
+ title: str
+ comments: str
+ hours_since_start: int
+
+
+# we overwrite done() in this example to demonstrate the validator
+@controller.registry.action('Done with task', param_model=DoneResult)
+async def done(params: DoneResult):
+ result = ActionResult(is_done=True, extracted_content=params.model_dump_json())
+ print(result)
+ # NOTE: this is clearly wrong - to demonstrate the validator
+ return 'blablabla'
+
+
+async def main():
+ task = 'Go to hackernews hn and give me the top 1 post'
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, controller=controller, validate_output=True)
+ # NOTE: this should fail to demonstrate the validator
+ await agent.run(max_steps=5)
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/integrations/discord/discord_api.py b/examples/integrations/discord/discord_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ee5c0b08b46852a56bbdd71beb470bb5ae6716
--- /dev/null
+++ b/examples/integrations/discord/discord_api.py
@@ -0,0 +1,123 @@
+import discord
+from discord.ext import commands
+from dotenv import load_dotenv
+from langchain_core.language_models.chat_models import BaseChatModel
+
+from browser_use import BrowserConfig
+from browser_use.agent.service import Agent, Browser
+
+load_dotenv()
+
+
+class DiscordBot(commands.Bot):
+ """Discord bot implementation for Browser-Use tasks.
+
+ This bot allows users to run browser automation tasks through Discord messages.
+ Processes tasks asynchronously and sends the result back to the user in response to the message.
+ Messages must start with the configured prefix (default: "$bu") followed by the task description.
+
+ Args:
+ llm (BaseChatModel): Language model instance to use for task processing
+ prefix (str, optional): Command prefix for triggering browser tasks. Defaults to "$bu"
+ ack (bool, optional): Whether to acknowledge task receipt with a message. Defaults to False
+ browser_config (BrowserConfig, optional): Browser configuration settings.
+ Defaults to headless mode
+
+ Usage:
+ ```python
+ from langchain_openai import ChatOpenAI
+
+ llm = ChatOpenAI()
+ bot = DiscordBot(llm=llm, prefix='$bu', ack=True)
+ bot.run('YOUR_DISCORD_TOKEN')
+ ```
+
+ Discord Usage:
+ Send messages starting with the prefix:
+ "$bu search for python tutorials"
+ """
+
+ def __init__(
+ self,
+ llm: BaseChatModel,
+ prefix: str = '$bu',
+ ack: bool = False,
+ browser_config: BrowserConfig = BrowserConfig(headless=True),
+ ):
+ self.llm = llm
+ self.prefix = prefix.strip()
+ self.ack = ack
+ self.browser_config = browser_config
+
+ # Define intents.
+ intents = discord.Intents.default()
+ intents.message_content = True # Enable message content intent
+ intents.members = True # Enable members intent for user info
+
+ # Initialize the bot with a command prefix and intents.
+ super().__init__(
+ command_prefix='!', intents=intents
+ ) # You may not need prefix, just here for flexibility
+
+ # self.tree = app_commands.CommandTree(self) # Initialize command tree for slash commands.
+
+ async def on_ready(self):
+ """Called when the bot is ready."""
+ try:
+ print(f'We have logged in as {self.user}')
+ cmds = await self.tree.sync() # Sync the command tree with discord
+
+ except Exception as e:
+ print(f'Error during bot startup: {e}')
+
+ async def on_message(self, message):
+ """Called when a message is received."""
+ try:
+ if message.author == self.user: # Ignore the bot's messages
+ return
+ if message.content.strip().startswith(f'{self.prefix} '):
+ if self.ack:
+ try:
+ await message.reply(
+ 'Starting browser use task...',
+ mention_author=True, # Don't ping the user
+ )
+ except Exception as e:
+ print(f'Error sending start message: {e}')
+
+ try:
+ agent_message = await self.run_agent(
+ message.content.replace(f'{self.prefix} ', '').strip()
+ )
+ await message.channel.send(
+ content=f'{agent_message}', reference=message, mention_author=True
+ )
+ except Exception as e:
+ await message.channel.send(
+ content=f'Error during task execution: {str(e)}',
+ reference=message,
+ mention_author=True,
+ )
+
+ except Exception as e:
+ print(f'Error in message handling: {e}')
+
+ # await self.process_commands(message) # Needed to process bot commands
+
+ async def run_agent(self, task: str) -> str:
+ try:
+ browser = Browser(config=self.browser_config)
+ agent = Agent(task=(task), llm=self.llm, browser=browser)
+ result = await agent.run()
+
+ agent_message = None
+ if result.is_done():
+ agent_message = result.history[-1].result[0].extracted_content
+
+ if agent_message is None:
+ agent_message = 'Oops! Something went wrong while running Browser-Use.'
+
+ return agent_message
+
+ except Exception as e:
+ raise Exception(f'Browser-use task failed: {str(e)}')
diff --git a/examples/integrations/discord/discord_example.py b/examples/integrations/discord/discord_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..259e68cc612e2f44369728868b6cc50dee440f5e
--- /dev/null
+++ b/examples/integrations/discord/discord_example.py
@@ -0,0 +1,68 @@
+"""
+This examples requires you to have a Discord bot token and the bot already added to a server.
+
+Five Steps to create and invite a Discord bot:
+
+1. Create a Discord Application:
+ * Go to the Discord Developer Portal: https://discord.com/developers/applications
+ * Log in to the Discord website.
+ * Click on "New Application".
+ * Give the application a name and click "Create".
+2. Configure the Bot:
+ * Navigate to the "Bot" tab on the left side of the screen.
+ * Make sure "Public Bot" is ticked if you want others to invite your bot.
+ * Generate your bot token by clicking on "Reset Token", Copy the token and save it securely.
+ * Do not share the bot token. Treat it like a password. If the token is leaked, regenerate it.
+3. Enable Privileged Intents:
+ * Scroll down to the "Privileged Gateway Intents" section.
+ * Enable the necessary intents (e.g., "Server Members Intent" and "Message Content Intent").
+ --> Note: Enabling privileged intents for bots in over 100 guilds requires bot verification. You may need to contact Discord support to enable privileged intents for verified bots.
+4. Generate Invite URL:
+ * Go to "OAuth2" tab and "OAuth2 URL Generator" section.
+ * Under "scopes", tick the "bot" checkbox.
+ * Tick the permissions required for your bot to function under βBot Permissionsβ.
+ * e.g. "Send Messages", "Send Messages in Threads", "Read Message History", "Mention Everyone".
+ * Copy the generated URL under the "GENERATED URL" section at the bottom.
+5. Invite the Bot:
+ * Paste the URL into your browser.
+ * Choose a server to invite the bot to.
+ * Click βAuthorizeβ.
+ --> Note: The person adding the bot needs "Manage Server" permissions.
+6. Run the code below to start the bot with your bot token.
+7. Write e.g. "/bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Discord channel.
+"""
+
+import os
+
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import SecretStr
+
+from browser_use import BrowserConfig
+from examples.integrations.discord.discord_api import DiscordBot
+
+load_dotenv()
+
+# load credentials from environment variables
+bot_token = os.getenv('DISCORD_BOT_TOKEN')
+if not bot_token:
+ raise ValueError('Discord bot token not found in .env file.')
+
+api_key = os.getenv('GEMINI_API_KEY')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+
+llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
+
+bot = DiscordBot(
+ llm=llm, # required; instance of BaseChatModel
+ prefix='$bu', # optional; prefix of messages to trigger browser-use, defaults to "$bu"
+ ack=True, # optional; whether to acknowledge task receipt with a message, defaults to False
+ browser_config=BrowserConfig(
+ headless=False
+ ), # optional; useful for changing headless mode or other browser configs, defaults to headless mode
+)
+
+bot.run(
+ token=bot_token, # required; Discord bot token
+)
diff --git a/examples/integrations/slack/README.md b/examples/integrations/slack/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..af985930467a98afa3cd30484c51436952080ce3
--- /dev/null
+++ b/examples/integrations/slack/README.md
@@ -0,0 +1,76 @@
+# Slack Integration
+
+Steps to create and configure a Slack bot:
+
+1. Create a Slack App:
+ * Go to the Slack API: https://api.slack.com/apps
+ * Click on "Create New App".
+ * Choose "From scratch" and give your app a name and select the workspace.
+ * Provide a name and description for your bot (these are required fields).
+2. Configure the Bot:
+ * Navigate to the "OAuth & Permissions" tab on the left side of the screen.
+ * Under "Scopes", add the necessary bot token scopes (add these "chat:write", "channels:history", "im:history").
+3. Enable Event Subscriptions:
+ * Navigate to the "Event Subscriptions" tab.
+ * Enable events and add the necessary bot events (add these "message.channels", "message.im").
+ * Add your request URL (you can use ngrok to expose your local server if needed). [See how to set up ngrok](#installing-and-starting-ngrok).
+ * **Note:** The URL provided by ngrok is ephemeral and will change each time ngrok is started. You will need to update the request URL in the bot's settings each time you restart ngrok. [See how to update the request URL](#updating-the-request-url-in-bots-settings).
+4. Add the bot to your Slack workspace:
+ * Navigate to the "OAuth & Permissions" tab.
+ * Under "OAuth Tokens for Your Workspace", click on "Install App to Workspace".
+ * Follow the prompts to authorize the app and add it to your workspace.
+5. Set up environment variables:
+ * Obtain the `SLACK_SIGNING_SECRET`:
+ * Go to the Slack API: https://api.slack.com/apps
+ * Select your app.
+ * Navigate to the "Basic Information" tab.
+ * Copy the "Signing Secret".
+ * Obtain the `SLACK_BOT_TOKEN`:
+ * Go to the Slack API: https://api.slack.com/apps
+ * Select your app.
+ * Navigate to the "OAuth & Permissions" tab.
+ * Copy the "Bot User OAuth Token".
+ * Create a `.env` file in the root directory of your project and add the following lines:
+ ```env
+ SLACK_SIGNING_SECRET=your-signing-secret
+ SLACK_BOT_TOKEN=your-bot-token
+ ```
+6. Invite the bot to a channel:
+ * Use the `/invite @your-bot-name` command in the Slack channel where you want the bot to be active.
+7. Run the code in `examples/slack_example.py` to start the bot with your bot token and signing secret.
+8. Write e.g. "$bu whats the weather in Tokyo?" to start a browser-use task and get a response inside the Slack channel.
+
+## Installing and Starting ngrok
+
+To expose your local server to the internet, you can use ngrok. Follow these steps to install and start ngrok:
+
+1. Download ngrok from the official website: https://ngrok.com/download
+2. Create a free account and follow the offical steps to install ngrok.
+3. Start ngrok by running the following command in your terminal:
+ ```sh
+ ngrok http 3000
+ ```
+ Replace `3000` with the port number your local server is running on.
+
+## Updating the Request URL in Bot's Settings
+
+If you need to update the request URL (e.g., when the ngrok URL changes), follow these steps:
+
+1. Go to the Slack API: https://api.slack.com/apps
+2. Select your app.
+3. Navigate to the "Event Subscriptions" tab.
+4. Update the "Request URL" field with the new ngrok URL. The URL should be something like: `https://.ngrok-free.app/slack/events`
+5. Save the changes.
+
+## Installing Required Packages
+
+To run this example, you need to install the following packages:
+
+- `fastapi`
+- `uvicorn`
+- `slack_sdk`
+
+You can install these packages using pip:
+
+```sh
+pip install fastapi uvicorn slack_sdk
\ No newline at end of file
diff --git a/examples/integrations/slack/slack_api.py b/examples/integrations/slack/slack_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df930a890692bfafe27fdd869eb4b7d5a6649a4
--- /dev/null
+++ b/examples/integrations/slack/slack_api.py
@@ -0,0 +1,111 @@
+import logging
+from browser_use import BrowserConfig
+from fastapi import FastAPI, Request, HTTPException, Depends
+from dotenv import load_dotenv
+from slack_sdk.web.async_client import AsyncWebClient
+from slack_sdk.errors import SlackApiError
+from slack_sdk.signature import SignatureVerifier
+from browser_use.agent.service import Agent, Browser
+from langchain_core.language_models.chat_models import BaseChatModel
+from browser_use.logging_config import setup_logging
+
+load_dotenv()
+
+setup_logging()
+logger = logging.getLogger('slack')
+
+app = FastAPI()
+
+class SlackBot:
+ def __init__(self, llm: BaseChatModel, bot_token: str, signing_secret: str, ack: bool = False, browser_config: BrowserConfig = BrowserConfig(headless=True)):
+ if not bot_token or not signing_secret:
+ raise ValueError("Bot token and signing secret must be provided")
+
+ self.llm = llm
+ self.ack = ack
+ self.browser_config = browser_config
+ self.client = AsyncWebClient(token=bot_token)
+ self.signature_verifier = SignatureVerifier(signing_secret)
+ self.processed_events = set()
+ logger.info("SlackBot initialized")
+
+ async def handle_event(self, event, event_id):
+ try:
+ logger.info(f"Received event id: {event_id}")
+ if not event_id:
+ logger.warning("Event ID missing in event data")
+ return
+
+ if event_id in self.processed_events:
+ logger.info(f"Event {event_id} already processed")
+ return
+ self.processed_events.add(event_id)
+
+ if 'subtype' in event and event['subtype'] == 'bot_message':
+ return
+
+ text = event.get('text')
+ user_id = event.get('user')
+ if text and text.startswith('$bu '):
+ task = text[len('$bu '):].strip()
+ if self.ack:
+ try:
+ await self.send_message(event['channel'], f'<@{user_id}> Starting browser use task...', thread_ts=event.get('ts'))
+ except Exception as e:
+ logger.error(f"Error sending start message: {e}")
+
+ try:
+ agent_message = await self.run_agent(task)
+ await self.send_message(event['channel'], f'<@{user_id}> {agent_message}', thread_ts=event.get('ts'))
+ except Exception as e:
+ await self.send_message(event['channel'], f'Error during task execution: {str(e)}', thread_ts=event.get('ts'))
+ except Exception as e:
+ logger.error(f"Error in handle_event: {str(e)}")
+
+ async def run_agent(self, task: str) -> str:
+ try:
+ browser = Browser(config=self.browser_config)
+ agent = Agent(task=task, llm=self.llm, browser=browser)
+ result = await agent.run()
+
+ agent_message = None
+ if result.is_done():
+ agent_message = result.history[-1].result[0].extracted_content
+
+ if agent_message is None:
+ agent_message = 'Oops! Something went wrong while running Browser-Use.'
+
+ return agent_message
+
+ except Exception as e:
+ logger.error(f"Error during task execution: {str(e)}")
+ return f'Error during task execution: {str(e)}'
+
+ async def send_message(self, channel, text, thread_ts=None):
+ try:
+ await self.client.chat_postMessage(channel=channel, text=text, thread_ts=thread_ts)
+ except SlackApiError as e:
+ logger.error(f"Error sending message: {e.response['error']}")
+
+@app.post("/slack/events")
+async def slack_events(request: Request, slack_bot: SlackBot = Depends()):
+ try:
+ if not slack_bot.signature_verifier.is_valid_request(await request.body(), dict(request.headers)):
+ logger.warning("Request verification failed")
+ raise HTTPException(status_code=400, detail="Request verification failed")
+
+ event_data = await request.json()
+ logger.info(f"Received event data: {event_data}")
+ if 'challenge' in event_data:
+ return {"challenge": event_data['challenge']}
+
+ if 'event' in event_data:
+ try:
+ await slack_bot.handle_event(event_data.get('event'), event_data.get('event_id'))
+ except Exception as e:
+ logger.error(f"Error handling event: {str(e)}")
+
+ return {}
+ except Exception as e:
+ logger.error(f"Error in slack_events: {str(e)}")
+ raise HTTPException(status_code=500, detail="Internal Server Error")
\ No newline at end of file
diff --git a/examples/integrations/slack/slack_example.py b/examples/integrations/slack/slack_example.py
new file mode 100644
index 0000000000000000000000000000000000000000..80dfc184bac1610151d4a539876e52a7cb65cc29
--- /dev/null
+++ b/examples/integrations/slack/slack_example.py
@@ -0,0 +1,42 @@
+import os
+
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import SecretStr
+
+from browser_use import BrowserConfig
+from examples.integrations.slack.slack_api import SlackBot, app
+
+load_dotenv()
+
+# load credentials from environment variables
+bot_token = os.getenv('SLACK_BOT_TOKEN')
+if not bot_token:
+ raise ValueError('Slack bot token not found in .env file.')
+
+signing_secret = os.getenv('SLACK_SIGNING_SECRET')
+if not signing_secret:
+ raise ValueError('Slack signing secret not found in .env file.')
+
+api_key = os.getenv('GEMINI_API_KEY')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+
+llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
+
+slack_bot = SlackBot(
+ llm=llm, # required; instance of BaseChatModel
+ bot_token=bot_token, # required; Slack bot token
+ signing_secret=signing_secret, # required; Slack signing secret
+ ack=True, # optional; whether to acknowledge task receipt with a message, defaults to False
+ browser_config=BrowserConfig(
+ headless=True
+ ), # optional; useful for changing headless mode or other browser configs, defaults to headless mode
+)
+
+app.dependency_overrides[SlackBot] = lambda: slack_bot
+
+if __name__ == '__main__':
+ import uvicorn
+
+ uvicorn.run('integrations.slack.slack_api:app', host='0.0.0.0', port=3000)
diff --git a/examples/models/README.md b/examples/models/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5308e8a838c94f2ac0db7bf58504c4870d89e0fc
--- /dev/null
+++ b/examples/models/README.md
@@ -0,0 +1,2 @@
+# Gemini
+Detailed video on how to integrate browser-use with Gemini: https://www.youtube.com/watch?v=JluZiWBV_Tc
diff --git a/examples/models/azure_openai.py b/examples/models/azure_openai.py
new file mode 100644
index 0000000000000000000000000000000000000000..854195b6370cf72059280691c3148c8fe3e55f5e
--- /dev/null
+++ b/examples/models/azure_openai.py
@@ -0,0 +1,42 @@
+"""
+Simple try of the agent.
+
+@dev You need to add AZURE_OPENAI_API_KEY and AZURE_OPENAI_ENDPOINT to your environment variables.
+"""
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from langchain_openai import AzureChatOpenAI
+
+from browser_use import Agent
+
+# Retrieve Azure-specific environment variables
+azure_openai_api_key = os.environ.get('AZURE_OPENAI_API_KEY')
+azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT')
+
+# Initialize the Azure OpenAI client
+llm = AzureChatOpenAI(
+ model_name='gpt-4o',
+ openai_api_key=azure_openai_api_key,
+ azure_endpoint=azure_openai_endpoint, # Corrected to use azure_endpoint instead of openai_api_base
+ deployment_name='gpt-4o', # Use deployment_name for Azure models
+ api_version='2024-08-01-preview' # Explicitly set the API version here
+)
+
+agent = Agent(
+ task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
+ llm=llm,
+)
+
+
+async def main():
+ await agent.run(max_steps=10)
+ input('Press Enter to continue...')
+
+
+asyncio.run(main())
\ No newline at end of file
diff --git a/examples/models/bedrock_claude.py b/examples/models/bedrock_claude.py
new file mode 100644
index 0000000000000000000000000000000000000000..eaf4e204778952e0bc78b678576f04d41395dcfe
--- /dev/null
+++ b/examples/models/bedrock_claude.py
@@ -0,0 +1,59 @@
+"""
+Automated news analysis and sentiment scoring using Bedrock.
+
+@dev Ensure AWS environment variables are set correctly for Bedrock access.
+"""
+
+import os
+import sys
+
+from langchain_aws import ChatBedrock
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import argparse
+import asyncio
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+
+def get_llm():
+ return ChatBedrock(
+ model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0",
+ temperature=0.0,
+ max_tokens=None,
+ )
+
+
+# Define the task for the agent
+task = (
+ "Visit cnn.com, navigate to the 'World News' section, and identify the latest headline. "
+ "Open the first article and summarize its content in 3-4 sentences. "
+ "Additionally, analyze the sentiment of the article (positive, neutral, or negative) "
+ "and provide a confidence score for the sentiment. Present the result in a tabular format."
+)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--query', type=str, help='The query for the agent to execute', default=task)
+args = parser.parse_args()
+
+llm = get_llm()
+
+browser = Browser(
+ config=BrowserConfig(
+ # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ )
+)
+
+agent = Agent(
+ task=args.query, llm=llm, controller=Controller(), browser=browser, validate_output=True,
+)
+
+
+async def main():
+ await agent.run(max_steps=30)
+ await browser.close()
+
+
+asyncio.run(main())
diff --git a/examples/models/claude-3.7-sonnet.py b/examples/models/claude-3.7-sonnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b8f4f0bcb711f4391e60d185248e31a019e8e4
--- /dev/null
+++ b/examples/models/claude-3.7-sonnet.py
@@ -0,0 +1,32 @@
+"""
+Simple script that runs the task of opening amazon and searching.
+@dev Ensure we have a `ANTHROPIC_API_KEY` variable in our `.env` file.
+"""
+
+import os
+import sys
+from dotenv import load_dotenv
+from langchain_anthropic import ChatAnthropic
+
+# Load environment variables from .env file
+load_dotenv()
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from browser_use import Agent
+
+llm = ChatAnthropic(model_name='claude-3-7-sonnet-20250219', temperature=0.0, timeout=30, stop=None)
+
+agent = Agent(
+ task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
+ llm=llm,
+)
+
+
+async def main():
+ await agent.run(max_steps=10)
+
+
+asyncio.run(main())
diff --git a/examples/models/deepseek-r1.py b/examples/models/deepseek-r1.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a8808a3894ef50c851988d1906f6fd84838a1fd
--- /dev/null
+++ b/examples/models/deepseek-r1.py
@@ -0,0 +1,35 @@
+import asyncio
+import os
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+
+# dotenv
+load_dotenv()
+
+api_key = os.getenv('DEEPSEEK_API_KEY', '')
+if not api_key:
+ raise ValueError('DEEPSEEK_API_KEY is not set')
+
+
+async def run_search():
+ agent = Agent(
+ task=('go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result'),
+ llm=ChatOpenAI(
+ base_url='https://api.deepseek.com/v1',
+ model='deepseek-reasoner',
+ api_key=SecretStr(api_key),
+ ),
+ use_vision=False,
+ max_failures=2,
+ max_actions_per_step=1,
+ )
+
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(run_search())
diff --git a/examples/models/deepseek.py b/examples/models/deepseek.py
new file mode 100644
index 0000000000000000000000000000000000000000..8763228b0ef53b893b5bc0a902f377191fd7e7cb
--- /dev/null
+++ b/examples/models/deepseek.py
@@ -0,0 +1,38 @@
+import asyncio
+import os
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+
+# dotenv
+load_dotenv()
+
+api_key = os.getenv('DEEPSEEK_API_KEY', '')
+if not api_key:
+ raise ValueError('DEEPSEEK_API_KEY is not set')
+
+
+async def run_search():
+ agent = Agent(
+ task=(
+ '1. Go to https://www.reddit.com/r/LocalLLaMA '
+ "2. Search for 'browser use' in the search bar"
+ '3. Click on first result'
+ '4. Return the first comment'
+ ),
+ llm=ChatOpenAI(
+ base_url='https://api.deepseek.com/v1',
+ model='deepseek-chat',
+ api_key=SecretStr(api_key),
+ ),
+ use_vision=False,
+ )
+
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(run_search())
diff --git a/examples/models/gemini.py b/examples/models/gemini.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b12f2ad7478a33c89cfb02bb6a35ef3cea741a3
--- /dev/null
+++ b/examples/models/gemini.py
@@ -0,0 +1,40 @@
+import asyncio
+import os
+
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import SecretStr
+
+from browser_use import Agent, BrowserConfig
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContextConfig
+
+load_dotenv()
+api_key = os.getenv('GEMINI_API_KEY')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+
+llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
+
+browser = Browser(
+ config=BrowserConfig(
+ new_context_config=BrowserContextConfig(
+ viewport_expansion=0,
+ )
+ )
+)
+
+
+async def run_search():
+ agent = Agent(
+ task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
+ llm=llm,
+ max_actions_per_step=4,
+ browser=browser,
+ )
+
+ await agent.run(max_steps=25)
+
+
+if __name__ == '__main__':
+ asyncio.run(run_search())
diff --git a/examples/models/gpt-4o.py b/examples/models/gpt-4o.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc9bb2b03c16877fd153dc825a90c9d91247ab1a
--- /dev/null
+++ b/examples/models/gpt-4o.py
@@ -0,0 +1,30 @@
+"""
+Simple try of the agent.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+llm = ChatOpenAI(model='gpt-4o')
+agent = Agent(
+ task='Go to amazon.com, search for laptop, sort by best rating, and give me the price of the first result',
+ llm=llm,
+)
+
+
+async def main():
+ await agent.run(max_steps=10)
+ input('Press Enter to continue...')
+
+
+asyncio.run(main())
diff --git a/examples/models/ollama.py b/examples/models/ollama.py
new file mode 100644
index 0000000000000000000000000000000000000000..a824cb405c14c2a0771302792afb2e377a7bad40
--- /dev/null
+++ b/examples/models/ollama.py
@@ -0,0 +1,34 @@
+# import os
+
+# Optional: Disable telemetry
+# os.environ["ANONYMIZED_TELEMETRY"] = "false"
+
+# Optional: Set the OLLAMA host to a remote server
+# os.environ["OLLAMA_HOST"] = "http://x.x.x.x:11434"
+
+import asyncio
+from browser_use import Agent
+from browser_use.agent.views import AgentHistoryList
+from langchain_ollama import ChatOllama
+
+
+async def run_search() -> AgentHistoryList:
+ agent = Agent(
+ task="Search for a 'browser use' post on the r/LocalLLaMA subreddit and open it.",
+ llm=ChatOllama(
+ model="qwen2.5:32b-instruct-q4_K_M",
+ num_ctx=32000,
+ ),
+ )
+
+ result = await agent.run()
+ return result
+
+
+async def main():
+ result = await run_search()
+ print("\n\n", result)
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/examples/models/qwen.py b/examples/models/qwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..e31316fc86ffbfb998866710f0c54779ca5c916e
--- /dev/null
+++ b/examples/models/qwen.py
@@ -0,0 +1,30 @@
+import asyncio
+import os
+
+from langchain_ollama import ChatOllama
+
+from browser_use import Agent
+
+
+async def run_search():
+ agent = Agent(
+ task=(
+ '1. Go to https://www.reddit.com/r/LocalLLaMA'
+ "2. Search for 'browser use' in the search bar"
+ '3. Click search'
+ '4. Call done'
+ ),
+ llm=ChatOllama(
+ # model='qwen2.5:32b-instruct-q4_K_M',
+ # model='qwen2.5:14b',
+ model='qwen2.5:latest',
+ num_ctx=128000,
+ ),
+ max_actions_per_step=1,
+ )
+
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(run_search())
diff --git a/examples/notebook/agent_browsing.ipynb b/examples/notebook/agent_browsing.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..2b5ae837de0c3bd77ec37f9136ab348055ea7e09
--- /dev/null
+++ b/examples/notebook/agent_browsing.ipynb
@@ -0,0 +1,760 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "ZRGlUb8O4fPV"
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "%pip install -U langgraph langchain_google_genai langchain_community langgraph-checkpoint-postgres openai langchain_groq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "%%capture --no-stderr\n",
+ "%pip install --upgrade --quiet playwright > /dev/null\n",
+ "%pip install --upgrade --quiet lxml browser-use langchain_openai"
+ ],
+ "metadata": {
+ "id": "cMfPUmHIxqTi"
+ },
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!playwright install"
+ ],
+ "metadata": {
+ "id": "kkZ7jVUOUV7Q"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install \"anyio<4\""
+ ],
+ "metadata": {
+ "id": "-_T1MhnGUl2q"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# This import is required only for jupyter notebooks, since they have their own eventloop\n",
+ "import nest_asyncio\n",
+ "\n",
+ "nest_asyncio.apply()"
+ ],
+ "metadata": {
+ "id": "yARYirp1UhDR"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from langchain_openai import ChatOpenAI\n",
+ "from google.colab import userdata\n",
+ "\n",
+ "\n",
+ "llm = ChatOpenAI(model=\"gpt-4o-mini\", temperature=0, api_key=userdata.get('Open_api_key'))\n",
+ "\n",
+ "\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "jyVP10O_5Qck"
+ },
+ "execution_count": 4,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "llm.invoke(\"hi\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "e9duizdv5cOH",
+ "outputId": "a07b1702-d485-4641-c307-601e6ab34b9b"
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 10, 'prompt_tokens': 8, 'total_tokens': 18, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_bd83329f63', 'finish_reason': 'stop', 'logprobs': None}, id='run-28a9088f-7539-412a-aa80-1663be40e74f-0', usage_metadata={'input_tokens': 8, 'output_tokens': 10, 'total_tokens': 18, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 5
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from browser_use import Agent, Browser\n",
+ "from browser_use import BrowserConfig\n",
+ "from langchain_openai import ChatOpenAI\n",
+ "import asyncio\n",
+ "\n",
+ "# Basic configuration for the browser\n",
+ "config = BrowserConfig(\n",
+ " headless=True, # Run in headless mode\n",
+ " # disable_security=True # Uncomment if you want to disable security\n",
+ ")\n",
+ "\n",
+ "# Initialize the browser with the specified configuration\n",
+ "browser = Browser(config=config)\n",
+ "\n",
+ "async def main():\n",
+ " # Initialize the agent with the task and language model\n",
+ " agent = Agent(\n",
+ " task=\"What is Langgraph\",\n",
+ " llm=llm, # Replace with your LLM configuration\n",
+ " browser=browser,\n",
+ " generate_gif=False # Disable GIF generation\n",
+ " )\n",
+ "\n",
+ " # Run the agent and get results asynchronously\n",
+ " result = await agent.run()\n",
+ "\n",
+ " # Process results token-wise\n",
+ " for action in result.action_results():\n",
+ " print(action.extracted_content,end=\"\\r\",flush=True)\n",
+ " print(\"\\n\\n\")\n",
+ " # if action.is_done:\n",
+ " # print(action.extracted_content)\n",
+ "\n",
+ " # Close the browser after completion\n",
+ " await browser.close()\n",
+ "\n",
+ "# Run the asynchronous main function\n",
+ "asyncio.run(main())\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wS8ouhiVQ2dL",
+ "outputId": "653879a8-b3ac-4178-edee-5cd834e3404a"
+ },
+ "execution_count": 32,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "π Searched for \"What is Langgraph?\" in Google\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "π Extracted page as markdown\n",
+ ": \n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Controllable cognitive architecture for any task\n",
+ "\n",
+ "LangGraph's flexible framework supports diverse control flows β single agent,\n",
+ "multi-agent, hierarchical, sequential β and robustly handles realistic,\n",
+ "complex scenarios. \n",
+ " \n",
+ "Ensure reliability with easy-to-add moderation and quality loops that prevent\n",
+ "agents from veering off course. \n",
+ " \n",
+ "Use LangGraph Platform to templatize your cognitive architecture so that\n",
+ "tools, prompts, and models are easily configurable with LangGraph Platform\n",
+ "Assistants.\n",
+ "\n",
+ "[See the docs ](https://langchain-ai.github.io/langgraph/)\n",
+ "\n",
+ "## Designed for human-agent collaboration\n",
+ "\n",
+ "With built-in statefulness, LangGraph agents seamlessly collaborate with\n",
+ "humans by writing drafts for review and awaiting approval before acting.\n",
+ "Easily inspect the agentβs actions and \"time-travel\" to roll back and take a\n",
+ "different action to correct course.\n",
+ "\n",
+ "[Read a conceptual guide ](https://langchain-\n",
+ "ai.github.io/langgraph/concepts/agentic_concepts/#human-in-the-loop)\n",
+ "\n",
+ ".gif)\n",
+ "\n",
+ "\n",
+ "\n",
+ "## First class streaming support for better UX design\n",
+ "\n",
+ "Bridge user expectations and agent capabilities with native token-by-token\n",
+ "streaming and streaming of intermediate steps, helpful for showing agent\n",
+ "reasoning and actions back to the user as they happen. Use LangGraph\n",
+ "Platform's API to deliver dynamic and interactive user experiences.\n",
+ "\n",
+ "[Learn more ](https://langchain-ai.github.io/langgraph/how-tos/streaming-\n",
+ "tokens/)\n",
+ "\n",
+ "## Why choose LangGraph?\n",
+ "\n",
+ "### Control, moderate, and guide your agentβs actions.\n",
+ "\n",
+ "Prevent agents from veering off course and ensure reliability with easy-to-add\n",
+ "moderation and quality loops. Add human-in-the-loop to steer and approve agent\n",
+ "actions.\n",
+ "\n",
+ "### Expressive and customizable agent and multi-agent workflows.\n",
+ "\n",
+ "LangGraphβs low level abstractions offer the flexibility needed to create\n",
+ "sophisticated agents. Design diverse control flows β single, multi-agent,\n",
+ "hierarchical, sequential β all with one framework.\n",
+ "\n",
+ "### Persisted context for long-term interactions.\n",
+ "\n",
+ "With its stateful design, LangGraph stores conversation histories and session\n",
+ "data to maintain context over time and ensure smooth handoffs in agentic\n",
+ "systems.\n",
+ "\n",
+ "### First-class streaming support for better UX design.\n",
+ "\n",
+ "Bridge user expectations and agent capabilities with native token-by-token\n",
+ "streaming of intermediate steps, helpful for showing agent reasoning and\n",
+ "actions back to the user as they happen.\n",
+ "\n",
+ "## LangGraph Platform: \n",
+ "Deploy & develop agents at scale\n",
+ "\n",
+ "Craft agent-appropriate UXs using LangGraph Platform's APIs. Quickly deploy\n",
+ "and scale your agent with purpose-built infrastructure. Choose from multiple\n",
+ "deployment options.\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Dynamic APIs for designing agent UXs.\n",
+ "\n",
+ "Craft personalized experiences with the long-term memory API to recall\n",
+ "information across conversation sessions. Expose, update, and rewind your\n",
+ "app's state for better user visibility, steering, and interaction. Kick off\n",
+ "long-running background jobs for research-style or multi-step work.\n",
+ "\n",
+ "[See the docs ](https://langchain-ai.github.io/langgraph/how-tos/streaming-\n",
+ "tokens/)\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Fault-tolerant scalability.\n",
+ "\n",
+ "Handle large workloads gracefully with horizontally-scaling servers, task\n",
+ "queues, and built-in persistence. Enhance resilience with intelligent caching\n",
+ "and automated retries.\n",
+ "\n",
+ "[Learn more in the blog ](https://langchain-ai.github.io/langgraph/how-\n",
+ "tos/streaming-tokens/)\n",
+ "\n",
+ ".gif)\n",
+ "\n",
+ "## An end-to-end agent experience.\n",
+ "\n",
+ "Simplify prototyping, debugging, and sharing of agents in our visual LangGraph\n",
+ "Studio. Deploy your application with 1-click deploy with our SaaS offering or\n",
+ "within your own VPC. Then, monitor app performance with LangSmith.\n",
+ "\n",
+ "[Discover LangGraph Studio ](https://langchain-ai.github.io/langgraph/how-\n",
+ "tos/streaming-tokens/)\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Introduction to LangGraph\n",
+ "\n",
+ "Learn the basics of LangGraph in this LangChain Academy Course. You'll learn\n",
+ "how to build agents that automate real-world tasks with LangGraph\n",
+ "orchestration.\n",
+ "\n",
+ "[Enroll for free](https://academy.langchain.com/courses/intro-to-\n",
+ "langgraph)[Book enterprise\n",
+ "training](https://airtable.com/appGjCAN6126Jm7K8/pagNAp7niHQzRH8zk/form)\n",
+ "\n",
+ "\n",
+ "\n",
+ "## Deploy agents at scale, monitor carefully, iterate boldly\n",
+ "\n",
+ "Design agent-driven user experiences with LangGraph Platform's APIs. Quickly\n",
+ "deploy and scale your application with infrastructure built for agents. Choose\n",
+ "from multiple deployment options.\n",
+ "\n",
+ "### Fault-tolerant scalability\n",
+ "\n",
+ "Handle large workloads gracefully with horizontally-scaling servers, task\n",
+ "queues, and built-in persistence. Enhance resilience with intelligent caching\n",
+ "and automated retries.\n",
+ "\n",
+ "### Dynamic APIs for designing agent experience\n",
+ "\n",
+ "Craft personalized user experiences with APIs featuring long-term memory to\n",
+ "recall information across conversation sessions. Track, update, and rewind\n",
+ "your app's state for easy human steering and interaction. Kick off long-\n",
+ "running background jobs for research-style or multi-step work.\n",
+ "\n",
+ "### Integrated developer experience\n",
+ "\n",
+ "Simplify prototyping, debugging, and sharing of agents in our visual LangGraph\n",
+ "Studio. Deploy your application with 1-click deploy with our SaaS offering or\n",
+ "within your own VPC. Then, monitor app performance with LangSmith.\n",
+ "\n",
+ "### Trusted by companies taking agency in AI innovation:\n",
+ "\n",
+ "LangGraph helps teams of all sizes, across all industries, from ambitious\n",
+ "startups to established enterprises.\n",
+ "\n",
+ "\n",
+ "\n",
+ "βLangChain is streets ahead with what they've put forward with LangGraph.\n",
+ "LangGraph sets the foundation for how we can build and scale AI workloads β\n",
+ "from conversational agents, complex task automation, to custom LLM-backed\n",
+ "experiences that 'just work'. The next chapter in building complex production-\n",
+ "ready features with LLMs is agentic, and with LangGraph and LangSmith,\n",
+ "LangChain delivers an out-of-the-box solution to iterate quickly, debug\n",
+ "immediately, and scale effortlessly.β\n",
+ "\n",
+ "\n",
+ "\n",
+ "Garrett Spong\n",
+ "\n",
+ "Principal SWE\n",
+ "\n",
+ "\n",
+ "\n",
+ "βLangGraph has been instrumental for our AI development. Its robust framework\n",
+ "for building stateful, multi-actor applications with LLMs has transformed how\n",
+ "we evaluate and optimize the performance of our AI guest-facing solutions.\n",
+ "LangGraph enables granular control over the agent's thought process, which has\n",
+ "empowered us to make data-driven and deliberate decisions to meet the diverse\n",
+ "needs of our guests.β\n",
+ "\n",
+ "\n",
+ "\n",
+ "Andres Torres\n",
+ "\n",
+ "Sr. Solutions Architect\n",
+ "\n",
+ "\n",
+ "\n",
+ "βIt's easy to build the prototype of a coding agent, but deceptively hard to\n",
+ "improve its reliability. Replit wants to give a coding agent to millions of\n",
+ "users β reliability is our top priority, and will remain so for a long time.\n",
+ "LangGraph is giving us the control and ergonomics we need to build and ship\n",
+ "powerful coding agents.β\n",
+ "\n",
+ "βAs Ally advances its exploration of Generative AI,\n",
+ "\n",
+ "%201.webp)\n",
+ "\n",
+ "Michele Catasta\n",
+ "\n",
+ "President\n",
+ "\n",
+ "\n",
+ "\n",
+ "βAs Ally advances its exploration of Generative AI, our tech labs is excited\n",
+ "by LangGraph, the new library from LangChain, which is central to our\n",
+ "experiments with multi-actor agentic workflows. We are committed to deepening\n",
+ "our partnership with LangChain.β\n",
+ "\n",
+ "βAs Ally advances its exploration of Generative AI,\n",
+ "\n",
+ "\n",
+ "\n",
+ "Sathish Muthukrishnan\n",
+ "\n",
+ "Chief Information, Data and Digital Officer\n",
+ "\n",
+ "\n",
+ "\n",
+ "βLangChain is streets ahead with what they've put forward with LangGraph.\n",
+ "LangGraph sets the foundation for how we can build and scale AI workloads β\n",
+ "from conversational agents, complex task automation, to custom LLM-backed\n",
+ "experiences that 'just work'. The next chapter in building complex production-\n",
+ "ready features with LLMs is agentic, and with LangGraph and LangSmith,\n",
+ "LangChain delivers an out-of-the-box solution to iterate quickly, debug\n",
+ "immediately, and scale effortlessly.β\n",
+ "\n",
+ "\n",
+ "\n",
+ "Garrett Spong\n",
+ "\n",
+ "Principal SWE\n",
+ "\n",
+ "\n",
+ "\n",
+ "βLangGraph has been instrumental for our AI development. Its robust framework\n",
+ "for building stateful, multi-actor applications with LLMs has transformed how\n",
+ "we evaluate and optimize the performance of our AI guest-facing solutions.\n",
+ "LangGraph enables granular control over the agent's thought process, which has\n",
+ "empowered us to make data-driven and deliberate decisions to meet the diverse\n",
+ "needs of our guests.β\n",
+ "\n",
+ "\n",
+ "\n",
+ "Andres Torres\n",
+ "\n",
+ "Sr. Solutions Architect\n",
+ "\n",
+ "\n",
+ "\n",
+ "βIt's easy to build the prototype of a coding agent, but deceptively hard to\n",
+ "improve its reliability. Replit wants to give a coding agent to millions of\n",
+ "users β reliability is our top priority, and will remain so for a long time.\n",
+ "LangGraph is giving us the control and ergonomics we need to build and ship\n",
+ "powerful coding agents.β\n",
+ "\n",
+ "βAs Ally advances its exploration of Generative AI,\n",
+ "\n",
+ "%201.webp)\n",
+ "\n",
+ "Michele Catasta\n",
+ "\n",
+ "President\n",
+ "\n",
+ "\n",
+ "\n",
+ "βAs Ally advances its exploration of Generative AI, our tech labs is excited\n",
+ "by LangGraph, the new library from LangChain, which is central to our\n",
+ "experiments with multi-actor agentic workflows. We are committed to deepening\n",
+ "our partnership with LangChain.β\n",
+ "\n",
+ "βAs Ally advances its exploration of Generative AI,\n",
+ "\n",
+ "\n",
+ "\n",
+ "Sathish Muthukrishnan\n",
+ "\n",
+ "Chief Information, Data and Digital Officer\n",
+ "\n",
+ "## LangGraph FAQs\n",
+ "\n",
+ "Do I need to use LangChain to use LangGraph? Whatβs the difference?\n",
+ "\n",
+ "No. LangGraph is an orchestration framework for complex agentic systems and is\n",
+ "more low-level and controllable than LangChain agents. LangChain provides a\n",
+ "standard interface to interact with models and other components, useful for\n",
+ "straight-forward chains and retrieval flows.\n",
+ "\n",
+ "How is LangGraph different from other agent frameworks?\n",
+ "\n",
+ "Other agentic frameworks can work for simple, generic tasks but fall short for\n",
+ "complex tasks bespoke to a companyβs needs. LangGraph provides a more\n",
+ "expressive framework to handle companiesβ unique tasks without restricting\n",
+ "users to a single black-box cognitive architecture.\n",
+ "\n",
+ "Does LangGraph impact the performance of my app?\n",
+ "\n",
+ "LangGraph will not add any overhead to your code and is specifically designed\n",
+ "with streaming workflows in mind.\n",
+ "\n",
+ "Is LangGraph open source? Is it free?\n",
+ "\n",
+ "Yes. LangGraph is an MIT-licensed open-source library and is free to use.\n",
+ "\n",
+ "How are LangGraph and LangGraph Platform different?\n",
+ "\n",
+ "LangGraph is a stateful, orchestration framework that brings added control to\n",
+ "agent workflows. LangGraph Platform is a service for deploying and scaling\n",
+ "LangGraph applications, with an opinionated API for building agent UXs, plus\n",
+ "an integrated developer studio.\n",
+ "\n",
+ "LangGraph (open source)\n",
+ "\n",
+ "LangGraph Platform\n",
+ "\n",
+ "Features\n",
+ "\n",
+ "Stateful orchestration framework for agentic applications\n",
+ "\n",
+ "Scalable infrastructure for deploying LangGraph applications \n",
+ "\n",
+ "Python and JavaScript\n",
+ "\n",
+ "Python and JavaScript \n",
+ "\n",
+ "None\n",
+ "\n",
+ "Yes - useful for retrieving & updating state or long-term memory, or creating\n",
+ "a configurable assistant \n",
+ "\n",
+ "Basic\n",
+ "\n",
+ "Dedicated mode for token-by-token messages \n",
+ "\n",
+ "Community contributed\n",
+ "\n",
+ "Supported out-of-the-box \n",
+ "\n",
+ "Self-managed\n",
+ "\n",
+ "Managed Postgres with efficient storage \n",
+ "\n",
+ "Self-managed\n",
+ "\n",
+ "\\- Cloud SaaS \n",
+ "\\- Free self-hosted \n",
+ "\\- Enterprise \n",
+ "(BYOC or paid self-hosted) \n",
+ "\n",
+ "Self-managed\n",
+ "\n",
+ "Auto-scaling of task queues and servers \n",
+ "\n",
+ "Self-managed\n",
+ "\n",
+ "Automated retries \n",
+ "\n",
+ "Simple threading\n",
+ "\n",
+ "Supports double-texting \n",
+ "\n",
+ "None\n",
+ "\n",
+ "Cron scheduling \n",
+ "\n",
+ "None\n",
+ "\n",
+ "Integrated with LangSmith for observability \n",
+ "\n",
+ "LangGraph Studio for Desktop\n",
+ "\n",
+ "LangGraph Studio for Desktop & Cloud \n",
+ "\n",
+ "What are my deployment options for LangGraph Platform?\n",
+ "\n",
+ "We currently have the following deployment options for LangGraph applications: \n",
+ " \n",
+ "β**Self-Hosted Lite** : A free (up to 1M nodes executed), limited version of\n",
+ "LangGraph Platform that you can run locally or in a self-hosted manner. This\n",
+ "version requires a LangSmith API key and logs all usage to LangSmith. Fewer\n",
+ "features are available than in paid plans. \n",
+ "β**Cloud SaaS:** Fully managed and hosted as part of LangSmith, with automatic\n",
+ "updates and zero maintenance. \n",
+ "β**Bring Your Own Cloud (BYOC):** Deploy LangGraph Platform within your VPC,\n",
+ "provisioned and run as a service. Keep data in your environment while\n",
+ "outsourcing the management of the service. \n",
+ "**Self-Hosted Enterprise:** Deploy LangGraph entirely on your own\n",
+ "infrastructure.\n",
+ "\n",
+ "Is LangGraph Platform open source?\n",
+ "\n",
+ "No. LangGraph Platform is proprietary software. \n",
+ " \n",
+ "There is a free, self-hosted version of LangGraph Platform with access to\n",
+ "basic features. The Cloud SaaS deployment option is free while in beta, but\n",
+ "will eventually be a paid service. We will always give ample notice before\n",
+ "charging for a service and reward our early adopters with preferential\n",
+ "pricing. The Bring Your Own Cloud (BYOC) and Self-Hosted Enterprise options\n",
+ "are also paid services. [Contact our sales team](/contact-sales) to learn\n",
+ "more. \n",
+ " \n",
+ "For more information, see our [LangGraph Platform pricing page](/pricing-\n",
+ "langgraph-platform).\n",
+ "\n",
+ "## Ready to start shipping reliable GenAI apps faster?\n",
+ "\n",
+ "Get started with LangChain, LangSmith, and LangGraph to enhance your LLM app\n",
+ "development, from prototype to production.\n",
+ "\n",
+ "[Contact Us](/contact-sales)[Sign Up](https://smith.langchain.com/)\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "LangGraph is a flexible framework designed for building and scaling agentic applications. It allows for complex task handling and human-agent collaboration, supporting various control flows such as single-agent, multi-agent, hierarchical, and sequential. Key features include:\n",
+ "\n",
+ "- **Statefulness**: LangGraph agents maintain context over time, enabling smooth interactions.\n",
+ "- **Streaming Support**: It provides native token-by-token streaming for better user experience.\n",
+ "- **Moderation and Quality Loops**: These features ensure agents remain reliable and on course.\n",
+ "- **Dynamic APIs**: LangGraph offers APIs for crafting personalized user experiences and managing long-term memory.\n",
+ "- **Deployment Options**: It supports various deployment methods, including self-hosted and cloud solutions.\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "from browser_use import Agent, Browser\n",
+ "from playwright.async_api import BrowserContext\n",
+ "from browser_use import BrowserConfig\n",
+ "from langchain_openai import ChatOpenAI\n",
+ "# from browser_use import Agent\n",
+ "import asyncio\n",
+ "# Basic configuration\n",
+ "config = BrowserConfig(\n",
+ " headless=True,\n",
+ "\n",
+ " # disable_security=True\n",
+ ")\n",
+ "# Reuse existing browser\n",
+ "browser = Browser(config=config)\n",
+ "# async def main():\n",
+ "agent = Agent(\n",
+ " task=\"what is langchain\",\n",
+ " llm=llm,\n",
+ " browser=browser,\n",
+ " generate_gif = False # Browser instance will be reused\n",
+ " )\n",
+ "\n",
+ "result = await agent.run()\n",
+ "print(result)\n",
+ "# Manually close the browser\n",
+ "# asyncio.run(main())\n",
+ "await browser.close()\n"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "TFK-fNoLDFcF",
+ "outputId": "d78fbeae-c8f0-4c26-e0e3-7a0a683d3fc1"
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "AgentHistoryList(all_results=[ActionResult(is_done=False, extracted_content='π Searched for \"What is LangChain?\" in Google', error=None, include_in_memory=True), ActionResult(is_done=False, extracted_content=\"π Extracted page as markdown\\n: # Filters and Topics\\n\\n[All](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQIEhAB)\\n\\n[Images](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=2&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQtKgLegQIExAB)\\n\\n[Videos](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=7&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQtKgLegQIERAB)\\n\\n[Forums](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=18&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQs6gLegQIDxAB)\\n\\nWeb\\n\\n[Flights](/travel/flights?sca_esv=4c6b8dc13bab3e46&output=search&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&ved=1t:200715&ictx=111)\\n\\n[Finance](/finance?sca_esv=4c6b8dc13bab3e46&output=search&q=What+is+LangChain%3F&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQIDBAB)\\n\\nMore\\n\\n[Books](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=36&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQINxAB)\\n\\n[News](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&tbm=nws&source=lnms&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ0pQJegQINhAB)\\n\\n[Shopping](/search?sca_esv=4c6b8dc13bab3e46&q=What+is+LangChain%3F&udm=28&fbs=AEQNm0Aa4sjWe7Rqy32pFwRj0UkWd8nbOJfsBGGB5IQQO6L3JyWp6w6_rxLPe8F8fpm5a55blYtaduielx1say4YCS0EIyvBb6VkaLhDZSOnSC94tp-\\nJuFEDkvqUl_u6quB-Is11hrT6R6Y6jGPIGI0MqGRIdRYfHHK4Fm5f9UNWxYphEnPjChpmH-\\nusjmkJN6Sk444PHRuqJvihdKgoqwGrUjYjqVvmxA&ved=1t:220175&ictx=111)\\n\\nTools\\n\\nAny time\\n\\nAny time\\n\\n[Past\\nhour](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:h&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAc)\\n\\n[Past 24\\nhours](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:d&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAg)\\n\\n[Past\\nweek](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:w&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAk)\\n\\n[Past\\nmonth](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:m&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAo)\\n\\n[Past\\nyear](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=qdr:y&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEAs)\\n\\nCustom range...\\n\\nCustom date range\\n\\nFromTo\\n\\nGo\\n\\nAll results\\n\\nAll results\\n\\n[Verbatim](/search?q=What+is+LangChain%3F&sca_esv=4c6b8dc13bab3e46&udm=14&source=lnt&tbs=li:1&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQpwV6BAgGEBM)\\n\\n[ Advanced Search\\n](https://www.google.com/advanced_search?q=What+is+LangChain%3F&udm=14)\\n\\nCtrl+Shift+X to select\\n\\n\\n\\n# Search settings\\n\\n[Search CustomizationOff](/history/optout?hl=en)\\n\\n[SafeSearchBlurring\\non](/safesearch?prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain?%26udm%3D14&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8JsIegQIChAH)\\n\\n[LanguageEnglish](/preferences?lang=1&hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain%253F%26sca_esv%3D4c6b8dc13bab3e46%26udm%3D14#languages)\\n\\n[Dark themeDevice\\ndefault](/setprefs?hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain?%26udm%3D14%26pccc%3D1&sig=0_jfSkJcafppJyKAIkCWZpHFXzfrs%3D&cs=2&sa=X&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQqsEHegQIChAJ&ictx=1)\\n\\n[More\\nsettings](/preferences?hl=en&prev=https://www.google.com/search?q%3DWhat%2Bis%2BLangChain%253F%26sca_esv%3D4c6b8dc13bab3e46%26udm%3D14)\\n\\nSend feedback\\n\\n[Help](https://support.google.com/websearch/?p=dsrp_search_hc&hl=en) β’\\n[Privacy](https://policies.google.com/privacy?hl=en&fg=1) β’\\n[Terms](https://policies.google.com/terms?hl=en&fg=1)\\n\\n# Search Results\\n\\n[ \\nLangChainLangChainhttps://www.langchain.com](https://www.langchain.com/)\\n\\nLangChain\\n\\nhttps://www.langchain.com\\n\\n _LangChain_ is a composable framework to build with LLMs. LangGraph is the\\norchestration framework for controllable agentic workflows. Run.\\n\\n\\u200e[Docs](https://python.langchain.com/docs/introduction/) Β·\\n\\u200e[Products](https://www.langchain.com/langchain) Β· \\u200e[LangChain\\nAcademy](https://academy.langchain.com/) Β· \\u200e[Join the LangChain\\nCommunity](https://www.langchain.com/join-community)\\n\\n[ \\nWhat is\\nLangChain?Amazon\\nWeb Serviceshttps://aws.amazon.com βΊ ... βΊ Generative\\nAI](https://aws.amazon.com/what-is/langchain/)\\n\\nAmazon Web Services\\n\\nhttps://aws.amazon.com βΊ ... βΊ Generative AI\\n\\nLangChain _provides AI developers with tools to connect language models with\\nexternal data sources_. It is open-source and supported by an active\\ncommunity.\\n\\n[ \\nWhat Is LangChain and How to Use It: A\\nGuideTechTargethttps://www.techtarget.com\\nβΊ definition βΊ\\nLangChain](https://www.techtarget.com/searchenterpriseai/definition/LangChain)\\n\\nTechTarget\\n\\nhttps://www.techtarget.com βΊ definition βΊ LangChain\\n\\n _LangChain is an open source framework_ that enables software developers\\nworking with artificial intelligence (AI) and its machine learning subset to\\ncombine ...\\n\\n[ \\nIntroduction | π¦οΈ LangChainLangChainhttps://python.langchain.com βΊ docs βΊ introduction](https://python.langchain.com/docs/introduction/)\\n\\nLangChain\\n\\nhttps://python.langchain.com βΊ docs βΊ introduction\\n\\n _LangChain_ is a framework for developing applications powered by large\\nlanguage models (LLMs). LangChain simplifies every stage of the LLM\\napplication lifecycle.\\n\\n\\u200e[Introduction](https://python.langchain.com/v0.1/docs/get_started/introduction/)\\nΒ·\\n\\u200e[Langchain.agents...](https://api.python.langchain.com/en/latest/agents/langchain.agents.tool_calling_agent.base.create_tool_calling_agent.html)\\nΒ· \\u200e[LangChain v0.3](https://python.langchain.com/docs/versions/v0_3/) Β·\\n\\u200e[Langchain_core.tools.](https://api.python.langchain.com/en/latest/tools/langchain_core.tools.tool.html)\\n\\n[ \\nWhat Is\\nLangChain?IBMhttps://www.ibm.com\\nβΊ think βΊ topics βΊ langchain](https://www.ibm.com/think/topics/langchain)\\n\\nIBM\\n\\nhttps://www.ibm.com βΊ think βΊ topics βΊ langchain\\n\\nLangChain is essentially _a library of abstractions for Python and Javascript_\\n, representing common steps and concepts necessary to work with language\\nmodels.\\n\\n[ \\nWhat is\\nLangChain?YouTube\\nΒ· IBM Technology287.6K+ views Β· 10 months\\nago](https://www.youtube.com/watch?v=1bUy-1hGZpI)\\n\\nYouTube Β· IBM Technology\\n\\n287.6K+ views Β· 10 months ago\\n\\nLang chain is _an open-source orchestration framework_ for the development of\\napplications that use large language models.\\n\\n[ \\nWhat is Langchain and why should I care as a\\ndeveloper?Medium\\nΒ· Logan Kilpatrick370+ likes Β· 1 year ago](https://medium.com/around-the-\\nprompt/what-is-langchain-and-why-should-i-care-as-a-developer-b2d952c42b28)\\n\\nMedium Β· Logan Kilpatrick\\n\\n370+ likes Β· 1 year ago\\n\\n _Langchain_ makes creating agents using large language models simple through\\ntheir agents API. Developers can use OpenAI functions or other means ...\\n\\n[ \\nLangChainWikipediahttps://en.wikipedia.org\\nβΊ wiki βΊ LangChain](https://en.wikipedia.org/wiki/LangChain)\\n\\nWikipedia\\n\\nhttps://en.wikipedia.org βΊ wiki βΊ LangChain\\n\\nLangChain is a software framework that helps facilitate the integration of\\nlarge language models (LLMs) into applications.\\n\\n\\u200e[History](https://en.wikipedia.org/wiki/LangChain#History) Β·\\n\\u200e[Capabilities](https://en.wikipedia.org/wiki/LangChain#Capabilities) Β·\\n\\u200e[LangChain tools](https://en.wikipedia.org/wiki/LangChain#LangChain_tools)\\n\\n[ \\nWhat Is LangChain? A Complete Comprehensive\\nOverviewDataStaxhttps://www.datastax.com\\nβΊ guides βΊ what-is-langchain](https://www.datastax.com/guides/what-is-\\nlangchain)\\n\\nDataStax\\n\\nhttps://www.datastax.com βΊ guides βΊ what-is-langchain\\n\\nNov 9, 2023 β LangChain is _a Python framework designed to streamline AI\\napplication development_ , focusing on real-time data processing and\\nintegration with ...\\n\\n[ \\nWhat Is\\nLangChain?Google\\nCloudhttps://cloud.google.com βΊ use-cases βΊ\\nlangchain](https://cloud.google.com/use-cases/langchain)\\n\\nGoogle Cloud\\n\\nhttps://cloud.google.com βΊ use-cases βΊ langchain\\n\\n _LangChain_ is a programming language platform that lets developers construct\\nand connect models to access, transform, and share data seamlessly.\\n\\n\\u200e[Langchain And Ai](https://cloud.google.com/use-\\ncases/langchain#:~:text=LangChain%20and%20AI) Β· \\u200e[How Does Langchain\\nWork?](https://cloud.google.com/use-\\ncases/langchain#:~:text=How%20does%20LangChain%20work%3F) Β· \\u200e[Key Features Of\\nLangchain](https://cloud.google.com/use-\\ncases/langchain#:~:text=Key%20features%20of%20LangChain)\\n\\n# Page Navigation\\n\\n| 1|\\n[2](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=10&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAE)|\\n[3](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=20&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAG)|\\n[4](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=30&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAI)|\\n[5](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=40&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAK)|\\n[6](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=50&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAM)|\\n[7](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=60&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAO)|\\n[8](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=70&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAQ)|\\n[9](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=80&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAS)|\\n[10](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=90&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8tMDegQICBAU)|\\n[Next](/search?q=What+is+LangChain?&sca_esv=4c6b8dc13bab3e46&udm=14&ei=e8iJZ425Mabg0PEP6LmQGQ&start=10&sa=N&sstk=ATObxK4t7c6xZe8J3zQzlUfrNV-\\nBchujCI0GxH83wgy_vu9jEqYrHuTxd0wVBzubCa-bn_k1uK_Zn1BBIfr2yh6eyUzMdvUxFJ-\\nmCw&ved=2ahUKEwjN4oy74vuKAxUmMDQIHegcJAMQ8NMDegQICBAW) \\n---|---|---|---|---|---|---|---|---|---|---|--- \\n \\n# Footer Links\\n\\nWasco County, Oregon \\\\- From your IP address\\n\\n\\\\-\\n\\nUpdate location\\n\\nCan't update your locationLearn more\\n\\nUpdating location...\\n\\n[Help](https://support.google.com/websearch/?p=ws_results_help&hl=en&fg=1)Send\\nfeedback[Privacy](https://policies.google.com/privacy?hl=en&fg=1)[Terms](https://policies.google.com/terms?hl=en&fg=1)\\n\\n\\n\", error=None, include_in_memory=False), ActionResult(is_done=True, extracted_content='LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.', error=None, include_in_memory=False)], all_model_outputs=[{'search_google': {'query': 'What is LangChain?'}}, {'extract_content': {'include_links': True}}, {'done': {'text': 'LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.'}}])\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# display(result.action_results())\n",
+ "for action in result.action_results():\n",
+ " if action.is_done:\n",
+ " print(action.extracted_content)\n",
+ ""
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "nKGC936xODry",
+ "outputId": "de70d715-c30a-4d5b-9d25-40bd79d410de"
+ },
+ "execution_count": 27,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "LangChain is a composable framework designed for building applications with large language models (LLMs). It simplifies the integration of language models with external data sources and is open-source, supported by an active community. LangChain provides tools for developers to streamline the application lifecycle of LLMs.\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/examples/simple.py b/examples/simple.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d586b8bfe007c042dfc9270572da175abd432ec
--- /dev/null
+++ b/examples/simple.py
@@ -0,0 +1,25 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+
+load_dotenv()
+
+# Initialize the model
+llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+)
+task = 'Find the founders of browser-use and draft them a short personalized message'
+
+agent = Agent(task=task, llm=llm)
+
+
+async def main():
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/ui/README.md b/examples/ui/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e1bcb0349e8e1ff0cfc6b0436c84ea7f4b45917
--- /dev/null
+++ b/examples/ui/README.md
@@ -0,0 +1,7 @@
+# **User Interfaces of Browser-Use**
+
+| **File Name** | **User Interface** | **Description** | **Example Usage** |
+|------------------------|-------------------|-------------------------------------------|-------------------------------------------|
+| `command_line.py` | **Terminal** | Parses arguments for command-line execution. | `python command_line.py` |
+| `gradio_demo.py` | **Gradio** | Provides a Gradio-based interactive UI. | `python gradio_demo.py` |
+| `streamlit_demo.py` | **Streamlit** | Runs a Streamlit-based web interface. | `python -m streamlit run streamlit_demo.py` |
\ No newline at end of file
diff --git a/examples/ui/command_line.py b/examples/ui/command_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..715bb1f09dc66ae61f8dc5eb2c4956d4e60bb8d4
--- /dev/null
+++ b/examples/ui/command_line.py
@@ -0,0 +1,97 @@
+"""
+To Use It:
+
+Example 1: Using OpenAI (default), with default task: 'go to reddit and search for posts about browser-use'
+python command_line.py
+
+Example 2: Using OpenAI with a Custom Query
+python command_line.py --query "go to google and search for browser-use"
+
+Example 3: Using Anthropic's Claude Model with a Custom Query
+python command_line.py --query "find latest Python tutorials on Medium" --provider anthropic
+
+"""
+import os
+import sys
+import argparse
+import asyncio
+
+# Ensure local repository (browser_use) is accessible
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dotenv import load_dotenv
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+
+load_dotenv()
+
+def get_llm(provider: str):
+ if provider == 'anthropic':
+ from langchain_anthropic import ChatAnthropic
+ api_key = os.getenv("ANTHROPIC_API_KEY")
+ if not api_key:
+ raise ValueError("Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.")
+
+ return ChatAnthropic(
+ model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0
+ )
+ elif provider == 'openai':
+ from langchain_openai import ChatOpenAI
+ api_key = os.getenv("OPENAI_API_KEY")
+ if not api_key:
+ raise ValueError("Error: OPENAI_API_KEY is not set. Please provide a valid API key.")
+
+ return ChatOpenAI(model='gpt-4o', temperature=0.0)
+
+ else:
+ raise ValueError(f'Unsupported provider: {provider}')
+
+def parse_arguments():
+ """Parse command-line arguments."""
+ parser = argparse.ArgumentParser(description="Automate browser tasks using an LLM agent.")
+ parser.add_argument(
+ '--query',
+ type=str,
+ help='The query to process',
+ default='go to reddit and search for posts about browser-use'
+ )
+ parser.add_argument(
+ '--provider',
+ type=str,
+ choices=['openai', 'anthropic'],
+ default='openai',
+ help='The model provider to use (default: openai)',
+ )
+ return parser.parse_args()
+
+def initialize_agent(query: str, provider: str):
+ """Initialize the browser agent with the given query and provider."""
+ llm = get_llm(provider)
+ controller = Controller()
+ browser = Browser(config=BrowserConfig())
+
+ return Agent(
+ task=query,
+ llm=llm,
+ controller=controller,
+ browser=browser,
+ use_vision=True,
+ max_actions_per_step=1,
+ ), browser
+
+async def main():
+ """Main async function to run the agent."""
+ args = parse_arguments()
+ agent, browser = initialize_agent(args.query, args.provider)
+
+ await agent.run(max_steps=25)
+
+ input('Press Enter to close the browser...')
+ await browser.close()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/ui/gradio_demo.py b/examples/ui/gradio_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..b67a88da160f3659c505dcd2a67e4923bfbfbdf7
--- /dev/null
+++ b/examples/ui/gradio_demo.py
@@ -0,0 +1,108 @@
+import os
+import asyncio
+from dataclasses import dataclass
+from typing import List, Optional
+
+# Third-party imports
+import gradio as gr
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from rich.console import Console
+from rich.panel import Panel
+from rich.text import Text
+
+# Local module imports
+from browser_use import Agent
+
+load_dotenv()
+
+
+@dataclass
+class ActionResult:
+ is_done: bool
+ extracted_content: Optional[str]
+ error: Optional[str]
+ include_in_memory: bool
+
+
+@dataclass
+class AgentHistoryList:
+ all_results: List[ActionResult]
+ all_model_outputs: List[dict]
+
+
+def parse_agent_history(history_str: str) -> None:
+ console = Console()
+
+ # Split the content into sections based on ActionResult entries
+ sections = history_str.split('ActionResult(')
+
+ for i, section in enumerate(sections[1:], 1): # Skip first empty section
+ # Extract relevant information
+ content = ''
+ if 'extracted_content=' in section:
+ content = section.split('extracted_content=')[1].split(',')[0].strip("'")
+
+ if content:
+ header = Text(f'Step {i}', style='bold blue')
+ panel = Panel(content, title=header, border_style='blue')
+ console.print(panel)
+ console.print()
+
+
+async def run_browser_task(
+ task: str,
+ api_key: str,
+ model: str = 'gpt-4o',
+ headless: bool = True,
+) -> str:
+ if not api_key.strip():
+ return 'Please provide an API key'
+
+ os.environ['OPENAI_API_KEY'] = api_key
+
+ try:
+ agent = Agent(
+ task=task,
+ llm=ChatOpenAI(model='gpt-4o'),
+ )
+ result = await agent.run()
+ # TODO: The result cloud be parsed better
+ return result
+ except Exception as e:
+ return f'Error: {str(e)}'
+
+
+def create_ui():
+ with gr.Blocks(title='Browser Use GUI') as interface:
+ gr.Markdown('# Browser Use Task Automation')
+
+ with gr.Row():
+ with gr.Column():
+ api_key = gr.Textbox(label='OpenAI API Key', placeholder='sk-...', type='password')
+ task = gr.Textbox(
+ label='Task Description',
+ placeholder='E.g., Find flights from New York to London for next week',
+ lines=3,
+ )
+ model = gr.Dropdown(
+ choices=['gpt-4', 'gpt-3.5-turbo'], label='Model', value='gpt-4'
+ )
+ headless = gr.Checkbox(label='Run Headless', value=True)
+ submit_btn = gr.Button('Run Task')
+
+ with gr.Column():
+ output = gr.Textbox(label='Output', lines=10, interactive=False)
+
+ submit_btn.click(
+ fn=lambda *args: asyncio.run(run_browser_task(*args)),
+ inputs=[task, api_key, model, headless],
+ outputs=output,
+ )
+
+ return interface
+
+
+if __name__ == '__main__':
+ demo = create_ui()
+ demo.launch()
\ No newline at end of file
diff --git a/examples/ui/streamlit_demo.py b/examples/ui/streamlit_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..9948f2fd5e09b4bfb253d94766eed08ed2ca3eae
--- /dev/null
+++ b/examples/ui/streamlit_demo.py
@@ -0,0 +1,80 @@
+"""
+To use it, you'll need to install streamlit, and run with:
+
+python -m streamlit run streamlit_demo.py
+
+"""
+
+import os
+import sys
+import asyncio
+import streamlit as st
+from dotenv import load_dotenv
+
+# Ensure local repository (browser_use) is accessible
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+# Load environment variables
+load_dotenv()
+
+# Function to get the LLM based on provider
+def get_llm(provider: str):
+ if provider == 'anthropic':
+ from langchain_anthropic import ChatAnthropic
+ api_key = os.getenv("ANTHROPIC_API_KEY")
+ if not api_key:
+ st.error("Error: ANTHROPIC_API_KEY is not set. Please provide a valid API key.")
+ st.stop()
+
+ return ChatAnthropic(
+ model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None, temperature=0.0
+ )
+ elif provider == 'openai':
+ from langchain_openai import ChatOpenAI
+ api_key = os.getenv("OPENAI_API_KEY")
+ if not api_key:
+ st.error("Error: OPENAI_API_KEY is not set. Please provide a valid API key.")
+ st.stop()
+
+ return ChatOpenAI(model='gpt-4o', temperature=0.0)
+ else:
+ st.error(f'Unsupported provider: {provider}')
+ st.stop()
+
+# Function to initialize the agent
+def initialize_agent(query: str, provider: str):
+ llm = get_llm(provider)
+ controller = Controller()
+ browser = Browser(config=BrowserConfig())
+
+ return Agent(
+ task=query,
+ llm=llm,
+ controller=controller,
+ browser=browser,
+ use_vision=True,
+ max_actions_per_step=1,
+ ), browser
+
+# Streamlit UI
+st.title("Automated Browser Agent with LLMs π€")
+
+query = st.text_input("Enter your query:", "go to reddit and search for posts about browser-use")
+provider = st.radio("Select LLM Provider:", ["openai", "anthropic"], index=0)
+
+if st.button("Run Agent"):
+ st.write("Initializing agent...")
+ agent, browser = initialize_agent(query, provider)
+
+ async def run_agent():
+ with st.spinner("Running automation..."):
+ await agent.run(max_steps=25)
+ st.success("Task completed! π")
+
+ asyncio.run(run_agent())
+
+ st.button("Close Browser", on_click=lambda: asyncio.run(browser.close()))
diff --git a/examples/use-cases/README.md b/examples/use-cases/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..90ec4c4f43d5d8c1adc66da2923d40200c0649ee
--- /dev/null
+++ b/examples/use-cases/README.md
@@ -0,0 +1,15 @@
+# Use Cases of Browser-Use
+
+| File Name | Description |
+|-----------|------------|
+| `captcha.py` | Automates CAPTCHA solving on a demo website. |
+| `check_appointment.py` | Checks for available visa appointment slots on the Greece MFA website. |
+| `find_and_apply_to_jobs.py` | Searches for job listings, evaluates relevance based on a CV, and applies automatically. |
+| `online_coding_agent.py` | Implements a multi-agent system for online code editors, with separate agents for coding and execution. |
+| `post-twitter.py` | Provides a template for automated posting on X (Twitter), including new tweets, tagging, and replies. |
+| `scrolling_page.py` | Automates webpage scrolling with various scrolling actions and text search functionality. |
+| `twitter_post_using_cookies.py` | Automates posting on X (Twitter) using stored authentication cookies. |
+| `web_voyager_agent.py` | A general-purpose web navigation agent for tasks like flight booking and course searching. |
+
+
+
diff --git a/examples/use-cases/captcha.py b/examples/use-cases/captcha.py
new file mode 100644
index 0000000000000000000000000000000000000000..784eb04aec79d4f5c50a7ee5ffedb96ea3888043
--- /dev/null
+++ b/examples/use-cases/captcha.py
@@ -0,0 +1,36 @@
+"""
+Goal: Automates CAPTCHA solving on a demo website.
+
+
+Simple try of the agent.
+@dev You need to add OPENAI_API_KEY to your environment variables.
+NOTE: captchas are hard. For this example it works. But e.g. for iframes it does not.
+for this example it helps to zoom in.
+"""
+
+import os
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+if not os.getenv('OPENAI_API_KEY'):
+ raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
+
+async def main():
+ llm = ChatOpenAI(model='gpt-4o')
+ agent = Agent(
+ task='go to https://captcha.com/demos/features/captcha-demo.aspx and solve the captcha',
+ llm=llm,
+ )
+ await agent.run()
+ input('Press Enter to exit')
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/use-cases/check_appointment.py b/examples/use-cases/check_appointment.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e010f01496c390e21eb59089b98e840c45c201c
--- /dev/null
+++ b/examples/use-cases/check_appointment.py
@@ -0,0 +1,47 @@
+# Goal: Checks for available visa appointment slots on the Greece MFA website.
+
+import asyncio
+import os
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel, SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.controller.service import Controller
+
+# Load environment variables
+load_dotenv()
+if not os.getenv('OPENAI_API_KEY'):
+ raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
+
+controller = Controller()
+
+
+class WebpageInfo(BaseModel):
+ """Model for webpage link."""
+ link: str = 'https://appointment.mfa.gr/en/reservations/aero/ireland-grcon-dub/'
+
+
+@controller.action('Go to the webpage', param_model=WebpageInfo)
+def go_to_webpage(webpage_info: WebpageInfo):
+ """Returns the webpage link."""
+ return webpage_info.link
+
+
+async def main():
+ """Main function to execute the agent task."""
+ task = (
+ 'Go to the Greece MFA webpage via the link I provided you.'
+ 'Check the visa appointment dates. If there is no available date in this month, check the next month.'
+ 'If there is no available date in both months, tell me there is no available date.'
+ )
+
+ model = ChatOpenAI(model='gpt-4o-mini', api_key=SecretStr(os.getenv('OPENAI_API_KEY', '')))
+ agent = Agent(task, model, controller=controller, use_vision=True)
+
+ await agent.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/examples/use-cases/find_and_apply_to_jobs.py b/examples/use-cases/find_and_apply_to_jobs.py
new file mode 100644
index 0000000000000000000000000000000000000000..daf65897bdd25af7e463aa5a51dd636435001adb
--- /dev/null
+++ b/examples/use-cases/find_and_apply_to_jobs.py
@@ -0,0 +1,160 @@
+"""
+Goal: Searches for job listings, evaluates relevance based on a CV, and applies
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+Also you have to install PyPDF2 to read pdf files: pip install PyPDF2
+"""
+
+import csv
+import os
+import sys
+from pathlib import Path
+import logging
+from typing import List, Optional
+import asyncio
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dotenv import load_dotenv
+from PyPDF2 import PdfReader
+from langchain_openai import AzureChatOpenAI, ChatOpenAI
+from pydantic import BaseModel, SecretStr
+
+from browser_use import ActionResult, Agent, Controller
+from browser_use.browser.context import BrowserContext
+from browser_use.browser.browser import Browser, BrowserConfig
+
+# Validate required environment variables
+load_dotenv()
+required_env_vars = ["AZURE_OPENAI_KEY", "AZURE_OPENAI_ENDPOINT"]
+for var in required_env_vars:
+ if not os.getenv(var):
+ raise ValueError(f"{var} is not set. Please add it to your environment variables.")
+
+logger = logging.getLogger(__name__)
+# full screen mode
+controller = Controller()
+
+# NOTE: This is the path to your cv file
+CV = Path.cwd() / 'cv_04_24.pdf'
+
+if not CV.exists():
+ raise FileNotFoundError(f'You need to set the path to your cv file in the CV variable. CV file not found at {CV}')
+
+
+class Job(BaseModel):
+ title: str
+ link: str
+ company: str
+ fit_score: float
+ location: Optional[str] = None
+ salary: Optional[str] = None
+
+
+@controller.action('Save jobs to file - with a score how well it fits to my profile', param_model=Job)
+def save_jobs(job: Job):
+ with open('jobs.csv', 'a', newline='') as f:
+ writer = csv.writer(f)
+ writer.writerow([job.title, job.company, job.link, job.salary, job.location])
+
+ return 'Saved job to file'
+
+
+@controller.action('Read jobs from file')
+def read_jobs():
+ with open('jobs.csv', 'r') as f:
+ return f.read()
+
+
+@controller.action('Read my cv for context to fill forms')
+def read_cv():
+ pdf = PdfReader(CV)
+ text = ''
+ for page in pdf.pages:
+ text += page.extract_text() or ''
+ logger.info(f'Read cv with {len(text)} characters')
+ return ActionResult(extracted_content=text, include_in_memory=True)
+
+
+@controller.action(
+ 'Upload cv to element - call this function to upload if element is not found, try with different index of the same upload element',
+)
+async def upload_cv(index: int, browser: BrowserContext):
+ path = str(CV.absolute())
+ dom_el = await browser.get_dom_element_by_index(index)
+
+ if dom_el is None:
+ return ActionResult(error=f'No element found at index {index}')
+
+ file_upload_dom_el = dom_el.get_file_upload_element()
+
+ if file_upload_dom_el is None:
+ logger.info(f'No file upload element found at index {index}')
+ return ActionResult(error=f'No file upload element found at index {index}')
+
+ file_upload_el = await browser.get_locate_element(file_upload_dom_el)
+
+ if file_upload_el is None:
+ logger.info(f'No file upload element found at index {index}')
+ return ActionResult(error=f'No file upload element found at index {index}')
+
+ try:
+ await file_upload_el.set_input_files(path)
+ msg = f'Successfully uploaded file "{path}" to index {index}'
+ logger.info(msg)
+ return ActionResult(extracted_content=msg)
+ except Exception as e:
+ logger.debug(f'Error in set_input_files: {str(e)}')
+ return ActionResult(error=f'Failed to upload file to index {index}')
+
+
+browser = Browser(
+ config=BrowserConfig(
+ chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ disable_security=True,
+ )
+)
+
+
+async def main():
+ # ground_task = (
+ # 'You are a professional job finder. '
+ # '1. Read my cv with read_cv'
+ # '2. Read the saved jobs file '
+ # '3. start applying to the first link of Amazon '
+ # 'You can navigate through pages e.g. by scrolling '
+ # 'Make sure to be on the english version of the page'
+ # )
+ ground_task = (
+ 'You are a professional job finder. '
+ '1. Read my cv with read_cv'
+ 'find ml internships in and save them to a file'
+ 'search at company:'
+ )
+ tasks = [
+ ground_task + '\n' + 'Google',
+ # ground_task + '\n' + 'Amazon',
+ # ground_task + '\n' + 'Apple',
+ # ground_task + '\n' + 'Microsoft',
+ # ground_task
+ # + '\n'
+ # + 'go to https://nvidia.wd5.myworkdayjobs.com/en-US/NVIDIAExternalCareerSite/job/Taiwan%2C-Remote/Fulfillment-Analyst---New-College-Graduate-2025_JR1988949/apply/autofillWithResume?workerSubType=0c40f6bd1d8f10adf6dae42e46d44a17&workerSubType=ab40a98049581037a3ada55b087049b7 NVIDIA',
+ # ground_task + '\n' + 'Meta',
+ ]
+ model = AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ )
+
+ agents = []
+ for task in tasks:
+ agent = Agent(task=task, llm=model, controller=controller, browser=browser)
+ agents.append(agent)
+
+ await asyncio.gather(*[agent.run() for agent in agents])
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/use-cases/find_influencer_profiles.py b/examples/use-cases/find_influencer_profiles.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4a40f17bf4aef2e7ca407e045af28b65016f586
--- /dev/null
+++ b/examples/use-cases/find_influencer_profiles.py
@@ -0,0 +1,88 @@
+"""
+Show how to use custom outputs.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import json
+import os
+import sys
+from typing import List
+
+import requests
+
+from browser_use.agent.views import ActionResult
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+from pydantic import BaseModel
+
+from browser_use import Agent, Controller
+
+load_dotenv()
+
+
+class Profile(BaseModel):
+ platform: str
+ profile_url: str
+
+
+class Profiles(BaseModel):
+ profiles: List[Profile]
+
+
+controller = Controller(exclude_actions=['search_google'], output_model=Profiles)
+BEARER_TOKEN = os.getenv('BEARER_TOKEN')
+
+if not BEARER_TOKEN:
+ # use the api key for ask tessa
+ # you can also use other apis like exa, xAI, perplexity, etc.
+ raise ValueError('BEARER_TOKEN is not set - go to https://www.heytessa.ai/ and create an api key')
+
+
+@controller.registry.action('Search the web for a specific query')
+async def search_web(query: str):
+ keys_to_use = ['url', 'title', 'content', 'author', 'score']
+ headers = {'Authorization': f'Bearer {BEARER_TOKEN}'}
+ response = requests.post('https://asktessa.ai/api/search', headers=headers, json={'query': query})
+
+ final_results = [
+ {key: source[key] for key in keys_to_use if key in source}
+ for source in response.json()['sources']
+ if source['score'] >= 0.2
+ ]
+ # print(json.dumps(final_results, indent=4))
+ result_text = json.dumps(final_results, indent=4)
+ print(result_text)
+ return ActionResult(extracted_content=result_text, include_in_memory=True)
+
+
+async def main():
+ task = (
+ 'Go to this tiktok video url, open it and extract the @username from the resulting url. Then do a websearch for this username to find all his social media profiles. Return me the links to the social media profiles with the platform name.'
+ ' https://www.tiktokv.com/share/video/7470981717659110678/ '
+ )
+ model = ChatOpenAI(model='gpt-4o')
+ agent = Agent(task=task, llm=model, controller=controller)
+
+ history = await agent.run()
+
+ result = history.final_result()
+ if result:
+ parsed: Profiles = Profiles.model_validate_json(result)
+
+ for profile in parsed.profiles:
+ print('\n--------------------------------')
+ print(f'Platform: {profile.platform}')
+ print(f'Profile URL: {profile.profile_url}')
+
+ else:
+ print('No result')
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/use-cases/online_coding_agent.py b/examples/use-cases/online_coding_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..390adc0bc88986cd1413190a88d97070abdbb14d
--- /dev/null
+++ b/examples/use-cases/online_coding_agent.py
@@ -0,0 +1,46 @@
+# Goal: Implements a multi-agent system for online code editors, with separate agents for coding and execution.
+
+import os
+import sys
+import asyncio
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from langchain_openai import ChatOpenAI
+from dotenv import load_dotenv
+
+from browser_use import Agent, Browser
+
+# Load environment variables
+load_dotenv()
+if not os.getenv('OPENAI_API_KEY'):
+ raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
+
+async def main():
+ browser = Browser()
+ async with await browser.new_context() as context:
+ model = ChatOpenAI(model='gpt-4o')
+
+ # Initialize browser agent
+ agent1 = Agent(
+ task='Open an online code editor programiz.',
+ llm=model,
+ browser_context=context,
+ )
+ executor = Agent(
+ task='Executor. Execute the code written by the coder and suggest some updates if there are errors.',
+ llm=model,
+ browser_context=context,
+ )
+
+ coder = Agent(
+ task='Coder. Your job is to write and complete code. You are an expert coder. Code a simple calculator. Write the code on the coding interface after agent1 has opened the link.',
+ llm=model,
+ browser_context=context,
+ )
+ await agent1.run()
+ await executor.run()
+ await coder.run()
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/use-cases/post-twitter.py b/examples/use-cases/post-twitter.py
new file mode 100644
index 0000000000000000000000000000000000000000..86caef1a0e26083112b05eecda8fb67fe48f525e
--- /dev/null
+++ b/examples/use-cases/post-twitter.py
@@ -0,0 +1,127 @@
+"""
+Goal: Provides a template for automated posting on X (Twitter), including new tweets, tagging, and replies.
+
+X Posting Template using browser-use
+----------------------------------------
+
+This template allows you to automate posting on X using browser-use.
+It supports:
+- Posting new tweets
+- Tagging users
+- Replying to tweets
+
+Add your target user and message in the config section.
+
+target_user="XXXXX"
+message="XXXXX"
+reply_url="XXXXX"
+
+Any issues, contact me on X @defichemist95
+"""
+
+import os
+import sys
+from typing import Optional
+import asyncio
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dataclasses import dataclass
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use import Agent, Controller
+
+# Load environment variables
+load_dotenv()
+if not os.getenv('OPENAI_API_KEY'):
+ raise ValueError('OPENAI_API_KEY is not set. Please add it to your environment variables.')
+
+# ============ Configuration Section ============
+@dataclass
+class TwitterConfig:
+ """Configuration for Twitter posting"""
+
+ openai_api_key: str
+ chrome_path: str
+ target_user: str # Twitter handle without @
+ message: str
+ reply_url: str
+ headless: bool = False
+ model: str = "gpt-4o-mini"
+ base_url: str = "https://x.com/home"
+
+
+# Customize these settings
+config = TwitterConfig(
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
+ chrome_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", # This is for MacOS (Chrome)
+ target_user="XXXXX",
+ message="XXXXX",
+ reply_url="XXXXX",
+ headless=False,
+)
+
+
+def create_twitter_agent(config: TwitterConfig) -> Agent:
+
+ llm = ChatOpenAI(model=config.model, api_key=config.openai_api_key)
+
+ browser = Browser(
+ config=BrowserConfig(
+ headless=config.headless,
+ chrome_instance_path=config.chrome_path,
+ )
+ )
+
+ controller = Controller()
+
+ # Construct the full message with tag
+ full_message = f"@{config.target_user} {config.message}"
+
+ # Create the agent with detailed instructions
+ return Agent(
+ task=f"""Navigate to Twitter and create a post and reply to a tweet.
+
+ Here are the specific steps:
+
+ 1. Go to {config.base_url}. See the text input field at the top of the page that says "What's happening?"
+ 2. Look for the text input field at the top of the page that says "What's happening?"
+ 3. Click the input field and type exactly this message:
+ "{full_message}"
+ 4. Find and click the "Post" button (look for attributes: 'button' and 'data-testid="tweetButton"')
+ 5. Do not click on the '+' button which will add another tweet.
+
+ 6. Navigate to {config.reply_url}
+ 7. Before replying, understand the context of the tweet by scrolling down and reading the comments.
+ 8. Reply to the tweet under 50 characters.
+
+ Important:
+ - Wait for each element to load before interacting
+ - Make sure the message is typed exactly as shown
+ - Verify the post button is clickable before clicking
+ - Do not click on the '+' button which will add another tweet
+ """,
+ llm=llm,
+ controller=controller,
+ browser=browser,
+ )
+
+
+async def post_tweet(agent: Agent):
+
+ try:
+ await agent.run(max_steps=100)
+ agent.create_history_gif()
+ print("Tweet posted successfully!")
+ except Exception as e:
+ print(f"Error posting tweet: {str(e)}")
+
+
+async def main():
+ agent = create_twitter_agent(config)
+ await agent.run()
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/examples/use-cases/scrolling_page.py b/examples/use-cases/scrolling_page.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc358e1ecd4c2a50d9f356d620d2186dfef91d2
--- /dev/null
+++ b/examples/use-cases/scrolling_page.py
@@ -0,0 +1,42 @@
+# Goal: Automates webpage scrolling with various scrolling actions and text search functionality.
+
+import os
+import sys
+import asyncio
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+from dotenv import load_dotenv
+
+from browser_use.browser.browser import Browser, BrowserConfig
+
+# Load environment variables
+load_dotenv()
+if not os.getenv('OPENAI_API_KEY'):
+ raise ValueError('OPENAI_API_KEY is not set')
+
+"""
+Example: Using the 'Scroll down' action.
+
+This script demonstrates how the agent can navigate to a webpage and scroll down the content.
+If no amount is specified, the agent will scroll down by one page height.
+"""
+
+llm = ChatOpenAI(model='gpt-4o')
+
+agent = Agent(
+ # task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll down by one page - then scroll up by 100 pixels - then scroll down by 100 pixels - then scroll down by 10000 pixels.",
+ task="Navigate to 'https://en.wikipedia.org/wiki/Internet' and scroll to the string 'The vast majority of computer'",
+ llm=llm,
+ browser=Browser(config=BrowserConfig(headless=False)),
+)
+
+
+async def main():
+ await agent.run()
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/examples/use-cases/shopping.py b/examples/use-cases/shopping.py
new file mode 100644
index 0000000000000000000000000000000000000000..1328aabd846f6e98c2d64c792a244fb522d0843e
--- /dev/null
+++ b/examples/use-cases/shopping.py
@@ -0,0 +1,119 @@
+from langchain_openai import ChatOpenAI
+from browser_use import Agent, Browser, BrowserConfig
+from dotenv import load_dotenv
+load_dotenv()
+
+import asyncio
+
+task="""
+ ### Prompt for Shopping Agent β Migros Online Grocery Order
+
+**Objective:**
+Visit [Migros Online](https://www.migros.ch/en), search for the required grocery items, add them to the cart, select an appropriate delivery window, and complete the checkout process using TWINT.
+
+**Important:**
+- Make sure that you don't buy more than it's needed for each article.
+- After your search, if you click the "+" button, it adds the item to the basket.
+- if you open the basket sidewindow menu, you can close it by clicking the X button on the top right. This will help you navigate easier.
+---
+
+### Step 1: Navigate to the Website
+- Open [Migros Online](https://www.migros.ch/en).
+- You should be logged in as Nikolaos Kaliorakis
+
+---
+
+### Step 2: Add Items to the Basket
+
+#### Shopping List:
+
+**Meat & Dairy:**
+- Beef Minced meat (1 kg)
+- Gruyère cheese (grated preferably)
+- 2 liters full-fat milk
+- Butter (cheapest available)
+
+**Vegetables:**
+- Carrots (1kg pack)
+- Celery
+- Leeks (1 piece)
+- 1 kg potatoes
+
+At this stage, check the basket on the top right (indicates the price) and check if you bought the right items.
+
+**Fruits:**
+- 2 lemons
+- Oranges (for snacking)
+
+**Pantry Items:**
+- Lasagna sheets
+- Tahini
+- Tomato paste (below CHF2)
+- Black pepper refill (not with the mill)
+- 2x 1L Oatly Barista(oat milk)
+- 1 pack of eggs (10 egg package)
+
+#### Ingredients I already have (DO NOT purchase):
+- Olive oil, garlic, canned tomatoes, dried oregano, bay leaves, salt, chili flakes, flour, nutmeg, cumin.
+
+---
+
+### Step 3: Handling Unavailable Items
+- If an item is **out of stock**, find the best alternative.
+- Use the following recipe contexts to choose substitutions:
+ - **Pasta Bolognese & Lasagna:** Minced meat, tomato paste, lasagna sheets, milk (for béchamel), Gruyère cheese.
+ - **Hummus:** Tahini, chickpeas, lemon juice, olive oil.
+ - **Chickpea Curry Soup:** Chickpeas, leeks, curry, lemons.
+ - **Crispy Slow-Cooked Pork Belly with Vegetables:** Potatoes, butter.
+- Example substitutions:
+ - If Gruyère cheese is unavailable, select another semi-hard cheese.
+ - If Tahini is unavailable, a sesame-based alternative may work.
+
+---
+
+### Step 4: Adjusting for Minimum Order Requirement
+- If the total order **is below CHF 99**, add **a liquid soap refill** to reach the minimum. If it;s still you can buy some bread, dark chockolate.
+- At this step, check if you have bought MORE items than needed. If the price is more then CHF200, you MUST remove items.
+- If an item is not available, choose an alternative.
+- if an age verification is needed, remove alchoholic products, we haven't verified yet.
+
+---
+
+### Step 5: Select Delivery Window
+- Choose a **delivery window within the current week**. It's ok to pay up to CHF2 for the window selction.
+- Preferably select a slot within the workweek.
+
+---
+
+### Step 6: Checkout
+- Proceed to checkout.
+- Select **TWINT** as the payment method.
+- Check out.
+-
+- if it's needed the userename is: nikoskalio.dev@gmail.com
+- and the password is : TheCircuit.Migros.dev!
+---
+
+### Step 7: Confirm Order & Output Summary
+- Once the order is placed, output a summary including:
+ - **Final list of items purchased** (including any substitutions).
+ - **Total cost**.
+ - **Chosen delivery time**.
+
+**Important:** Ensure efficiency and accuracy throughout the process."""
+
+browser = Browser()
+
+agent = Agent(
+ task=task,
+ llm=ChatOpenAI(model="gpt-4o"),
+ browser=browser,
+ )
+
+async def main():
+ await agent.run()
+ input("Press Enter to close the browser...")
+ await browser.close()
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/examples/use-cases/twitter_post_using_cookies.py b/examples/use-cases/twitter_post_using_cookies.py
new file mode 100644
index 0000000000000000000000000000000000000000..72ac98cea241a9508f17755108d971bbddb0c58a
--- /dev/null
+++ b/examples/use-cases/twitter_post_using_cookies.py
@@ -0,0 +1,43 @@
+# Goal: Automates posting on X (Twitter) using stored authentication cookies.
+
+import asyncio
+import os
+
+from dotenv import load_dotenv
+from langchain_google_genai import ChatGoogleGenerativeAI
+from pydantic import SecretStr
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+
+load_dotenv()
+api_key = os.getenv('GEMINI_API_KEY')
+if not api_key:
+ raise ValueError('GEMINI_API_KEY is not set')
+
+llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(api_key))
+
+
+browser = Browser(
+ config=BrowserConfig(
+ # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+ )
+)
+file_path = os.path.join(os.path.dirname(__file__), 'twitter_cookies.txt')
+context = BrowserContext(browser=browser, config=BrowserContextConfig(cookies_file=file_path))
+
+
+async def main():
+ agent = Agent(
+ browser_context=context,
+ task=('go to https://x.com. write a new post with the text "browser-use ftw", and submit it'),
+ llm=llm,
+ max_actions_per_step=4,
+ )
+ await agent.run(max_steps=25)
+ input('Press Enter to close the browser...')
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/examples/use-cases/web_voyager_agent.py b/examples/use-cases/web_voyager_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b0de90420e7b2ae1421b1f528e81c7eb1c2bc2b
--- /dev/null
+++ b/examples/use-cases/web_voyager_agent.py
@@ -0,0 +1,72 @@
+# Goal: A general-purpose web navigation agent for tasks like flight booking and course searching.
+
+import os
+import sys
+import asyncio
+
+# Adjust Python path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from dotenv import load_dotenv
+from pydantic import SecretStr
+from langchain_openai import AzureChatOpenAI
+
+from browser_use.agent.service import Agent
+from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig
+
+# Load environment variables
+load_dotenv()
+
+# Validate required environment variables
+required_env_vars = ["AZURE_OPENAI_KEY", "AZURE_OPENAI_ENDPOINT"]
+for var in required_env_vars:
+ if not os.getenv(var):
+ raise ValueError(f"{var} is not set. Please add it to your environment variables.")
+
+browser = Browser(
+ config=BrowserConfig(
+ headless=False, # This is True in production
+ disable_security=True,
+ new_context_config=BrowserContextConfig(
+ disable_security=True,
+ minimum_wait_page_load_time=1, # 3 on prod
+ maximum_wait_page_load_time=10, # 20 on prod
+ # no_viewport=True,
+ browser_window_size={
+ 'width': 1280,
+ 'height': 1100,
+ },
+ # trace_path='./tmp/web_voyager_agent',
+ ),
+ )
+)
+llm = AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+)
+
+# TASK = """
+# Find the lowest-priced one-way flight from Cairo to Montreal on February 21, 2025, including the total travel time and number of stops. on https://www.google.com/travel/flights/
+# """
+# TASK = """
+# Browse Coursera, which universities offer Master of Advanced Study in Engineering degrees? Tell me what is the latest application deadline for this degree? on https://www.coursera.org/"""
+TASK = """
+Find and book a hotel in Paris with suitable accommodations for a family of four (two adults and two children) offering free cancellation for the dates of February 14-21, 2025. on https://www.booking.com/
+"""
+
+
+async def main():
+ agent = Agent(
+ task=TASK,
+ llm=llm,
+ browser=browser,
+ validate_output=True,
+ )
+ history = await agent.run(max_steps=50)
+ history.save_to_file('./tmp/history.json')
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/examples/use-cases/wikipedia_banana_to_quantum.py b/examples/use-cases/wikipedia_banana_to_quantum.py
new file mode 100644
index 0000000000000000000000000000000000000000..23c1b8d315407263198baaca90049faa54ff05ef
--- /dev/null
+++ b/examples/use-cases/wikipedia_banana_to_quantum.py
@@ -0,0 +1,34 @@
+import asyncio
+
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent
+from browser_use.browser.browser import Browser, BrowserConfig, BrowserContextConfig
+
+load_dotenv()
+
+# video https://preview.screen.studio/share/vuq91Ej8
+llm = ChatOpenAI(
+ model='gpt-4o',
+ temperature=0.0,
+)
+task = 'go to https://en.wikipedia.org/wiki/Banana and click on buttons on the wikipedia page to go as fast as possible from banna to Quantum mechanics'
+
+browser = Browser(
+ config=BrowserConfig(
+ new_context_config=BrowserContextConfig(
+ viewport_expansion=-1,
+ highlight_elements=False,
+ ),
+ ),
+)
+agent = Agent(task=task, llm=llm, browser=browser, use_vision=False)
+
+
+async def main():
+ await agent.run()
+
+
+if __name__ == '__main__':
+ asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..f39231da02d41fe05a5aa71889c5a892db830271
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,57 @@
+[project]
+name = "browser-use"
+description = "Make websites accessible for AI agents"
+authors = [{ name = "Gregor Zunic" }]
+version = "0.1.40"
+readme = "README.md"
+requires-python = ">=3.11,<4.0"
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: MIT License",
+ "Operating System :: OS Independent",
+]
+dependencies = [
+ "httpx>=0.27.2",
+ "pydantic>=2.10.4",
+ "python-dotenv>=1.0.1",
+ "requests>=2.32.3",
+ "posthog>=3.7.0",
+ "playwright>=1.49.0",
+ "setuptools>=75.8.0",
+ "markdownify==0.14.1",
+ "langchain-core>=0.3.35",
+ "langchain-openai==0.3.1",
+ "langchain-anthropic==0.3.3",
+ "langchain-ollama==0.2.2",
+]
+urls = { "Repository" = "https://github.com/browser-use/browser-use" }
+
+[project.optional-dependencies]
+dev = [
+ "tokencost>=0.1.16",
+ "hatch>=1.13.0",
+ "build>=1.2.2",
+ "pytest>=8.3.3",
+ "pytest-asyncio>=0.24.0",
+ "fastapi>=0.115.8",
+ "inngest>=0.4.19",
+ "uvicorn>=0.34.0",
+ "langchain>=0.3.18",
+ "langchain-aws>=0.2.11",
+ "langchain-fireworks>=0.2.6",
+ "langchain-google-genai==2.0.8",
+]
+
+[tool.ruff]
+line-length = 130
+select = ["E", "F", "I"]
+fix = true
+
+[tool.ruff.format]
+quote-style = "single"
+indent-style = "tab"
+docstring-code-format = true
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..8715c46f8ba9a74025a31d45c397ff8596b1ec8b
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,29 @@
+[pytest]
+markers =
+ slow: marks tests as slow (deselect with '-m "not slow"')
+ integration: marks tests as integration tests
+ unit: marks tests as unit tests
+ asyncio: mark tests as async tests
+
+testpaths =
+ tests
+
+python_files =
+ test_*.py
+ *_test.py
+
+addopts =
+ -v
+ --strict-markers
+ --tb=short
+
+asyncio_mode = auto
+asyncio_default_fixture_loop_scope = function
+log_cli = true
+; log_cli_level = DEBUG
+log_cli_format = %(levelname)-8s [%(name)s] %(message)s
+filterwarnings =
+ ignore::pytest.PytestDeprecationWarning
+ ignore::DeprecationWarning
+
+log_level = INFO
diff --git a/static/browser-use-dark.png b/static/browser-use-dark.png
new file mode 100644
index 0000000000000000000000000000000000000000..1b84ab87956327263d49cd7f77a09c91be6f8bca
Binary files /dev/null and b/static/browser-use-dark.png differ
diff --git a/static/browser-use.png b/static/browser-use.png
new file mode 100644
index 0000000000000000000000000000000000000000..54685c4f4b3c721f74bbb09cfc54c6645216ae70
Binary files /dev/null and b/static/browser-use.png differ
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..663966c2740534d8ccd52b5e62b19c75bb9346e5
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,59 @@
+"""
+Test configuration for browser-use.
+"""
+
+import logging
+import os
+import sys
+
+import pytest
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+
+# Ensure the project root is in the Python path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+# Load environment variables
+from dotenv import load_dotenv
+
+load_dotenv()
+
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+
+@pytest.fixture(scope='session')
+def llm():
+ """
+ Fixture to provide a ChatOpenAI instance or a mock for testing.
+ Uses a mock if OPENAI_API_KEY is not set.
+ """
+ api_key = os.getenv('OPENAI_API_KEY')
+ logger.debug(f'API Key present: {bool(api_key)}')
+ logger.debug('Using actual ChatOpenAI model')
+ return ChatOpenAI(model='gpt-4o', api_key=SecretStr(api_key) if api_key else None)
+
+
+@pytest.fixture(scope='session')
+def browser():
+ """
+ Fixture to provide a Browser instance for testing.
+ """
+ logger.debug('Creating Browser instance for testing')
+ return Browser(config=BrowserConfig(headless=True, disable_security=True))
+
+
+@pytest.fixture(scope='function')
+async def browser_context(browser):
+ """
+ Fixture to provide a BrowserContext instance for testing.
+ """
+ logger.debug('Creating BrowserContext instance for testing')
+ context = BrowserContext(browser=browser)
+ yield context
+ await context.close()
diff --git a/tests/test_agent_actions.py b/tests/test_agent_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..14d8dbacc7882065f7339bb27abf7e62d06b6358
--- /dev/null
+++ b/tests/test_agent_actions.py
@@ -0,0 +1,224 @@
+import asyncio
+import os
+
+import pytest
+from langchain_openai import AzureChatOpenAI
+from pydantic import BaseModel, SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.views import BrowserState
+
+
+@pytest.fixture
+def llm():
+ """Initialize language model for testing"""
+
+ # return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
+ return AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ )
+ # return ChatOpenAI(model='gpt-4o-mini')
+
+
+@pytest.fixture(scope='session')
+def event_loop():
+ """Create an instance of the default event loop for each test case."""
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='session')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as context:
+ yield context
+ # Clean up automatically happens with __aexit__
+
+
+# pytest tests/test_agent_actions.py -v -k "test_ecommerce_interaction" --capture=no
+# @pytest.mark.asyncio
+@pytest.mark.skip(reason='Kinda expensive to run')
+async def test_ecommerce_interaction(llm, context):
+ """Test complex ecommerce interaction sequence"""
+ agent = Agent(
+ task="Go to amazon.com, search for 'laptop', filter by 4+ stars, and find the price of the first result",
+ llm=llm,
+ browser_context=context,
+ save_conversation_path='tmp/test_ecommerce_interaction/conversation',
+ )
+
+ history: AgentHistoryList = await agent.run(max_steps=20)
+
+ # Verify sequence of actions
+ action_sequence = []
+ for action in history.model_actions():
+ action_name = list(action.keys())[0]
+ if action_name in ['go_to_url', 'open_tab']:
+ action_sequence.append('navigate')
+ elif action_name == 'input_text':
+ action_sequence.append('input')
+ # Check that the input is 'laptop'
+ inp = action['input_text']['text'].lower() # type: ignore
+ if inp == 'laptop':
+ action_sequence.append('input_exact_correct')
+ elif 'laptop' in inp:
+ action_sequence.append('correct_in_input')
+ else:
+ action_sequence.append('incorrect_input')
+ elif action_name == 'click_element':
+ action_sequence.append('click')
+
+ # Verify essential steps were performed
+ assert 'navigate' in action_sequence # Navigated to Amazon
+ assert 'input' in action_sequence # Entered search term
+ assert 'click' in action_sequence # Clicked search/filter
+ assert 'input_exact_correct' in action_sequence or 'correct_in_input' in action_sequence
+
+
+# @pytest.mark.asyncio
+async def test_error_recovery(llm, context):
+ """Test agent's ability to recover from errors"""
+ agent = Agent(
+ task='Navigate to nonexistent-site.com and then recover by going to google.com ',
+ llm=llm,
+ browser_context=context,
+ )
+
+ history: AgentHistoryList = await agent.run(max_steps=10)
+
+ actions_names = history.action_names()
+ actions = history.model_actions()
+ assert (
+ 'go_to_url' in actions_names or 'open_tab' in actions_names
+ ), f'{actions_names} does not contain go_to_url or open_tab'
+ for action in actions:
+ if 'go_to_url' in action:
+ assert 'url' in action['go_to_url'], 'url is not in go_to_url'
+ assert action['go_to_url']['url'].endswith(
+ 'google.com'
+ ), 'url does not end with google.com'
+ break
+
+
+# @pytest.mark.asyncio
+async def test_find_contact_email(llm, context):
+ """Test agent's ability to find contact email on a website"""
+ agent = Agent(
+ task='Go to https://browser-use.com/ and find out the contact email',
+ llm=llm,
+ browser_context=context,
+ )
+
+ history: AgentHistoryList = await agent.run(max_steps=10)
+
+ # Verify the agent found the contact email
+ extracted_content = history.extracted_content()
+ email = 'info@browser-use.com'
+ for content in extracted_content:
+ if email in content:
+ break
+ else:
+ pytest.fail(f'{extracted_content} does not contain {email}')
+
+
+# @pytest.mark.asyncio
+async def test_agent_finds_installation_command(llm, context):
+ """Test agent's ability to find the pip installation command for browser-use on the web"""
+ agent = Agent(
+ task='Find the pip installation command for the browser-use repo',
+ llm=llm,
+ browser_context=context,
+ )
+
+ history: AgentHistoryList = await agent.run(max_steps=10)
+
+ # Verify the agent found the correct installation command
+ extracted_content = history.extracted_content()
+ install_command = 'pip install browser-use'
+ for content in extracted_content:
+ if install_command in content:
+ break
+ else:
+ pytest.fail(f'{extracted_content} does not contain {install_command}')
+
+
+class CaptchaTest(BaseModel):
+ name: str
+ url: str
+ success_text: str
+ additional_text: str | None = None
+
+
+# run 3 test: python -m pytest tests/test_agent_actions.py -v -k "test_captcha_solver" --capture=no --log-cli-level=INFO
+# pytest tests/test_agent_actions.py -v -k "test_captcha_solver" --capture=no --log-cli-level=INFO
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+ 'captcha',
+ [
+ CaptchaTest(
+ name='Text Captcha',
+ url='https://2captcha.com/demo/text',
+ success_text='Captcha is passed successfully!',
+ ),
+ CaptchaTest(
+ name='Basic Captcha',
+ url='https://captcha.com/demos/features/captcha-demo.aspx',
+ success_text='Correct!',
+ ),
+ CaptchaTest(
+ name='Rotate Captcha',
+ url='https://2captcha.com/demo/rotatecaptcha',
+ success_text='Captcha is passed successfully',
+ additional_text='Use multiple clicks at once. click done when image is exact correct position.',
+ ),
+ CaptchaTest(
+ name='MT Captcha',
+ url='https://2captcha.com/demo/mtcaptcha',
+ success_text='Verified Successfully',
+ additional_text='Stop when you solved it successfully.',
+ ),
+ ],
+)
+async def test_captcha_solver(llm, context, captcha: CaptchaTest):
+ """Test agent's ability to solve different types of captchas"""
+ agent = Agent(
+ task=f'Go to {captcha.url} and solve the captcha. {captcha.additional_text}',
+ llm=llm,
+ browser_context=context,
+ )
+ from browser_use.agent.views import AgentHistoryList
+
+ history: AgentHistoryList = await agent.run(max_steps=7)
+
+ state: BrowserState = await context.get_state()
+
+ all_text = state.element_tree.get_all_text_till_next_clickable_element()
+
+ if not all_text:
+ all_text = ''
+
+ if not isinstance(all_text, str):
+ all_text = str(all_text)
+
+ solved = captcha.success_text in all_text
+ assert solved, f'Failed to solve {captcha.name}'
+
+ # python -m pytest tests/test_agent_actions.py -v --capture=no
+
+ # pytest tests/test_agent_actions.py -v -k "test_captcha_solver" --capture=no --log-cli-level=INFO
diff --git a/tests/test_attach_chrome.py b/tests/test_attach_chrome.py
new file mode 100644
index 0000000000000000000000000000000000000000..2271f3f07b98d7e5d2254a9460334448160b66c2
--- /dev/null
+++ b/tests/test_attach_chrome.py
@@ -0,0 +1,69 @@
+import asyncio
+
+from playwright.async_api import async_playwright
+
+
+async def test_full_screen(start_fullscreen: bool, maximize: bool):
+ async with async_playwright() as p:
+ try:
+ print('Attempting to connect to Chrome...')
+ # run in terminal: /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --remote-debugging-port=9222 --no-first-run
+ browser = await p.chromium.connect_over_cdp(
+ 'http://localhost:9222',
+ timeout=20000, # 20 second timeout for connection
+ )
+ print('Connected to Chrome successfully')
+
+ # Get the first context and page, or create new ones if needed
+ if len(browser.contexts) == 0:
+ context = await browser.new_context(ignore_https_errors=True)
+ else:
+ context = browser.contexts[0]
+
+ if len(context.pages) == 0:
+ page = await context.new_page()
+ else:
+ page = context.pages[0]
+
+ print('Attempting to navigate to Gmail...')
+ try:
+ # First try with a shorter timeout
+ await page.goto(
+ 'https://mail.google.com',
+ wait_until='load', # Changed from domcontentloaded
+ timeout=10000,
+ )
+ except Exception as e:
+ print(f'First navigation attempt failed: {e}')
+ print('Trying again with different settings...')
+ # If that fails, try again with different settings
+ await page.goto(
+ 'https://mail.google.com',
+ wait_until='commit', # Less strict wait condition
+ timeout=30000,
+ )
+
+ # Wait for the page to stabilize
+ await asyncio.sleep(2)
+
+ print(f'Current page title: {await page.title()}')
+
+ # Optional: wait for specific Gmail elements
+ try:
+ await page.wait_for_selector('div[role="main"]', timeout=5000)
+ print('Gmail interface detected')
+ except Exception as e:
+ print(f'Note: Gmail interface not detected: {e}')
+
+ await asyncio.sleep(30)
+ except Exception as e:
+ print(f'An error occurred: {e}')
+ import traceback
+
+ traceback.print_exc()
+ finally:
+ await browser.close()
+
+
+if __name__ == '__main__':
+ asyncio.run(test_full_screen(False, False))
diff --git a/tests/test_browser.py b/tests/test_browser.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3acf344f617552ec376e2e8476a94cfdf42fe6b
--- /dev/null
+++ b/tests/test_browser.py
@@ -0,0 +1,306 @@
+import asyncio
+import pytest
+import requests
+import subprocess
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from playwright._impl._api_structures import ProxySettings
+
+@pytest.mark.asyncio
+async def test_standard_browser_launch(monkeypatch):
+ """
+ Test that the standard browser is launched correctly:
+ When no remote (cdp or wss) or chrome instance is provided, the Browser class uses _setup_standard_browser.
+ This test monkeypatches async_playwright to return dummy objects, and asserts that get_playwright_browser returns the expected DummyBrowser.
+ """
+ class DummyBrowser:
+ pass
+ class DummyChromium:
+ async def launch(self, headless, args, proxy=None):
+ return DummyBrowser()
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ config = BrowserConfig(headless=True, disable_security=False, extra_chromium_args=["--test"])
+ browser_obj = Browser(config=config)
+ result_browser = await browser_obj.get_playwright_browser()
+ assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_standard_browser"
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_cdp_browser_launch(monkeypatch):
+ """
+ Test that when a CDP URL is provided in the configuration, the Browser uses _setup_cdp
+ and returns the expected DummyBrowser.
+ """
+ class DummyBrowser:
+ pass
+ class DummyChromium:
+ async def connect_over_cdp(self, endpoint_url, timeout=20000):
+ assert endpoint_url == "ws://dummy-cdp-url", "The endpoint URL should match the configuration."
+ return DummyBrowser()
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ config = BrowserConfig(cdp_url="ws://dummy-cdp-url")
+ browser_obj = Browser(config=config)
+ result_browser = await browser_obj.get_playwright_browser()
+ assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_cdp"
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_wss_browser_launch(monkeypatch):
+ """
+ Test that when a WSS URL is provided in the configuration,
+ the Browser uses _setup_wss and returns the expected DummyBrowser.
+ """
+ class DummyBrowser:
+ pass
+ class DummyChromium:
+ async def connect(self, wss_url):
+ assert wss_url == "ws://dummy-wss-url", "WSS URL should match the configuration."
+ return DummyBrowser()
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ config = BrowserConfig(wss_url="ws://dummy-wss-url")
+ browser_obj = Browser(config=config)
+ result_browser = await browser_obj.get_playwright_browser()
+ assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_wss"
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_chrome_instance_browser_launch(monkeypatch):
+ """
+ Test that when a chrome instance path is provided the Browser class uses
+ _setup_browser_with_instance branch and returns the expected DummyBrowser object
+ by reusing an existing Chrome instance.
+ """
+ # Dummy response for requests.get when checking chrome debugging endpoint.
+ class DummyResponse:
+ status_code = 200
+ def dummy_get(url, timeout):
+ if url == "http://localhost:9222/json/version":
+ return DummyResponse()
+ raise requests.ConnectionError("Connection failed")
+ monkeypatch.setattr(requests, "get", dummy_get)
+ class DummyBrowser:
+ pass
+ class DummyChromium:
+ async def connect_over_cdp(self, endpoint_url, timeout=20000):
+ assert endpoint_url == "http://localhost:9222", "Endpoint URL must be 'http://localhost:9222'"
+ return DummyBrowser()
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ config = BrowserConfig(chrome_instance_path="dummy/chrome", extra_chromium_args=["--dummy-arg"])
+ browser_obj = Browser(config=config)
+ result_browser = await browser_obj.get_playwright_browser()
+ assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_browser_with_instance"
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_standard_browser_disable_security_args(monkeypatch):
+ """
+ Test that the standard browser launch includes disable-security arguments when disable_security is True.
+ This verifies that _setup_standard_browser correctly appends the security disabling arguments along with
+ the base arguments and any extra arguments provided.
+ """
+ # These are the base arguments defined in _setup_standard_browser.
+ base_args = [
+ '--no-sandbox',
+ '--disable-blink-features=AutomationControlled',
+ '--disable-infobars',
+ '--disable-background-timer-throttling',
+ '--disable-popup-blocking',
+ '--disable-backgrounding-occluded-windows',
+ '--disable-renderer-backgrounding',
+ '--disable-window-activation',
+ '--disable-focus-on-load',
+ '--no-first-run',
+ '--no-default-browser-check',
+ '--no-startup-window',
+ '--window-position=0,0',
+ ]
+ # When disable_security is True, these arguments should be added.
+ disable_security_args = [
+ '--disable-web-security',
+ '--disable-site-isolation-trials',
+ '--disable-features=IsolateOrigins,site-per-process'
+ ]
+ # Additional arbitrary argument for testing extra args
+ extra_args = ["--dummy-extra"]
+ class DummyBrowser:
+ pass
+ class DummyChromium:
+ async def launch(self, headless, args, proxy=None):
+ # Expected args is the base args plus disable security args and the extra args.
+ expected_args = base_args + disable_security_args + extra_args
+ assert headless is True, "Expected headless to be True"
+ assert args == expected_args, f"Expected args {expected_args}, but got {args}"
+ assert proxy is None, "Expected proxy to be None"
+ return DummyBrowser()
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ config = BrowserConfig(headless=True, disable_security=True, extra_chromium_args=extra_args)
+ browser_obj = Browser(config=config)
+ result_browser = await browser_obj.get_playwright_browser()
+ assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_standard_browser with disable_security active"
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_new_context_creation():
+ """
+ Test that the new_context method returns a BrowserContext with the correct attributes.
+ This verifies that the BrowserContext is initialized with the provided Browser instance and configuration.
+ """
+ config = BrowserConfig()
+ browser_obj = Browser(config=config)
+ custom_context_config = BrowserContextConfig()
+ context = await browser_obj.new_context(custom_context_config)
+ assert isinstance(context, BrowserContext), "Expected new_context to return an instance of BrowserContext"
+ assert context.browser is browser_obj, "Expected the context's browser attribute to be the Browser instance"
+ assert context.config == custom_context_config, "Expected the context's config attribute to be the provided config"
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_chrome_instance_browser_launch_failure(monkeypatch):
+ """
+ Test that when a Chrome instance cannot be started or connected to,
+ the Browser._setup_browser_with_instance branch eventually raises a RuntimeError.
+ We simulate failure by:
+ - Forcing requests.get to always raise a ConnectionError (so no existing instance is found).
+ - Monkeypatching subprocess.Popen to do nothing.
+ - Replacing asyncio.sleep to avoid delays.
+ - Having the dummy playwright's connect_over_cdp method always raise an Exception.
+ """
+ def dummy_get(url, timeout):
+ raise requests.ConnectionError("Simulated connection failure")
+ monkeypatch.setattr(requests, "get", dummy_get)
+ monkeypatch.setattr(subprocess, "Popen", lambda args, stdout, stderr: None)
+ async def fake_sleep(seconds):
+ return
+ monkeypatch.setattr(asyncio, "sleep", fake_sleep)
+ class DummyChromium:
+ async def connect_over_cdp(self, endpoint_url, timeout=20000):
+ raise Exception("Connection failed simulation")
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ config = BrowserConfig(chrome_instance_path="dummy/chrome", extra_chromium_args=["--dummy-arg"])
+ browser_obj = Browser(config=config)
+ with pytest.raises(RuntimeError, match="To start chrome in Debug mode"):
+ await browser_obj.get_playwright_browser()
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_get_playwright_browser_caching(monkeypatch):
+ """
+ Test that get_playwright_browser returns a cached browser instance.
+ On the first call, the browser is initialized; on subsequent calls,
+ the same instance is returned.
+ """
+ class DummyBrowser:
+ pass
+ class DummyChromium:
+ async def launch(self, headless, args, proxy=None):
+ return DummyBrowser()
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ config = BrowserConfig(headless=True, disable_security=False, extra_chromium_args=["--test"])
+ browser_obj = Browser(config=config)
+ first_browser = await browser_obj.get_playwright_browser()
+ second_browser = await browser_obj.get_playwright_browser()
+ assert first_browser is second_browser, "Expected the browser to be cached and reused across calls."
+ await browser_obj.close()
+@pytest.mark.asyncio
+async def test_close_error_handling(monkeypatch):
+ """
+ Test that the close method properly handles exceptions thrown by
+ playwright_browser.close() and playwright.stop(), ensuring that the
+ browser's attributes are set to None even if errors occur.
+ """
+ class DummyBrowserWithError:
+ async def close(self):
+ raise Exception("Close error simulation")
+ class DummyPlaywrightWithError:
+ async def stop(self):
+ raise Exception("Stop error simulation")
+ config = BrowserConfig()
+ browser_obj = Browser(config=config)
+ browser_obj.playwright_browser = DummyBrowserWithError()
+ browser_obj.playwright = DummyPlaywrightWithError()
+ await browser_obj.close()
+ assert browser_obj.playwright_browser is None, "Expected playwright_browser to be None after close"
+ assert browser_obj.playwright is None, "Expected playwright to be None after close"
+@pytest.mark.asyncio
+async def test_standard_browser_launch_with_proxy(monkeypatch):
+ """
+ Test that when a proxy is provided in the BrowserConfig, the _setup_standard_browser method
+ correctly passes the proxy parameter to the playwright.chromium.launch method.
+ This test sets up a dummy async_playwright context and verifies that the dummy proxy is received.
+ """
+ class DummyBrowser:
+ pass
+ # Create a dummy proxy settings instance.
+ dummy_proxy = ProxySettings(server="http://dummy.proxy")
+ class DummyChromium:
+ async def launch(self, headless, args, proxy=None):
+ # Assert that the proxy passed equals the dummy proxy provided in the configuration.
+ assert proxy == dummy_proxy, f"Expected proxy {dummy_proxy} but got {proxy}"
+ # We can also verify some base parameters if needed (headless, args) but our focus is proxy.
+ return DummyBrowser()
+ class DummyPlaywright:
+ def __init__(self):
+ self.chromium = DummyChromium()
+ async def stop(self):
+ pass
+ class DummyAsyncPlaywrightContext:
+ async def start(self):
+ return DummyPlaywright()
+ # Monkeypatch async_playwright to return our dummy async playwright context.
+ monkeypatch.setattr("browser_use.browser.browser.async_playwright", lambda: DummyAsyncPlaywrightContext())
+ # Create a BrowserConfig with the dummy proxy.
+ config = BrowserConfig(headless=False, disable_security=False, proxy=dummy_proxy)
+ browser_obj = Browser(config=config)
+ # Call get_playwright_browser and verify that the returned browser is as expected.
+ result_browser = await browser_obj.get_playwright_browser()
+ assert isinstance(result_browser, DummyBrowser), "Expected DummyBrowser from _setup_standard_browser with proxy provided"
+ await browser_obj.close()
\ No newline at end of file
diff --git a/tests/test_context.py b/tests/test_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..385f70f48beae07f545c8b3c5a841b64712441ce
--- /dev/null
+++ b/tests/test_context.py
@@ -0,0 +1,326 @@
+import asyncio
+import base64
+import os
+import pytest
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from browser_use.browser.views import BrowserState
+from browser_use.dom.views import DOMElementNode
+from unittest.mock import Mock
+
+def test_is_url_allowed():
+ """
+ Test the _is_url_allowed method to verify that it correctly checks URLs against
+ the allowed domains configuration.
+ Scenario 1: When allowed_domains is None, all URLs should be allowed.
+ Scenario 2: When allowed_domains is a list, only URLs matching the allowed domain(s) are allowed.
+ Scenario 3: When the URL is malformed, it should return False.
+ """
+ # Create a dummy Browser mock. Only the 'config' attribute is needed for _is_url_allowed.
+ dummy_browser = Mock()
+ # Set an empty config for dummy_browser; it won't be used in _is_url_allowed.
+ dummy_browser.config = Mock()
+ # Scenario 1: allowed_domains is None, any URL should be allowed.
+ config1 = BrowserContextConfig(allowed_domains=None)
+ context1 = BrowserContext(browser=dummy_browser, config=config1)
+ assert context1._is_url_allowed("http://anydomain.com") is True
+ assert context1._is_url_allowed("https://anotherdomain.org/path") is True
+ # Scenario 2: allowed_domains is provided.
+ allowed = ["example.com", "mysite.org"]
+ config2 = BrowserContextConfig(allowed_domains=allowed)
+ context2 = BrowserContext(browser=dummy_browser, config=config2)
+ # URL exactly matching
+ assert context2._is_url_allowed("http://example.com") is True
+ # URL with subdomain (should be allowed)
+ assert context2._is_url_allowed("http://sub.example.com/path") is True
+ # URL with different domain (should not be allowed)
+ assert context2._is_url_allowed("http://notexample.com") is False
+ # URL that matches second allowed domain
+ assert context2._is_url_allowed("https://mysite.org/page") is True
+ # URL with port number, still allowed (port is stripped)
+ assert context2._is_url_allowed("http://example.com:8080") is True
+ # Scenario 3: Malformed URL or empty domain
+ # urlparse will return an empty netloc for some malformed URLs.
+ assert context2._is_url_allowed("notaurl") is False
+def test_convert_simple_xpath_to_css_selector():
+ """
+ Test the _convert_simple_xpath_to_css_selector method of BrowserContext.
+ This verifies that simple XPath expressions (with and without indices) are correctly converted to CSS selectors.
+ """
+ # Test empty xpath returns empty string
+ assert BrowserContext._convert_simple_xpath_to_css_selector('') == ''
+ # Test a simple xpath without indices
+ xpath = "/html/body/div/span"
+ expected = "html > body > div > span"
+ result = BrowserContext._convert_simple_xpath_to_css_selector(xpath)
+ assert result == expected
+ # Test xpath with an index on one element: [2] should translate to :nth-of-type(2)
+ xpath = "/html/body/div[2]/span"
+ expected = "html > body > div:nth-of-type(2) > span"
+ result = BrowserContext._convert_simple_xpath_to_css_selector(xpath)
+ assert result == expected
+ # Test xpath with indices on multiple elements:
+ # For "li[3]" -> li:nth-of-type(3) and for "a[1]" -> a:nth-of-type(1)
+ xpath = "/ul/li[3]/a[1]"
+ expected = "ul > li:nth-of-type(3) > a:nth-of-type(1)"
+ result = BrowserContext._convert_simple_xpath_to_css_selector(xpath)
+ assert result == expected
+def test_get_initial_state():
+ """
+ Test the _get_initial_state method to verify it returns the correct initial BrowserState.
+ The test checks that when a dummy page with a URL is provided,
+ the returned state contains that URL and other default values.
+ """
+ # Create a dummy browser since only its existence is needed.
+ dummy_browser = Mock()
+ dummy_browser.config = Mock()
+ context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
+ # Define a dummy page with a 'url' attribute.
+ class DummyPage:
+ url = "http://dummy.com"
+ dummy_page = DummyPage()
+ # Call _get_initial_state with a page: URL should be set from page.url.
+ state_with_page = context._get_initial_state(page=dummy_page)
+ assert state_with_page.url == dummy_page.url
+ # Verify that the element_tree is initialized with tag 'root'
+ assert state_with_page.element_tree.tag_name == 'root'
+ # Call _get_initial_state without a page: URL should be empty.
+ state_without_page = context._get_initial_state()
+ assert state_without_page.url == ""
+@pytest.mark.asyncio
+async def test_execute_javascript():
+ """
+ Test the execute_javascript method by mocking the current page's evaluate function.
+ This ensures that when execute_javascript is called, it correctly returns the value
+ from the page's evaluate method.
+ """
+ # Define a dummy page with an async evaluate method.
+ class DummyPage:
+ async def evaluate(self, script):
+ return "dummy_result"
+ # Create a dummy session object with a dummy current_page.
+ dummy_session = type("DummySession", (), {})()
+ dummy_session.current_page = DummyPage()
+ # Create a dummy browser mock with a minimal config.
+ dummy_browser = Mock()
+ dummy_browser.config = Mock()
+ # Initialize the BrowserContext with the dummy browser and config.
+ context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
+ # Manually set the session to our dummy session.
+ context.session = dummy_session
+ # Call execute_javascript and verify it returns the expected result.
+ result = await context.execute_javascript("return 1+1")
+ assert result == "dummy_result"
+@pytest.mark.asyncio
+async def test_enhanced_css_selector_for_element():
+ """
+ Test the _enhanced_css_selector_for_element method to verify that
+ it returns the correct CSS selector string for a dummy DOMElementNode.
+ The test checks that:
+ - The provided xpath is correctly converted (handling indices),
+ - Class attributes are appended as CSS classes,
+ - Standard and dynamic attributes (including ones with special characters)
+ are correctly added to the selector.
+ """
+ # Create a dummy DOMElementNode instance with a complex set of attributes.
+ dummy_element = DOMElementNode(
+ tag_name="div",
+ is_visible=True,
+ parent=None,
+ xpath="/html/body/div[2]",
+ attributes={
+ "class": "foo bar",
+ "id": "my-id",
+ "placeholder": 'some "quoted" text',
+ "data-testid": "123"
+ },
+ children=[]
+ )
+ # Call the method with include_dynamic_attributes=True.
+ actual_selector = BrowserContext._enhanced_css_selector_for_element(dummy_element, include_dynamic_attributes=True)
+ # Expected conversion:
+ # 1. The xpath "/html/body/div[2]" converts to "html > body > div:nth-of-type(2)".
+ # 2. The class attribute "foo bar" appends ".foo.bar".
+ # 3. The "id" attribute is added as [id="my-id"].
+ # 4. The "placeholder" attribute contains quotes; it is added as
+ # [placeholder*="some \"quoted\" text"].
+ # 5. The dynamic attribute "data-testid" is added as [data-testid="123"].
+ expected_selector = 'html > body > div:nth-of-type(2).foo.bar[id="my-id"][placeholder*="some \\"quoted\\" text"][data-testid="123"]'
+ assert actual_selector == expected_selector, f"Expected {expected_selector}, but got {actual_selector}"
+@pytest.mark.asyncio
+async def test_get_scroll_info():
+ """
+ Test the get_scroll_info method by mocking the page's evaluate method.
+ This dummy page returns preset values for window.scrollY, window.innerHeight,
+ and document.documentElement.scrollHeight. The test then verifies that the
+ computed scroll information (pixels_above and pixels_below) match the expected values.
+ """
+ # Define a dummy page with an async evaluate method returning preset values.
+ class DummyPage:
+ async def evaluate(self, script):
+ if "window.scrollY" in script:
+ return 100 # scrollY
+ elif "window.innerHeight" in script:
+ return 500 # innerHeight
+ elif "document.documentElement.scrollHeight" in script:
+ return 1200 # total scrollable height
+ return None
+ # Create a dummy session with a dummy current_page.
+ dummy_session = type("DummySession", (), {})()
+ dummy_session.current_page = DummyPage()
+ # We also need a dummy context attribute but it won't be used in this test.
+ dummy_session.context = type("DummyContext", (), {})()
+ # Create a dummy browser mock.
+ dummy_browser = Mock()
+ dummy_browser.config = Mock()
+ # Initialize BrowserContext with the dummy browser and config.
+ context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
+ # Manually set the session to our dummy session.
+ context.session = dummy_session
+ # Call get_scroll_info on the dummy page.
+ pixels_above, pixels_below = await context.get_scroll_info(dummy_session.current_page)
+ # Expected calculations:
+ # pixels_above = scrollY = 100
+ # pixels_below = total_height - (scrollY + innerHeight) = 1200 - (100 + 500) = 600
+ assert pixels_above == 100, f"Expected 100 pixels above, got {pixels_above}"
+ assert pixels_below == 600, f"Expected 600 pixels below, got {pixels_below}"
+@pytest.mark.asyncio
+async def test_reset_context():
+ """
+ Test the reset_context method to ensure it correctly closes all existing tabs,
+ resets the cached state, and creates a new page.
+ """
+ # Dummy Page with close and wait_for_load_state methods.
+ class DummyPage:
+ def __init__(self, url="http://dummy.com"):
+ self.url = url
+ self.closed = False
+ async def close(self):
+ self.closed = True
+ async def wait_for_load_state(self):
+ pass
+ # Dummy Context that holds pages and can create a new page.
+ class DummyContext:
+ def __init__(self):
+ self.pages = []
+ async def new_page(self):
+ new_page = DummyPage(url="")
+ self.pages.append(new_page)
+ return new_page
+ # Create a dummy session with a context containing two pages.
+ dummy_session = type("DummySession", (), {})()
+ dummy_context = DummyContext()
+ page1 = DummyPage(url="http://page1.com")
+ page2 = DummyPage(url="http://page2.com")
+ dummy_context.pages.extend([page1, page2])
+ dummy_session.context = dummy_context
+ dummy_session.current_page = page1
+ dummy_session.cached_state = None
+ # Create a dummy browser mock.
+ dummy_browser = Mock()
+ dummy_browser.config = Mock()
+ # Initialize BrowserContext using our dummy_browser and config,
+ # and manually set its session to our dummy session.
+ context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
+ context.session = dummy_session
+ # Confirm session has 2 pages before reset.
+ assert len(dummy_session.context.pages) == 2
+ # Call reset_context which should close existing pages,
+ # reset the cached state, and create a new page as current_page.
+ await context.reset_context()
+ # Verify that initial pages were closed.
+ assert page1.closed is True
+ assert page2.closed is True
+ # Check that a new page is created and set as current_page.
+ assert dummy_session.current_page is not None
+ new_page = dummy_session.current_page
+ # New page URL should be empty as per _get_initial_state.
+ assert new_page.url == ""
+ # Verify that cached_state is reset to an initial BrowserState.
+ state = dummy_session.cached_state
+ assert isinstance(state, BrowserState)
+ assert state.url == ""
+ assert state.element_tree.tag_name == 'root'
+@pytest.mark.asyncio
+async def test_take_screenshot():
+ """
+ Test the take_screenshot method to verify that it returns a base64 encoded screenshot string.
+ A dummy page with a mocked screenshot method is used, returning a predefined byte string.
+ """
+ class DummyPage:
+ async def screenshot(self, full_page, animations):
+ # Verify that parameters are forwarded correctly.
+ assert full_page is True, "full_page parameter was not correctly passed"
+ assert animations == 'disabled', "animations parameter was not correctly passed"
+ # Return a test byte string.
+ return b'test'
+ # Create a dummy session with the DummyPage as the current_page.
+ dummy_session = type("DummySession", (), {})()
+ dummy_session.current_page = DummyPage()
+ dummy_session.context = None # Not used in this test
+ # Create a dummy browser mock.
+ dummy_browser = Mock()
+ dummy_browser.config = Mock()
+ # Initialize the BrowserContext with the dummy browser and config.
+ context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
+ # Manually set the session to our dummy session.
+ context.session = dummy_session
+ # Call take_screenshot and check that it returns the expected base64 encoded string.
+ result = await context.take_screenshot(full_page=True)
+ expected = base64.b64encode(b'test').decode('utf-8')
+ assert result == expected, f"Expected {expected}, but got {result}"
+@pytest.mark.asyncio
+async def test_refresh_page_behavior():
+ """
+ Test the refresh_page method of BrowserContext to verify that it correctly reloads the current page
+ and waits for the page's load state. This is done by creating a dummy page that flags when its
+ reload and wait_for_load_state methods are called.
+ """
+ class DummyPage:
+ def __init__(self):
+ self.reload_called = False
+ self.wait_for_load_state_called = False
+ async def reload(self):
+ self.reload_called = True
+ async def wait_for_load_state(self):
+ self.wait_for_load_state_called = True
+ # Create a dummy session with the dummy page as the current_page.
+ dummy_page = DummyPage()
+ dummy_session = type("DummySession", (), {})()
+ dummy_session.current_page = dummy_page
+ dummy_session.context = None # Not required for this test
+ # Create a dummy browser mock
+ dummy_browser = Mock()
+ dummy_browser.config = Mock()
+ # Initialize BrowserContext with the dummy browser and config,
+ # and manually set its session to our dummy session.
+ context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
+ context.session = dummy_session
+ # Call refresh_page and verify that reload and wait_for_load_state were called.
+ await context.refresh_page()
+ assert dummy_page.reload_called is True, "Expected the page to call reload()"
+ assert dummy_page.wait_for_load_state_called is True, "Expected the page to call wait_for_load_state()"
+@pytest.mark.asyncio
+async def test_remove_highlights_failure():
+ """
+ Test the remove_highlights method to ensure that if the page.evaluate call fails,
+ the exception is caught and does not propagate (i.e. the method handles errors gracefully).
+ """
+ # Dummy page that always raises an exception when evaluate is called.
+ class DummyPage:
+ async def evaluate(self, script):
+ raise Exception("dummy error")
+ # Create a dummy session with the DummyPage as current_page.
+ dummy_session = type("DummySession", (), {})()
+ dummy_session.current_page = DummyPage()
+ dummy_session.context = None # Not used in this test
+ # Create a dummy browser mock.
+ dummy_browser = Mock()
+ dummy_browser.config = Mock()
+ # Initialize BrowserContext with the dummy browser and configuration.
+ context = BrowserContext(browser=dummy_browser, config=BrowserContextConfig())
+ context.session = dummy_session
+ # Call remove_highlights and verify that no exception is raised.
+ try:
+ await context.remove_highlights()
+ except Exception as e:
+ pytest.fail(f"remove_highlights raised an exception: {e}")
\ No newline at end of file
diff --git a/tests/test_core_functionality.py b/tests/test_core_functionality.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6b1f271a726aea0dc9910a0b2f3944953e4cbdb
--- /dev/null
+++ b/tests/test_core_functionality.py
@@ -0,0 +1,202 @@
+import asyncio
+import os
+
+import pytest
+from langchain_openai import AzureChatOpenAI
+from pydantic import SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+from browser_use.browser.browser import Browser, BrowserConfig
+
+
+@pytest.fixture(scope='function')
+def event_loop():
+ """Create an instance of the default event loop for each test case."""
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='function')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as context:
+ yield context
+
+
+@pytest.fixture
+def llm():
+ """Initialize language model for testing"""
+ return AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ )
+
+
+# pytest -s -k test_search_google
+@pytest.mark.asyncio
+async def test_search_google(llm, context):
+ """Test 'Search Google' action"""
+ agent = Agent(
+ task="Search Google for 'OpenAI'.",
+ llm=llm,
+ browser_context=context,
+ )
+ history: AgentHistoryList = await agent.run(max_steps=2)
+ action_names = history.action_names()
+ assert 'search_google' in action_names
+
+
+@pytest.mark.asyncio
+async def test_go_to_url(llm, context):
+ """Test 'Navigate to URL' action"""
+ agent = Agent(
+ task="Navigate to 'https://www.python.org'.",
+ llm=llm,
+ browser_context=context,
+ )
+ history = await agent.run(max_steps=2)
+ action_names = history.action_names()
+ assert 'go_to_url' in action_names
+
+
+@pytest.mark.asyncio
+async def test_go_back(llm, context):
+ """Test 'Go back' action"""
+ agent = Agent(
+ task="Go to 'https://www.example.com', then go back.",
+ llm=llm,
+ browser_context=context,
+ )
+ history = await agent.run(max_steps=3)
+ action_names = history.action_names()
+ assert 'go_to_url' in action_names
+ assert 'go_back' in action_names
+
+
+@pytest.mark.asyncio
+async def test_click_element(llm, context):
+ """Test 'Click element' action"""
+ agent = Agent(
+ task="Go to 'https://www.python.org' and click on the first link.",
+ llm=llm,
+ browser_context=context,
+ )
+ history = await agent.run(max_steps=4)
+ action_names = history.action_names()
+ assert 'go_to_url' in action_names or 'open_tab' in action_names
+ assert 'click_element' in action_names
+
+
+@pytest.mark.asyncio
+async def test_input_text(llm, context):
+ """Test 'Input text' action"""
+ agent = Agent(
+ task="Go to 'https://www.google.com' and input 'OpenAI' into the search box.",
+ llm=llm,
+ browser_context=context,
+ )
+ history = await agent.run(max_steps=4)
+ action_names = history.action_names()
+ assert 'go_to_url' in action_names
+ assert 'input_text' in action_names
+
+
+@pytest.mark.asyncio
+async def test_switch_tab(llm, context):
+ """Test 'Switch tab' action"""
+ agent = Agent(
+ task="Open new tabs with 'https://www.google.com' and 'https://www.wikipedia.org', then switch to the first tab.",
+ llm=llm,
+ browser_context=context,
+ )
+ history = await agent.run(max_steps=6)
+ action_names = history.action_names()
+ open_tab_count = action_names.count('open_tab')
+ assert open_tab_count >= 2
+ assert 'switch_tab' in action_names
+
+
+@pytest.mark.asyncio
+async def test_open_new_tab(llm, context):
+ """Test 'Open new tab' action"""
+ agent = Agent(
+ task="Open a new tab and go to 'https://www.example.com'.",
+ llm=llm,
+ browser_context=context,
+ )
+ history = await agent.run(max_steps=3)
+ action_names = history.action_names()
+ assert 'open_tab' in action_names
+
+
+@pytest.mark.asyncio
+async def test_extract_page_content(llm, context):
+ """Test 'Extract page content' action"""
+ agent = Agent(
+ task="Go to 'https://www.example.com' and extract the page content.",
+ llm=llm,
+ browser_context=context,
+ )
+ history = await agent.run(max_steps=3)
+ action_names = history.action_names()
+ assert 'go_to_url' in action_names
+ assert 'extract_content' in action_names
+
+
+# pytest -k test_done_action
+@pytest.mark.asyncio
+async def test_done_action(llm, context):
+ """Test 'Complete task' action"""
+ agent = Agent(
+ task="Navigate to 'https://www.example.com' and signal that the task is done.",
+ llm=llm,
+ browser_context=context,
+ )
+
+ history = await agent.run(max_steps=3)
+ action_names = history.action_names()
+ assert 'go_to_url' in action_names
+ assert 'done' in action_names
+
+
+# run with: pytest -k test_scroll_down
+@pytest.mark.asyncio
+async def test_scroll_down(llm, context):
+ """Test 'Scroll down' action and validate that the page actually scrolled"""
+ agent = Agent(
+ task="Go to 'https://en.wikipedia.org/wiki/Internet' and scroll down the page.",
+ llm=llm,
+ browser_context=context,
+ )
+ # Get the browser instance
+ page = await context.get_current_page()
+
+ # Navigate to the page and get initial scroll position
+ await agent.run(max_steps=1)
+ initial_scroll_position = await page.evaluate('window.scrollY;')
+
+ # Perform the scroll down action
+ await agent.run(max_steps=2)
+ final_scroll_position = await page.evaluate('window.scrollY;')
+
+ # Validate that the scroll position has changed
+ assert final_scroll_position > initial_scroll_position, 'Page did not scroll down'
+
+ # Validate that the 'scroll_down' action was executed
+ history = agent.history
+ action_names = history.action_names()
+ assert 'scroll_down' in action_names
diff --git a/tests/test_dropdown.py b/tests/test_dropdown.py
new file mode 100644
index 0000000000000000000000000000000000000000..374d432279c4170e656594ccf4c6cbbc9e14ee76
--- /dev/null
+++ b/tests/test_dropdown.py
@@ -0,0 +1,37 @@
+"""
+Test dropdown interaction functionality.
+"""
+import pytest
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+
+@pytest.mark.asyncio
+async def test_dropdown(llm, browser_context):
+ """Test selecting an option from a dropdown menu."""
+ agent = Agent(
+ task=(
+ 'go to https://codepen.io/geheimschriftstift/pen/mPLvQz and first get all options for the dropdown and then select the 5th option'
+ ),
+ llm=llm,
+ browser_context=browser_context,
+ )
+
+ try:
+ history: AgentHistoryList = await agent.run(20)
+ result = history.final_result()
+
+ # Verify dropdown interaction
+ assert result is not None
+ assert 'Duck' in result, "Expected 5th option 'Duck' to be selected"
+
+ # Verify dropdown state
+ element = await browser_context.get_element_by_selector('select')
+ assert element is not None, "Dropdown element should exist"
+
+ value = await element.evaluate('el => el.value')
+ assert value == '5', "Dropdown should have 5th option selected"
+
+ except Exception as e:
+ pytest.fail(f"Dropdown test failed: {str(e)}")
+ finally:
+ await browser_context.close()
diff --git a/tests/test_dropdown_complex.py b/tests/test_dropdown_complex.py
new file mode 100644
index 0000000000000000000000000000000000000000..774e34203dbceb55cc6db97ecf544cf467c62d99
--- /dev/null
+++ b/tests/test_dropdown_complex.py
@@ -0,0 +1,41 @@
+"""
+Test complex dropdown interaction functionality.
+"""
+import pytest
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+
+@pytest.mark.asyncio
+async def test_dropdown_complex(llm, browser_context):
+ """Test selecting an option from a complex dropdown menu."""
+ agent = Agent(
+ task=(
+ 'go to https://codepen.io/shyam-king/pen/pvzpByJ and first get all options for the dropdown and then select the json option'
+ ),
+ llm=llm,
+ browser_context=browser_context,
+ )
+
+ try:
+ history: AgentHistoryList = await agent.run(20)
+ result = history.final_result()
+
+ # Verify dropdown interaction
+ assert result is not None
+ assert 'json' in result.lower(), "Expected 'json' option to be selected"
+
+ # Verify dropdown state
+ element = await browser_context.get_element_by_selector('.select-selected')
+ assert element is not None, "Custom dropdown element should exist"
+
+ text = await element.text_content()
+ assert 'json' in text.lower(), "Dropdown should display json option"
+
+ # Verify the selected option's effect
+ code_element = await browser_context.get_element_by_selector('pre code')
+ assert code_element is not None, "Code element should be visible when JSON is selected"
+
+ except Exception as e:
+ pytest.fail(f"Complex dropdown test failed: {str(e)}")
+ finally:
+ await browser_context.close()
diff --git a/tests/test_dropdown_error.py b/tests/test_dropdown_error.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f23bc9324f3c54d313cbeda373f8d62c8d231e5
--- /dev/null
+++ b/tests/test_dropdown_error.py
@@ -0,0 +1,42 @@
+"""
+Simple try of the agent.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, AgentHistoryList
+
+llm = ChatOpenAI(model='gpt-4o')
+# browser = Browser(config=BrowserConfig(headless=False))
+
+agent = Agent(
+ task=(
+ 'go to https://codepen.io/shyam-king/pen/emOyjKm and select number "4" and return the output of "selected value"'
+ ),
+ llm=llm,
+ browser_context=BrowserContext(
+ browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
+ ),
+)
+
+
+async def test_dropdown():
+ history: AgentHistoryList = await agent.run(20)
+ # await controller.browser.close(force=True)
+
+ result = history.final_result()
+ assert result is not None
+ assert '4' in result
+ print(result)
+
+ # await browser.close()
diff --git a/tests/test_excluded_actions.py b/tests/test_excluded_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4544283bf2f4e0be5e0b00922bd827a41eee2e2
--- /dev/null
+++ b/tests/test_excluded_actions.py
@@ -0,0 +1,98 @@
+import asyncio
+import os
+
+import pytest
+from langchain_openai import AzureChatOpenAI
+from pydantic import SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+# run with:
+# python -m pytest tests/test_excluded_actions.py -v -k "test_only_open_tab_allowed" --capture=no
+
+
+@pytest.fixture(scope='session')
+def event_loop():
+ """Create an instance of the default event loop for each test case."""
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='session')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as context:
+ yield context
+
+
+@pytest.fixture
+def llm():
+ """Initialize language model for testing"""
+ return AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ )
+
+
+# pytest tests/test_excluded_actions.py -v -k "test_only_open_tab_allowed" --capture=no
+@pytest.mark.asyncio
+async def test_only_open_tab_allowed(llm, context):
+ """Test that only open_tab action is available while others are excluded"""
+
+ # Create list of all default actions except open_tab
+ excluded_actions = [
+ 'search_google',
+ 'go_to_url',
+ 'go_back',
+ 'click_element',
+ 'input_text',
+ 'switch_tab',
+ 'extract_content',
+ 'done',
+ 'scroll_down',
+ 'scroll_up',
+ 'send_keys',
+ 'scroll_to_text',
+ 'get_dropdown_options',
+ 'select_dropdown_option',
+ ]
+
+ # Initialize controller with excluded actions
+ controller = Controller(exclude_actions=excluded_actions)
+
+ # Create agent with a task that would normally use other actions
+ agent = Agent(
+ task="Go to google.com and search for 'python programming'",
+ llm=llm,
+ browser_context=context,
+ controller=controller,
+ )
+
+ history: AgentHistoryList = await agent.run(max_steps=2)
+
+ # Verify that only open_tab was used
+ action_names = history.action_names()
+
+ # Only open_tab should be in the actions
+ assert all(action == 'open_tab' for action in action_names), (
+ f'Found unexpected actions: {[a for a in action_names if a != "open_tab"]}'
+ )
+
+ # open_tab should be used at least once
+ assert 'open_tab' in action_names, 'open_tab action was not used'
diff --git a/tests/test_full_screen.py b/tests/test_full_screen.py
new file mode 100644
index 0000000000000000000000000000000000000000..011f696fa9f22b20d96c0c01aeb667c3034e6964
--- /dev/null
+++ b/tests/test_full_screen.py
@@ -0,0 +1,21 @@
+import asyncio
+
+from playwright.async_api import async_playwright
+
+
+async def test_full_screen(start_fullscreen: bool, maximize: bool):
+ async with async_playwright() as p:
+ browser = await p.chromium.launch(
+ headless=False,
+ args=['--start-maximized'],
+ )
+ context = await browser.new_context(no_viewport=True, viewport=None)
+ page = await context.new_page()
+ await page.goto('https://google.com')
+
+ await asyncio.sleep(10)
+ await browser.close()
+
+
+if __name__ == '__main__':
+ asyncio.run(test_full_screen(False, False))
diff --git a/tests/test_gif_path.py b/tests/test_gif_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..3839a1145b7ec2a392b0311ffee27250bed36d24
--- /dev/null
+++ b/tests/test_gif_path.py
@@ -0,0 +1,43 @@
+"""
+Simple try of the agent.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, AgentHistoryList
+
+llm = ChatOpenAI(model='gpt-4o')
+
+agent = Agent(
+ task=(
+ 'go to google.com and search for text "hi there"'
+ ),
+ llm=llm,
+ browser_context=BrowserContext(
+ browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
+ ),
+ generate_gif="./google.gif"
+)
+
+
+async def test_gif_path():
+ if os.path.exists("./google.gif"):
+ os.unlink("./google.gif")
+
+ history: AgentHistoryList = await agent.run(20)
+
+ result = history.final_result()
+ assert result is not None
+
+ assert os.path.exists("./google.gif"), "google.gif was not created"
+
diff --git a/tests/test_mind2web.py b/tests/test_mind2web.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bbd3c907977f768923cf6d4310493d463cea9b6
--- /dev/null
+++ b/tests/test_mind2web.py
@@ -0,0 +1,137 @@
+"""
+Test browser automation using Mind2Web dataset tasks with pytest framework.
+"""
+
+import asyncio
+import json
+import os
+from typing import Any, Dict, List
+
+import pytest
+from langchain_openai import AzureChatOpenAI
+from pydantic import SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.utils import logger
+
+# Constants
+MAX_STEPS = 50
+TEST_SUBSET_SIZE = 10
+
+
+@pytest.fixture(scope='session')
+def event_loop():
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='session')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as new_context:
+ yield new_context
+
+
+@pytest.fixture(scope='session')
+def test_cases() -> List[Dict[str, Any]]:
+ """Load test cases from Mind2Web dataset"""
+ file_path = os.path.join(os.path.dirname(__file__), 'mind2web_data/processed.json')
+ logger.info(f'Loading test cases from {file_path}')
+
+ with open(file_path, 'r') as f:
+ data = json.load(f)
+
+ subset = data[:TEST_SUBSET_SIZE]
+ logger.info(f'Loaded {len(subset)}/{len(data)} test cases')
+ return subset
+
+
+@pytest.fixture
+def llm():
+ """Initialize language model for testing"""
+
+ # return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
+ return AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ )
+
+
+# run with: pytest -s -v tests/test_mind2web.py:test_random_samples
+@pytest.mark.asyncio
+async def test_random_samples(test_cases: List[Dict[str, Any]], llm, context, validator):
+ """Test a random sampling of tasks across different websites"""
+ import random
+
+ logger.info('=== Testing Random Samples ===')
+
+ # Take random samples
+ samples = random.sample(test_cases, 1)
+
+ for i, case in enumerate(samples, 1):
+ task = f"Go to {case['website']}.com and {case['confirmed_task']}"
+ logger.info(f'--- Random Sample {i}/{len(samples)} ---')
+ logger.info(f'Task: {task}\n')
+
+ agent = Agent(task, llm, browser_context=context)
+
+ await agent.run()
+
+ logger.info('Validating random sample task...')
+
+ # TODO: Validate the task
+
+
+def test_dataset_integrity(test_cases):
+ """Test the integrity of the test dataset"""
+ logger.info('\n=== Testing Dataset Integrity ===')
+
+ required_fields = ['website', 'confirmed_task', 'action_reprs']
+ missing_fields = []
+
+ logger.info(f'Checking {len(test_cases)} test cases for required fields')
+
+ for i, case in enumerate(test_cases, 1):
+ logger.debug(f'Checking case {i}/{len(test_cases)}')
+
+ for field in required_fields:
+ if field not in case:
+ missing_fields.append(f'Case {i}: {field}')
+ logger.warning(f"Missing field '{field}' in case {i}")
+
+ # Type checks
+ if not isinstance(case.get('confirmed_task'), str):
+ logger.error(f"Case {i}: 'confirmed_task' must be string")
+ assert False, 'Task must be string'
+
+ if not isinstance(case.get('action_reprs'), list):
+ logger.error(f"Case {i}: 'action_reprs' must be list")
+ assert False, 'Actions must be list'
+
+ if len(case.get('action_reprs', [])) == 0:
+ logger.error(f"Case {i}: 'action_reprs' must not be empty")
+ assert False, 'Must have at least one action'
+
+ if missing_fields:
+ logger.error('Dataset integrity check failed')
+ assert False, f'Missing fields: {missing_fields}'
+ else:
+ logger.info('β
Dataset integrity check passed')
+
+
+if __name__ == '__main__':
+ pytest.main([__file__, '-v'])
diff --git a/tests/test_models.py b/tests/test_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d5f30b7ec7eea9365c854f50136836c1c0c5933
--- /dev/null
+++ b/tests/test_models.py
@@ -0,0 +1,159 @@
+import asyncio
+import os
+
+import pytest
+import requests
+from langchain_anthropic import ChatAnthropic
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_ollama import ChatOllama
+from langchain_openai import AzureChatOpenAI, ChatOpenAI
+from pydantic import SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+from browser_use.browser.browser import Browser, BrowserConfig
+
+
+@pytest.fixture(scope='function')
+def event_loop():
+ """Create an instance of the default event loop for each test case."""
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='function')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as context:
+ yield context
+
+
+api_key_gemini = SecretStr(os.getenv('GEMINI_API_KEY') or '')
+api_key_deepseek = SecretStr(os.getenv('DEEPSEEK_API_KEY') or '')
+api_key_anthropic = SecretStr(os.getenv('ANTHROPIC_API_KEY') or '')
+
+
+# pytest -s -v tests/test_models.py
+@pytest.fixture(
+ params=[
+ ChatOpenAI(model='gpt-4o'),
+ ChatOpenAI(model='gpt-4o-mini'),
+ AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ ),
+ # ChatOpenAI(
+ # base_url='https://api.deepseek.com/v1',
+ # model='deepseek-reasoner',
+ # api_key=api_key_deepseek,
+ # ),
+ # run: ollama start
+ ChatOllama(
+ model='qwen2.5:latest',
+ num_ctx=128000,
+ ),
+ AzureChatOpenAI(
+ model='gpt-4o-mini',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ ),
+ ChatAnthropic(
+ model_name='claude-3-5-sonnet-20240620',
+ timeout=100,
+ temperature=0.0,
+ stop=None,
+ api_key=api_key_anthropic,
+ ),
+ ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=api_key_gemini),
+ ChatGoogleGenerativeAI(model='gemini-1.5-pro', api_key=api_key_gemini),
+ ChatGoogleGenerativeAI(model='gemini-1.5-flash-latest', api_key=api_key_gemini),
+ ChatOpenAI(
+ base_url='https://api.deepseek.com/v1',
+ model='deepseek-chat',
+ api_key=api_key_deepseek,
+ ),
+ ],
+ ids=[
+ 'gpt-4o',
+ 'gpt-4o-mini',
+ 'azure-gpt-4o',
+ #'deepseek-reasoner',
+ 'qwen2.5:latest',
+ 'azure-gpt-4o-mini',
+ 'claude-3-5-sonnet',
+ 'gemini-2.0-flash-exp',
+ 'gemini-1.5-pro',
+ 'gemini-1.5-flash-latest',
+ 'deepseek-chat',
+ ],
+)
+async def llm(request):
+ return request.param
+
+
+@pytest.mark.asyncio
+async def test_model_search(llm, context):
+ """Test 'Search Google' action"""
+ model_name = llm.model if hasattr(llm, 'model') else llm.model_name
+ print(f'\nTesting model: {model_name}')
+
+ use_vision = True
+ models_without_vision = ['deepseek-chat', 'deepseek-reasoner']
+ if hasattr(llm, 'model') and llm.model in models_without_vision:
+ use_vision = False
+ elif hasattr(llm, 'model_name') and llm.model_name in models_without_vision:
+ use_vision = False
+
+ # require ollama run
+ local_models = ['qwen2.5:latest']
+ if model_name in local_models:
+ # check if ollama is running
+ # ping ollama http://127.0.0.1
+ try:
+ response = requests.get('http://127.0.0.1:11434/')
+ if response.status_code != 200:
+ raise
+ except Exception:
+ raise Exception('Ollama is not running - start with `ollama start`')
+
+ agent = Agent(
+ task="Search Google for 'elon musk' then click on the first result and scroll down.",
+ llm=llm,
+ browser_context=context,
+ max_failures=2,
+ use_vision=use_vision,
+ )
+ history: AgentHistoryList = await agent.run(max_steps=2)
+ done = history.is_done()
+ successful = history.is_successful()
+ action_names = history.action_names()
+ print(f'Actions performed: {action_names}')
+ errors = [e for e in history.errors() if e is not None]
+ errors = '\n'.join(errors)
+ passed = False
+ if 'search_google' in action_names:
+ passed = True
+ elif 'go_to_url' in action_names:
+ passed = True
+ elif 'open_tab' in action_names:
+ passed = True
+
+ else:
+ passed = False
+ print(f'Model {model_name}: {"β
PASSED - " if passed else "β FAILED - "} Done: {done} Successful: {successful}')
+
+ assert passed, f'Model {model_name} not working\nActions performed: {action_names}\nErrors: {errors}'
diff --git a/tests/test_qwen.py b/tests/test_qwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a333430712d26e0c4fd67dcb827e6a9b197b2a6
--- /dev/null
+++ b/tests/test_qwen.py
@@ -0,0 +1,66 @@
+import asyncio
+
+import pytest
+from langchain_ollama import ChatOllama
+
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+from browser_use.browser.browser import Browser, BrowserConfig
+
+
+@pytest.fixture
+def llm():
+ """Initialize language model for testing"""
+
+ # return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
+ # NOTE: Make sure to run ollama server with `ollama start'
+ return ChatOllama(
+ model='qwen2.5:latest',
+ num_ctx=128000,
+ )
+
+
+@pytest.fixture(scope='session')
+def event_loop():
+ """Create an instance of the default event loop for each test case."""
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='session')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as context:
+ yield context
+
+
+# pytest tests/test_qwen.py -v -k "test_qwen_url" --capture=no
+# @pytest.mark.asyncio
+async def test_qwen_url(llm, context):
+ """Test complex ecommerce interaction sequence"""
+ agent = Agent(
+ task='go_to_url amazon.com',
+ llm=llm,
+ )
+
+ history: AgentHistoryList = await agent.run(max_steps=3)
+
+ # Verify sequence of actions
+ action_sequence = []
+ for action in history.model_actions():
+ action_name = list(action.keys())[0]
+ if action_name in ['go_to_url', 'open_tab']:
+ action_sequence.append('navigate')
+
+ assert 'navigate' in action_sequence # Navigated to Amazon
diff --git a/tests/test_react_dropdown.py b/tests/test_react_dropdown.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7dd057a53bc7daefb8809b1063a1e5491059e4a
--- /dev/null
+++ b/tests/test_react_dropdown.py
@@ -0,0 +1,45 @@
+"""
+Simple try of the agent.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import asyncio
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, AgentHistoryList
+
+llm = ChatOpenAI(model='gpt-4o')
+# browser = Browser(config=BrowserConfig(headless=False))
+
+agent = Agent(
+ task=(
+ 'go to https://codepen.io/shyam-king/pen/ByBJoOv and select "Tiger" dropdown and read the text given in "Selected Animal" box (it can be empty as well)'
+ ),
+ llm=llm,
+ browser_context=BrowserContext(
+ browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
+ ),
+)
+
+
+async def test_dropdown():
+ history: AgentHistoryList = await agent.run(10)
+ # await controller.browser.close(force=True)
+
+ result = history.final_result()
+ assert result is not None
+ print('result: ', result)
+ # await browser.close()
+
+
+if __name__ == '__main__':
+ asyncio.run(test_dropdown())
diff --git a/tests/test_save_conversation.py b/tests/test_save_conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b2b25856d30dca94d63ad0eb345c63cbaacacae
--- /dev/null
+++ b/tests/test_save_conversation.py
@@ -0,0 +1,83 @@
+"""
+Simple try of the agent.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import shutil
+import sys
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, AgentHistoryList
+
+llm = ChatOpenAI(model='gpt-4o')
+
+
+async def test_save_conversation_contains_slash():
+ if os.path.exists('./logs'):
+ shutil.rmtree('./logs')
+
+ agent = Agent(
+ task=('go to google.com and search for text "hi there"'),
+ llm=llm,
+ browser_context=BrowserContext(
+ browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
+ ),
+ save_conversation_path='logs/conversation',
+ )
+ history: AgentHistoryList = await agent.run(20)
+
+ result = history.final_result()
+ assert result is not None
+
+ assert os.path.exists('./logs'), 'logs directory was not created'
+ assert os.path.exists('./logs/conversation_2.txt'), 'logs file was not created'
+
+
+async def test_save_conversation_not_contains_slash():
+ if os.path.exists('./logs'):
+ shutil.rmtree('./logs')
+
+ agent = Agent(
+ task=('go to google.com and search for text "hi there"'),
+ llm=llm,
+ browser_context=BrowserContext(
+ browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
+ ),
+ save_conversation_path='logs',
+ )
+ history: AgentHistoryList = await agent.run(20)
+
+ result = history.final_result()
+ assert result is not None
+
+ assert os.path.exists('./logs'), 'logs directory was not created'
+ assert os.path.exists('./logs/_2.txt'), 'logs file was not created'
+
+
+async def test_save_conversation_deep_directory():
+ if os.path.exists('./logs'):
+ shutil.rmtree('./logs')
+
+ agent = Agent(
+ task=('go to google.com and search for text "hi there"'),
+ llm=llm,
+ browser_context=BrowserContext(
+ browser=Browser(config=BrowserConfig(headless=False, disable_security=True)),
+ ),
+ save_conversation_path='logs/deep/directory/conversation',
+ )
+ history: AgentHistoryList = await agent.run(20)
+
+ result = history.final_result()
+ assert result is not None
+
+ assert os.path.exists('./logs/deep/directory'), 'logs directory was not created'
+ assert os.path.exists('./logs/deep/directory/conversation_2.txt'), 'logs file was not created'
diff --git a/tests/test_self_registered_actions.py b/tests/test_self_registered_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..2221f7866777589626926ec0235e0af7be922537
--- /dev/null
+++ b/tests/test_self_registered_actions.py
@@ -0,0 +1,198 @@
+import asyncio
+import os
+
+import pytest
+from langchain_openai import AzureChatOpenAI
+from pydantic import BaseModel, SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.agent.views import AgentHistoryList
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+
+@pytest.fixture(scope='session')
+def event_loop():
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='session')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as context:
+ yield context
+
+
+@pytest.fixture
+async def controller():
+ """Initialize the controller with self-registered actions"""
+ controller = Controller()
+
+ # Define custom actions without Pydantic models
+ @controller.action('Print a message')
+ def print_message(message: str):
+ print(f'Message: {message}')
+ return f'Printed message: {message}'
+
+ @controller.action('Add two numbers')
+ def add_numbers(a: int, b: int):
+ result = a + b
+ return f'The sum is {result}'
+
+ @controller.action('Concatenate strings')
+ def concatenate_strings(str1: str, str2: str):
+ result = str1 + str2
+ return f'Concatenated string: {result}'
+
+ # Define Pydantic models
+ class SimpleModel(BaseModel):
+ name: str
+ age: int
+
+ class Address(BaseModel):
+ street: str
+ city: str
+
+ class NestedModel(BaseModel):
+ user: SimpleModel
+ address: Address
+
+ # Add actions with Pydantic model arguments
+ @controller.action('Process simple model', param_model=SimpleModel)
+ def process_simple_model(model: SimpleModel):
+ return f'Processed {model.name}, age {model.age}'
+
+ @controller.action('Process nested model', param_model=NestedModel)
+ def process_nested_model(model: NestedModel):
+ user_info = f'{model.user.name}, age {model.user.age}'
+ address_info = f'{model.address.street}, {model.address.city}'
+ return f'Processed user {user_info} at address {address_info}'
+
+ @controller.action('Process multiple models')
+ def process_multiple_models(model1: SimpleModel, model2: Address):
+ return f'Processed {model1.name} living at {model2.street}, {model2.city}'
+
+ yield controller
+
+
+@pytest.fixture
+def llm():
+ """Initialize language model for testing"""
+
+ # return ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=25, stop=None)
+ return AzureChatOpenAI(
+ model='gpt-4o',
+ api_version='2024-10-21',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ )
+
+
+# @pytest.mark.skip(reason="Skipping test for now")
+@pytest.mark.asyncio
+async def test_self_registered_actions_no_pydantic(llm, controller):
+ """Test self-registered actions with individual arguments"""
+ agent = Agent(
+ task="First, print the message 'Hello, World!'. Then, add 10 and 20. Next, concatenate 'foo' and 'bar'.",
+ llm=llm,
+ controller=controller,
+ )
+ history: AgentHistoryList = await agent.run(max_steps=10)
+ # Check that custom actions were executed
+ action_names = history.action_names()
+
+ assert 'print_message' in action_names
+ assert 'add_numbers' in action_names
+ assert 'concatenate_strings' in action_names
+
+
+# @pytest.mark.skip(reason="Skipping test for now")
+@pytest.mark.asyncio
+async def test_mixed_arguments_actions(llm, controller):
+ """Test actions with mixed argument types"""
+
+ # Define another action during the test
+ # Test for async actions
+ @controller.action('Calculate the area of a rectangle')
+ async def calculate_area(length: float, width: float):
+ area = length * width
+ return f'The area is {area}'
+
+ agent = Agent(
+ task='Calculate the area of a rectangle with length 5.5 and width 3.2.',
+ llm=llm,
+ controller=controller,
+ )
+ history = await agent.run(max_steps=5)
+
+ # Check that the action was executed
+ action_names = history.action_names()
+
+ assert 'calculate_area' in action_names
+ # check result
+ correct = 'The area is 17.6'
+ for content in history.extracted_content():
+ if correct in content:
+ break
+ else:
+ pytest.fail(f'{correct} not found in extracted content')
+
+
+@pytest.mark.asyncio
+async def test_pydantic_simple_model(llm, controller):
+ """Test action with a simple Pydantic model argument"""
+ agent = Agent(
+ task="Process a simple model with name 'Alice' and age 30.",
+ llm=llm,
+ controller=controller,
+ )
+ history = await agent.run(max_steps=5)
+
+ # Check that the action was executed
+ action_names = history.action_names()
+
+ assert 'process_simple_model' in action_names
+ correct = 'Processed Alice, age 30'
+ for content in history.extracted_content():
+ if correct in content:
+ break
+ else:
+ pytest.fail(f'{correct} not found in extracted content')
+
+
+@pytest.mark.asyncio
+async def test_pydantic_nested_model(llm, controller):
+ """Test action with a nested Pydantic model argument"""
+ agent = Agent(
+ task="Process a nested model with user name 'Bob', age 25, living at '123 Maple St', 'Springfield'.",
+ llm=llm,
+ controller=controller,
+ )
+ history = await agent.run(max_steps=5)
+
+ # Check that the action was executed
+ action_names = history.action_names()
+
+ assert 'process_nested_model' in action_names
+ correct = 'Processed user Bob, age 25 at address 123 Maple St, Springfield'
+ for content in history.extracted_content():
+ if correct in content:
+ break
+ else:
+ pytest.fail(f'{correct} not found in extracted content')
+
+
+# run this file with:
+# pytest tests/test_self_registered_actions.py --capture=no
diff --git a/tests/test_service.py b/tests/test_service.py
new file mode 100644
index 0000000000000000000000000000000000000000..340085bff73be14908d0aa09aea391162c5e4be0
--- /dev/null
+++ b/tests/test_service.py
@@ -0,0 +1,220 @@
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+
+import pytest
+from langchain_core.language_models.chat_models import BaseChatModel
+from pydantic import BaseModel
+
+from browser_use.agent.service import Agent
+from browser_use.agent.views import ActionResult
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContext
+from browser_use.browser.views import BrowserState
+from browser_use.controller.registry.service import Registry
+from browser_use.controller.registry.views import ActionModel
+from browser_use.controller.service import Controller
+
+# run with python -m pytest tests/test_service.py
+
+
+# run test with:
+# python -m pytest tests/test_service.py
+class TestAgent:
+ @pytest.fixture
+ def mock_controller(self):
+ controller = Mock(spec=Controller)
+ registry = Mock(spec=Registry)
+ registry.registry = MagicMock()
+ registry.registry.actions = {'test_action': MagicMock(param_model=MagicMock())} # type: ignore
+ controller.registry = registry
+ return controller
+
+ @pytest.fixture
+ def mock_llm(self):
+ return Mock(spec=BaseChatModel)
+
+ @pytest.fixture
+ def mock_browser(self):
+ return Mock(spec=Browser)
+
+ @pytest.fixture
+ def mock_browser_context(self):
+ return Mock(spec=BrowserContext)
+
+ def test_convert_initial_actions(self, mock_controller, mock_llm, mock_browser, mock_browser_context): # type: ignore
+ """
+ Test that the _convert_initial_actions method correctly converts
+ dictionary-based actions to ActionModel instances.
+
+ This test ensures that:
+ 1. The method processes the initial actions correctly.
+ 2. The correct param_model is called with the right parameters.
+ 3. The ActionModel is created with the validated parameters.
+ 4. The method returns a list of ActionModel instances.
+ """
+ # Arrange
+ agent = Agent(
+ task='Test task', llm=mock_llm, controller=mock_controller, browser=mock_browser, browser_context=mock_browser_context
+ )
+ initial_actions = [{'test_action': {'param1': 'value1', 'param2': 'value2'}}]
+
+ # Mock the ActionModel
+ mock_action_model = MagicMock(spec=ActionModel)
+ mock_action_model_instance = MagicMock()
+ mock_action_model.return_value = mock_action_model_instance
+ agent.ActionModel = mock_action_model # type: ignore
+
+ # Act
+ result = agent._convert_initial_actions(initial_actions)
+
+ # Assert
+ assert len(result) == 1
+ mock_controller.registry.registry.actions['test_action'].param_model.assert_called_once_with( # type: ignore
+ param1='value1', param2='value2'
+ )
+ mock_action_model.assert_called_once()
+ assert isinstance(result[0], MagicMock)
+ assert result[0] == mock_action_model_instance
+
+ # Check that the ActionModel was called with the correct parameters
+ call_args = mock_action_model.call_args[1]
+ assert 'test_action' in call_args
+ assert call_args['test_action'] == mock_controller.registry.registry.actions['test_action'].param_model.return_value # type: ignore
+
+ @pytest.mark.asyncio
+ async def test_step_error_handling(self):
+ """
+ Test the error handling in the step method of the Agent class.
+ This test simulates a failure in the get_next_action method and
+ checks if the error is properly handled and recorded.
+ """
+ # Mock the LLM
+ mock_llm = MagicMock(spec=BaseChatModel)
+
+ # Mock the MessageManager
+ with patch('browser_use.agent.service.MessageManager') as mock_message_manager:
+ # Create an Agent instance with mocked dependencies
+ agent = Agent(task='Test task', llm=mock_llm)
+
+ # Mock the get_next_action method to raise an exception
+ agent.get_next_action = AsyncMock(side_effect=ValueError('Test error'))
+
+ # Mock the browser_context
+ agent.browser_context = AsyncMock()
+ agent.browser_context.get_state = AsyncMock(
+ return_value=BrowserState(
+ url='https://example.com',
+ title='Example',
+ element_tree=MagicMock(), # Mocked element tree
+ tabs=[],
+ selector_map={},
+ screenshot='',
+ )
+ )
+
+ # Mock the controller
+ agent.controller = AsyncMock()
+
+ # Call the step method
+ await agent.step()
+
+ # Assert that the error was handled and recorded
+ assert agent.consecutive_failures == 1
+ assert len(agent._last_result) == 1
+ assert isinstance(agent._last_result[0], ActionResult)
+ assert 'Test error' in agent._last_result[0].error
+ assert agent._last_result[0].include_in_memory == True
+
+
+class TestRegistry:
+ @pytest.fixture
+ def registry_with_excludes(self):
+ return Registry(exclude_actions=['excluded_action'])
+
+ def test_action_decorator_with_excluded_action(self, registry_with_excludes):
+ """
+ Test that the action decorator does not register an action
+ if it's in the exclude_actions list.
+ """
+
+ # Define a function to be decorated
+ def excluded_action():
+ pass
+
+ # Apply the action decorator
+ decorated_func = registry_with_excludes.action(description='This should be excluded')(excluded_action)
+
+ # Assert that the decorated function is the same as the original
+ assert decorated_func == excluded_action
+
+ # Assert that the action was not added to the registry
+ assert 'excluded_action' not in registry_with_excludes.registry.actions
+
+ # Define another function that should be included
+ def included_action():
+ pass
+
+ # Apply the action decorator to an included action
+ registry_with_excludes.action(description='This should be included')(included_action)
+
+ # Assert that the included action was added to the registry
+ assert 'included_action' in registry_with_excludes.registry.actions
+
+ @pytest.mark.asyncio
+ async def test_execute_action_with_and_without_browser_context(self):
+ """
+ Test that the execute_action method correctly handles actions with and without a browser context.
+ This test ensures that:
+ 1. An action requiring a browser context is executed correctly.
+ 2. An action not requiring a browser context is executed correctly.
+ 3. The browser context is passed to the action function when required.
+ 4. The action function receives the correct parameters.
+ 5. The method raises an error when a browser context is required but not provided.
+ """
+ registry = Registry()
+
+ # Define a mock action model
+ class TestActionModel(BaseModel):
+ param1: str
+
+ # Define mock action functions
+ async def test_action_with_browser(param1: str, browser):
+ return f'Action executed with {param1} and browser'
+
+ async def test_action_without_browser(param1: str):
+ return f'Action executed with {param1}'
+
+ # Register the actions
+ registry.registry.actions['test_action_with_browser'] = MagicMock(
+ function=AsyncMock(side_effect=test_action_with_browser),
+ param_model=TestActionModel,
+ description='Test action with browser',
+ )
+
+ registry.registry.actions['test_action_without_browser'] = MagicMock(
+ function=AsyncMock(side_effect=test_action_without_browser),
+ param_model=TestActionModel,
+ description='Test action without browser',
+ )
+
+ # Mock BrowserContext
+ mock_browser = MagicMock()
+
+ # Execute the action with a browser context
+ result_with_browser = await registry.execute_action(
+ 'test_action_with_browser', {'param1': 'test_value'}, browser=mock_browser
+ )
+ assert result_with_browser == 'Action executed with test_value and browser'
+
+ # Execute the action without a browser context
+ result_without_browser = await registry.execute_action('test_action_without_browser', {'param1': 'test_value'})
+ assert result_without_browser == 'Action executed with test_value'
+
+ # Test error when browser is required but not provided
+ with pytest.raises(RuntimeError, match='Action test_action_with_browser requires browser but none provided'):
+ await registry.execute_action('test_action_with_browser', {'param1': 'test_value'})
+
+ # Verify that the action functions were called with correct parameters
+ registry.registry.actions['test_action_with_browser'].function.assert_called_once_with(
+ param1='test_value', browser=mock_browser
+ )
+ registry.registry.actions['test_action_without_browser'].function.assert_called_once_with(param1='test_value')
diff --git a/tests/test_stress.py b/tests/test_stress.py
new file mode 100644
index 0000000000000000000000000000000000000000..07a5e8fabaac2c9e149f49ab3b7e7077f94fa4ce
--- /dev/null
+++ b/tests/test_stress.py
@@ -0,0 +1,115 @@
+import asyncio
+import os
+import random
+import string
+import time
+
+import pytest
+from langchain_openai import AzureChatOpenAI
+from pydantic import SecretStr
+
+from browser_use.agent.service import Agent
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.controller.service import Controller
+
+
+@pytest.fixture(scope='session')
+def event_loop():
+ loop = asyncio.get_event_loop_policy().new_event_loop()
+ yield loop
+ loop.close()
+
+
+@pytest.fixture(scope='session')
+async def browser(event_loop):
+ browser_instance = Browser(
+ config=BrowserConfig(
+ headless=True,
+ )
+ )
+ yield browser_instance
+ await browser_instance.close()
+
+
+@pytest.fixture
+async def context(browser):
+ async with await browser.new_context() as context:
+ yield context
+
+
+@pytest.fixture
+def llm():
+ """Initialize the language model"""
+ model = AzureChatOpenAI(
+ api_version='2024-10-21',
+ model='gpt-4o',
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''),
+ api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')),
+ )
+ return model
+
+
+def generate_random_text(length: int) -> str:
+ """Generate random text of specified length"""
+ return ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
+
+
+@pytest.fixture
+async def controller():
+ """Initialize the controller"""
+ controller = Controller()
+ large_text = generate_random_text(10000)
+
+ @controller.action('call this magical function to get very special text')
+ def get_very_special_text():
+ return large_text
+
+ yield controller
+
+
+@pytest.mark.asyncio
+async def test_token_limit_with_multiple_extractions(llm, controller, context):
+ """Test handling of multiple smaller extractions accumulating tokens"""
+ agent = Agent(
+ task='Call the magical function to get very special text 5 times',
+ llm=llm,
+ controller=controller,
+ browser_context=context,
+ max_input_tokens=2000,
+ save_conversation_path='tmp/stress_test/test_token_limit_with_multiple_extractions.json',
+ )
+
+ history = await agent.run(max_steps=5)
+
+ # check if 5 times called get_special_text
+ calls = [a for a in history.action_names() if a == 'get_very_special_text']
+ assert len(calls) == 5
+ # check the message history should be max 3 messages
+ assert len(agent.message_manager.history.messages) > 3
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize('max_tokens', [4000]) # 8000 20000
+@pytest.mark.asyncio
+async def test_open_3_tabs_and_extract_content(llm, controller, context, max_tokens):
+ """Stress test: Open 3 tabs with urls and extract content"""
+ agent = Agent(
+ task='Open 3 tabs with https://en.wikipedia.org/wiki/Internet and extract the content from each.',
+ llm=llm,
+ controller=controller,
+ browser_context=context,
+ max_input_tokens=max_tokens,
+ save_conversation_path='tmp/stress_test/test_open_3_tabs_and_extract_content.json',
+ )
+ start_time = time.time()
+ history = await agent.run(max_steps=7)
+ end_time = time.time()
+
+ total_time = end_time - start_time
+
+ print(f'Total time: {total_time:.2f} seconds')
+ # Check for errors
+ errors = history.errors()
+ assert len(errors) == 0, 'Errors occurred during the test'
+ # check if 3 tabs were opened
+ assert len(context.current_state.tabs) >= 3, '3 tabs were not opened'
diff --git a/tests/test_vision.py b/tests/test_vision.py
new file mode 100644
index 0000000000000000000000000000000000000000..91c01b66720096f6eea2df995638015e5f62e190
--- /dev/null
+++ b/tests/test_vision.py
@@ -0,0 +1,52 @@
+"""
+Simple try of the agent.
+
+@dev You need to add OPENAI_API_KEY to your environment variables.
+"""
+
+import os
+import sys
+from pprint import pprint
+
+import pytest
+
+from browser_use.browser.browser import Browser, BrowserConfig
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from langchain_openai import ChatOpenAI
+
+from browser_use import Agent, AgentHistoryList, Controller
+
+llm = ChatOpenAI(model='gpt-4o')
+controller = Controller()
+
+# use this test to ask the model questions about the page like
+# which color do you see for bbox labels, list all with their label
+# whats the smallest bboxes with labels and
+
+
+@controller.registry.action(description='explain what you see on the screen and ask user for input')
+async def explain_screen(text: str) -> str:
+ pprint(text)
+ answer = input('\nuser input next question: \n')
+ return answer
+
+
+@controller.registry.action(description='done')
+async def done(text: str) -> str:
+ # pprint(text)
+ return 'call explain_screen'
+
+
+agent = Agent(
+ task='call explain_screen all the time the user asks you questions e.g. about the page like bbox which you see are labels - your task is to expalin it and get the next question',
+ llm=llm,
+ controller=controller,
+ browser=Browser(config=BrowserConfig(disable_security=True, headless=False)),
+)
+
+
+@pytest.mark.skip(reason='this is for local testing only')
+async def test_vision():
+ history: AgentHistoryList = await agent.run(20)