Spaces:

Boobs00
/

use

Configuration error

App Files Files Community

Boobs00 commited on Mar 13, 2025

Commit

db4810d

verified ·

1 Parent(s): e91a48e

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +8 -0
.gitattributes +4 -35
.github/ISSUE_TEMPLATE/bug_report.yml +84 -0
.github/ISSUE_TEMPLATE/config.yml +11 -0
.github/ISSUE_TEMPLATE/docs_issue.yml +55 -0
.github/ISSUE_TEMPLATE/feature_request.yml +43 -0
.github/workflows/publish.yml +38 -0
.gitignore +190 -0
.pre-commit-config.yaml +18 -0
.python-version +1 -0
LICENSE +21 -0
README.md +193 -10
SECURITY.md +20 -0
browser_use/README.md +51 -0
browser_use/__init__.py +27 -0
browser_use/agent/gif.py +325 -0
browser_use/agent/message_manager/service.py +306 -0
browser_use/agent/message_manager/tests.py +237 -0
browser_use/agent/message_manager/utils.py +127 -0
browser_use/agent/message_manager/views.py +129 -0
browser_use/agent/prompts.py +165 -0
browser_use/agent/service.py +964 -0
browser_use/agent/system_prompt.md +69 -0
browser_use/agent/tests.py +197 -0
browser_use/agent/views.py +393 -0
browser_use/browser/browser.py +253 -0
browser_use/browser/context.py +1353 -0
browser_use/browser/tests/screenshot_test.py +37 -0
browser_use/browser/tests/test_clicks.py +94 -0
browser_use/browser/views.py +53 -0
browser_use/controller/registry/service.py +199 -0
browser_use/controller/registry/views.py +70 -0
browser_use/controller/service.py +532 -0
browser_use/controller/views.py +65 -0
browser_use/dom/__init__.py +0 -0
browser_use/dom/buildDomTree.js +1055 -0
browser_use/dom/history_tree_processor/service.py +107 -0
browser_use/dom/history_tree_processor/view.py +70 -0
browser_use/dom/service.py +169 -0
browser_use/dom/tests/debug_page_structure.py +123 -0
browser_use/dom/tests/extraction_test.py +147 -0
browser_use/dom/tests/process_dom_test.py +40 -0
browser_use/dom/views.py +196 -0
browser_use/logging_config.py +132 -0
browser_use/telemetry/service.py +105 -0
browser_use/telemetry/views.py +63 -0
browser_use/utils.py +54 -0
codebeaver.yml +4 -0
conftest.py +10 -0
docs/README.md +17 -0

.env.example ADDED Viewed

	@@ -0,0 +1,8 @@

+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+# Set to false to disable anonymized telemetry
+ANONYMIZED_TELEMETRY=true
+# LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
+BROWSER_USE_LOGGING_LEVEL=info

.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+static/*.gif filter=lfs diff=lfs merge=lfs -text
+# static/*.mp4 filter=lfs diff=lfs merge=lfs -text
+docs/images/checks-passed.png filter=lfs diff=lfs merge=lfs -text
+docs/images/laminar.png filter=lfs diff=lfs merge=lfs -text

.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

	@@ -0,0 +1,84 @@

+name: 🐛 Bug Report
+description: Report a bug in browser-use
+labels: ["bug", "triage"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to fill out this bug report! Please fill out the form below to help us reproduce and fix the issue.
+  - type: textarea
+    id: description
+    attributes:
+      label: Bug Description
+      description: A clear and concise description of what the bug is.
+      placeholder: When I try to... the library...
+    validations:
+      required: true
+  - type: textarea
+    id: reproduction
+    attributes:
+      label: Reproduction Steps
+      description: Steps to reproduce the behavior
+      placeholder: |
+        1. Install browser-use...
+        2. Run the following task...
+        3. See error...
+    validations:
+      required: true
+  - type: textarea
+    id: code
+    attributes:
+      label: Code Sample
+      description: Include a minimal code sample that reproduces the issue
+      render: python
+    validations:
+      required: true
+  - type: input
+    id: version
+    attributes:
+      label: Version
+      description: What version of browser-use are you using? (Run `uv pip show browser-use` to find out)
+      placeholder: "e.g., pip 0.1.26, or git main branch"
+    validations:
+      required: true
+  - type: dropdown
+    id: model
+    attributes:
+      label: LLM Model
+      description: Which LLM model(s) are you using?
+      multiple: true
+      options:
+        - GPT-4o
+        - GPT-4
+        - Claude 3.5 Sonnet
+        - Claude 3.5 Opus
+        - Claude 3.5 Haiku
+        - Gemini 1.5 Pro
+        - Gemini 1.5 Ultra
+        - Fireworks Mixtral
+        - DeepSeek Coder
+        - Local Model (Specify model in description)
+        - Other (specify in description)
+    validations:
+      required: true
+  - type: input
+    id: os
+    attributes:
+      label: Operating System
+      description: What operating system are you using?
+      placeholder: "e.g., macOS 13.1, Windows 11, Ubuntu 22.04"
+    validations:
+      required: true
+  - type: textarea
+    id: logs
+    attributes:
+      label: Relevant Log Output
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code.
+      render: shell

.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1,11 @@

+blank_issues_enabled: false  # Set to true if you want to allow blank issues
+contact_links:
+  - name: 🤔 Quickstart Guide
+    url: https://docs.browser-use.com/quickstart
+    about: Most common issues can be resolved by following our quickstart guide
+  - name: 🤔 Questions and Help
+    url: https://link.browser-use.com/discord
+    about: Please ask questions in our Discord community
+  - name: 📖 Documentation
+    url: https://docs.browser-use.com
+    about: Check our documentation for answers first

.github/ISSUE_TEMPLATE/docs_issue.yml ADDED Viewed

	@@ -0,0 +1,55 @@

+name: 📚 Documentation Issue
+description: Report an issue in the browser-use documentation
+labels: ["documentation"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to improve our documentation! Please fill out the form below to help us understand the issue.
+  - type: dropdown
+    id: type
+    attributes:
+      label: Type of Documentation Issue
+      description: What type of documentation issue is this?
+      options:
+        - Missing documentation
+        - Incorrect documentation
+        - Unclear documentation
+        - Broken link
+        - Other (specify in description)
+    validations:
+      required: true
+  - type: input
+    id: page
+    attributes:
+      label: Documentation Page
+      description: Which page or section of the documentation is this about?
+      placeholder: "e.g., https://docs.browser-use.com/getting-started or Installation Guide"
+    validations:
+      required: true
+  - type: textarea
+    id: description
+    attributes:
+      label: Issue Description
+      description: Describe what's wrong or missing in the documentation
+      placeholder: The documentation should...
+    validations:
+      required: true
+  - type: textarea
+    id: suggestion
+    attributes:
+      label: Suggested Changes
+      description: If you have specific suggestions for how to improve the documentation, please share them
+      placeholder: |
+        The documentation could be improved by...
+        Example:
+        ```python
+        # Your suggested code example or text here
+        ```
+    validations:
+      required: true

.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

	@@ -0,0 +1,43 @@

+name: 💡 Feature Request
+description: Suggest a new feature for browser-use
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        Thanks for taking the time to suggest a new feature! Please fill out the form below to help us understand your suggestion.
+  - type: textarea
+    id: problem
+    attributes:
+      label: Problem Description
+      description: Is your feature request related to a problem? Please describe.
+      placeholder: I'm always frustrated when...
+    validations:
+      required: true
+  - type: textarea
+    id: solution
+    attributes:
+      label: Proposed Solution
+      description: Describe the solution you'd like to see
+      placeholder: It would be great if...
+    validations:
+      required: true
+  - type: textarea
+    id: alternatives
+    attributes:
+      label: Alternative Solutions
+      description: Describe any alternative solutions or features you've considered
+      placeholder: I've also thought about...
+  - type: textarea
+    id: context
+    attributes:
+      label: Additional Context
+      description: Add any other context or examples about the feature request here
+      placeholder: |
+        - Example use cases
+        - Screenshots or mockups
+        - Related issues or discussions

.github/workflows/publish.yml ADDED Viewed

	@@ -0,0 +1,38 @@

+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+name: Upload Python Package
+on:
+  release:
+    types: [published]
+permissions:
+  contents: read
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install build hatch
+      - name: Build package
+        run: python -m build
+      - name: Publish package
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          user: __token__
+          password: ${{ secrets.PYPI_API_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,190 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+test_env/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+temp
+tmp
+.DS_Store
+private_example.py
+private_example
+browser_cookies.json
+cookies.json
+AgentHistory.json
+cv_04_24.pdf
+AgentHistoryList.json
+*.gif
+gcp-login.json
+.vscode
+.ruff_cache
+.idea
+*.txt
+*.pdf
+*.csv
+*.json
+*.jsonl
+uv.lock

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.3.0
+    hooks:
+      - id: ruff
+        args: [
+            --line-length=130,
+            --select=E,F,I,
+            --fix,
+        ]
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-toml

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Gregor Zunic
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,10 +1,193 @@
----
-title: Use
-emoji: 🌍
-colorFrom: indigo
-colorTo: indigo
-sdk: static
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+<picture>
+  <source media="(prefers-color-scheme: dark)" srcset="./static/browser-use-dark.png">
+  <source media="(prefers-color-scheme: light)" srcset="./static/browser-use.png">
+  <img alt="Shows a black Browser Use Logo in light color mode and a white one in dark color mode." src="./static/browser-use.png"  width="full">
+</picture>
+<h1 align="center">Enable AI to control your browser 🤖</h1>
+[![GitHub stars](https://img.shields.io/github/stars/gregpr07/browser-use?style=social)](https://github.com/gregpr07/browser-use/stargazers)
+[![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://link.browser-use.com/discord)
+[![Cloud](https://img.shields.io/badge/Cloud-☁️-blue)](https://cloud.browser-use.com)
+[![Documentation](https://img.shields.io/badge/Documentation-📕-blue)](https://docs.browser-use.com)
+[![Twitter Follow](https://img.shields.io/twitter/follow/Gregor?style=social)](https://x.com/gregpr07)
+[![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/mamagnus00)
+[![Weave Badge](https://img.shields.io/endpoint?url=https%3A%2F%2Fapp.workweave.ai%2Fapi%2Frepository%2Fbadge%2Forg_T5Pvn3UBswTHIsN1dWS3voPg%2F881458615&labelColor=#EC6341)](https://app.workweave.ai/reports/repository/org_T5Pvn3UBswTHIsN1dWS3voPg/881458615)
+🌐 Browser-use is the easiest way to connect your AI agents with the browser.
+💡 See what others are building and share your projects in our [Discord](https://link.browser-use.com/discord)! Want Swag? Check out our [Merch store](https://browsermerch.com).
+🌤️ Skip the setup - try our <b>hosted version</b> for instant browser automation! <b>[Try the cloud ☁︎](https://cloud.browser-use.com)</b>.
+# Quick start
+With pip (Python>=3.11):
+```bash
+pip install browser-use
+```
+install playwright:
+```bash
+playwright install
+```
+Spin up your agent:
+```python
+from langchain_openai import ChatOpenAI
+from browser_use import Agent
+import asyncio
+from dotenv import load_dotenv
+load_dotenv()
+async def main():
+    agent = Agent(
+        task="Compare the price of gpt-4o and DeepSeek-V3",
+        llm=ChatOpenAI(model="gpt-4o"),
+    )
+    await agent.run()
+asyncio.run(main())
+```
+Add your API keys for the provider you want to use to your `.env` file.
+```bash
+OPENAI_API_KEY=
+```
+For other settings, models, and more, check out the [documentation 📕](https://docs.browser-use.com).
+### Test with UI
+You can test [browser-use with a UI repository](https://github.com/browser-use/web-ui)
+Or simply run the gradio example:
+```
+uv pip install gradio
+```
+```bash
+python examples/ui/gradio_demo.py
+```
+# Demos
+<br/><br/>
+[Task](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/shopping.py): Add grocery items to cart, and checkout.
+[![AI Did My Groceries](https://github.com/user-attachments/assets/d9359085-bde6-41d4-aa4e-6520d0221872)](https://www.youtube.com/watch?v=L2Ya9PYNns8)
+<br/><br/>
+Prompt: Add my latest LinkedIn follower to my leads in Salesforce.
+![LinkedIn to Salesforce](https://github.com/user-attachments/assets/1440affc-a552-442e-b702-d0d3b277b0ae)
+<br/><br/>
+[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/find_and_apply_to_jobs.py): Read my CV & find ML jobs, save them to a file, and then start applying for them in new tabs, if you need help, ask me.'
+https://github.com/user-attachments/assets/171fb4d6-0355-46f2-863e-edb04a828d04
+<br/><br/>
+[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/browser/real_browser.py): Write a letter in Google Docs to my Papa, thanking him for everything, and save the document as a PDF.
+![Letter to Papa](https://github.com/user-attachments/assets/242ade3e-15bc-41c2-988f-cbc5415a66aa)
+<br/><br/>
+[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/custom-functions/save_to_file_hugging_face.py): Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.
+https://github.com/user-attachments/assets/de73ee39-432c-4b97-b4e8-939fd7f323b3
+<br/><br/>
+## More examples
+For more examples see the [examples](examples) folder or join the [Discord](https://link.browser-use.com/discord) and show off your project.
+# Vision
+Tell your computer what to do, and it gets it done.
+## Roadmap
+### Agent
+- [ ] Improve agent memory (summarize, compress, RAG, etc.)
+- [ ] Enhance planning capabilities (load website specific context)
+- [ ] Reduce token consumption (system prompt, DOM state)
+### DOM Extraction
+- [ ] Improve extraction for datepickers, dropdowns, special elements
+- [ ] Improve state representation for UI elements
+### Rerunning tasks
+- [ ] LLM as fallback
+- [ ] Make it easy to define workfows templates where LLM fills in the details
+- [ ] Return playwright script from the agent
+### Datasets
+- [ ] Create datasets for complex tasks
+- [ ] Benchmark various models against each other
+- [ ] Fine-tuning models for specific tasks
+### User Experience
+- [ ] Human-in-the-loop execution
+- [ ] Improve the generated GIF quality
+- [ ] Create various demos for tutorial execution, job application, QA testing, social media, etc.
+## Contributing
+We love contributions! Feel free to open issues for bugs or feature requests. To contribute to the docs, check out the `/docs` folder.
+## Local Setup
+To learn more about the library, check out the [local setup 📕](https://docs.browser-use.com/development/local-setup).
+## Cooperations
+We are forming a commission to define best practices for UI/UX design for browser agents.
+Together, we're exploring how software redesign improves the performance of AI agents and gives these companies a competitive advantage by designing their existing software to be at the forefront of the agent age.
+Email [Toby](mailto:tbiddle@loop11.com?subject=I%20want%20to%20join%20the%20UI/UX%20commission%20for%20AI%20agents&body=Hi%20Toby%2C%0A%0AI%20found%20you%20in%20the%20browser-use%20GitHub%20README.%0A%0A) to apply for a seat on the committee.
+## Swag
+Want to show off your Browser-use swag? Check out our [Merch store](https://browsermerch.com). Good contributors will receive swag for free 👀.
+## Citation
+If you use Browser Use in your research or project, please cite:
+```bibtex
+@software{browser_use2024,
+  author = {Müller, Magnus and Žunič, Gregor},
+  title = {Browser Use: Enable AI to control your browser},
+  year = {2024},
+  publisher = {GitHub},
+  url = {https://github.com/browser-use/browser-use}
+}
+```
+ <div align="center"> <img src="https://github.com/user-attachments/assets/402b2129-b6ac-44d3-a217-01aea3277dce" width="400"/>
+[![Twitter Follow](https://img.shields.io/twitter/follow/Gregor?style=social)](https://x.com/gregpr07)
+[![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/mamagnus00)
+ </div>
+<div align="center">
+Made with ❤️ in Zurich and San Francisco
+ </div>

SECURITY.md ADDED Viewed

	@@ -0,0 +1,20 @@

+## Reporting Security Issues
+If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure.
+**Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.**
+Instead, please open a new [Github security advisory](https://github.com/browser-use/browser-use/security/advisories/new).
+Please include as much of the information listed below as you can to help me better understand and resolve the issue:
+* The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
+* Full paths of source file(s) related to the manifestation of the issue
+* The location of the affected source code (tag/branch/commit or direct URL)
+* Any special configuration required to reproduce the issue
+* Step-by-step instructions to reproduce the issue
+* Proof-of-concept or exploit code (if possible)
+* Impact of the issue, including how an attacker might exploit the issue
+This information will help me triage your report more quickly.

browser_use/README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+# Codebase Structure
+> The code structure inspired by https://github.com/Netflix/dispatch.
+Very good structure on how to make a scalable codebase is also in [this repo](https://github.com/zhanymkanov/fastapi-best-practices).
+Just a brief document about how we should structure our backend codebase.
+## Code Structure
+```markdown
+src/
+/<service name>/
+models.py
+services.py
+prompts.py
+views.py
+utils.py
+routers.py
+    	/_<subservice name>/
+```
+### Service.py
+Always a single file, except if it becomes too long - more than ~500 lines, split it into \_subservices
+### Views.py
+Always split the views into two parts
+```python
+# All
+...
+# Requests
+...
+# Responses
+...
+```
+If too long → split into multiple files
+### Prompts.py
+Single file; if too long → split into multiple files (one prompt per file or so)
+### Routers.py
+Never split into more than one file

browser_use/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from browser_use.logging_config import setup_logging
+setup_logging()
+from browser_use.agent.prompts import SystemPrompt as SystemPrompt
+from browser_use.agent.service import Agent as Agent
+from browser_use.agent.views import ActionModel as ActionModel
+from browser_use.agent.views import ActionResult as ActionResult
+from browser_use.agent.views import AgentHistoryList as AgentHistoryList
+from browser_use.browser.browser import Browser as Browser
+from browser_use.browser.browser import BrowserConfig as BrowserConfig
+from browser_use.browser.context import BrowserContextConfig
+from browser_use.controller.service import Controller as Controller
+from browser_use.dom.service import DomService as DomService
+__all__ = [
+	'Agent',
+	'Browser',
+	'BrowserConfig',
+	'Controller',
+	'DomService',
+	'SystemPrompt',
+	'ActionResult',
+	'ActionModel',
+	'AgentHistoryList',
+	'BrowserContextConfig',
+]

browser_use/agent/gif.py ADDED Viewed

	@@ -0,0 +1,325 @@

+from __future__ import annotations
+import base64
+import io
+import logging
+import os
+import platform
+from typing import TYPE_CHECKING, Optional
+from browser_use.agent.views import (
+	AgentHistoryList,
+)
+if TYPE_CHECKING:
+	from PIL import Image, ImageFont
+logger = logging.getLogger(__name__)
+def create_history_gif(
+	task: str,
+	history: AgentHistoryList,
+	#
+	output_path: str = 'agent_history.gif',
+	duration: int = 3000,
+	show_goals: bool = True,
+	show_task: bool = True,
+	show_logo: bool = False,
+	font_size: int = 40,
+	title_font_size: int = 56,
+	goal_font_size: int = 44,
+	margin: int = 40,
+	line_spacing: float = 1.5,
+) -> None:
+	"""Create a GIF from the agent's history with overlaid task and goal text."""
+	if not history.history:
+		logger.warning('No history to create GIF from')
+		return
+	from PIL import Image, ImageFont
+	images = []
+	# if history is empty or first screenshot is None, we can't create a gif
+	if not history.history or not history.history[0].state.screenshot:
+		logger.warning('No history or first screenshot to create GIF from')
+		return
+	# Try to load nicer fonts
+	try:
+		# Try different font options in order of preference
+		font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana']
+		font_loaded = False
+		for font_name in font_options:
+			try:
+				if platform.system() == 'Windows':
+					# Need to specify the abs font path on Windows
+					font_name = os.path.join(os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts'), font_name + '.ttf')
+				regular_font = ImageFont.truetype(font_name, font_size)
+				title_font = ImageFont.truetype(font_name, title_font_size)
+				goal_font = ImageFont.truetype(font_name, goal_font_size)
+				font_loaded = True
+				break
+			except OSError:
+				continue
+		if not font_loaded:
+			raise OSError('No preferred fonts found')
+	except OSError:
+		regular_font = ImageFont.load_default()
+		title_font = ImageFont.load_default()
+		goal_font = regular_font
+	# Load logo if requested
+	logo = None
+	if show_logo:
+		try:
+			logo = Image.open('./static/browser-use.png')
+			# Resize logo to be small (e.g., 40px height)
+			logo_height = 150
+			aspect_ratio = logo.width / logo.height
+			logo_width = int(logo_height * aspect_ratio)
+			logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
+		except Exception as e:
+			logger.warning(f'Could not load logo: {e}')
+	# Create task frame if requested
+	if show_task and task:
+		task_frame = _create_task_frame(
+			task,
+			history.history[0].state.screenshot,
+			title_font,  # type: ignore
+			regular_font,  # type: ignore
+			logo,
+			line_spacing,
+		)
+		images.append(task_frame)
+	# Process each history item
+	for i, item in enumerate(history.history, 1):
+		if not item.state.screenshot:
+			continue
+		# Convert base64 screenshot to PIL Image
+		img_data = base64.b64decode(item.state.screenshot)
+		image = Image.open(io.BytesIO(img_data))
+		if show_goals and item.model_output:
+			image = _add_overlay_to_image(
+				image=image,
+				step_number=i,
+				goal_text=item.model_output.current_state.next_goal,
+				regular_font=regular_font,  # type: ignore
+				title_font=title_font,  # type: ignore
+				margin=margin,
+				logo=logo,
+			)
+		images.append(image)
+	if images:
+		# Save the GIF
+		images[0].save(
+			output_path,
+			save_all=True,
+			append_images=images[1:],
+			duration=duration,
+			loop=0,
+			optimize=False,
+		)
+		logger.info(f'Created GIF at {output_path}')
+	else:
+		logger.warning('No images found in history to create GIF')
+def _create_task_frame(
+	task: str,
+	first_screenshot: str,
+	title_font: 'ImageFont.FreeTypeFont',
+	regular_font: 'ImageFont.FreeTypeFont',
+	logo: Optional[Image.Image] = None,
+	line_spacing: float = 1.5,
+) -> 'Image.Image':
+	"""Create initial frame showing the task."""
+	from PIL import Image, ImageDraw, ImageFont
+	img_data = base64.b64decode(first_screenshot)
+	template = Image.open(io.BytesIO(img_data))
+	image = Image.new('RGB', template.size, (0, 0, 0))
+	draw = ImageDraw.Draw(image)
+	# Calculate vertical center of image
+	center_y = image.height // 2
+	# Draw task text with increased font size
+	margin = 140  # Increased margin
+	max_width = image.width - (2 * margin)
+	larger_font = ImageFont.truetype(regular_font.path, regular_font.size + 16)  # Increase font size more
+	wrapped_text = _wrap_text(task, larger_font, max_width)
+	# Calculate line height with spacing
+	line_height = larger_font.size * line_spacing
+	# Split text into lines and draw with custom spacing
+	lines = wrapped_text.split('\n')
+	total_height = line_height * len(lines)
+	# Start position for first line
+	text_y = center_y - (total_height / 2) + 50  # Shifted down slightly
+	for line in lines:
+		# Get line width for centering
+		line_bbox = draw.textbbox((0, 0), line, font=larger_font)
+		text_x = (image.width - (line_bbox[2] - line_bbox[0])) // 2
+		draw.text(
+			(text_x, text_y),
+			line,
+			font=larger_font,
+			fill=(255, 255, 255),
+		)
+		text_y += line_height
+	# Add logo if provided (top right corner)
+	if logo:
+		logo_margin = 20
+		logo_x = image.width - logo.width - logo_margin
+		image.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
+	return image
+def _add_overlay_to_image(
+	image: 'Image.Image',
+	step_number: int,
+	goal_text: str,
+	regular_font: 'ImageFont.FreeTypeFont',
+	title_font: 'ImageFont.FreeTypeFont',
+	margin: int,
+	logo: Optional['Image.Image'] = None,
+	display_step: bool = True,
+	text_color: tuple[int, int, int, int] = (255, 255, 255, 255),
+	text_box_color: tuple[int, int, int, int] = (0, 0, 0, 255),
+) -> 'Image.Image':
+	"""Add step number and goal overlay to an image."""
+	from PIL import Image, ImageDraw
+	image = image.convert('RGBA')
+	txt_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
+	draw = ImageDraw.Draw(txt_layer)
+	if display_step:
+		# Add step number (bottom left)
+		step_text = str(step_number)
+		step_bbox = draw.textbbox((0, 0), step_text, font=title_font)
+		step_width = step_bbox[2] - step_bbox[0]
+		step_height = step_bbox[3] - step_bbox[1]
+		# Position step number in bottom left
+		x_step = margin + 10  # Slight additional offset from edge
+		y_step = image.height - margin - step_height - 10  # Slight offset from bottom
+		# Draw rounded rectangle background for step number
+		padding = 20  # Increased padding
+		step_bg_bbox = (
+			x_step - padding,
+			y_step - padding,
+			x_step + step_width + padding,
+			y_step + step_height + padding,
+		)
+		draw.rounded_rectangle(
+			step_bg_bbox,
+			radius=15,  # Add rounded corners
+			fill=text_box_color,
+		)
+		# Draw step number
+		draw.text(
+			(x_step, y_step),
+			step_text,
+			font=title_font,
+			fill=text_color,
+		)
+	# Draw goal text (centered, bottom)
+	max_width = image.width - (4 * margin)
+	wrapped_goal = _wrap_text(goal_text, title_font, max_width)
+	goal_bbox = draw.multiline_textbbox((0, 0), wrapped_goal, font=title_font)
+	goal_width = goal_bbox[2] - goal_bbox[0]
+	goal_height = goal_bbox[3] - goal_bbox[1]
+	# Center goal text horizontally, place above step number
+	x_goal = (image.width - goal_width) // 2
+	y_goal = y_step - goal_height - padding * 4  # More space between step and goal
+	# Draw rounded rectangle background for goal
+	padding_goal = 25  # Increased padding for goal
+	goal_bg_bbox = (
+		x_goal - padding_goal,  # Remove extra space for logo
+		y_goal - padding_goal,
+		x_goal + goal_width + padding_goal,
+		y_goal + goal_height + padding_goal,
+	)
+	draw.rounded_rectangle(
+		goal_bg_bbox,
+		radius=15,  # Add rounded corners
+		fill=text_box_color,
+	)
+	# Draw goal text
+	draw.multiline_text(
+		(x_goal, y_goal),
+		wrapped_goal,
+		font=title_font,
+		fill=text_color,
+		align='center',
+	)
+	# Add logo if provided (top right corner)
+	if logo:
+		logo_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
+		logo_margin = 20
+		logo_x = image.width - logo.width - logo_margin
+		logo_layer.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
+		txt_layer = Image.alpha_composite(logo_layer, txt_layer)
+	# Composite and convert
+	result = Image.alpha_composite(image, txt_layer)
+	return result.convert('RGB')
+def _wrap_text(text: str, font: 'ImageFont.FreeTypeFont', max_width: int) -> str:
+	"""
+	Wrap text to fit within a given width.
+	Args:
+	    text: Text to wrap
+	    font: Font to use for text
+	    max_width: Maximum width in pixels
+	Returns:
+	    Wrapped text with newlines
+	"""
+	words = text.split()
+	lines = []
+	current_line = []
+	for word in words:
+		current_line.append(word)
+		line = ' '.join(current_line)
+		bbox = font.getbbox(line)
+		if bbox[2] > max_width:
+			if len(current_line) == 1:
+				lines.append(current_line.pop())
+			else:
+				current_line.pop()
+				lines.append(' '.join(current_line))
+				current_line = [word]
+	if current_line:
+		lines.append(' '.join(current_line))
+	return '\n'.join(lines)

browser_use/agent/message_manager/service.py ADDED Viewed

	@@ -0,0 +1,306 @@

+from __future__ import annotations
+import logging
+from typing import Dict, List, Optional
+from langchain_core.messages import (
+	AIMessage,
+	BaseMessage,
+	HumanMessage,
+	SystemMessage,
+	ToolMessage,
+)
+from pydantic import BaseModel
+from browser_use.agent.message_manager.views import MessageMetadata
+from browser_use.agent.prompts import AgentMessagePrompt
+from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
+from browser_use.browser.views import BrowserState
+from browser_use.utils import time_execution_sync
+logger = logging.getLogger(__name__)
+class MessageManagerSettings(BaseModel):
+	max_input_tokens: int = 128000
+	estimated_characters_per_token: int = 3
+	image_tokens: int = 800
+	include_attributes: list[str] = []
+	message_context: Optional[str] = None
+	sensitive_data: Optional[Dict[str, str]] = None
+	available_file_paths: Optional[List[str]] = None
+class MessageManager:
+	def __init__(
+		self,
+		task: str,
+		system_message: SystemMessage,
+		settings: MessageManagerSettings = MessageManagerSettings(),
+		state: MessageManagerState = MessageManagerState(),
+	):
+		self.task = task
+		self.settings = settings
+		self.state = state
+		self.system_prompt = system_message
+		# Only initialize messages if state is empty
+		if len(self.state.history.messages) == 0:
+			self._init_messages()
+	def _init_messages(self) -> None:
+		"""Initialize the message history with system message, context, task, and other initial messages"""
+		self._add_message_with_tokens(self.system_prompt)
+		if self.settings.message_context:
+			context_message = HumanMessage(content='Context for the task' + self.settings.message_context)
+			self._add_message_with_tokens(context_message)
+		task_message = HumanMessage(
+			content=f'Your ultimate task is: """{self.task}""". If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task. If not, continue as usual.'
+		)
+		self._add_message_with_tokens(task_message)
+		if self.settings.sensitive_data:
+			info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}'
+			info += 'To use them, write <secret>the placeholder name</secret>'
+			info_message = HumanMessage(content=info)
+			self._add_message_with_tokens(info_message)
+		placeholder_message = HumanMessage(content='Example output:')
+		self._add_message_with_tokens(placeholder_message)
+		tool_calls = [
+			{
+				'name': 'AgentOutput',
+				'args': {
+					'current_state': {
+						'evaluation_previous_goal': 'Success - I opend the first page',
+						'memory': 'Starting with the new task. I have completed 1/10 steps',
+						'next_goal': 'Click on company a',
+					},
+					'action': [{'click_element': {'index': 0}}],
+				},
+				'id': str(self.state.tool_id),
+				'type': 'tool_call',
+			}
+		]
+		example_tool_call = AIMessage(
+			content='',
+			tool_calls=tool_calls,
+		)
+		self._add_message_with_tokens(example_tool_call)
+		self.add_tool_message(content='Browser started')
+		placeholder_message = HumanMessage(content='[Your task history memory starts here]')
+		self._add_message_with_tokens(placeholder_message)
+		if self.settings.available_file_paths:
+			filepaths_msg = HumanMessage(content=f'Here are file paths you can use: {self.settings.available_file_paths}')
+			self._add_message_with_tokens(filepaths_msg)
+	def add_new_task(self, new_task: str) -> None:
+		content = f'Your new ultimate task is: """{new_task}""". Take the previous context into account and finish your new ultimate task. '
+		msg = HumanMessage(content=content)
+		self._add_message_with_tokens(msg)
+		self.task = new_task
+	@time_execution_sync('--add_state_message')
+	def add_state_message(
+		self,
+		state: BrowserState,
+		result: Optional[List[ActionResult]] = None,
+		step_info: Optional[AgentStepInfo] = None,
+		use_vision=True,
+	) -> None:
+		"""Add browser state as human message"""
+		# if keep in memory, add to directly to history and add state without result
+		if result:
+			for r in result:
+				if r.include_in_memory:
+					if r.extracted_content:
+						msg = HumanMessage(content='Action result: ' + str(r.extracted_content))
+						self._add_message_with_tokens(msg)
+					if r.error:
+						# if endswith \n, remove it
+						if r.error.endswith('\n'):
+							r.error = r.error[:-1]
+						# get only last line of error
+						last_line = r.error.split('\n')[-1]
+						msg = HumanMessage(content='Action error: ' + last_line)
+						self._add_message_with_tokens(msg)
+					result = None  # if result in history, we dont want to add it again
+		# otherwise add state message and result to next message (which will not stay in memory)
+		state_message = AgentMessagePrompt(
+			state,
+			result,
+			include_attributes=self.settings.include_attributes,
+			step_info=step_info,
+		).get_user_message(use_vision)
+		self._add_message_with_tokens(state_message)
+	def add_model_output(self, model_output: AgentOutput) -> None:
+		"""Add model output as AI message"""
+		tool_calls = [
+			{
+				'name': 'AgentOutput',
+				'args': model_output.model_dump(mode='json', exclude_unset=True),
+				'id': str(self.state.tool_id),
+				'type': 'tool_call',
+			}
+		]
+		msg = AIMessage(
+			content='',
+			tool_calls=tool_calls,
+		)
+		self._add_message_with_tokens(msg)
+		# empty tool response
+		self.add_tool_message(content='')
+	def add_plan(self, plan: Optional[str], position: int | None = None) -> None:
+		if plan:
+			msg = AIMessage(content=plan)
+			self._add_message_with_tokens(msg, position)
+	@time_execution_sync('--get_messages')
+	def get_messages(self) -> List[BaseMessage]:
+		"""Get current message list, potentially trimmed to max tokens"""
+		msg = [m.message for m in self.state.history.messages]
+		# debug which messages are in history with token count # log
+		total_input_tokens = 0
+		logger.debug(f'Messages in history: {len(self.state.history.messages)}:')
+		for m in self.state.history.messages:
+			total_input_tokens += m.metadata.tokens
+			logger.debug(f'{m.message.__class__.__name__} - Token count: {m.metadata.tokens}')
+		logger.debug(f'Total input tokens: {total_input_tokens}')
+		return msg
+	def _add_message_with_tokens(self, message: BaseMessage, position: int | None = None) -> None:
+		"""Add message with token count metadata
+		position: None for last, -1 for second last, etc.
+		"""
+		# filter out sensitive data from the message
+		if self.settings.sensitive_data:
+			message = self._filter_sensitive_data(message)
+		token_count = self._count_tokens(message)
+		metadata = MessageMetadata(tokens=token_count)
+		self.state.history.add_message(message, metadata, position)
+	@time_execution_sync('--filter_sensitive_data')
+	def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage:
+		"""Filter out sensitive data from the message"""
+		def replace_sensitive(value: str) -> str:
+			if not self.settings.sensitive_data:
+				return value
+			for key, val in self.settings.sensitive_data.items():
+				if not val:
+					continue
+				value = value.replace(val, f'<secret>{key}</secret>')
+			return value
+		if isinstance(message.content, str):
+			message.content = replace_sensitive(message.content)
+		elif isinstance(message.content, list):
+			for i, item in enumerate(message.content):
+				if isinstance(item, dict) and 'text' in item:
+					item['text'] = replace_sensitive(item['text'])
+					message.content[i] = item
+		return message
+	def _count_tokens(self, message: BaseMessage) -> int:
+		"""Count tokens in a message using the model's tokenizer"""
+		tokens = 0
+		if isinstance(message.content, list):
+			for item in message.content:
+				if 'image_url' in item:
+					tokens += self.settings.image_tokens
+				elif isinstance(item, dict) and 'text' in item:
+					tokens += self._count_text_tokens(item['text'])
+		else:
+			msg = message.content
+			if hasattr(message, 'tool_calls'):
+				msg += str(message.tool_calls)  # type: ignore
+			tokens += self._count_text_tokens(msg)
+		return tokens
+	def _count_text_tokens(self, text: str) -> int:
+		"""Count tokens in a text string"""
+		tokens = len(text) // self.settings.estimated_characters_per_token  # Rough estimate if no tokenizer available
+		return tokens
+	def cut_messages(self):
+		"""Get current message list, potentially trimmed to max tokens"""
+		diff = self.state.history.current_tokens - self.settings.max_input_tokens
+		if diff <= 0:
+			return None
+		msg = self.state.history.messages[-1]
+		# if list with image remove image
+		if isinstance(msg.message.content, list):
+			text = ''
+			for item in msg.message.content:
+				if 'image_url' in item:
+					msg.message.content.remove(item)
+					diff -= self.settings.image_tokens
+					msg.metadata.tokens -= self.settings.image_tokens
+					self.state.history.current_tokens -= self.settings.image_tokens
+					logger.debug(
+						f'Removed image with {self.settings.image_tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens}'
+					)
+				elif 'text' in item and isinstance(item, dict):
+					text += item['text']
+			msg.message.content = text
+			self.state.history.messages[-1] = msg
+		if diff <= 0:
+			return None
+		# if still over, remove text from state message proportionally to the number of tokens needed with buffer
+		# Calculate the proportion of content to remove
+		proportion_to_remove = diff / msg.metadata.tokens
+		if proportion_to_remove > 0.99:
+			raise ValueError(
+				f'Max token limit reached - history is too long - reduce the system prompt or task. '
+				f'proportion_to_remove: {proportion_to_remove}'
+			)
+		logger.debug(
+			f'Removing {proportion_to_remove * 100:.2f}% of the last message  {proportion_to_remove * msg.metadata.tokens:.2f} / {msg.metadata.tokens:.2f} tokens)'
+		)
+		content = msg.message.content
+		characters_to_remove = int(len(content) * proportion_to_remove)
+		content = content[:-characters_to_remove]
+		# remove tokens and old long message
+		self.state.history.remove_last_state_message()
+		# new message with updated content
+		msg = HumanMessage(content=content)
+		self._add_message_with_tokens(msg)
+		last_msg = self.state.history.messages[-1]
+		logger.debug(
+			f'Added message with {last_msg.metadata.tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens} - total messages: {len(self.state.history.messages)}'
+		)
+	def _remove_last_state_message(self) -> None:
+		"""Remove last state message from history"""
+		self.state.history.remove_last_state_message()
+	def add_tool_message(self, content: str) -> None:
+		"""Add tool message to history"""
+		msg = ToolMessage(content=content, tool_call_id=str(self.state.tool_id))
+		self.state.tool_id += 1
+		self._add_message_with_tokens(msg)

browser_use/agent/message_manager/tests.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import pytest
+from langchain_anthropic import ChatAnthropic
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
+from langchain_openai import AzureChatOpenAI, ChatOpenAI
+from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
+from browser_use.agent.views import ActionResult
+from browser_use.browser.views import BrowserState, TabInfo
+from browser_use.dom.views import DOMElementNode, DOMTextNode
+@pytest.fixture(
+	params=[
+		ChatOpenAI(model='gpt-4o-mini'),
+		AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'),
+		ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None),
+	],
+	ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'],
+)
+def message_manager(request: pytest.FixtureRequest):
+	task = 'Test task'
+	action_descriptions = 'Test actions'
+	return MessageManager(
+		task=task,
+		system_message=SystemMessage(content=action_descriptions),
+		settings=MessageManagerSettings(
+			max_input_tokens=1000,
+			estimated_characters_per_token=3,
+			image_tokens=800,
+		),
+	)
+def test_initial_messages(message_manager: MessageManager):
+	"""Test that message manager initializes with system and task messages"""
+	messages = message_manager.get_messages()
+	assert len(messages) == 2
+	assert isinstance(messages[0], SystemMessage)
+	assert isinstance(messages[1], HumanMessage)
+	assert 'Test task' in messages[1].content
+def test_add_state_message(message_manager: MessageManager):
+	"""Test adding browser state message"""
+	state = BrowserState(
+		url='https://test.com',
+		title='Test Page',
+		element_tree=DOMElementNode(
+			tag_name='div',
+			attributes={},
+			children=[],
+			is_visible=True,
+			parent=None,
+			xpath='//div',
+		),
+		selector_map={},
+		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
+	)
+	message_manager.add_state_message(state)
+	messages = message_manager.get_messages()
+	assert len(messages) == 3
+	assert isinstance(messages[2], HumanMessage)
+	assert 'https://test.com' in messages[2].content
+def test_add_state_with_memory_result(message_manager: MessageManager):
+	"""Test adding state with result that should be included in memory"""
+	state = BrowserState(
+		url='https://test.com',
+		title='Test Page',
+		element_tree=DOMElementNode(
+			tag_name='div',
+			attributes={},
+			children=[],
+			is_visible=True,
+			parent=None,
+			xpath='//div',
+		),
+		selector_map={},
+		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
+	)
+	result = ActionResult(extracted_content='Important content', include_in_memory=True)
+	message_manager.add_state_message(state, [result])
+	messages = message_manager.get_messages()
+	# Should have system, task, extracted content, and state messages
+	assert len(messages) == 4
+	assert 'Important content' in messages[2].content
+	assert isinstance(messages[2], HumanMessage)
+	assert isinstance(messages[3], HumanMessage)
+	assert 'Important content' not in messages[3].content
+def test_add_state_with_non_memory_result(message_manager: MessageManager):
+	"""Test adding state with result that should not be included in memory"""
+	state = BrowserState(
+		url='https://test.com',
+		title='Test Page',
+		element_tree=DOMElementNode(
+			tag_name='div',
+			attributes={},
+			children=[],
+			is_visible=True,
+			parent=None,
+			xpath='//div',
+		),
+		selector_map={},
+		tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
+	)
+	result = ActionResult(extracted_content='Temporary content', include_in_memory=False)
+	message_manager.add_state_message(state, [result])
+	messages = message_manager.get_messages()
+	# Should have system, task, and combined state+result message
+	assert len(messages) == 3
+	assert 'Temporary content' in messages[2].content
+	assert isinstance(messages[2], HumanMessage)
+@pytest.mark.skip('not sure how to fix this')
+@pytest.mark.parametrize('max_tokens', [100000, 10000, 5000])
+def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens):
+	"""Test handling of token overflow in a realistic message flow"""
+	# Set more realistic token limit
+	message_manager.settings.max_input_tokens = max_tokens
+	# Create a long sequence of interactions
+	for i in range(200):  # Simulate 40 steps of interaction
+		# Create state with varying content length
+		state = BrowserState(
+			url=f'https://test{i}.com',
+			title=f'Test Page {i}',
+			element_tree=DOMElementNode(
+				tag_name='div',
+				attributes={},
+				children=[
+					DOMTextNode(
+						text=f'Content {j} ' * (10 + i),  # Increasing content length
+						is_visible=True,
+						parent=None,
+					)
+					for j in range(5)  # Multiple DOM items
+				],
+				is_visible=True,
+				parent=None,
+				xpath='//div',
+			),
+			selector_map={j: f'//div[{j}]' for j in range(5)},
+			tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')],
+		)
+		# Alternate between different types of results
+		result = None
+		if i % 2 == 0:  # Every other iteration
+			result = ActionResult(
+				extracted_content=f'Important content from step {i}' * 5,
+				include_in_memory=i % 4 == 0,  # Include in memory every 4th message
+			)
+		# Add state message
+		if result:
+			message_manager.add_state_message(state, [result])
+		else:
+			message_manager.add_state_message(state)
+		try:
+			messages = message_manager.get_messages()
+		except ValueError as e:
+			if 'Max token limit reached - history is too long' in str(e):
+				return  # If error occurs, end the test
+			else:
+				raise e
+		assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100
+		last_msg = messages[-1]
+		assert isinstance(last_msg, HumanMessage)
+		if i % 4 == 0:
+			assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage)
+		if i % 2 == 0 and not i % 4 == 0:
+			if isinstance(last_msg.content, list):
+				assert 'Current url: https://test' in last_msg.content[0]['text']
+			else:
+				assert 'Current url: https://test' in last_msg.content
+		# Add model output every time
+		from browser_use.agent.views import AgentBrain, AgentOutput
+		from browser_use.controller.registry.views import ActionModel
+		output = AgentOutput(
+			current_state=AgentBrain(
+				evaluation_previous_goal=f'Success in step {i}',
+				memory=f'Memory from step {i}',
+				next_goal=f'Goal for step {i + 1}',
+			),
+			action=[ActionModel()],
+		)
+		message_manager._remove_last_state_message()
+		message_manager.add_model_output(output)
+		# Get messages and verify after each addition
+		messages = [m.message for m in message_manager.state.history.messages]
+		# Verify token limit is respected
+		# Verify essential messages are preserved
+		assert isinstance(messages[0], SystemMessage)  # System prompt always first
+		assert isinstance(messages[1], HumanMessage)  # Task always second
+		assert 'Test task' in messages[1].content
+		# Verify structure of latest messages
+		assert isinstance(messages[-1], AIMessage)  # Last message should be model output
+		assert f'step {i}' in messages[-1].content  # Should contain current step info
+		# Log token usage for debugging
+		token_usage = message_manager.state.history.current_tokens
+		token_limit = message_manager.settings.max_input_tokens
+		# print(f'Step {i}: Using {token_usage}/{token_limit} tokens')
+		# go through all messages and verify that the token count and total tokens is correct
+		total_tokens = 0
+		real_tokens = []
+		stored_tokens = []
+		for msg in message_manager.state.history.messages:
+			total_tokens += msg.metadata.tokens
+			stored_tokens.append(msg.metadata.tokens)
+			real_tokens.append(message_manager._count_tokens(msg.message))
+		assert total_tokens == sum(real_tokens)
+		assert stored_tokens == real_tokens
+		assert message_manager.state.history.current_tokens == total_tokens
+# pytest -s browser_use/agent/message_manager/tests.py

browser_use/agent/message_manager/utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+from __future__ import annotations
+import json
+import logging
+import os
+from typing import Any, Optional, Type
+from langchain_core.messages import (
+	AIMessage,
+	BaseMessage,
+	HumanMessage,
+	SystemMessage,
+	ToolMessage,
+)
+logger = logging.getLogger(__name__)
+def extract_json_from_model_output(content: str) -> dict:
+	"""Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
+	try:
+		# If content is wrapped in code blocks, extract just the JSON part
+		if '```' in content:
+			# Find the JSON content between code blocks
+			content = content.split('```')[1]
+			# Remove language identifier if present (e.g., 'json\n')
+			if '\n' in content:
+				content = content.split('\n', 1)[1]
+		# Parse the cleaned content
+		return json.loads(content)
+	except json.JSONDecodeError as e:
+		logger.warning(f'Failed to parse model output: {content} {str(e)}')
+		raise ValueError('Could not parse response.')
+def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]:
+	"""Convert input messages to a format that is compatible with the planner model"""
+	if model_name is None:
+		return input_messages
+	if model_name == 'deepseek-reasoner' or 'deepseek-r1' in model_name:
+		converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages)
+		merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage)
+		merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage)
+		return merged_input_messages
+	return input_messages
+def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]:
+	"""Convert messages for non-function-calling models"""
+	output_messages = []
+	for message in input_messages:
+		if isinstance(message, HumanMessage):
+			output_messages.append(message)
+		elif isinstance(message, SystemMessage):
+			output_messages.append(message)
+		elif isinstance(message, ToolMessage):
+			output_messages.append(HumanMessage(content=message.content))
+		elif isinstance(message, AIMessage):
+			# check if tool_calls is a valid JSON object
+			if message.tool_calls:
+				tool_calls = json.dumps(message.tool_calls)
+				output_messages.append(AIMessage(content=tool_calls))
+			else:
+				output_messages.append(message)
+		else:
+			raise ValueError(f'Unknown message type: {type(message)}')
+	return output_messages
+def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]:
+	"""Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one."""
+	merged_messages = []
+	streak = 0
+	for message in messages:
+		if isinstance(message, class_to_merge):
+			streak += 1
+			if streak > 1:
+				if isinstance(message.content, list):
+					merged_messages[-1].content += message.content[0]['text']  # type:ignore
+				else:
+					merged_messages[-1].content += message.content
+			else:
+				merged_messages.append(message)
+		else:
+			merged_messages.append(message)
+			streak = 0
+	return merged_messages
+def save_conversation(input_messages: list[BaseMessage], response: Any, target: str, encoding: Optional[str] = None) -> None:
+	"""Save conversation history to file."""
+	# create folders if not exists
+	os.makedirs(os.path.dirname(target), exist_ok=True)
+	with open(
+		target,
+		'w',
+		encoding=encoding,
+	) as f:
+		_write_messages_to_file(f, input_messages)
+		_write_response_to_file(f, response)
+def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None:
+	"""Write messages to conversation file"""
+	for message in messages:
+		f.write(f' {message.__class__.__name__} \n')
+		if isinstance(message.content, list):
+			for item in message.content:
+				if isinstance(item, dict) and item.get('type') == 'text':
+					f.write(item['text'].strip() + '\n')
+		elif isinstance(message.content, str):
+			try:
+				content = json.loads(message.content)
+				f.write(json.dumps(content, indent=2) + '\n')
+			except json.JSONDecodeError:
+				f.write(message.content.strip() + '\n')
+		f.write('\n')
+def _write_response_to_file(f: Any, response: Any) -> None:
+	"""Write model response to conversation file"""
+	f.write(' RESPONSE\n')
+	f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2))

browser_use/agent/message_manager/views.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from __future__ import annotations
+from typing import TYPE_CHECKING, Any
+from langchain_core.load import dumpd, load
+from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
+from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator
+if TYPE_CHECKING:
+	from browser_use.agent.views import AgentOutput
+class MessageMetadata(BaseModel):
+	"""Metadata for a message"""
+	tokens: int = 0
+class ManagedMessage(BaseModel):
+	"""A message with its metadata"""
+	message: BaseMessage
+	metadata: MessageMetadata = Field(default_factory=MessageMetadata)
+	model_config = ConfigDict(arbitrary_types_allowed=True)
+	# https://github.com/pydantic/pydantic/discussions/7558
+	@model_serializer(mode='wrap')
+	def to_json(self, original_dump):
+		"""
+		Returns the JSON representation of the model.
+		It uses langchain's `dumps` function to serialize the `message`
+		property before encoding the overall dict with json.dumps.
+		"""
+		data = original_dump(self)
+		# NOTE: We override the message field to use langchain JSON serialization.
+		data['message'] = dumpd(self.message)
+		return data
+	@model_validator(mode='before')
+	@classmethod
+	def validate(
+		cls,
+		value: Any,
+		*,
+		strict: bool | None = None,
+		from_attributes: bool | None = None,
+		context: Any | None = None,
+	) -> Any:
+		"""
+		Custom validator that uses langchain's `loads` function
+		to parse the message if it is provided as a JSON string.
+		"""
+		if isinstance(value, dict) and 'message' in value:
+			# NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object.
+			value['message'] = load(value['message'])
+		return value
+class MessageHistory(BaseModel):
+	"""History of messages with metadata"""
+	messages: list[ManagedMessage] = Field(default_factory=list)
+	current_tokens: int = 0
+	model_config = ConfigDict(arbitrary_types_allowed=True)
+	def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
+		"""Add message with metadata to history"""
+		if position is None:
+			self.messages.append(ManagedMessage(message=message, metadata=metadata))
+		else:
+			self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
+		self.current_tokens += metadata.tokens
+	def add_model_output(self, output: 'AgentOutput') -> None:
+		"""Add model output as AI message"""
+		tool_calls = [
+			{
+				'name': 'AgentOutput',
+				'args': output.model_dump(mode='json', exclude_unset=True),
+				'id': '1',
+				'type': 'tool_call',
+			}
+		]
+		msg = AIMessage(
+			content='',
+			tool_calls=tool_calls,
+		)
+		self.add_message(msg, MessageMetadata(tokens=100))  # Estimate tokens for tool calls
+		# Empty tool response
+		tool_message = ToolMessage(content='', tool_call_id='1')
+		self.add_message(tool_message, MessageMetadata(tokens=10))  # Estimate tokens for empty response
+	def get_messages(self) -> list[BaseMessage]:
+		"""Get all messages"""
+		return [m.message for m in self.messages]
+	def get_total_tokens(self) -> int:
+		"""Get total tokens in history"""
+		return self.current_tokens
+	def remove_oldest_message(self) -> None:
+		"""Remove oldest non-system message"""
+		for i, msg in enumerate(self.messages):
+			if not isinstance(msg.message, SystemMessage):
+				self.current_tokens -= msg.metadata.tokens
+				self.messages.pop(i)
+				break
+	def remove_last_state_message(self) -> None:
+		"""Remove last state message from history"""
+		if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage):
+			self.current_tokens -= self.messages[-1].metadata.tokens
+			self.messages.pop()
+class MessageManagerState(BaseModel):
+	"""Holds the state for MessageManager"""
+	history: MessageHistory = Field(default_factory=MessageHistory)
+	tool_id: int = 1
+	model_config = ConfigDict(arbitrary_types_allowed=True)

browser_use/agent/prompts.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import datetime
+import importlib.resources
+from datetime import datetime
+from typing import TYPE_CHECKING, List, Optional
+from langchain_core.messages import HumanMessage, SystemMessage
+if TYPE_CHECKING:
+	from browser_use.agent.views import ActionResult, AgentStepInfo
+	from browser_use.browser.views import BrowserState
+class SystemPrompt:
+	def __init__(
+		self,
+		action_description: str,
+		max_actions_per_step: int = 10,
+		override_system_message: Optional[str] = None,
+		extend_system_message: Optional[str] = None,
+	):
+		self.default_action_description = action_description
+		self.max_actions_per_step = max_actions_per_step
+		prompt = ''
+		if override_system_message:
+			prompt = override_system_message
+		else:
+			self._load_prompt_template()
+			prompt = self.prompt_template.format(max_actions=self.max_actions_per_step)
+		if extend_system_message:
+			prompt += f'\n{extend_system_message}'
+		self.system_message = SystemMessage(content=prompt)
+	def _load_prompt_template(self) -> None:
+		"""Load the prompt template from the markdown file."""
+		try:
+			# This works both in development and when installed as a package
+			with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r') as f:
+				self.prompt_template = f.read()
+		except Exception as e:
+			raise RuntimeError(f'Failed to load system prompt template: {e}')
+	def get_system_message(self) -> SystemMessage:
+		"""
+		Get the system prompt for the agent.
+		Returns:
+		    SystemMessage: Formatted system prompt
+		"""
+		return self.system_message
+# Functions:
+# {self.default_action_description}
+# Example:
+# {self.example_response()}
+# Your AVAILABLE ACTIONS:
+# {self.default_action_description}
+class AgentMessagePrompt:
+	def __init__(
+		self,
+		state: 'BrowserState',
+		result: Optional[List['ActionResult']] = None,
+		include_attributes: list[str] = [],
+		step_info: Optional['AgentStepInfo'] = None,
+	):
+		self.state = state
+		self.result = result
+		self.include_attributes = include_attributes
+		self.step_info = step_info
+	def get_user_message(self, use_vision: bool = True) -> HumanMessage:
+		elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
+		has_content_above = (self.state.pixels_above or 0) > 0
+		has_content_below = (self.state.pixels_below or 0) > 0
+		if elements_text != '':
+			if has_content_above:
+				elements_text = (
+					f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
+				)
+			else:
+				elements_text = f'[Start of page]\n{elements_text}'
+			if has_content_below:
+				elements_text = (
+					f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
+				)
+			else:
+				elements_text = f'{elements_text}\n[End of page]'
+		else:
+			elements_text = 'empty page'
+		if self.step_info:
+			step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}'
+		else:
+			step_info_description = ''
+		time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
+		step_info_description += f'Current date and time: {time_str}'
+		state_description = f"""
+[Task history memory ends]
+[Current state starts here]
+The following is one-time information - if you need to remember it write it to memory:
+Current url: {self.state.url}
+Available tabs:
+{self.state.tabs}
+Interactive elements from top layer of the current page inside the viewport:
+{elements_text}
+{step_info_description}
+"""
+		if self.result:
+			for i, result in enumerate(self.result):
+				if result.extracted_content:
+					state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.extracted_content}'
+				if result.error:
+					# only use last line of error
+					error = result.error.split('\n')[-1]
+					state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}'
+		if self.state.screenshot and use_vision == True:
+			# Format message for vision model
+			return HumanMessage(
+				content=[
+					{'type': 'text', 'text': state_description},
+					{
+						'type': 'image_url',
+						'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'},  # , 'detail': 'low'
+					},
+				]
+			)
+		return HumanMessage(content=state_description)
+class PlannerPrompt(SystemPrompt):
+	def get_system_message(self) -> SystemMessage:
+		return SystemMessage(
+			content="""You are a planning agent that helps break down tasks into smaller steps and reason about the current state.
+Your role is to:
+1. Analyze the current state and history
+2. Evaluate progress towards the ultimate goal
+3. Identify potential challenges or roadblocks
+4. Suggest the next high-level steps to take
+Inside your messages, there will be AI messages from different agents with different formats.
+Your output format should be always a JSON object with the following fields:
+{
+    "state_analysis": "Brief analysis of the current state and what has been done so far",
+    "progress_evaluation": "Evaluation of progress towards the ultimate goal (as percentage and description)",
+    "challenges": "List any potential challenges or roadblocks",
+    "next_steps": "List 2-3 concrete next steps to take",
+    "reasoning": "Explain your reasoning for the suggested next steps"
+}
+Ignore the other AI messages output structures.
+Keep your responses concise and focused on actionable insights."""
+		)

browser_use/agent/service.py ADDED Viewed

	@@ -0,0 +1,964 @@

+from __future__ import annotations
+import asyncio
+import json
+import logging
+import re
+import time
+from pathlib import Path
+from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar
+from dotenv import load_dotenv
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import (
+	BaseMessage,
+	HumanMessage,
+	SystemMessage,
+)
+# from lmnr.sdk.decorators import observe
+from pydantic import BaseModel, ValidationError
+from browser_use.agent.gif import create_history_gif
+from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
+from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation
+from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt
+from browser_use.agent.views import (
+	ActionResult,
+	AgentError,
+	AgentHistory,
+	AgentHistoryList,
+	AgentOutput,
+	AgentSettings,
+	AgentState,
+	AgentStepInfo,
+	StepMetadata,
+	ToolCallingMethod,
+)
+from browser_use.browser.browser import Browser
+from browser_use.browser.context import BrowserContext
+from browser_use.browser.views import BrowserState, BrowserStateHistory
+from browser_use.controller.registry.views import ActionModel
+from browser_use.controller.service import Controller
+from browser_use.dom.history_tree_processor.service import (
+	DOMHistoryElement,
+	HistoryTreeProcessor,
+)
+from browser_use.telemetry.service import ProductTelemetry
+from browser_use.telemetry.views import (
+	AgentEndTelemetryEvent,
+	AgentRunTelemetryEvent,
+	AgentStepTelemetryEvent,
+)
+from browser_use.utils import time_execution_async, time_execution_sync
+load_dotenv()
+logger = logging.getLogger(__name__)
+def log_response(response: AgentOutput) -> None:
+	"""Utility function to log the model's response."""
+	if 'Success' in response.current_state.evaluation_previous_goal:
+		emoji = '👍'
+	elif 'Failed' in response.current_state.evaluation_previous_goal:
+		emoji = '⚠'
+	else:
+		emoji = '🤷'
+	logger.info(f'{emoji} Eval: {response.current_state.evaluation_previous_goal}')
+	logger.info(f'🧠 Memory: {response.current_state.memory}')
+	logger.info(f'🎯 Next goal: {response.current_state.next_goal}')
+	for i, action in enumerate(response.action):
+		logger.info(f'🛠️  Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}')
+Context = TypeVar('Context')
+class Agent(Generic[Context]):
+	@time_execution_sync('--init (agent)')
+	def __init__(
+		self,
+		task: str,
+		llm: BaseChatModel,
+		# Optional parameters
+		browser: Browser | None = None,
+		browser_context: BrowserContext | None = None,
+		controller: Controller[Context] = Controller(),
+		# Initial agent run parameters
+		sensitive_data: Optional[Dict[str, str]] = None,
+		initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
+		# Cloud Callbacks
+		register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None,
+		register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None,
+		register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None,
+		# Agent settings
+		use_vision: bool = True,
+		use_vision_for_planner: bool = False,
+		save_conversation_path: Optional[str] = None,
+		save_conversation_path_encoding: Optional[str] = 'utf-8',
+		max_failures: int = 3,
+		retry_delay: int = 10,
+		override_system_message: Optional[str] = None,
+		extend_system_message: Optional[str] = None,
+		max_input_tokens: int = 128000,
+		validate_output: bool = False,
+		message_context: Optional[str] = None,
+		generate_gif: bool | str = False,
+		available_file_paths: Optional[list[str]] = None,
+		include_attributes: list[str] = [
+			'title',
+			'type',
+			'name',
+			'role',
+			'aria-label',
+			'placeholder',
+			'value',
+			'alt',
+			'aria-expanded',
+			'data-date-format',
+		],
+		max_actions_per_step: int = 10,
+		tool_calling_method: Optional[ToolCallingMethod] = 'auto',
+		page_extraction_llm: Optional[BaseChatModel] = None,
+		planner_llm: Optional[BaseChatModel] = None,
+		planner_interval: int = 1,  # Run planner every N steps
+		# Inject state
+		injected_agent_state: Optional[AgentState] = None,
+		#
+		context: Context | None = None,
+	):
+		if page_extraction_llm is None:
+			page_extraction_llm = llm
+		# Core components
+		self.task = task
+		self.llm = llm
+		self.controller = controller
+		self.sensitive_data = sensitive_data
+		self.settings = AgentSettings(
+			use_vision=use_vision,
+			use_vision_for_planner=use_vision_for_planner,
+			save_conversation_path=save_conversation_path,
+			save_conversation_path_encoding=save_conversation_path_encoding,
+			max_failures=max_failures,
+			retry_delay=retry_delay,
+			override_system_message=override_system_message,
+			extend_system_message=extend_system_message,
+			max_input_tokens=max_input_tokens,
+			validate_output=validate_output,
+			message_context=message_context,
+			generate_gif=generate_gif,
+			available_file_paths=available_file_paths,
+			include_attributes=include_attributes,
+			max_actions_per_step=max_actions_per_step,
+			tool_calling_method=tool_calling_method,
+			page_extraction_llm=page_extraction_llm,
+			planner_llm=planner_llm,
+			planner_interval=planner_interval,
+		)
+		# Initialize state
+		self.state = injected_agent_state or AgentState()
+		# Action setup
+		self._setup_action_models()
+		self._set_browser_use_version_and_source()
+		self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
+		# Model setup
+		self._set_model_names()
+		# for models without tool calling, add available actions to context
+		self.available_actions = self.controller.registry.get_prompt_description()
+		self.tool_calling_method = self._set_tool_calling_method()
+		self.settings.message_context = self._set_message_context()
+		# Initialize message manager with state
+		self._message_manager = MessageManager(
+			task=task,
+			system_message=SystemPrompt(
+				action_description=self.available_actions,
+				max_actions_per_step=self.settings.max_actions_per_step,
+				override_system_message=override_system_message,
+				extend_system_message=extend_system_message,
+			).get_system_message(),
+			settings=MessageManagerSettings(
+				max_input_tokens=self.settings.max_input_tokens,
+				include_attributes=self.settings.include_attributes,
+				message_context=self.settings.message_context,
+				sensitive_data=sensitive_data,
+				available_file_paths=self.settings.available_file_paths,
+			),
+			state=self.state.message_manager_state,
+		)
+		# Browser setup
+		self.injected_browser = browser is not None
+		self.injected_browser_context = browser_context is not None
+		self.browser = browser if browser is not None else (None if browser_context else Browser())
+		if browser_context:
+			self.browser_context = browser_context
+		elif self.browser:
+			self.browser_context = BrowserContext(browser=self.browser, config=self.browser.config.new_context_config)
+		else:
+			self.browser = Browser()
+			self.browser_context = BrowserContext(browser=self.browser)
+		# Callbacks
+		self.register_new_step_callback = register_new_step_callback
+		self.register_done_callback = register_done_callback
+		self.register_external_agent_status_raise_error_callback = register_external_agent_status_raise_error_callback
+		# Context
+		self.context = context
+		# Telemetry
+		self.telemetry = ProductTelemetry()
+		if self.settings.save_conversation_path:
+			logger.info(f'Saving conversation to {self.settings.save_conversation_path}')
+	def _set_message_context(self) -> str | None:
+		if self.tool_calling_method == 'raw':
+			if self.settings.message_context:
+				self.settings.message_context += f'\n\nAvailable actions: {self.available_actions}'
+			else:
+				self.settings.message_context = f'Available actions: {self.available_actions}'
+		return self.settings.message_context
+	def _set_browser_use_version_and_source(self) -> None:
+		"""Get the version and source of the browser-use package (git or pip in a nutshell)"""
+		try:
+			# First check for repository-specific files
+			repo_files = ['.git', 'README.md', 'docs', 'examples']
+			package_root = Path(__file__).parent.parent.parent
+			# If all of these files/dirs exist, it's likely from git
+			if all(Path(package_root / file).exists() for file in repo_files):
+				try:
+					import subprocess
+					version = subprocess.check_output(['git', 'describe', '--tags']).decode('utf-8').strip()
+				except Exception:
+					version = 'unknown'
+				source = 'git'
+			else:
+				# If no repo files found, try getting version from pip
+				import pkg_resources
+				version = pkg_resources.get_distribution('browser-use').version
+				source = 'pip'
+		except Exception:
+			version = 'unknown'
+			source = 'unknown'
+		logger.debug(f'Version: {version}, Source: {source}')
+		self.version = version
+		self.source = source
+	def _set_model_names(self) -> None:
+		self.chat_model_library = self.llm.__class__.__name__
+		self.model_name = 'Unknown'
+		if hasattr(self.llm, 'model_name'):
+			model = self.llm.model_name  # type: ignore
+			self.model_name = model if model is not None else 'Unknown'
+		elif hasattr(self.llm, 'model'):
+			model = self.llm.model  # type: ignore
+			self.model_name = model if model is not None else 'Unknown'
+		if self.settings.planner_llm:
+			if hasattr(self.settings.planner_llm, 'model_name'):
+				self.planner_model_name = self.settings.planner_llm.model_name  # type: ignore
+			elif hasattr(self.settings.planner_llm, 'model'):
+				self.planner_model_name = self.settings.planner_llm.model  # type: ignore
+			else:
+				self.planner_model_name = 'Unknown'
+		else:
+			self.planner_model_name = None
+	def _setup_action_models(self) -> None:
+		"""Setup dynamic action models from controller's registry"""
+		self.ActionModel = self.controller.registry.create_action_model()
+		# Create output model with the dynamic actions
+		self.AgentOutput = AgentOutput.type_with_custom_actions(self.ActionModel)
+		# used to force the done action when max_steps is reached
+		self.DoneActionModel = self.controller.registry.create_action_model(include_actions=['done'])
+		self.DoneAgentOutput = AgentOutput.type_with_custom_actions(self.DoneActionModel)
+	def _set_tool_calling_method(self) -> Optional[ToolCallingMethod]:
+		tool_calling_method = self.settings.tool_calling_method
+		if tool_calling_method == 'auto':
+			if 'deepseek-reasoner' in self.model_name or 'deepseek-r1' in self.model_name:
+				return 'raw'
+			elif self.chat_model_library == 'ChatGoogleGenerativeAI':
+				return None
+			elif self.chat_model_library == 'ChatOpenAI':
+				return 'function_calling'
+			elif self.chat_model_library == 'AzureChatOpenAI':
+				return 'function_calling'
+			else:
+				return None
+		else:
+			return tool_calling_method
+	def add_new_task(self, new_task: str) -> None:
+		self._message_manager.add_new_task(new_task)
+	async def _raise_if_stopped_or_paused(self) -> None:
+		"""Utility function that raises an InterruptedError if the agent is stopped or paused."""
+		if self.register_external_agent_status_raise_error_callback:
+			if await self.register_external_agent_status_raise_error_callback():
+				raise InterruptedError
+		if self.state.stopped or self.state.paused:
+			logger.debug('Agent paused after getting state')
+			raise InterruptedError
+	# @observe(name='agent.step', ignore_output=True, ignore_input=True)
+	@time_execution_async('--step (agent)')
+	async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
+		"""Execute one step of the task"""
+		logger.info(f'📍 Step {self.state.n_steps}')
+		state = None
+		model_output = None
+		result: list[ActionResult] = []
+		step_start_time = time.time()
+		tokens = 0
+		try:
+			state = await self.browser_context.get_state()
+			await self._raise_if_stopped_or_paused()
+			self._message_manager.add_state_message(state, self.state.last_result, step_info, self.settings.use_vision)
+			# Run planner at specified intervals if planner is configured
+			if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0:
+				plan = await self._run_planner()
+				# add plan before last state message
+				self._message_manager.add_plan(plan, position=-1)
+			if step_info and step_info.is_last_step():
+				# Add last step warning if needed
+				msg = 'Now comes your last step. Use only the "done" action now. No other actions - so here your action sequence must have length 1.'
+				msg += '\nIf the task is not yet fully finished as requested by the user, set success in "done" to false! E.g. if not all steps are fully completed.'
+				msg += '\nIf the task is fully finished, set success in "done" to true.'
+				msg += '\nInclude everything you found out for the ultimate task in the done text.'
+				logger.info('Last step finishing up')
+				self._message_manager._add_message_with_tokens(HumanMessage(content=msg))
+				self.AgentOutput = self.DoneAgentOutput
+			input_messages = self._message_manager.get_messages()
+			tokens = self._message_manager.state.history.current_tokens
+			try:
+				model_output = await self.get_next_action(input_messages)
+				self.state.n_steps += 1
+				if self.register_new_step_callback:
+					await self.register_new_step_callback(state, model_output, self.state.n_steps)
+				if self.settings.save_conversation_path:
+					target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt'
+					save_conversation(input_messages, model_output, target, self.settings.save_conversation_path_encoding)
+				self._message_manager._remove_last_state_message()  # we dont want the whole state in the chat history
+				await self._raise_if_stopped_or_paused()
+				self._message_manager.add_model_output(model_output)
+			except Exception as e:
+				# model call failed, remove last state message from history
+				self._message_manager._remove_last_state_message()
+				raise e
+			result: list[ActionResult] = await self.multi_act(model_output.action)
+			self.state.last_result = result
+			if len(result) > 0 and result[-1].is_done:
+				logger.info(f'📄 Result: {result[-1].extracted_content}')
+			self.state.consecutive_failures = 0
+		except InterruptedError:
+			logger.debug('Agent paused')
+			self.state.last_result = [
+				ActionResult(
+					error='The agent was paused - now continuing actions might need to be repeated', include_in_memory=True
+				)
+			]
+			return
+		except Exception as e:
+			result = await self._handle_step_error(e)
+			self.state.last_result = result
+		finally:
+			step_end_time = time.time()
+			actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else []
+			self.telemetry.capture(
+				AgentStepTelemetryEvent(
+					agent_id=self.state.agent_id,
+					step=self.state.n_steps,
+					actions=actions,
+					consecutive_failures=self.state.consecutive_failures,
+					step_error=[r.error for r in result if r.error] if result else ['No result'],
+				)
+			)
+			if not result:
+				return
+			if state:
+				metadata = StepMetadata(
+					step_number=self.state.n_steps,
+					step_start_time=step_start_time,
+					step_end_time=step_end_time,
+					input_tokens=tokens,
+				)
+				self._make_history_item(model_output, state, result, metadata)
+	@time_execution_async('--handle_step_error (agent)')
+	async def _handle_step_error(self, error: Exception) -> list[ActionResult]:
+		"""Handle all types of errors that can occur during a step"""
+		include_trace = logger.isEnabledFor(logging.DEBUG)
+		error_msg = AgentError.format_error(error, include_trace=include_trace)
+		prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures} times:\n '
+		if isinstance(error, (ValidationError, ValueError)):
+			logger.error(f'{prefix}{error_msg}')
+			if 'Max token limit reached' in error_msg:
+				# cut tokens from history
+				self._message_manager.settings.max_input_tokens = self.settings.max_input_tokens - 500
+				logger.info(
+					f'Cutting tokens from history - new max input tokens: {self._message_manager.settings.max_input_tokens}'
+				)
+				self._message_manager.cut_messages()
+			elif 'Could not parse response' in error_msg:
+				# give model a hint how output should look like
+				error_msg += '\n\nReturn a valid JSON object with the required fields.'
+			self.state.consecutive_failures += 1
+		else:
+			from google.api_core.exceptions import ResourceExhausted
+			from openai import RateLimitError
+			if isinstance(error, RateLimitError) or isinstance(error, ResourceExhausted):
+				logger.warning(f'{prefix}{error_msg}')
+				await asyncio.sleep(self.settings.retry_delay)
+				self.state.consecutive_failures += 1
+			else:
+				logger.error(f'{prefix}{error_msg}')
+				self.state.consecutive_failures += 1
+		return [ActionResult(error=error_msg, include_in_memory=True)]
+	def _make_history_item(
+		self,
+		model_output: AgentOutput | None,
+		state: BrowserState,
+		result: list[ActionResult],
+		metadata: Optional[StepMetadata] = None,
+	) -> None:
+		"""Create and store history item"""
+		if model_output:
+			interacted_elements = AgentHistory.get_interacted_element(model_output, state.selector_map)
+		else:
+			interacted_elements = [None]
+		state_history = BrowserStateHistory(
+			url=state.url,
+			title=state.title,
+			tabs=state.tabs,
+			interacted_element=interacted_elements,
+			screenshot=state.screenshot,
+		)
+		history_item = AgentHistory(model_output=model_output, result=result, state=state_history, metadata=metadata)
+		self.state.history.history.append(history_item)
+	THINK_TAGS = re.compile(r'<think>.*?</think>', re.DOTALL)
+	STRAY_CLOSE_TAG = re.compile(r'.*?</think>', re.DOTALL)
+	def _remove_think_tags(self, text: str) -> str:
+		# Step 1: Remove well-formed <think>...</think>
+		text = re.sub(self.THINK_TAGS, '', text)
+		# Step 2: If there's an unmatched closing tag </think>,
+		#         remove everything up to and including that.
+		text = re.sub(self.STRAY_CLOSE_TAG, '', text)
+		return text.strip()
+	def _convert_input_messages(self, input_messages: list[BaseMessage]) -> list[BaseMessage]:
+		"""Convert input messages to the correct format"""
+		if self.model_name == 'deepseek-reasoner' or 'deepseek-r1' in self.model_name:
+			return convert_input_messages(input_messages, self.model_name)
+		else:
+			return input_messages
+	@time_execution_async('--get_next_action (agent)')
+	async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
+		"""Get next action from LLM based on current state"""
+		input_messages = self._convert_input_messages(input_messages)
+		if self.tool_calling_method == 'raw':
+			output = self.llm.invoke(input_messages)
+			# TODO: currently invoke does not return reasoning_content, we should override invoke
+			output.content = self._remove_think_tags(str(output.content))
+			try:
+				parsed_json = extract_json_from_model_output(output.content)
+				parsed = self.AgentOutput(**parsed_json)
+			except (ValueError, ValidationError) as e:
+				logger.warning(f'Failed to parse model output: {output} {str(e)}')
+				raise ValueError('Could not parse response.')
+		elif self.tool_calling_method is None:
+			structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True)
+			response: dict[str, Any] = await structured_llm.ainvoke(input_messages)  # type: ignore
+			parsed: AgentOutput | None = response['parsed']
+		else:
+			structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True, method=self.tool_calling_method)
+			response: dict[str, Any] = await structured_llm.ainvoke(input_messages)  # type: ignore
+			parsed: AgentOutput | None = response['parsed']
+		if parsed is None:
+			raise ValueError('Could not parse response.')
+		# cut the number of actions to max_actions_per_step if needed
+		if len(parsed.action) > self.settings.max_actions_per_step:
+			parsed.action = parsed.action[: self.settings.max_actions_per_step]
+		log_response(parsed)
+		return parsed
+	def _log_agent_run(self) -> None:
+		"""Log the agent run"""
+		logger.info(f'🚀 Starting task: {self.task}')
+		logger.debug(f'Version: {self.version}, Source: {self.source}')
+		self.telemetry.capture(
+			AgentRunTelemetryEvent(
+				agent_id=self.state.agent_id,
+				use_vision=self.settings.use_vision,
+				task=self.task,
+				model_name=self.model_name,
+				chat_model_library=self.chat_model_library,
+				version=self.version,
+				source=self.source,
+			)
+		)
+	async def take_step(self) -> tuple[bool, bool]:
+		"""Take a step
+		Returns:
+			Tuple[bool, bool]: (is_done, is_valid)
+		"""
+		await self.step()
+		if self.state.history.is_done():
+			if self.settings.validate_output:
+				if not await self._validate_output():
+					return True, False
+			await self.log_completion()
+			if self.register_done_callback:
+				await self.register_done_callback(self.state.history)
+			return True, True
+		return False, False
+	# @observe(name='agent.run', ignore_output=True)
+	@time_execution_async('--run (agent)')
+	async def run(self, max_steps: int = 100) -> AgentHistoryList:
+		"""Execute the task with maximum number of steps"""
+		try:
+			self._log_agent_run()
+			# Execute initial actions if provided
+			if self.initial_actions:
+				result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
+				self.state.last_result = result
+			for step in range(max_steps):
+				# Check if we should stop due to too many failures
+				if self.state.consecutive_failures >= self.settings.max_failures:
+					logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
+					break
+				# Check control flags before each step
+				if self.state.stopped:
+					logger.info('Agent stopped')
+					break
+				while self.state.paused:
+					await asyncio.sleep(0.2)  # Small delay to prevent CPU spinning
+					if self.state.stopped:  # Allow stopping while paused
+						break
+				step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
+				await self.step(step_info)
+				if self.state.history.is_done():
+					if self.settings.validate_output and step < max_steps - 1:
+						if not await self._validate_output():
+							continue
+					await self.log_completion()
+					break
+			else:
+				logger.info('❌ Failed to complete task in maximum steps')
+			return self.state.history
+		finally:
+			self.telemetry.capture(
+				AgentEndTelemetryEvent(
+					agent_id=self.state.agent_id,
+					is_done=self.state.history.is_done(),
+					success=self.state.history.is_successful(),
+					steps=self.state.n_steps,
+					max_steps_reached=self.state.n_steps >= max_steps,
+					errors=self.state.history.errors(),
+					total_input_tokens=self.state.history.total_input_tokens(),
+					total_duration_seconds=self.state.history.total_duration_seconds(),
+				)
+			)
+			if not self.injected_browser_context:
+				await self.browser_context.close()
+			if not self.injected_browser and self.browser:
+				await self.browser.close()
+			if self.settings.generate_gif:
+				output_path: str = 'agent_history.gif'
+				if isinstance(self.settings.generate_gif, str):
+					output_path = self.settings.generate_gif
+				create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
+	# @observe(name='controller.multi_act')
+	@time_execution_async('--multi-act (agent)')
+	async def multi_act(
+		self,
+		actions: list[ActionModel],
+		check_for_new_elements: bool = True,
+	) -> list[ActionResult]:
+		"""Execute multiple actions"""
+		results = []
+		cached_selector_map = await self.browser_context.get_selector_map()
+		cached_path_hashes = set(e.hash.branch_path_hash for e in cached_selector_map.values())
+		await self.browser_context.remove_highlights()
+		for i, action in enumerate(actions):
+			if action.get_index() is not None and i != 0:
+				new_state = await self.browser_context.get_state()
+				new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values())
+				if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes):
+					# next action requires index but there are new elements on the page
+					msg = f'Something new appeared after action {i} / {len(actions)}'
+					logger.info(msg)
+					results.append(ActionResult(extracted_content=msg, include_in_memory=True))
+					break
+			await self._raise_if_stopped_or_paused()
+			result = await self.controller.act(
+				action,
+				self.browser_context,
+				self.settings.page_extraction_llm,
+				self.sensitive_data,
+				self.settings.available_file_paths,
+				context=self.context,
+			)
+			results.append(result)
+			logger.debug(f'Executed action {i + 1} / {len(actions)}')
+			if results[-1].is_done or results[-1].error or i == len(actions) - 1:
+				break
+			await asyncio.sleep(self.browser_context.config.wait_between_actions)
+			# hash all elements. if it is a subset of cached_state its fine - else break (new elements on page)
+		return results
+	async def _validate_output(self) -> bool:
+		"""Validate the output of the last action is what the user wanted"""
+		system_msg = (
+			f'You are a validator of an agent who interacts with a browser. '
+			f'Validate if the output of last action is what the user wanted and if the task is completed. '
+			f'If the task is unclear defined, you can let it pass. But if something is missing or the image does not show what was requested dont let it pass. '
+			f'Try to understand the page and help the model with suggestions like scroll, do x, ... to get the solution right. '
+			f'Task to validate: {self.task}. Return a JSON object with 2 keys: is_valid and reason. '
+			f'is_valid is a boolean that indicates if the output is correct. '
+			f'reason is a string that explains why it is valid or not.'
+			f' example: {{"is_valid": false, "reason": "The user wanted to search for "cat photos", but the agent searched for "dog photos" instead."}}'
+		)
+		if self.browser_context.session:
+			state = await self.browser_context.get_state()
+			content = AgentMessagePrompt(
+				state=state,
+				result=self.state.last_result,
+				include_attributes=self.settings.include_attributes,
+			)
+			msg = [SystemMessage(content=system_msg), content.get_user_message(self.settings.use_vision)]
+		else:
+			# if no browser session, we can't validate the output
+			return True
+		class ValidationResult(BaseModel):
+			"""
+			Validation results.
+			"""
+			is_valid: bool
+			reason: str
+		validator = self.llm.with_structured_output(ValidationResult, include_raw=True)
+		response: dict[str, Any] = await validator.ainvoke(msg)  # type: ignore
+		parsed: ValidationResult = response['parsed']
+		is_valid = parsed.is_valid
+		if not is_valid:
+			logger.info(f'❌ Validator decision: {parsed.reason}')
+			msg = f'The output is not yet correct. {parsed.reason}.'
+			self.state.last_result = [ActionResult(extracted_content=msg, include_in_memory=True)]
+		else:
+			logger.info(f'✅ Validator decision: {parsed.reason}')
+		return is_valid
+	async def log_completion(self) -> None:
+		"""Log the completion of the task"""
+		logger.info('✅ Task completed')
+		if self.state.history.is_successful():
+			logger.info('✅ Successfully')
+		else:
+			logger.info('❌ Unfinished')
+		if self.register_done_callback:
+			await self.register_done_callback(self.state.history)
+	async def rerun_history(
+		self,
+		history: AgentHistoryList,
+		max_retries: int = 3,
+		skip_failures: bool = True,
+		delay_between_actions: float = 2.0,
+	) -> list[ActionResult]:
+		"""
+		Rerun a saved history of actions with error handling and retry logic.
+		Args:
+				history: The history to replay
+				max_retries: Maximum number of retries per action
+				skip_failures: Whether to skip failed actions or stop execution
+				delay_between_actions: Delay between actions in seconds
+		Returns:
+				List of action results
+		"""
+		# Execute initial actions if provided
+		if self.initial_actions:
+			result = await self.multi_act(self.initial_actions)
+			self.state.last_result = result
+		results = []
+		for i, history_item in enumerate(history.history):
+			goal = history_item.model_output.current_state.next_goal if history_item.model_output else ''
+			logger.info(f'Replaying step {i + 1}/{len(history.history)}: goal: {goal}')
+			if (
+				not history_item.model_output
+				or not history_item.model_output.action
+				or history_item.model_output.action == [None]
+			):
+				logger.warning(f'Step {i + 1}: No action to replay, skipping')
+				results.append(ActionResult(error='No action to replay'))
+				continue
+			retry_count = 0
+			while retry_count < max_retries:
+				try:
+					result = await self._execute_history_step(history_item, delay_between_actions)
+					results.extend(result)
+					break
+				except Exception as e:
+					retry_count += 1
+					if retry_count == max_retries:
+						error_msg = f'Step {i + 1} failed after {max_retries} attempts: {str(e)}'
+						logger.error(error_msg)
+						if not skip_failures:
+							results.append(ActionResult(error=error_msg))
+							raise RuntimeError(error_msg)
+					else:
+						logger.warning(f'Step {i + 1} failed (attempt {retry_count}/{max_retries}), retrying...')
+						await asyncio.sleep(delay_between_actions)
+		return results
+	async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]:
+		"""Execute a single step from history with element validation"""
+		state = await self.browser_context.get_state()
+		if not state or not history_item.model_output:
+			raise ValueError('Invalid state or model output')
+		updated_actions = []
+		for i, action in enumerate(history_item.model_output.action):
+			updated_action = await self._update_action_indices(
+				history_item.state.interacted_element[i],
+				action,
+				state,
+			)
+			updated_actions.append(updated_action)
+			if updated_action is None:
+				raise ValueError(f'Could not find matching element {i} in current page')
+		result = await self.multi_act(updated_actions)
+		await asyncio.sleep(delay)
+		return result
+	async def _update_action_indices(
+		self,
+		historical_element: Optional[DOMHistoryElement],
+		action: ActionModel,  # Type this properly based on your action model
+		current_state: BrowserState,
+	) -> Optional[ActionModel]:
+		"""
+		Update action indices based on current page state.
+		Returns updated action or None if element cannot be found.
+		"""
+		if not historical_element or not current_state.element_tree:
+			return action
+		current_element = HistoryTreeProcessor.find_history_element_in_tree(historical_element, current_state.element_tree)
+		if not current_element or current_element.highlight_index is None:
+			return None
+		old_index = action.get_index()
+		if old_index != current_element.highlight_index:
+			action.set_index(current_element.highlight_index)
+			logger.info(f'Element moved in DOM, updated index from {old_index} to {current_element.highlight_index}')
+		return action
+	async def load_and_rerun(self, history_file: Optional[str | Path] = None, **kwargs) -> list[ActionResult]:
+		"""
+		Load history from file and rerun it.
+		Args:
+				history_file: Path to the history file
+				**kwargs: Additional arguments passed to rerun_history
+		"""
+		if not history_file:
+			history_file = 'AgentHistory.json'
+		history = AgentHistoryList.load_from_file(history_file, self.AgentOutput)
+		return await self.rerun_history(history, **kwargs)
+	def save_history(self, file_path: Optional[str | Path] = None) -> None:
+		"""Save the history to a file"""
+		if not file_path:
+			file_path = 'AgentHistory.json'
+		self.state.history.save_to_file(file_path)
+	def pause(self) -> None:
+		"""Pause the agent before the next step"""
+		logger.info('🔄 pausing Agent ')
+		self.state.paused = True
+	def resume(self) -> None:
+		"""Resume the agent"""
+		logger.info('▶️ Agent resuming')
+		self.state.paused = False
+	def stop(self) -> None:
+		"""Stop the agent"""
+		logger.info('⏹️ Agent stopping')
+		self.state.stopped = True
+	def _convert_initial_actions(self, actions: List[Dict[str, Dict[str, Any]]]) -> List[ActionModel]:
+		"""Convert dictionary-based actions to ActionModel instances"""
+		converted_actions = []
+		action_model = self.ActionModel
+		for action_dict in actions:
+			# Each action_dict should have a single key-value pair
+			action_name = next(iter(action_dict))
+			params = action_dict[action_name]
+			# Get the parameter model for this action from registry
+			action_info = self.controller.registry.registry.actions[action_name]
+			param_model = action_info.param_model
+			# Create validated parameters using the appropriate param model
+			validated_params = param_model(**params)
+			# Create ActionModel instance with the validated parameters
+			action_model = self.ActionModel(**{action_name: validated_params})
+			converted_actions.append(action_model)
+		return converted_actions
+	async def _run_planner(self) -> Optional[str]:
+		"""Run the planner to analyze state and suggest next steps"""
+		# Skip planning if no planner_llm is set
+		if not self.settings.planner_llm:
+			return None
+		# Create planner message history using full message history
+		planner_messages = [
+			PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(),
+			*self._message_manager.get_messages()[1:],  # Use full message history except the first
+		]
+		if not self.settings.use_vision_for_planner and self.settings.use_vision:
+			last_state_message: HumanMessage = planner_messages[-1]
+			# remove image from last state message
+			new_msg = ''
+			if isinstance(last_state_message.content, list):
+				for msg in last_state_message.content:
+					if msg['type'] == 'text':  # type: ignore
+						new_msg += msg['text']  # type: ignore
+					elif msg['type'] == 'image_url':  # type: ignore
+						continue  # type: ignore
+			else:
+				new_msg = last_state_message.content
+			planner_messages[-1] = HumanMessage(content=new_msg)
+		planner_messages = convert_input_messages(planner_messages, self.planner_model_name)
+		# Get planner output
+		response = await self.settings.planner_llm.ainvoke(planner_messages)
+		plan = str(response.content)
+		# if deepseek-reasoner, remove think tags
+		if self.planner_model_name and ('deepseek-r1' in self.planner_model_name or 'deepseek-reasoner' in self.planner_model_name):
+			plan = self._remove_think_tags(plan)
+		try:
+			plan_json = json.loads(plan)
+			logger.info(f'Planning Analysis:\n{json.dumps(plan_json, indent=4)}')
+		except json.JSONDecodeError:
+			logger.info(f'Planning Analysis:\n{plan}')
+		except Exception as e:
+			logger.debug(f'Error parsing planning analysis: {e}')
+			logger.info(f'Plan: {plan}')
+		return plan
+	@property
+	def message_manager(self) -> MessageManager:
+		return self._message_manager

browser_use/agent/system_prompt.md ADDED Viewed

	@@ -0,0 +1,69 @@

+You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
+# Input Format
+Task
+Previous steps
+Current URL
+Open Tabs
+Interactive Elements
+[index]<type>text</type>
+- index: Numeric identifier for interaction
+- type: HTML element type (button, input, etc.)
+- text: Element description
+Example:
+[33]<button>Submit Form</button>
+- Only elements with numeric indexes in [] are interactive
+- elements without [] provide only context
+# Response Rules
+1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
+{{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
+"memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
+"next_goal": "What needs to be done with the next immediate action"}},
+"action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
+2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
+Common action sequences:
+- Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
+- Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
+- Actions are executed in the given order
+- If the page changes after an action, the sequence is interrupted and you get the new state.
+- Only provide the action sequence until an action which changes the page state significantly.
+- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
+- only use multiple actions if it makes sense.
+3. ELEMENT INTERACTION:
+- Only use indexes of the interactive elements
+- Elements marked with "[]Non-interactive text" are non-interactive
+4. NAVIGATION & ERROR HANDLING:
+- If no suitable elements exist, use other functions to complete the task
+- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
+- Handle popups/cookies by accepting or closing them
+- Use scroll to find elements you are looking for
+- If you want to research something, open a new tab instead of using the current tab
+- If captcha pops up, try to solve it - else try a different approach
+- If the page is not fully loaded, use wait action
+5. TASK COMPLETION:
+- Use the done action as the last action as soon as the ultimate task is complete
+- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
+- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
+- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
+- Don't hallucinate actions
+- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
+6. VISUAL CONTEXT:
+- When an image is provided, use it to understand the page layout
+- Bounding boxes with labels on their top right corner correspond to element indexes
+7. Form filling:
+- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
+8. Long tasks:
+- Keep track of the status and subresults in the memory.
+9. Extraction:
+- If your task is to find information - call extract_content on the specific pages to get and store the information.
+Your responses must be always JSON with the specified format.

browser_use/agent/tests.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import pytest
+from browser_use.agent.views import (
+	ActionResult,
+	AgentBrain,
+	AgentHistory,
+	AgentHistoryList,
+	AgentOutput,
+)
+from browser_use.browser.views import BrowserState, BrowserStateHistory, TabInfo
+from browser_use.controller.registry.service import Registry
+from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction
+from browser_use.dom.views import DOMElementNode
+@pytest.fixture
+def sample_browser_state():
+	return BrowserState(
+		url='https://example.com',
+		title='Example Page',
+		tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)],
+		screenshot='screenshot1.png',
+		element_tree=DOMElementNode(
+			tag_name='root',
+			is_visible=True,
+			parent=None,
+			xpath='',
+			attributes={},
+			children=[],
+		),
+		selector_map={},
+	)
+@pytest.fixture
+def action_registry():
+	registry = Registry()
+	# Register the actions we need for testing
+	@registry.action(description='Click an element', param_model=ClickElementAction)
+	def click_element(params: ClickElementAction, browser=None):
+		pass
+	@registry.action(
+		description='Extract page content',
+		param_model=ExtractPageContentAction,
+	)
+	def extract_page_content(params: ExtractPageContentAction, browser=None):
+		pass
+	@registry.action(description='Mark task as done', param_model=DoneAction)
+	def done(params: DoneAction):
+		pass
+	# Create the dynamic ActionModel with all registered actions
+	return registry.create_action_model()
+@pytest.fixture
+def sample_history(action_registry):
+	# Create actions with nested params structure
+	click_action = action_registry(click_element={'index': 1})
+	extract_action = action_registry(extract_page_content={'value': 'text'})
+	done_action = action_registry(done={'text': 'Task completed'})
+	histories = [
+		AgentHistory(
+			model_output=AgentOutput(
+				current_state=AgentBrain(
+					evaluation_previous_goal='None',
+					memory='Started task',
+					next_goal='Click button',
+				),
+				action=[click_action],
+			),
+			result=[ActionResult(is_done=False)],
+			state=BrowserStateHistory(
+				url='https://example.com',
+				title='Page 1',
+				tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)],
+				screenshot='screenshot1.png',
+				interacted_element=[{'xpath': '//button[1]'}],
+			),
+		),
+		AgentHistory(
+			model_output=AgentOutput(
+				current_state=AgentBrain(
+					evaluation_previous_goal='Clicked button',
+					memory='Button clicked',
+					next_goal='Extract content',
+				),
+				action=[extract_action],
+			),
+			result=[
+				ActionResult(
+					is_done=False,
+					extracted_content='Extracted text',
+					error='Failed to extract completely',
+				)
+			],
+			state=BrowserStateHistory(
+				url='https://example.com/page2',
+				title='Page 2',
+				tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
+				screenshot='screenshot2.png',
+				interacted_element=[{'xpath': '//div[1]'}],
+			),
+		),
+		AgentHistory(
+			model_output=AgentOutput(
+				current_state=AgentBrain(
+					evaluation_previous_goal='Extracted content',
+					memory='Content extracted',
+					next_goal='Finish task',
+				),
+				action=[done_action],
+			),
+			result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)],
+			state=BrowserStateHistory(
+				url='https://example.com/page2',
+				title='Page 2',
+				tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
+				screenshot='screenshot3.png',
+				interacted_element=[{'xpath': '//div[1]'}],
+			),
+		),
+	]
+	return AgentHistoryList(history=histories)
+def test_last_model_output(sample_history: AgentHistoryList):
+	last_output = sample_history.last_action()
+	print(last_output)
+	assert last_output == {'done': {'text': 'Task completed'}}
+def test_get_errors(sample_history: AgentHistoryList):
+	errors = sample_history.errors()
+	assert len(errors) == 1
+	assert errors[0] == 'Failed to extract completely'
+def test_final_result(sample_history: AgentHistoryList):
+	assert sample_history.final_result() == 'Task completed'
+def test_is_done(sample_history: AgentHistoryList):
+	assert sample_history.is_done() == True
+def test_urls(sample_history: AgentHistoryList):
+	urls = sample_history.urls()
+	assert 'https://example.com' in urls
+	assert 'https://example.com/page2' in urls
+def test_all_screenshots(sample_history: AgentHistoryList):
+	screenshots = sample_history.screenshots()
+	assert len(screenshots) == 3
+	assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png']
+def test_all_model_outputs(sample_history: AgentHistoryList):
+	outputs = sample_history.model_actions()
+	print(f'DEBUG: {outputs[0]}')
+	assert len(outputs) == 3
+	# get first key value pair
+	assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}}
+	assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}}
+	assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}}
+def test_all_model_outputs_filtered(sample_history: AgentHistoryList):
+	filtered = sample_history.model_actions_filtered(include=['click_element'])
+	assert len(filtered) == 1
+	assert filtered[0]['click_element']['index'] == 1
+def test_empty_history():
+	empty_history = AgentHistoryList(history=[])
+	assert empty_history.last_action() is None
+	assert empty_history.final_result() is None
+	assert empty_history.is_done() == False
+	assert len(empty_history.urls()) == 0
+# Add a test to verify action creation
+def test_action_creation(action_registry):
+	click_action = action_registry(click_element={'index': 1})
+	assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}}
+# run this with:
+# pytest browser_use/agent/tests.py

browser_use/agent/views.py ADDED Viewed

	@@ -0,0 +1,393 @@

+from __future__ import annotations
+import json
+import traceback
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Type
+from langchain_core.language_models.chat_models import BaseChatModel
+from openai import RateLimitError
+from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model
+from browser_use.agent.message_manager.views import MessageManagerState
+from browser_use.browser.views import BrowserStateHistory
+from browser_use.controller.registry.views import ActionModel
+from browser_use.dom.history_tree_processor.service import (
+	DOMElementNode,
+	DOMHistoryElement,
+	HistoryTreeProcessor,
+)
+from browser_use.dom.views import SelectorMap
+ToolCallingMethod = Literal['function_calling', 'json_mode', 'raw', 'auto']
+class AgentSettings(BaseModel):
+	"""Options for the agent"""
+	use_vision: bool = True
+	use_vision_for_planner: bool = False
+	save_conversation_path: Optional[str] = None
+	save_conversation_path_encoding: Optional[str] = 'utf-8'
+	max_failures: int = 3
+	retry_delay: int = 10
+	max_input_tokens: int = 128000
+	validate_output: bool = False
+	message_context: Optional[str] = None
+	generate_gif: bool | str = False
+	available_file_paths: Optional[list[str]] = None
+	override_system_message: Optional[str] = None
+	extend_system_message: Optional[str] = None
+	include_attributes: list[str] = [
+		'title',
+		'type',
+		'name',
+		'role',
+		'tabindex',
+		'aria-label',
+		'placeholder',
+		'value',
+		'alt',
+		'aria-expanded',
+	]
+	max_actions_per_step: int = 10
+	tool_calling_method: Optional[ToolCallingMethod] = 'auto'
+	page_extraction_llm: Optional[BaseChatModel] = None
+	planner_llm: Optional[BaseChatModel] = None
+	planner_interval: int = 1  # Run planner every N steps
+class AgentState(BaseModel):
+	"""Holds all state information for an Agent"""
+	agent_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+	n_steps: int = 1
+	consecutive_failures: int = 0
+	last_result: Optional[List['ActionResult']] = None
+	history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[]))
+	last_plan: Optional[str] = None
+	paused: bool = False
+	stopped: bool = False
+	message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
+	# class Config:
+	# 	arbitrary_types_allowed = True
+@dataclass
+class AgentStepInfo:
+	step_number: int
+	max_steps: int
+	def is_last_step(self) -> bool:
+		"""Check if this is the last step"""
+		return self.step_number >= self.max_steps - 1
+class ActionResult(BaseModel):
+	"""Result of executing an action"""
+	is_done: Optional[bool] = False
+	success: Optional[bool] = None
+	extracted_content: Optional[str] = None
+	error: Optional[str] = None
+	include_in_memory: bool = False  # whether to include in past messages as context or not
+class StepMetadata(BaseModel):
+	"""Metadata for a single step including timing and token information"""
+	step_start_time: float
+	step_end_time: float
+	input_tokens: int  # Approximate tokens from message manager for this step
+	step_number: int
+	@property
+	def duration_seconds(self) -> float:
+		"""Calculate step duration in seconds"""
+		return self.step_end_time - self.step_start_time
+class AgentBrain(BaseModel):
+	"""Current state of the agent"""
+	evaluation_previous_goal: str
+	memory: str
+	next_goal: str
+class AgentOutput(BaseModel):
+	"""Output model for agent
+	@dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
+	"""
+	model_config = ConfigDict(arbitrary_types_allowed=True)
+	current_state: AgentBrain
+	action: list[ActionModel] = Field(
+		...,
+		description='List of actions to execute',
+		json_schema_extra={'min_items': 1},  # Ensure at least one action is provided
+	)
+	@staticmethod
+	def type_with_custom_actions(custom_actions: Type[ActionModel]) -> Type['AgentOutput']:
+		"""Extend actions with custom actions"""
+		model_ = create_model(
+			'AgentOutput',
+			__base__=AgentOutput,
+			action=(
+				list[custom_actions],
+				Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}),
+			),
+			__module__=AgentOutput.__module__,
+		)
+		model_.__doc__ = 'AgentOutput model with custom actions'
+		return model_
+class AgentHistory(BaseModel):
+	"""History item for agent actions"""
+	model_output: AgentOutput | None
+	result: list[ActionResult]
+	state: BrowserStateHistory
+	metadata: Optional[StepMetadata] = None
+	model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=())
+	@staticmethod
+	def get_interacted_element(model_output: AgentOutput, selector_map: SelectorMap) -> list[DOMHistoryElement | None]:
+		elements = []
+		for action in model_output.action:
+			index = action.get_index()
+			if index and index in selector_map:
+				el: DOMElementNode = selector_map[index]
+				elements.append(HistoryTreeProcessor.convert_dom_element_to_history_element(el))
+			else:
+				elements.append(None)
+		return elements
+	def model_dump(self, **kwargs) -> Dict[str, Any]:
+		"""Custom serialization handling circular references"""
+		# Handle action serialization
+		model_output_dump = None
+		if self.model_output:
+			action_dump = [action.model_dump(exclude_none=True) for action in self.model_output.action]
+			model_output_dump = {
+				'current_state': self.model_output.current_state.model_dump(),
+				'action': action_dump,  # This preserves the actual action data
+			}
+		return {
+			'model_output': model_output_dump,
+			'result': [r.model_dump(exclude_none=True) for r in self.result],
+			'state': self.state.to_dict(),
+			'metadata': self.metadata.model_dump() if self.metadata else None,
+		}
+class AgentHistoryList(BaseModel):
+	"""List of agent history items"""
+	history: list[AgentHistory]
+	def total_duration_seconds(self) -> float:
+		"""Get total duration of all steps in seconds"""
+		total = 0.0
+		for h in self.history:
+			if h.metadata:
+				total += h.metadata.duration_seconds
+		return total
+	def total_input_tokens(self) -> int:
+		"""
+		Get total tokens used across all steps.
+		Note: These are from the approximate token counting of the message manager.
+		For accurate token counting, use tools like LangChain Smith or OpenAI's token counters.
+		"""
+		total = 0
+		for h in self.history:
+			if h.metadata:
+				total += h.metadata.input_tokens
+		return total
+	def input_token_usage(self) -> list[int]:
+		"""Get token usage for each step"""
+		return [h.metadata.input_tokens for h in self.history if h.metadata]
+	def __str__(self) -> str:
+		"""Representation of the AgentHistoryList object"""
+		return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})'
+	def __repr__(self) -> str:
+		"""Representation of the AgentHistoryList object"""
+		return self.__str__()
+	def save_to_file(self, filepath: str | Path) -> None:
+		"""Save history to JSON file with proper serialization"""
+		try:
+			Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+			data = self.model_dump()
+			with open(filepath, 'w', encoding='utf-8') as f:
+				json.dump(data, f, indent=2)
+		except Exception as e:
+			raise e
+	def model_dump(self, **kwargs) -> Dict[str, Any]:
+		"""Custom serialization that properly uses AgentHistory's model_dump"""
+		return {
+			'history': [h.model_dump(**kwargs) for h in self.history],
+		}
+	@classmethod
+	def load_from_file(cls, filepath: str | Path, output_model: Type[AgentOutput]) -> 'AgentHistoryList':
+		"""Load history from JSON file"""
+		with open(filepath, 'r', encoding='utf-8') as f:
+			data = json.load(f)
+		# loop through history and validate output_model actions to enrich with custom actions
+		for h in data['history']:
+			if h['model_output']:
+				if isinstance(h['model_output'], dict):
+					h['model_output'] = output_model.model_validate(h['model_output'])
+				else:
+					h['model_output'] = None
+			if 'interacted_element' not in h['state']:
+				h['state']['interacted_element'] = None
+		history = cls.model_validate(data)
+		return history
+	def last_action(self) -> None | dict:
+		"""Last action in history"""
+		if self.history and self.history[-1].model_output:
+			return self.history[-1].model_output.action[-1].model_dump(exclude_none=True)
+		return None
+	def errors(self) -> list[str | None]:
+		"""Get all errors from history, with None for steps without errors"""
+		errors = []
+		for h in self.history:
+			step_errors = [r.error for r in h.result if r.error]
+			# each step can have only one error
+			errors.append(step_errors[0] if step_errors else None)
+		return errors
+	def final_result(self) -> None | str:
+		"""Final result from history"""
+		if self.history and self.history[-1].result[-1].extracted_content:
+			return self.history[-1].result[-1].extracted_content
+		return None
+	def is_done(self) -> bool:
+		"""Check if the agent is done"""
+		if self.history and len(self.history[-1].result) > 0:
+			last_result = self.history[-1].result[-1]
+			return last_result.is_done is True
+		return False
+	def is_successful(self) -> bool | None:
+		"""Check if the agent completed successfully - the agent decides in the last step if it was successful or not. None if not done yet."""
+		if self.history and len(self.history[-1].result) > 0:
+			last_result = self.history[-1].result[-1]
+			if last_result.is_done is True:
+				return last_result.success
+		return None
+	def has_errors(self) -> bool:
+		"""Check if the agent has any non-None errors"""
+		return any(error is not None for error in self.errors())
+	def urls(self) -> list[str | None]:
+		"""Get all unique URLs from history"""
+		return [h.state.url if h.state.url is not None else None for h in self.history]
+	def screenshots(self) -> list[str | None]:
+		"""Get all screenshots from history"""
+		return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history]
+	def action_names(self) -> list[str]:
+		"""Get all action names from history"""
+		action_names = []
+		for action in self.model_actions():
+			actions = list(action.keys())
+			if actions:
+				action_names.append(actions[0])
+		return action_names
+	def model_thoughts(self) -> list[AgentBrain]:
+		"""Get all thoughts from history"""
+		return [h.model_output.current_state for h in self.history if h.model_output]
+	def model_outputs(self) -> list[AgentOutput]:
+		"""Get all model outputs from history"""
+		return [h.model_output for h in self.history if h.model_output]
+	# get all actions with params
+	def model_actions(self) -> list[dict]:
+		"""Get all actions from history"""
+		outputs = []
+		for h in self.history:
+			if h.model_output:
+				for action, interacted_element in zip(h.model_output.action, h.state.interacted_element):
+					output = action.model_dump(exclude_none=True)
+					output['interacted_element'] = interacted_element
+					outputs.append(output)
+		return outputs
+	def action_results(self) -> list[ActionResult]:
+		"""Get all results from history"""
+		results = []
+		for h in self.history:
+			results.extend([r for r in h.result if r])
+		return results
+	def extracted_content(self) -> list[str]:
+		"""Get all extracted content from history"""
+		content = []
+		for h in self.history:
+			content.extend([r.extracted_content for r in h.result if r.extracted_content])
+		return content
+	def model_actions_filtered(self, include: list[str] | None = None) -> list[dict]:
+		"""Get all model actions from history as JSON"""
+		if include is None:
+			include = []
+		outputs = self.model_actions()
+		result = []
+		for o in outputs:
+			for i in include:
+				if i == list(o.keys())[0]:
+					result.append(o)
+		return result
+	def number_of_steps(self) -> int:
+		"""Get the number of steps in the history"""
+		return len(self.history)
+class AgentError:
+	"""Container for agent error handling"""
+	VALIDATION_ERROR = 'Invalid model output format. Please follow the correct schema.'
+	RATE_LIMIT_ERROR = 'Rate limit reached. Waiting before retry.'
+	NO_VALID_ACTION = 'No valid action found'
+	@staticmethod
+	def format_error(error: Exception, include_trace: bool = False) -> str:
+		"""Format error message based on error type and optionally include trace"""
+		message = ''
+		if isinstance(error, ValidationError):
+			return f'{AgentError.VALIDATION_ERROR}\nDetails: {str(error)}'
+		if isinstance(error, RateLimitError):
+			return AgentError.RATE_LIMIT_ERROR
+		if include_trace:
+			return f'{str(error)}\nStacktrace:\n{traceback.format_exc()}'
+		return f'{str(error)}'

browser_use/browser/browser.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Playwright browser on steroids.
+"""
+import asyncio
+import gc
+import logging
+from dataclasses import dataclass, field
+from playwright._impl._api_structures import ProxySettings
+from playwright.async_api import Browser as PlaywrightBrowser
+from playwright.async_api import (
+	Playwright,
+	async_playwright,
+)
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from browser_use.utils import time_execution_async
+logger = logging.getLogger(__name__)
+@dataclass
+class BrowserConfig:
+	r"""
+	Configuration for the Browser.
+	Default values:
+		headless: True
+			Whether to run browser in headless mode
+		disable_security: True
+			Disable browser security features
+		extra_chromium_args: []
+			Extra arguments to pass to the browser
+		wss_url: None
+			Connect to a browser instance via WebSocket
+		cdp_url: None
+			Connect to a browser instance via CDP
+		chrome_instance_path: None
+			Path to a Chrome instance to use to connect to your normal browser
+			e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome'
+	"""
+	headless: bool = False
+	disable_security: bool = True
+	extra_chromium_args: list[str] = field(default_factory=list)
+	chrome_instance_path: str | None = None
+	wss_url: str | None = None
+	cdp_url: str | None = None
+	proxy: ProxySettings | None = field(default=None)
+	new_context_config: BrowserContextConfig = field(default_factory=BrowserContextConfig)
+	_force_keep_browser_alive: bool = False
+# @singleton: TODO - think about id singleton makes sense here
+# @dev By default this is a singleton, but you can create multiple instances if you need to.
+class Browser:
+	"""
+	Playwright browser on steroids.
+	This is persistant browser factory that can spawn multiple browser contexts.
+	It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise).
+	"""
+	def __init__(
+		self,
+		config: BrowserConfig = BrowserConfig(),
+	):
+		logger.debug('Initializing new browser')
+		self.config = config
+		self.playwright: Playwright | None = None
+		self.playwright_browser: PlaywrightBrowser | None = None
+		self.disable_security_args = []
+		if self.config.disable_security:
+			self.disable_security_args = [
+				'--disable-web-security',
+				'--disable-site-isolation-trials',
+				'--disable-features=IsolateOrigins,site-per-process',
+			]
+	async def new_context(self, config: BrowserContextConfig = BrowserContextConfig()) -> BrowserContext:
+		"""Create a browser context"""
+		return BrowserContext(config=config, browser=self)
+	async def get_playwright_browser(self) -> PlaywrightBrowser:
+		"""Get a browser context"""
+		if self.playwright_browser is None:
+			return await self._init()
+		return self.playwright_browser
+	@time_execution_async('--init (browser)')
+	async def _init(self):
+		"""Initialize the browser session"""
+		playwright = await async_playwright().start()
+		browser = await self._setup_browser(playwright)
+		self.playwright = playwright
+		self.playwright_browser = browser
+		return self.playwright_browser
+	async def _setup_cdp(self, playwright: Playwright) -> PlaywrightBrowser:
+		"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
+		if not self.config.cdp_url:
+			raise ValueError('CDP URL is required')
+		logger.info(f'Connecting to remote browser via CDP {self.config.cdp_url}')
+		browser = await playwright.chromium.connect_over_cdp(self.config.cdp_url)
+		return browser
+	async def _setup_wss(self, playwright: Playwright) -> PlaywrightBrowser:
+		"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
+		if not self.config.wss_url:
+			raise ValueError('WSS URL is required')
+		logger.info(f'Connecting to remote browser via WSS {self.config.wss_url}')
+		browser = await playwright.chromium.connect(self.config.wss_url)
+		return browser
+	async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser:
+		"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
+		if not self.config.chrome_instance_path:
+			raise ValueError('Chrome instance path is required')
+		import subprocess
+		import requests
+		try:
+			# Check if browser is already running
+			response = requests.get('http://localhost:9222/json/version', timeout=2)
+			if response.status_code == 200:
+				logger.info('Reusing existing Chrome instance')
+				browser = await playwright.chromium.connect_over_cdp(
+					endpoint_url='http://localhost:9222',
+					timeout=20000,  # 20 second timeout for connection
+				)
+				return browser
+		except requests.ConnectionError:
+			logger.debug('No existing Chrome instance found, starting a new one')
+		# Start a new Chrome instance
+		subprocess.Popen(
+			[
+				self.config.chrome_instance_path,
+				'--remote-debugging-port=9222',
+			]
+			+ self.config.extra_chromium_args,
+			stdout=subprocess.DEVNULL,
+			stderr=subprocess.DEVNULL,
+		)
+		# Attempt to connect again after starting a new instance
+		for _ in range(10):
+			try:
+				response = requests.get('http://localhost:9222/json/version', timeout=2)
+				if response.status_code == 200:
+					break
+			except requests.ConnectionError:
+				pass
+			await asyncio.sleep(1)
+		# Attempt to connect again after starting a new instance
+		try:
+			browser = await playwright.chromium.connect_over_cdp(
+				endpoint_url='http://localhost:9222',
+				timeout=20000,  # 20 second timeout for connection
+			)
+			return browser
+		except Exception as e:
+			logger.error(f'Failed to start a new Chrome instance.: {str(e)}')
+			raise RuntimeError(
+				' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.'
+			)
+	async def _setup_standard_browser(self, playwright: Playwright) -> PlaywrightBrowser:
+		"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
+		browser = await playwright.chromium.launch(
+			headless=self.config.headless,
+			args=[
+				'--no-sandbox',
+				'--disable-blink-features=AutomationControlled',
+				'--disable-infobars',
+				'--disable-background-timer-throttling',
+				'--disable-popup-blocking',
+				'--disable-backgrounding-occluded-windows',
+				'--disable-renderer-backgrounding',
+				'--disable-window-activation',
+				'--disable-focus-on-load',
+				'--no-first-run',
+				'--no-default-browser-check',
+				'--no-startup-window',
+				'--window-position=0,0',
+				# '--window-size=1280,1000',
+			]
+			+ self.disable_security_args
+			+ self.config.extra_chromium_args,
+			proxy=self.config.proxy,
+		)
+		# convert to Browser
+		return browser
+	async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
+		"""Sets up and returns a Playwright Browser instance with anti-detection measures."""
+		try:
+			if self.config.cdp_url:
+				return await self._setup_cdp(playwright)
+			if self.config.wss_url:
+				return await self._setup_wss(playwright)
+			elif self.config.chrome_instance_path:
+				return await self._setup_browser_with_instance(playwright)
+			else:
+				return await self._setup_standard_browser(playwright)
+		except Exception as e:
+			logger.error(f'Failed to initialize Playwright browser: {str(e)}')
+			raise
+	async def close(self):
+		"""Close the browser instance"""
+		try:
+			if not self.config._force_keep_browser_alive:
+				if self.playwright_browser:
+					await self.playwright_browser.close()
+					del self.playwright_browser
+				if self.playwright:
+					await self.playwright.stop()
+					del self.playwright
+		except Exception as e:
+			logger.debug(f'Failed to close browser properly: {e}')
+		finally:
+			self.playwright_browser = None
+			self.playwright = None
+			gc.collect()
+	def __del__(self):
+		"""Async cleanup when object is destroyed"""
+		try:
+			if self.playwright_browser or self.playwright:
+				loop = asyncio.get_running_loop()
+				if loop.is_running():
+					loop.create_task(self.close())
+				else:
+					asyncio.run(self.close())
+		except Exception as e:
+			logger.debug(f'Failed to cleanup browser in destructor: {e}')

browser_use/browser/context.py ADDED Viewed

	@@ -0,0 +1,1353 @@

+"""
+Playwright browser on steroids.
+"""
+import asyncio
+import base64
+import gc
+import json
+import logging
+import os
+import re
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Optional, TypedDict
+from playwright._impl._errors import TimeoutError
+from playwright.async_api import Browser as PlaywrightBrowser
+from playwright.async_api import (
+	BrowserContext as PlaywrightBrowserContext,
+)
+from playwright.async_api import (
+	ElementHandle,
+	FrameLocator,
+	Page,
+)
+from browser_use.browser.views import (
+	BrowserError,
+	BrowserState,
+	TabInfo,
+	URLNotAllowedError,
+)
+from browser_use.dom.service import DomService
+from browser_use.dom.views import DOMElementNode, SelectorMap
+from browser_use.utils import time_execution_async, time_execution_sync
+if TYPE_CHECKING:
+	from browser_use.browser.browser import Browser
+logger = logging.getLogger(__name__)
+class BrowserContextWindowSize(TypedDict):
+	width: int
+	height: int
+@dataclass
+class BrowserContextConfig:
+	"""
+	Configuration for the BrowserContext.
+	Default values:
+	    cookies_file: None
+	        Path to cookies file for persistence
+	        disable_security: True
+	                Disable browser security features
+	    minimum_wait_page_load_time: 0.5
+	        Minimum time to wait before getting page state for LLM input
+	        wait_for_network_idle_page_load_time: 1.0
+	                Time to wait for network requests to finish before getting page state.
+	                Lower values may result in incomplete page loads.
+	    maximum_wait_page_load_time: 5.0
+	        Maximum time to wait for page load before proceeding anyway
+	    wait_between_actions: 1.0
+	        Time to wait between multiple per step actions
+	    browser_window_size: {
+	            'width': 1280,
+	            'height': 1100,
+	        }
+	        Default browser window size
+	    no_viewport: False
+	        Disable viewport
+	    save_recording_path: None
+	        Path to save video recordings
+	    save_downloads_path: None
+	        Path to save downloads to
+	    trace_path: None
+	        Path to save trace files. It will auto name the file with the TRACE_PATH/{context_id}.zip
+	    locale: None
+	        Specify user locale, for example en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. If not provided, defaults to the system default locale.
+	    user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
+	        custom user agent to use.
+	    highlight_elements: True
+	        Highlight elements in the DOM on the screen
+	    viewport_expansion: 500
+	        Viewport expansion in pixels. This amount will increase the number of elements which are included in the state what the LLM will see. If set to -1, all elements will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
+	    allowed_domains: None
+	        List of allowed domains that can be accessed. If None, all domains are allowed.
+	        Example: ['example.com', 'api.example.com']
+	    include_dynamic_attributes: bool = True
+	        Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False.
+	"""
+	cookies_file: str | None = None
+	minimum_wait_page_load_time: float = 0.25
+	wait_for_network_idle_page_load_time: float = 0.5
+	maximum_wait_page_load_time: float = 5
+	wait_between_actions: float = 0.5
+	disable_security: bool = True
+	browser_window_size: BrowserContextWindowSize = field(default_factory=lambda: {'width': 1280, 'height': 1100})
+	no_viewport: Optional[bool] = None
+	save_recording_path: str | None = None
+	save_downloads_path: str | None = None
+	trace_path: str | None = None
+	locale: str | None = None
+	user_agent: str = (
+		'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36  (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
+	)
+	highlight_elements: bool = True
+	viewport_expansion: int = 500
+	allowed_domains: list[str] | None = None
+	include_dynamic_attributes: bool = True
+	_force_keep_context_alive: bool = False
+@dataclass
+class BrowserSession:
+	context: PlaywrightBrowserContext
+	cached_state: BrowserState | None
+@dataclass
+class BrowserContextState:
+	"""
+	State of the browser context
+	"""
+	target_id: str | None = None  # CDP target ID
+class BrowserContext:
+	def __init__(
+		self,
+		browser: 'Browser',
+		config: BrowserContextConfig = BrowserContextConfig(),
+		state: Optional[BrowserContextState] = None,
+	):
+		self.context_id = str(uuid.uuid4())
+		logger.debug(f'Initializing new browser context with id: {self.context_id}')
+		self.config = config
+		self.browser = browser
+		self.state = state or BrowserContextState()
+		# Initialize these as None - they'll be set up when needed
+		self.session: BrowserSession | None = None
+	async def __aenter__(self):
+		"""Async context manager entry"""
+		await self._initialize_session()
+		return self
+	async def __aexit__(self, exc_type, exc_val, exc_tb):
+		"""Async context manager exit"""
+		await self.close()
+	@time_execution_async('--close')
+	async def close(self):
+		"""Close the browser instance"""
+		logger.debug('Closing browser context')
+		try:
+			if self.session is None:
+				return
+			# Then remove CDP protocol listeners
+			if self._page_event_handler and self.session.context:
+				try:
+					# This actually sends a CDP command to unsubscribe
+					self.session.context.remove_listener('page', self._page_event_handler)
+				except Exception as e:
+					logger.debug(f'Failed to remove CDP listener: {e}')
+				self._page_event_handler = None
+			await self.save_cookies()
+			if self.config.trace_path:
+				try:
+					await self.session.context.tracing.stop(path=os.path.join(self.config.trace_path, f'{self.context_id}.zip'))
+				except Exception as e:
+					logger.debug(f'Failed to stop tracing: {e}')
+			# This is crucial - it closes the CDP connection
+			if not self.config._force_keep_context_alive:
+				try:
+					await self.session.context.close()
+				except Exception as e:
+					logger.debug(f'Failed to close context: {e}')
+		finally:
+			# Dereference everything
+			self.session = None
+			self._page_event_handler = None
+	def __del__(self):
+		"""Cleanup when object is destroyed"""
+		if not self.config._force_keep_context_alive and self.session is not None:
+			logger.debug('BrowserContext was not properly closed before destruction')
+			try:
+				# Use sync Playwright method for force cleanup
+				if hasattr(self.session.context, '_impl_obj'):
+					asyncio.run(self.session.context._impl_obj.close())
+				self.session = None
+				gc.collect()
+			except Exception as e:
+				logger.warning(f'Failed to force close browser context: {e}')
+	@time_execution_async('--initialize_session')
+	async def _initialize_session(self):
+		"""Initialize the browser session"""
+		logger.debug('Initializing browser context')
+		playwright_browser = await self.browser.get_playwright_browser()
+		context = await self._create_context(playwright_browser)
+		self._page_event_handler = None
+		# Get or create a page to use
+		pages = context.pages
+		self.session = BrowserSession(
+			context=context,
+			cached_state=None,
+		)
+		active_page = None
+		if self.browser.config.cdp_url:
+			# If we have a saved target ID, try to find and activate it
+			if self.state.target_id:
+				targets = await self._get_cdp_targets()
+				for target in targets:
+					if target['targetId'] == self.state.target_id:
+						# Find matching page by URL
+						for page in pages:
+							if page.url == target['url']:
+								active_page = page
+								break
+						break
+		# If no target ID or couldn't find it, use existing page or create new
+		if not active_page:
+			if pages:
+				active_page = pages[0]
+				logger.debug('Using existing page')
+			else:
+				active_page = await context.new_page()
+				logger.debug('Created new page')
+			# Get target ID for the active page
+			if self.browser.config.cdp_url:
+				targets = await self._get_cdp_targets()
+				for target in targets:
+					if target['url'] == active_page.url:
+						self.state.target_id = target['targetId']
+						break
+		# Bring page to front
+		await active_page.bring_to_front()
+		await active_page.wait_for_load_state('load')
+		return self.session
+	def _add_new_page_listener(self, context: PlaywrightBrowserContext):
+		async def on_page(page: Page):
+			if self.browser.config.cdp_url:
+				await page.reload()  # Reload the page to avoid timeout errors
+			await page.wait_for_load_state()
+			logger.debug(f'New page opened: {page.url}')
+			if self.session is not None:
+				self.state.target_id = None
+		self._page_event_handler = on_page
+		context.on('page', on_page)
+	async def get_session(self) -> BrowserSession:
+		"""Lazy initialization of the browser and related components"""
+		if self.session is None:
+			return await self._initialize_session()
+		return self.session
+	async def get_current_page(self) -> Page:
+		"""Get the current page"""
+		session = await self.get_session()
+		return await self._get_current_page(session)
+	async def _create_context(self, browser: PlaywrightBrowser):
+		"""Creates a new browser context with anti-detection measures and loads cookies if available."""
+		if self.browser.config.cdp_url and len(browser.contexts) > 0:
+			context = browser.contexts[0]
+		elif self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
+			# Connect to existing Chrome instance instead of creating new one
+			context = browser.contexts[0]
+		else:
+			# Original code for creating new context
+			context = await browser.new_context(
+				viewport=self.config.browser_window_size,
+				no_viewport=False,
+				user_agent=self.config.user_agent,
+				java_script_enabled=True,
+				bypass_csp=self.config.disable_security,
+				ignore_https_errors=self.config.disable_security,
+				record_video_dir=self.config.save_recording_path,
+				record_video_size=self.config.browser_window_size,
+				locale=self.config.locale,
+			)
+		if self.config.trace_path:
+			await context.tracing.start(screenshots=True, snapshots=True, sources=True)
+		# Load cookies if they exist
+		if self.config.cookies_file and os.path.exists(self.config.cookies_file):
+			with open(self.config.cookies_file, 'r') as f:
+				cookies = json.load(f)
+				logger.info(f'Loaded {len(cookies)} cookies from {self.config.cookies_file}')
+				await context.add_cookies(cookies)
+		# Expose anti-detection scripts
+		await context.add_init_script(
+			"""
+            // Webdriver property
+            Object.defineProperty(navigator, 'webdriver', {
+                get: () => undefined
+            });
+            // Languages
+            Object.defineProperty(navigator, 'languages', {
+                get: () => ['en-US']
+            });
+            // Plugins
+            Object.defineProperty(navigator, 'plugins', {
+                get: () => [1, 2, 3, 4, 5]
+            });
+            // Chrome runtime
+            window.chrome = { runtime: {} };
+            // Permissions
+            const originalQuery = window.navigator.permissions.query;
+            window.navigator.permissions.query = (parameters) => (
+                parameters.name === 'notifications' ?
+                    Promise.resolve({ state: Notification.permission }) :
+                    originalQuery(parameters)
+            );
+            (function () {
+                const originalAttachShadow = Element.prototype.attachShadow;
+                Element.prototype.attachShadow = function attachShadow(options) {
+                    return originalAttachShadow.call(this, { ...options, mode: "open" });
+                };
+            })();
+            """
+		)
+		return context
+	async def _wait_for_stable_network(self):
+		page = await self.get_current_page()
+		pending_requests = set()
+		last_activity = asyncio.get_event_loop().time()
+		# Define relevant resource types and content types
+		RELEVANT_RESOURCE_TYPES = {
+			'document',
+			'stylesheet',
+			'image',
+			'font',
+			'script',
+			'iframe',
+		}
+		RELEVANT_CONTENT_TYPES = {
+			'text/html',
+			'text/css',
+			'application/javascript',
+			'image/',
+			'font/',
+			'application/json',
+		}
+		# Additional patterns to filter out
+		IGNORED_URL_PATTERNS = {
+			# Analytics and tracking
+			'analytics',
+			'tracking',
+			'telemetry',
+			'beacon',
+			'metrics',
+			# Ad-related
+			'doubleclick',
+			'adsystem',
+			'adserver',
+			'advertising',
+			# Social media widgets
+			'facebook.com/plugins',
+			'platform.twitter',
+			'linkedin.com/embed',
+			# Live chat and support
+			'livechat',
+			'zendesk',
+			'intercom',
+			'crisp.chat',
+			'hotjar',
+			# Push notifications
+			'push-notifications',
+			'onesignal',
+			'pushwoosh',
+			# Background sync/heartbeat
+			'heartbeat',
+			'ping',
+			'alive',
+			# WebRTC and streaming
+			'webrtc',
+			'rtmp://',
+			'wss://',
+			# Common CDNs for dynamic content
+			'cloudfront.net',
+			'fastly.net',
+		}
+		async def on_request(request):
+			# Filter by resource type
+			if request.resource_type not in RELEVANT_RESOURCE_TYPES:
+				return
+			# Filter out streaming, websocket, and other real-time requests
+			if request.resource_type in {
+				'websocket',
+				'media',
+				'eventsource',
+				'manifest',
+				'other',
+			}:
+				return
+			# Filter out by URL patterns
+			url = request.url.lower()
+			if any(pattern in url for pattern in IGNORED_URL_PATTERNS):
+				return
+			# Filter out data URLs and blob URLs
+			if url.startswith(('data:', 'blob:')):
+				return
+			# Filter out requests with certain headers
+			headers = request.headers
+			if headers.get('purpose') == 'prefetch' or headers.get('sec-fetch-dest') in [
+				'video',
+				'audio',
+			]:
+				return
+			nonlocal last_activity
+			pending_requests.add(request)
+			last_activity = asyncio.get_event_loop().time()
+			# logger.debug(f'Request started: {request.url} ({request.resource_type})')
+		async def on_response(response):
+			request = response.request
+			if request not in pending_requests:
+				return
+			# Filter by content type if available
+			content_type = response.headers.get('content-type', '').lower()
+			# Skip if content type indicates streaming or real-time data
+			if any(
+				t in content_type
+				for t in [
+					'streaming',
+					'video',
+					'audio',
+					'webm',
+					'mp4',
+					'event-stream',
+					'websocket',
+					'protobuf',
+				]
+			):
+				pending_requests.remove(request)
+				return
+			# Only process relevant content types
+			if not any(ct in content_type for ct in RELEVANT_CONTENT_TYPES):
+				pending_requests.remove(request)
+				return
+			# Skip if response is too large (likely not essential for page load)
+			content_length = response.headers.get('content-length')
+			if content_length and int(content_length) > 5 * 1024 * 1024:  # 5MB
+				pending_requests.remove(request)
+				return
+			nonlocal last_activity
+			pending_requests.remove(request)
+			last_activity = asyncio.get_event_loop().time()
+			# logger.debug(f'Request resolved: {request.url} ({content_type})')
+		# Attach event listeners
+		page.on('request', on_request)
+		page.on('response', on_response)
+		try:
+			# Wait for idle time
+			start_time = asyncio.get_event_loop().time()
+			while True:
+				await asyncio.sleep(0.1)
+				now = asyncio.get_event_loop().time()
+				if len(pending_requests) == 0 and (now - last_activity) >= self.config.wait_for_network_idle_page_load_time:
+					break
+				if now - start_time > self.config.maximum_wait_page_load_time:
+					logger.debug(
+						f'Network timeout after {self.config.maximum_wait_page_load_time}s with {len(pending_requests)} '
+						f'pending requests: {[r.url for r in pending_requests]}'
+					)
+					break
+		finally:
+			# Clean up event listeners
+			page.remove_listener('request', on_request)
+			page.remove_listener('response', on_response)
+		logger.debug(f'Network stabilized for {self.config.wait_for_network_idle_page_load_time} seconds')
+	async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None):
+		"""
+		Ensures page is fully loaded before continuing.
+		Waits for either network to be idle or minimum WAIT_TIME, whichever is longer.
+		Also checks if the loaded URL is allowed.
+		"""
+		# Start timing
+		start_time = time.time()
+		# Wait for page load
+		try:
+			await self._wait_for_stable_network()
+			# Check if the loaded URL is allowed
+			page = await self.get_current_page()
+			await self._check_and_handle_navigation(page)
+		except URLNotAllowedError as e:
+			raise e
+		except Exception:
+			logger.warning('Page load failed, continuing...')
+			pass
+		# Calculate remaining time to meet minimum WAIT_TIME
+		elapsed = time.time() - start_time
+		remaining = max((timeout_overwrite or self.config.minimum_wait_page_load_time) - elapsed, 0)
+		logger.debug(f'--Page loaded in {elapsed:.2f} seconds, waiting for additional {remaining:.2f} seconds')
+		# Sleep remaining time if needed
+		if remaining > 0:
+			await asyncio.sleep(remaining)
+	def _is_url_allowed(self, url: str) -> bool:
+		"""Check if a URL is allowed based on the whitelist configuration."""
+		if not self.config.allowed_domains:
+			return True
+		try:
+			from urllib.parse import urlparse
+			parsed_url = urlparse(url)
+			domain = parsed_url.netloc.lower()
+			# Remove port number if present
+			if ':' in domain:
+				domain = domain.split(':')[0]
+			# Check if domain matches any allowed domain pattern
+			return any(
+				domain == allowed_domain.lower() or domain.endswith('.' + allowed_domain.lower())
+				for allowed_domain in self.config.allowed_domains
+			)
+		except Exception as e:
+			logger.error(f'Error checking URL allowlist: {str(e)}')
+			return False
+	async def _check_and_handle_navigation(self, page: Page) -> None:
+		"""Check if current page URL is allowed and handle if not."""
+		if not self._is_url_allowed(page.url):
+			logger.warning(f'Navigation to non-allowed URL detected: {page.url}')
+			try:
+				await self.go_back()
+			except Exception as e:
+				logger.error(f'Failed to go back after detecting non-allowed URL: {str(e)}')
+			raise URLNotAllowedError(f'Navigation to non-allowed URL: {page.url}')
+	async def navigate_to(self, url: str):
+		"""Navigate to a URL"""
+		if not self._is_url_allowed(url):
+			raise BrowserError(f'Navigation to non-allowed URL: {url}')
+		page = await self.get_current_page()
+		await page.goto(url)
+		await page.wait_for_load_state()
+	async def refresh_page(self):
+		"""Refresh the current page"""
+		page = await self.get_current_page()
+		await page.reload()
+		await page.wait_for_load_state()
+	async def go_back(self):
+		"""Navigate back in history"""
+		page = await self.get_current_page()
+		try:
+			# 10 ms timeout
+			await page.go_back(timeout=10, wait_until='domcontentloaded')
+			# await self._wait_for_page_and_frames_load(timeout_overwrite=1.0)
+		except Exception as e:
+			# Continue even if its not fully loaded, because we wait later for the page to load
+			logger.debug(f'During go_back: {e}')
+	async def go_forward(self):
+		"""Navigate forward in history"""
+		page = await self.get_current_page()
+		try:
+			await page.go_forward(timeout=10, wait_until='domcontentloaded')
+		except Exception as e:
+			# Continue even if its not fully loaded, because we wait later for the page to load
+			logger.debug(f'During go_forward: {e}')
+	async def close_current_tab(self):
+		"""Close the current tab"""
+		session = await self.get_session()
+		page = await self._get_current_page(session)
+		await page.close()
+		# Switch to the first available tab if any exist
+		if session.context.pages:
+			await self.switch_to_tab(0)
+		# otherwise the browser will be closed
+	async def get_page_html(self) -> str:
+		"""Get the current page HTML content"""
+		page = await self.get_current_page()
+		return await page.content()
+	async def execute_javascript(self, script: str):
+		"""Execute JavaScript code on the page"""
+		page = await self.get_current_page()
+		return await page.evaluate(script)
+	async def get_page_structure(self) -> str:
+		"""Get a debug view of the page structure including iframes"""
+		debug_script = """(() => {
+			function getPageStructure(element = document, depth = 0, maxDepth = 10) {
+				if (depth >= maxDepth) return '';
+				const indent = '  '.repeat(depth);
+				let structure = '';
+				// Skip certain elements that clutter the output
+				const skipTags = new Set(['script', 'style', 'link', 'meta', 'noscript']);
+				// Add current element info if it's not the document
+				if (element !== document) {
+					const tagName = element.tagName.toLowerCase();
+					// Skip uninteresting elements
+					if (skipTags.has(tagName)) return '';
+					const id = element.id ? `#${element.id}` : '';
+					const classes = element.className && typeof element.className === 'string' ?
+						`.${element.className.split(' ').filter(c => c).join('.')}` : '';
+					// Get additional useful attributes
+					const attrs = [];
+					if (element.getAttribute('role')) attrs.push(`role="${element.getAttribute('role')}"`);
+					if (element.getAttribute('aria-label')) attrs.push(`aria-label="${element.getAttribute('aria-label')}"`);
+					if (element.getAttribute('type')) attrs.push(`type="${element.getAttribute('type')}"`);
+					if (element.getAttribute('name')) attrs.push(`name="${element.getAttribute('name')}"`);
+					if (element.getAttribute('src')) {
+						const src = element.getAttribute('src');
+						attrs.push(`src="${src.substring(0, 50)}${src.length > 50 ? '...' : ''}"`);
+					}
+					// Add element info
+					structure += `${indent}${tagName}${id}${classes}${attrs.length ? ' [' + attrs.join(', ') + ']' : ''}\\n`;
+					// Handle iframes specially
+					if (tagName === 'iframe') {
+						try {
+							const iframeDoc = element.contentDocument || element.contentWindow?.document;
+							if (iframeDoc) {
+								structure += `${indent}  [IFRAME CONTENT]:\\n`;
+								structure += getPageStructure(iframeDoc, depth + 2, maxDepth);
+							} else {
+								structure += `${indent}  [IFRAME: No access - likely cross-origin]\\n`;
+							}
+						} catch (e) {
+							structure += `${indent}  [IFRAME: Access denied - ${e.message}]\\n`;
+						}
+					}
+				}
+				// Get all child elements
+				const children = element.children || element.childNodes;
+				for (const child of children) {
+					if (child.nodeType === 1) { // Element nodes only
+						structure += getPageStructure(child, depth + 1, maxDepth);
+					}
+				}
+				return structure;
+			}
+			return getPageStructure();
+		})()"""
+		page = await self.get_current_page()
+		structure = await page.evaluate(debug_script)
+		return structure
+	@time_execution_sync('--get_state')  # This decorator might need to be updated to handle async
+	async def get_state(self) -> BrowserState:
+		"""Get the current state of the browser"""
+		await self._wait_for_page_and_frames_load()
+		session = await self.get_session()
+		session.cached_state = await self._update_state()
+		# Save cookies if a file is specified
+		if self.config.cookies_file:
+			asyncio.create_task(self.save_cookies())
+		return session.cached_state
+	async def _update_state(self, focus_element: int = -1) -> BrowserState:
+		"""Update and return state."""
+		session = await self.get_session()
+		# Check if current page is still valid, if not switch to another available page
+		try:
+			page = await self.get_current_page()
+			# Test if page is still accessible
+			await page.evaluate('1')
+		except Exception as e:
+			logger.debug(f'Current page is no longer accessible: {str(e)}')
+			# Get all available pages
+			pages = session.context.pages
+			if pages:
+				self.state.target_id = None
+				page = await self._get_current_page(session)
+				logger.debug(f'Switched to page: {await page.title()}')
+			else:
+				raise BrowserError('Browser closed: no valid pages available')
+		try:
+			await self.remove_highlights()
+			dom_service = DomService(page)
+			content = await dom_service.get_clickable_elements(
+				focus_element=focus_element,
+				viewport_expansion=self.config.viewport_expansion,
+				highlight_elements=self.config.highlight_elements,
+			)
+			screenshot_b64 = await self.take_screenshot()
+			pixels_above, pixels_below = await self.get_scroll_info(page)
+			self.current_state = BrowserState(
+				element_tree=content.element_tree,
+				selector_map=content.selector_map,
+				url=page.url,
+				title=await page.title(),
+				tabs=await self.get_tabs_info(),
+				screenshot=screenshot_b64,
+				pixels_above=pixels_above,
+				pixels_below=pixels_below,
+			)
+			return self.current_state
+		except Exception as e:
+			logger.error(f'Failed to update state: {str(e)}')
+			# Return last known good state if available
+			if hasattr(self, 'current_state'):
+				return self.current_state
+			raise
+	# region - Browser Actions
+	@time_execution_async('--take_screenshot')
+	async def take_screenshot(self, full_page: bool = False) -> str:
+		"""
+		Returns a base64 encoded screenshot of the current page.
+		"""
+		page = await self.get_current_page()
+		await page.bring_to_front()
+		await page.wait_for_load_state()
+		screenshot = await page.screenshot(
+			full_page=full_page,
+			animations='disabled',
+		)
+		screenshot_b64 = base64.b64encode(screenshot).decode('utf-8')
+		# await self.remove_highlights()
+		return screenshot_b64
+	@time_execution_async('--remove_highlights')
+	async def remove_highlights(self):
+		"""
+		Removes all highlight overlays and labels created by the highlightElement function.
+		Handles cases where the page might be closed or inaccessible.
+		"""
+		try:
+			page = await self.get_current_page()
+			await page.evaluate(
+				"""
+                try {
+                    // Remove the highlight container and all its contents
+                    const container = document.getElementById('playwright-highlight-container');
+                    if (container) {
+                        container.remove();
+                    }
+                    // Remove highlight attributes from elements
+                    const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]');
+                    highlightedElements.forEach(el => {
+                        el.removeAttribute('browser-user-highlight-id');
+                    });
+                } catch (e) {
+                    console.error('Failed to remove highlights:', e);
+                }
+                """
+			)
+		except Exception as e:
+			logger.debug(f'Failed to remove highlights (this is usually ok): {str(e)}')
+			# Don't raise the error since this is not critical functionality
+			pass
+	# endregion
+	# region - User Actions
+	@classmethod
+	def _convert_simple_xpath_to_css_selector(cls, xpath: str) -> str:
+		"""Converts simple XPath expressions to CSS selectors."""
+		if not xpath:
+			return ''
+		# Remove leading slash if present
+		xpath = xpath.lstrip('/')
+		# Split into parts
+		parts = xpath.split('/')
+		css_parts = []
+		for part in parts:
+			if not part:
+				continue
+			# Handle index notation [n]
+			if '[' in part:
+				base_part = part[: part.find('[')]
+				index_part = part[part.find('[') :]
+				# Handle multiple indices
+				indices = [i.strip('[]') for i in index_part.split(']')[:-1]]
+				for idx in indices:
+					try:
+						# Handle numeric indices
+						if idx.isdigit():
+							index = int(idx) - 1
+							base_part += f':nth-of-type({index + 1})'
+						# Handle last() function
+						elif idx == 'last()':
+							base_part += ':last-of-type'
+						# Handle position() functions
+						elif 'position()' in idx:
+							if '>1' in idx:
+								base_part += ':nth-of-type(n+2)'
+					except ValueError:
+						continue
+				css_parts.append(base_part)
+			else:
+				css_parts.append(part)
+		base_selector = ' > '.join(css_parts)
+		return base_selector
+	@classmethod
+	@time_execution_sync('--enhanced_css_selector_for_element')
+	def _enhanced_css_selector_for_element(cls, element: DOMElementNode, include_dynamic_attributes: bool = True) -> str:
+		"""
+		Creates a CSS selector for a DOM element, handling various edge cases and special characters.
+		Args:
+		        element: The DOM element to create a selector for
+		Returns:
+		        A valid CSS selector string
+		"""
+		try:
+			# Get base selector from XPath
+			css_selector = cls._convert_simple_xpath_to_css_selector(element.xpath)
+			# Handle class attributes
+			if 'class' in element.attributes and element.attributes['class'] and include_dynamic_attributes:
+				# Define a regex pattern for valid class names in CSS
+				valid_class_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_-]*$')
+				# Iterate through the class attribute values
+				classes = element.attributes['class'].split()
+				for class_name in classes:
+					# Skip empty class names
+					if not class_name.strip():
+						continue
+					# Check if the class name is valid
+					if valid_class_name_pattern.match(class_name):
+						# Append the valid class name to the CSS selector
+						css_selector += f'.{class_name}'
+					else:
+						# Skip invalid class names
+						continue
+			# Expanded set of safe attributes that are stable and useful for selection
+			SAFE_ATTRIBUTES = {
+				# Data attributes (if they're stable in your application)
+				'id',
+				# Standard HTML attributes
+				'name',
+				'type',
+				'placeholder',
+				# Accessibility attributes
+				'aria-label',
+				'aria-labelledby',
+				'aria-describedby',
+				'role',
+				# Common form attributes
+				'for',
+				'autocomplete',
+				'required',
+				'readonly',
+				# Media attributes
+				'alt',
+				'title',
+				'src',
+				# Custom stable attributes (add any application-specific ones)
+				'href',
+				'target',
+			}
+			if include_dynamic_attributes:
+				dynamic_attributes = {
+					'data-id',
+					'data-qa',
+					'data-cy',
+					'data-testid',
+				}
+				SAFE_ATTRIBUTES.update(dynamic_attributes)
+			# Handle other attributes
+			for attribute, value in element.attributes.items():
+				if attribute == 'class':
+					continue
+				# Skip invalid attribute names
+				if not attribute.strip():
+					continue
+				if attribute not in SAFE_ATTRIBUTES:
+					continue
+				# Escape special characters in attribute names
+				safe_attribute = attribute.replace(':', r'\:')
+				# Handle different value cases
+				if value == '':
+					css_selector += f'[{safe_attribute}]'
+				elif any(char in value for char in '"\'<>`\n\r\t'):
+					# Use contains for values with special characters
+					# Regex-substitute *any* whitespace with a single space, then strip.
+					collapsed_value = re.sub(r'\s+', ' ', value).strip()
+					# Escape embedded double-quotes.
+					safe_value = collapsed_value.replace('"', '\\"')
+					css_selector += f'[{safe_attribute}*="{safe_value}"]'
+				else:
+					css_selector += f'[{safe_attribute}="{value}"]'
+			return css_selector
+		except Exception:
+			# Fallback to a more basic selector if something goes wrong
+			tag_name = element.tag_name or '*'
+			return f"{tag_name}[highlight_index='{element.highlight_index}']"
+	@time_execution_async('--get_locate_element')
+	async def get_locate_element(self, element: DOMElementNode) -> Optional[ElementHandle]:
+		current_frame = await self.get_current_page()
+		# Start with the target element and collect all parents
+		parents: list[DOMElementNode] = []
+		current = element
+		while current.parent is not None:
+			parent = current.parent
+			parents.append(parent)
+			current = parent
+		# Reverse the parents list to process from top to bottom
+		parents.reverse()
+		# Process all iframe parents in sequence
+		iframes = [item for item in parents if item.tag_name == 'iframe']
+		for parent in iframes:
+			css_selector = self._enhanced_css_selector_for_element(
+				parent,
+				include_dynamic_attributes=self.config.include_dynamic_attributes,
+			)
+			current_frame = current_frame.frame_locator(css_selector)
+		css_selector = self._enhanced_css_selector_for_element(
+			element, include_dynamic_attributes=self.config.include_dynamic_attributes
+		)
+		try:
+			if isinstance(current_frame, FrameLocator):
+				element_handle = await current_frame.locator(css_selector).element_handle()
+				return element_handle
+			else:
+				# Try to scroll into view if hidden
+				element_handle = await current_frame.query_selector(css_selector)
+				if element_handle:
+					await element_handle.scroll_into_view_if_needed()
+					return element_handle
+				return None
+		except Exception as e:
+			logger.error(f'Failed to locate element: {str(e)}')
+			return None
+	@time_execution_async('--input_text_element_node')
+	async def _input_text_element_node(self, element_node: DOMElementNode, text: str):
+		"""
+		Input text into an element with proper error handling and state management.
+		Handles different types of input fields and ensures proper element state before input.
+		"""
+		try:
+			# Highlight before typing
+			# if element_node.highlight_index is not None:
+			# 	await self._update_state(focus_element=element_node.highlight_index)
+			element_handle = await self.get_locate_element(element_node)
+			if element_handle is None:
+				raise BrowserError(f'Element: {repr(element_node)} not found')
+			# Ensure element is ready for input
+			try:
+				await element_handle.wait_for_element_state('stable', timeout=1000)
+				await element_handle.scroll_into_view_if_needed(timeout=1000)
+			except Exception:
+				pass
+			# Get element properties to determine input method
+			tag_handle = await element_handle.get_property("tagName")
+			tag_name = (await tag_handle.json_value()).lower()
+			is_contenteditable = await element_handle.get_property('isContentEditable')
+			readonly_handle = await element_handle.get_property("readOnly")
+			disabled_handle = await element_handle.get_property("disabled")
+			readonly = await readonly_handle.json_value() if readonly_handle else False
+			disabled = await disabled_handle.json_value() if disabled_handle else False
+			if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled):
+				await element_handle.evaluate('el => el.textContent = ""')
+				await element_handle.type(text, delay=5)
+			else:
+				await element_handle.fill(text)
+		except Exception as e:
+			logger.debug(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
+			raise BrowserError(f'Failed to input text into index {element_node.highlight_index}')
+	@time_execution_async('--click_element_node')
+	async def _click_element_node(self, element_node: DOMElementNode) -> Optional[str]:
+		"""
+		Optimized method to click an element using xpath.
+		"""
+		page = await self.get_current_page()
+		try:
+			# Highlight before clicking
+			# if element_node.highlight_index is not None:
+			# 	await self._update_state(focus_element=element_node.highlight_index)
+			element_handle = await self.get_locate_element(element_node)
+			if element_handle is None:
+				raise Exception(f'Element: {repr(element_node)} not found')
+			async def perform_click(click_func):
+				"""Performs the actual click, handling both download
+				and navigation scenarios."""
+				if self.config.save_downloads_path:
+					try:
+						# Try short-timeout expect_download to detect a file download has been been triggered
+						async with page.expect_download(timeout=5000) as download_info:
+							await click_func()
+						download = await download_info.value
+						# Determine file path
+						suggested_filename = download.suggested_filename
+						unique_filename = await self._get_unique_filename(self.config.save_downloads_path, suggested_filename)
+						download_path = os.path.join(self.config.save_downloads_path, unique_filename)
+						await download.save_as(download_path)
+						logger.debug(f'Download triggered. Saved file to: {download_path}')
+						return download_path
+					except TimeoutError:
+						# If no download is triggered, treat as normal click
+						logger.debug('No download triggered within timeout. Checking navigation...')
+						await page.wait_for_load_state()
+						await self._check_and_handle_navigation(page)
+				else:
+					# Standard click logic if no download is expected
+					await click_func()
+					await page.wait_for_load_state()
+					await self._check_and_handle_navigation(page)
+			try:
+				return await perform_click(lambda: element_handle.click(timeout=1500))
+			except URLNotAllowedError as e:
+				raise e
+			except Exception:
+				try:
+					return await perform_click(lambda: page.evaluate('(el) => el.click()', element_handle))
+				except URLNotAllowedError as e:
+					raise e
+				except Exception as e:
+					raise Exception(f'Failed to click element: {str(e)}')
+		except URLNotAllowedError as e:
+			raise e
+		except Exception as e:
+			raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
+	@time_execution_async('--get_tabs_info')
+	async def get_tabs_info(self) -> list[TabInfo]:
+		"""Get information about all tabs"""
+		session = await self.get_session()
+		tabs_info = []
+		for page_id, page in enumerate(session.context.pages):
+			tab_info = TabInfo(page_id=page_id, url=page.url, title=await page.title())
+			tabs_info.append(tab_info)
+		return tabs_info
+	@time_execution_async('--switch_to_tab')
+	async def switch_to_tab(self, page_id: int) -> None:
+		"""Switch to a specific tab by its page_id"""
+		session = await self.get_session()
+		pages = session.context.pages
+		if page_id >= len(pages):
+			raise BrowserError(f'No tab found with page_id: {page_id}')
+		page = pages[page_id]
+		# Check if the tab's URL is allowed before switching
+		if not self._is_url_allowed(page.url):
+			raise BrowserError(f'Cannot switch to tab with non-allowed URL: {page.url}')
+		# Update target ID if using CDP
+		if self.browser.config.cdp_url:
+			targets = await self._get_cdp_targets()
+			for target in targets:
+				if target['url'] == page.url:
+					self.state.target_id = target['targetId']
+					break
+		await page.bring_to_front()
+		await page.wait_for_load_state()
+	@time_execution_async('--create_new_tab')
+	async def create_new_tab(self, url: str | None = None) -> None:
+		"""Create a new tab and optionally navigate to a URL"""
+		if url and not self._is_url_allowed(url):
+			raise BrowserError(f'Cannot create new tab with non-allowed URL: {url}')
+		session = await self.get_session()
+		new_page = await session.context.new_page()
+		await new_page.wait_for_load_state()
+		if url:
+			await new_page.goto(url)
+			await self._wait_for_page_and_frames_load(timeout_overwrite=1)
+		# Get target ID for new page if using CDP
+		if self.browser.config.cdp_url:
+			targets = await self._get_cdp_targets()
+			for target in targets:
+				if target['url'] == new_page.url:
+					self.state.target_id = target['targetId']
+					break
+	# endregion
+	# region - Helper methods for easier access to the DOM
+	async def _get_current_page(self, session: BrowserSession) -> Page:
+		pages = session.context.pages
+		# Try to find page by target ID if using CDP
+		if self.browser.config.cdp_url and self.state.target_id:
+			targets = await self._get_cdp_targets()
+			for target in targets:
+				if target['targetId'] == self.state.target_id:
+					for page in pages:
+						if page.url == target['url']:
+							return page
+		# Fallback to last page
+		return pages[-1] if pages else await session.context.new_page()
+	async def get_selector_map(self) -> SelectorMap:
+		session = await self.get_session()
+		if session.cached_state is None:
+			return {}
+		return session.cached_state.selector_map
+	async def get_element_by_index(self, index: int) -> ElementHandle | None:
+		selector_map = await self.get_selector_map()
+		element_handle = await self.get_locate_element(selector_map[index])
+		return element_handle
+	async def get_dom_element_by_index(self, index: int) -> DOMElementNode:
+		selector_map = await self.get_selector_map()
+		return selector_map[index]
+	async def save_cookies(self):
+		"""Save current cookies to file"""
+		if self.session and self.session.context and self.config.cookies_file:
+			try:
+				cookies = await self.session.context.cookies()
+				logger.debug(f'Saving {len(cookies)} cookies to {self.config.cookies_file}')
+				# Check if the path is a directory and create it if necessary
+				dirname = os.path.dirname(self.config.cookies_file)
+				if dirname:
+					os.makedirs(dirname, exist_ok=True)
+				with open(self.config.cookies_file, 'w') as f:
+					json.dump(cookies, f)
+			except Exception as e:
+				logger.warning(f'Failed to save cookies: {str(e)}')
+	async def is_file_uploader(self, element_node: DOMElementNode, max_depth: int = 3, current_depth: int = 0) -> bool:
+		"""Check if element or its children are file uploaders"""
+		if current_depth > max_depth:
+			return False
+		# Check current element
+		is_uploader = False
+		if not isinstance(element_node, DOMElementNode):
+			return False
+		# Check for file input attributes
+		if element_node.tag_name == 'input':
+			is_uploader = element_node.attributes.get('type') == 'file' or element_node.attributes.get('accept') is not None
+		if is_uploader:
+			return True
+		# Recursively check children
+		if element_node.children and current_depth < max_depth:
+			for child in element_node.children:
+				if isinstance(child, DOMElementNode):
+					if await self.is_file_uploader(child, max_depth, current_depth + 1):
+						return True
+		return False
+	async def get_scroll_info(self, page: Page) -> tuple[int, int]:
+		"""Get scroll position information for the current page."""
+		scroll_y = await page.evaluate('window.scrollY')
+		viewport_height = await page.evaluate('window.innerHeight')
+		total_height = await page.evaluate('document.documentElement.scrollHeight')
+		pixels_above = scroll_y
+		pixels_below = total_height - (scroll_y + viewport_height)
+		return pixels_above, pixels_below
+	async def reset_context(self):
+		"""Reset the browser session
+		Call this when you don't want to kill the context but just kill the state
+		"""
+		# close all tabs and clear cached state
+		session = await self.get_session()
+		pages = session.context.pages
+		for page in pages:
+			await page.close()
+		session.cached_state = None
+		self.state.target_id = None
+	async def _get_unique_filename(self, directory, filename):
+		"""Generate a unique filename by appending (1), (2), etc., if a file already exists."""
+		base, ext = os.path.splitext(filename)
+		counter = 1
+		new_filename = filename
+		while os.path.exists(os.path.join(directory, new_filename)):
+			new_filename = f'{base} ({counter}){ext}'
+			counter += 1
+		return new_filename
+	async def _get_cdp_targets(self) -> list[dict]:
+		"""Get all CDP targets directly using CDP protocol"""
+		if not self.browser.config.cdp_url or not self.session:
+			return []
+		try:
+			pages = self.session.context.pages
+			if not pages:
+				return []
+			cdp_session = await pages[0].context.new_cdp_session(pages[0])
+			result = await cdp_session.send('Target.getTargets')
+			await cdp_session.detach()
+			return result.get('targetInfos', [])
+		except Exception as e:
+			logger.debug(f'Failed to get CDP targets: {e}')
+			return []

browser_use/browser/tests/screenshot_test.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import base64
+import pytest
+from browser_use.browser.browser import Browser, BrowserConfig
+@pytest.fixture
+async def browser():
+	browser_service = Browser(config=BrowserConfig(headless=True))
+	yield browser_service
+	await browser_service.close()
+# @pytest.mark.skip(reason='takes too long')
+def test_take_full_page_screenshot(browser):
+	# Go to a test page
+	browser.go_to_url('https://example.com')
+	# Take full page screenshot
+	screenshot_b64 = browser.take_screenshot(full_page=True)
+	# Verify screenshot is not empty and is valid base64
+	assert screenshot_b64 is not None
+	assert isinstance(screenshot_b64, str)
+	assert len(screenshot_b64) > 0
+	# Test we can decode the base64 string
+	try:
+		base64.b64decode(screenshot_b64)
+	except Exception as e:
+		pytest.fail(f'Failed to decode base64 screenshot: {str(e)}')
+if __name__ == '__main__':
+	test_take_full_page_screenshot(Browser(config=BrowserConfig(headless=False)))

browser_use/browser/tests/test_clicks.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import asyncio
+import json
+import pytest
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode
+from browser_use.utils import time_execution_sync
+class ElementTreeSerializer:
+	@staticmethod
+	def dom_element_node_to_json(element_tree: DOMElementNode) -> dict:
+		def node_to_dict(node: DOMBaseNode) -> dict:
+			if isinstance(node, DOMTextNode):
+				return {'type': 'text', 'text': node.text}
+			elif isinstance(node, DOMElementNode):
+				return {
+					'type': 'element',
+					'tag_name': node.tag_name,
+					'attributes': node.attributes,
+					'highlight_index': node.highlight_index,
+					'children': [node_to_dict(child) for child in node.children],
+				}
+			return {}
+		return node_to_dict(element_tree)
+# run with: pytest browser_use/browser/tests/test_clicks.py
+@pytest.mark.asyncio
+async def test_highlight_elements():
+	browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
+	async with await browser.new_context() as context:
+		page = await context.get_current_page()
+		# await page.goto('https://immobilienscout24.de')
+		# await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans')
+		# await page.goto('https://google.com/search?q=elon+musk')
+		# await page.goto('https://kayak.com')
+		# await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe')
+		# await page.goto('https://dictionary.cambridge.org')
+		# await page.goto('https://github.com')
+		await page.goto('https://huggingface.co/')
+		await asyncio.sleep(1)
+		while True:
+			try:
+				# await asyncio.sleep(10)
+				state = await context.get_state()
+				with open('./tmp/page.json', 'w') as f:
+					json.dump(
+						ElementTreeSerializer.dom_element_node_to_json(state.element_tree),
+						f,
+						indent=1,
+					)
+				# await time_execution_sync('highlight_selector_map_elements')(
+				# 	browser.highlight_selector_map_elements
+				# )(state.selector_map)
+				# Find and print duplicate XPaths
+				xpath_counts = {}
+				if not state.selector_map:
+					continue
+				for selector in state.selector_map.values():
+					xpath = selector.xpath
+					if xpath in xpath_counts:
+						xpath_counts[xpath] += 1
+					else:
+						xpath_counts[xpath] = 1
+				print('\nDuplicate XPaths found:')
+				for xpath, count in xpath_counts.items():
+					if count > 1:
+						print(f'XPath: {xpath}')
+						print(f'Count: {count}\n')
+				print(list(state.selector_map.keys()), 'Selector map keys')
+				print(state.element_tree.clickable_elements_to_string())
+				action = input('Select next action: ')
+				await time_execution_sync('remove_highlight_elements')(context.remove_highlights)()
+				node_element = state.selector_map[int(action)]
+				# check if index of selector map are the same as index of items in dom_items
+				await context._click_element_node(node_element)
+			except Exception as e:
+				print(e)

browser_use/browser/views.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from dataclasses import dataclass, field
+from typing import Any, Optional
+from pydantic import BaseModel
+from browser_use.dom.history_tree_processor.service import DOMHistoryElement
+from browser_use.dom.views import DOMState
+# Pydantic
+class TabInfo(BaseModel):
+	"""Represents information about a browser tab"""
+	page_id: int
+	url: str
+	title: str
+@dataclass
+class BrowserState(DOMState):
+	url: str
+	title: str
+	tabs: list[TabInfo]
+	screenshot: Optional[str] = None
+	pixels_above: int = 0
+	pixels_below: int = 0
+	browser_errors: list[str] = field(default_factory=list)
+@dataclass
+class BrowserStateHistory:
+	url: str
+	title: str
+	tabs: list[TabInfo]
+	interacted_element: list[DOMHistoryElement | None] | list[None]
+	screenshot: Optional[str] = None
+	def to_dict(self) -> dict[str, Any]:
+		data = {}
+		data['tabs'] = [tab.model_dump() for tab in self.tabs]
+		data['screenshot'] = self.screenshot
+		data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
+		data['url'] = self.url
+		data['title'] = self.title
+		return data
+class BrowserError(Exception):
+	"""Base class for all browser errors"""
+class URLNotAllowedError(BrowserError):
+	"""Error raised when a URL is not allowed"""

browser_use/controller/registry/service.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import asyncio
+from inspect import iscoroutinefunction, signature
+from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
+from langchain_core.language_models.chat_models import BaseChatModel
+from pydantic import BaseModel, Field, create_model
+from browser_use.browser.context import BrowserContext
+from browser_use.controller.registry.views import (
+	ActionModel,
+	ActionRegistry,
+	RegisteredAction,
+)
+from browser_use.telemetry.service import ProductTelemetry
+from browser_use.telemetry.views import (
+	ControllerRegisteredFunctionsTelemetryEvent,
+	RegisteredFunction,
+)
+from browser_use.utils import time_execution_async, time_execution_sync
+Context = TypeVar('Context')
+class Registry(Generic[Context]):
+	"""Service for registering and managing actions"""
+	def __init__(self, exclude_actions: list[str] | None = None):
+		self.registry = ActionRegistry()
+		self.telemetry = ProductTelemetry()
+		self.exclude_actions = exclude_actions if exclude_actions is not None else []
+	@time_execution_sync('--create_param_model')
+	def _create_param_model(self, function: Callable) -> Type[BaseModel]:
+		"""Creates a Pydantic model from function signature"""
+		sig = signature(function)
+		params = {
+			name: (param.annotation, ... if param.default == param.empty else param.default)
+			for name, param in sig.parameters.items()
+			if name != 'browser' and name != 'page_extraction_llm' and name != 'available_file_paths'
+		}
+		# TODO: make the types here work
+		return create_model(
+			f'{function.__name__}_parameters',
+			__base__=ActionModel,
+			**params,  # type: ignore
+		)
+	def action(
+		self,
+		description: str,
+		param_model: Optional[Type[BaseModel]] = None,
+	):
+		"""Decorator for registering actions"""
+		def decorator(func: Callable):
+			# Skip registration if action is in exclude_actions
+			if func.__name__ in self.exclude_actions:
+				return func
+			# Create param model from function if not provided
+			actual_param_model = param_model or self._create_param_model(func)
+			# Wrap sync functions to make them async
+			if not iscoroutinefunction(func):
+				async def async_wrapper(*args, **kwargs):
+					return await asyncio.to_thread(func, *args, **kwargs)
+				# Copy the signature and other metadata from the original function
+				async_wrapper.__signature__ = signature(func)
+				async_wrapper.__name__ = func.__name__
+				async_wrapper.__annotations__ = func.__annotations__
+				wrapped_func = async_wrapper
+			else:
+				wrapped_func = func
+			action = RegisteredAction(
+				name=func.__name__,
+				description=description,
+				function=wrapped_func,
+				param_model=actual_param_model,
+			)
+			self.registry.actions[func.__name__] = action
+			return func
+		return decorator
+	@time_execution_async('--execute_action')
+	async def execute_action(
+		self,
+		action_name: str,
+		params: dict,
+		browser: Optional[BrowserContext] = None,
+		page_extraction_llm: Optional[BaseChatModel] = None,
+		sensitive_data: Optional[Dict[str, str]] = None,
+		available_file_paths: Optional[list[str]] = None,
+		#
+		context: Context | None = None,
+	) -> Any:
+		"""Execute a registered action"""
+		if action_name not in self.registry.actions:
+			raise ValueError(f'Action {action_name} not found')
+		action = self.registry.actions[action_name]
+		try:
+			# Create the validated Pydantic model
+			validated_params = action.param_model(**params)
+			# Check if the first parameter is a Pydantic model
+			sig = signature(action.function)
+			parameters = list(sig.parameters.values())
+			is_pydantic = parameters and issubclass(parameters[0].annotation, BaseModel)
+			parameter_names = [param.name for param in parameters]
+			if sensitive_data:
+				validated_params = self._replace_sensitive_data(validated_params, sensitive_data)
+			# Check if the action requires browser
+			if 'browser' in parameter_names and not browser:
+				raise ValueError(f'Action {action_name} requires browser but none provided.')
+			if 'page_extraction_llm' in parameter_names and not page_extraction_llm:
+				raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.')
+			if 'available_file_paths' in parameter_names and not available_file_paths:
+				raise ValueError(f'Action {action_name} requires available_file_paths but none provided.')
+			if 'context' in parameter_names and not context:
+				raise ValueError(f'Action {action_name} requires context but none provided.')
+			# Prepare arguments based on parameter type
+			extra_args = {}
+			if 'context' in parameter_names:
+				extra_args['context'] = context
+			if 'browser' in parameter_names:
+				extra_args['browser'] = browser
+			if 'page_extraction_llm' in parameter_names:
+				extra_args['page_extraction_llm'] = page_extraction_llm
+			if 'available_file_paths' in parameter_names:
+				extra_args['available_file_paths'] = available_file_paths
+			if action_name == 'input_text' and sensitive_data:
+				extra_args['has_sensitive_data'] = True
+			if is_pydantic:
+				return await action.function(validated_params, **extra_args)
+			return await action.function(**validated_params.model_dump(), **extra_args)
+		except Exception as e:
+			raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
+	def _replace_sensitive_data(self, params: BaseModel, sensitive_data: Dict[str, str]) -> BaseModel:
+		"""Replaces the sensitive data in the params"""
+		# if there are any str with <secret>placeholder</secret> in the params, replace them with the actual value from sensitive_data
+		import re
+		secret_pattern = re.compile(r'<secret>(.*?)</secret>')
+		def replace_secrets(value):
+			if isinstance(value, str):
+				matches = secret_pattern.findall(value)
+				for placeholder in matches:
+					if placeholder in sensitive_data:
+						value = value.replace(f'<secret>{placeholder}</secret>', sensitive_data[placeholder])
+				return value
+			elif isinstance(value, dict):
+				return {k: replace_secrets(v) for k, v in value.items()}
+			elif isinstance(value, list):
+				return [replace_secrets(v) for v in value]
+			return value
+		for key, value in params.model_dump().items():
+			params.__dict__[key] = replace_secrets(value)
+		return params
+	@time_execution_sync('--create_action_model')
+	def create_action_model(self, include_actions: Optional[list[str]] = None) -> Type[ActionModel]:
+		"""Creates a Pydantic model from registered actions"""
+		fields = {
+			name: (
+				Optional[action.param_model],
+				Field(default=None, description=action.description),
+			)
+			for name, action in self.registry.actions.items()
+			if include_actions is None or name in include_actions
+		}
+		self.telemetry.capture(
+			ControllerRegisteredFunctionsTelemetryEvent(
+				registered_functions=[
+					RegisteredFunction(name=name, params=action.param_model.model_json_schema())
+					for name, action in self.registry.actions.items()
+					if include_actions is None or name in include_actions
+				]
+			)
+		)
+		return create_model('ActionModel', __base__=ActionModel, **fields)  # type:ignore
+	def get_prompt_description(self) -> str:
+		"""Get a description of all actions for the prompt"""
+		return self.registry.get_prompt_description()

browser_use/controller/registry/views.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from typing import Callable, Dict, Type
+from pydantic import BaseModel, ConfigDict
+class RegisteredAction(BaseModel):
+	"""Model for a registered action"""
+	name: str
+	description: str
+	function: Callable
+	param_model: Type[BaseModel]
+	model_config = ConfigDict(arbitrary_types_allowed=True)
+	def prompt_description(self) -> str:
+		"""Get a description of the action for the prompt"""
+		skip_keys = ['title']
+		s = f'{self.description}: \n'
+		s += '{' + str(self.name) + ': '
+		s += str(
+			{
+				k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys}
+				for k, v in self.param_model.schema()['properties'].items()
+			}
+		)
+		s += '}'
+		return s
+class ActionModel(BaseModel):
+	"""Base model for dynamically created action models"""
+	# this will have all the registered actions, e.g.
+	# click_element = param_model = ClickElementParams
+	# done = param_model = None
+	#
+	model_config = ConfigDict(arbitrary_types_allowed=True)
+	def get_index(self) -> int | None:
+		"""Get the index of the action"""
+		# {'clicked_element': {'index':5}}
+		params = self.model_dump(exclude_unset=True).values()
+		if not params:
+			return None
+		for param in params:
+			if param is not None and 'index' in param:
+				return param['index']
+		return None
+	def set_index(self, index: int):
+		"""Overwrite the index of the action"""
+		# Get the action name and params
+		action_data = self.model_dump(exclude_unset=True)
+		action_name = next(iter(action_data.keys()))
+		action_params = getattr(self, action_name)
+		# Update the index directly on the model
+		if hasattr(action_params, 'index'):
+			action_params.index = index
+class ActionRegistry(BaseModel):
+	"""Model representing the action registry"""
+	actions: Dict[str, RegisteredAction] = {}
+	def get_prompt_description(self) -> str:
+		"""Get a description of all actions for the prompt"""
+		return '\n'.join([action.prompt_description() for action in self.actions.values()])

browser_use/controller/service.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import asyncio
+import json
+import enum
+import logging
+from typing import Dict, Generic, Optional, Type, TypeVar
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.prompts import PromptTemplate
+# from lmnr.sdk.laminar import Laminar
+from pydantic import BaseModel
+from browser_use.agent.views import ActionModel, ActionResult
+from browser_use.browser.context import BrowserContext
+from browser_use.controller.registry.service import Registry
+from browser_use.controller.views import (
+	ClickElementAction,
+	DoneAction,
+	GoToUrlAction,
+	InputTextAction,
+	NoParamsAction,
+	OpenTabAction,
+	ScrollAction,
+	SearchGoogleAction,
+	SendKeysAction,
+	SwitchTabAction,
+)
+from browser_use.utils import time_execution_sync
+logger = logging.getLogger(__name__)
+Context = TypeVar('Context')
+class Controller(Generic[Context]):
+	def __init__(
+		self,
+		exclude_actions: list[str] = [],
+		output_model: Optional[Type[BaseModel]] = None,
+	):
+		self.registry = Registry[Context](exclude_actions)
+		"""Register all default browser actions"""
+		if output_model is not None:
+			# Create a new model that extends the output model with success parameter
+			class ExtendedOutputModel(BaseModel):  # type: ignore
+				success: bool = True
+				data: output_model
+			@self.registry.action(
+				'Complete task - with return text and if the task is finished (success=True) or not yet  completly finished (success=False), because last step is reached',
+				param_model=ExtendedOutputModel,
+			)
+			async def done(params: ExtendedOutputModel):
+				# Exclude success from the output JSON since it's an internal parameter
+				output_dict = params.data.model_dump()
+				# Enums are not serializable, convert to string
+				for key, value in output_dict.items():
+					if isinstance(value, enum.Enum):
+						output_dict[key] = value.value
+				return ActionResult(is_done=True, success=params.success, extracted_content=json.dumps(output_dict))
+		else:
+			@self.registry.action(
+				'Complete task - with return text and if the task is finished (success=True) or not yet  completly finished (success=False), because last step is reached',
+				param_model=DoneAction,
+			)
+			async def done(params: DoneAction):
+				return ActionResult(is_done=True, success=params.success, extracted_content=params.text)
+		# Basic Navigation Actions
+		@self.registry.action(
+			'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
+			param_model=SearchGoogleAction,
+		)
+		async def search_google(params: SearchGoogleAction, browser: BrowserContext):
+			page = await browser.get_current_page()
+			await page.goto(f'https://www.google.com/search?q={params.query}&udm=14')
+			await page.wait_for_load_state()
+			msg = f'🔍  Searched for "{params.query}" in Google'
+			logger.info(msg)
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		@self.registry.action('Navigate to URL in the current tab', param_model=GoToUrlAction)
+		async def go_to_url(params: GoToUrlAction, browser: BrowserContext):
+			page = await browser.get_current_page()
+			await page.goto(params.url)
+			await page.wait_for_load_state()
+			msg = f'🔗  Navigated to {params.url}'
+			logger.info(msg)
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		@self.registry.action('Go back', param_model=NoParamsAction)
+		async def go_back(_: NoParamsAction, browser: BrowserContext):
+			await browser.go_back()
+			msg = '🔙  Navigated back'
+			logger.info(msg)
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		# wait for x seconds
+		@self.registry.action('Wait for x seconds default 3')
+		async def wait(seconds: int = 3):
+			msg = f'🕒  Waiting for {seconds} seconds'
+			logger.info(msg)
+			await asyncio.sleep(seconds)
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		# Element Interaction Actions
+		@self.registry.action('Click element', param_model=ClickElementAction)
+		async def click_element(params: ClickElementAction, browser: BrowserContext):
+			session = await browser.get_session()
+			if params.index not in await browser.get_selector_map():
+				raise Exception(f'Element with index {params.index} does not exist - retry or use alternative actions')
+			element_node = await browser.get_dom_element_by_index(params.index)
+			initial_pages = len(session.context.pages)
+			# if element has file uploader then dont click
+			if await browser.is_file_uploader(element_node):
+				msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
+				logger.info(msg)
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+			msg = None
+			try:
+				download_path = await browser._click_element_node(element_node)
+				if download_path:
+					msg = f'💾  Downloaded file to {download_path}'
+				else:
+					msg = f'🖱️  Clicked button with index {params.index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}'
+				logger.info(msg)
+				logger.debug(f'Element xpath: {element_node.xpath}')
+				if len(session.context.pages) > initial_pages:
+					new_tab_msg = 'New tab opened - switching to it'
+					msg += f' - {new_tab_msg}'
+					logger.info(new_tab_msg)
+					await browser.switch_to_tab(-1)
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+			except Exception as e:
+				logger.warning(f'Element not clickable with index {params.index} - most likely the page changed')
+				return ActionResult(error=str(e))
+		@self.registry.action(
+			'Input text into a input interactive element',
+			param_model=InputTextAction,
+		)
+		async def input_text(params: InputTextAction, browser: BrowserContext, has_sensitive_data: bool = False):
+			if params.index not in await browser.get_selector_map():
+				raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions')
+			element_node = await browser.get_dom_element_by_index(params.index)
+			await browser._input_text_element_node(element_node, params.text)
+			if not has_sensitive_data:
+				msg = f'⌨️  Input {params.text} into index {params.index}'
+			else:
+				msg = f'⌨️  Input sensitive data into index {params.index}'
+			logger.info(msg)
+			logger.debug(f'Element xpath: {element_node.xpath}')
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		# Tab Management Actions
+		@self.registry.action('Switch tab', param_model=SwitchTabAction)
+		async def switch_tab(params: SwitchTabAction, browser: BrowserContext):
+			await browser.switch_to_tab(params.page_id)
+			# Wait for tab to be ready
+			page = await browser.get_current_page()
+			await page.wait_for_load_state()
+			msg = f'🔄  Switched to tab {params.page_id}'
+			logger.info(msg)
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		@self.registry.action('Open url in new tab', param_model=OpenTabAction)
+		async def open_tab(params: OpenTabAction, browser: BrowserContext):
+			await browser.create_new_tab(params.url)
+			msg = f'🔗  Opened new tab with {params.url}'
+			logger.info(msg)
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		# Content Actions
+		@self.registry.action(
+			'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links',
+		)
+		async def extract_content(goal: str, browser: BrowserContext, page_extraction_llm: BaseChatModel):
+			page = await browser.get_current_page()
+			import markdownify
+			content = markdownify.markdownify(await page.content())
+			prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
+			template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
+			try:
+				output = page_extraction_llm.invoke(template.format(goal=goal, page=content))
+				msg = f'📄  Extracted from page\n: {output.content}\n'
+				logger.info(msg)
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+			except Exception as e:
+				logger.debug(f'Error extracting content: {e}')
+				msg = f'📄  Extracted from page\n: {content}\n'
+				logger.info(msg)
+				return ActionResult(extracted_content=msg)
+		@self.registry.action(
+			'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
+			param_model=ScrollAction,
+		)
+		async def scroll_down(params: ScrollAction, browser: BrowserContext):
+			page = await browser.get_current_page()
+			if params.amount is not None:
+				await page.evaluate(f'window.scrollBy(0, {params.amount});')
+			else:
+				await page.evaluate('window.scrollBy(0, window.innerHeight);')
+			amount = f'{params.amount} pixels' if params.amount is not None else 'one page'
+			msg = f'🔍  Scrolled down the page by {amount}'
+			logger.info(msg)
+			return ActionResult(
+				extracted_content=msg,
+				include_in_memory=True,
+			)
+		# scroll up
+		@self.registry.action(
+			'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
+			param_model=ScrollAction,
+		)
+		async def scroll_up(params: ScrollAction, browser: BrowserContext):
+			page = await browser.get_current_page()
+			if params.amount is not None:
+				await page.evaluate(f'window.scrollBy(0, -{params.amount});')
+			else:
+				await page.evaluate('window.scrollBy(0, -window.innerHeight);')
+			amount = f'{params.amount} pixels' if params.amount is not None else 'one page'
+			msg = f'🔍  Scrolled up the page by {amount}'
+			logger.info(msg)
+			return ActionResult(
+				extracted_content=msg,
+				include_in_memory=True,
+			)
+		# send keys
+		@self.registry.action(
+			'Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. ',
+			param_model=SendKeysAction,
+		)
+		async def send_keys(params: SendKeysAction, browser: BrowserContext):
+			page = await browser.get_current_page()
+			try:
+				await page.keyboard.press(params.keys)
+			except Exception as e:
+				if 'Unknown key' in str(e):
+					# loop over the keys and try to send each one
+					for key in params.keys:
+						try:
+							await page.keyboard.press(key)
+						except Exception as e:
+							logger.debug(f'Error sending key {key}: {str(e)}')
+							raise e
+				else:
+					raise e
+			msg = f'⌨️  Sent keys: {params.keys}'
+			logger.info(msg)
+			return ActionResult(extracted_content=msg, include_in_memory=True)
+		@self.registry.action(
+			description='If you dont find something which you want to interact with, scroll to it',
+		)
+		async def scroll_to_text(text: str, browser: BrowserContext):  # type: ignore
+			page = await browser.get_current_page()
+			try:
+				# Try different locator strategies
+				locators = [
+					page.get_by_text(text, exact=False),
+					page.locator(f'text={text}'),
+					page.locator(f"//*[contains(text(), '{text}')]"),
+				]
+				for locator in locators:
+					try:
+						# First check if element exists and is visible
+						if await locator.count() > 0 and await locator.first.is_visible():
+							await locator.first.scroll_into_view_if_needed()
+							await asyncio.sleep(0.5)  # Wait for scroll to complete
+							msg = f'🔍  Scrolled to text: {text}'
+							logger.info(msg)
+							return ActionResult(extracted_content=msg, include_in_memory=True)
+					except Exception as e:
+						logger.debug(f'Locator attempt failed: {str(e)}')
+						continue
+				msg = f"Text '{text}' not found or not visible on page"
+				logger.info(msg)
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+			except Exception as e:
+				msg = f"Failed to scroll to text '{text}': {str(e)}"
+				logger.error(msg)
+				return ActionResult(error=msg, include_in_memory=True)
+		@self.registry.action(
+			description='Get all options from a native dropdown',
+		)
+		async def get_dropdown_options(index: int, browser: BrowserContext) -> ActionResult:
+			"""Get all options from a native dropdown"""
+			page = await browser.get_current_page()
+			selector_map = await browser.get_selector_map()
+			dom_element = selector_map[index]
+			try:
+				# Frame-aware approach since we know it works
+				all_options = []
+				frame_index = 0
+				for frame in page.frames:
+					try:
+						options = await frame.evaluate(
+							"""
+							(xpath) => {
+								const select = document.evaluate(xpath, document, null,
+									XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+								if (!select) return null;
+								return {
+									options: Array.from(select.options).map(opt => ({
+										text: opt.text, //do not trim, because we are doing exact match in select_dropdown_option
+										value: opt.value,
+										index: opt.index
+									})),
+									id: select.id,
+									name: select.name
+								};
+							}
+						""",
+							dom_element.xpath,
+						)
+						if options:
+							logger.debug(f'Found dropdown in frame {frame_index}')
+							logger.debug(f'Dropdown ID: {options["id"]}, Name: {options["name"]}')
+							formatted_options = []
+							for opt in options['options']:
+								# encoding ensures AI uses the exact string in select_dropdown_option
+								encoded_text = json.dumps(opt['text'])
+								formatted_options.append(f'{opt["index"]}: text={encoded_text}')
+							all_options.extend(formatted_options)
+					except Exception as frame_e:
+						logger.debug(f'Frame {frame_index} evaluation failed: {str(frame_e)}')
+					frame_index += 1
+				if all_options:
+					msg = '\n'.join(all_options)
+					msg += '\nUse the exact text string in select_dropdown_option'
+					logger.info(msg)
+					return ActionResult(extracted_content=msg, include_in_memory=True)
+				else:
+					msg = 'No options found in any frame for dropdown'
+					logger.info(msg)
+					return ActionResult(extracted_content=msg, include_in_memory=True)
+			except Exception as e:
+				logger.error(f'Failed to get dropdown options: {str(e)}')
+				msg = f'Error getting options: {str(e)}'
+				logger.info(msg)
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+		@self.registry.action(
+			description='Select dropdown option for interactive element index by the text of the option you want to select',
+		)
+		async def select_dropdown_option(
+			index: int,
+			text: str,
+			browser: BrowserContext,
+		) -> ActionResult:
+			"""Select dropdown option by the text of the option you want to select"""
+			page = await browser.get_current_page()
+			selector_map = await browser.get_selector_map()
+			dom_element = selector_map[index]
+			# Validate that we're working with a select element
+			if dom_element.tag_name != 'select':
+				logger.error(f'Element is not a select! Tag: {dom_element.tag_name}, Attributes: {dom_element.attributes}')
+				msg = f'Cannot select option: Element with index {index} is a {dom_element.tag_name}, not a select'
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+			logger.debug(f"Attempting to select '{text}' using xpath: {dom_element.xpath}")
+			logger.debug(f'Element attributes: {dom_element.attributes}')
+			logger.debug(f'Element tag: {dom_element.tag_name}')
+			xpath = '//' + dom_element.xpath
+			try:
+				frame_index = 0
+				for frame in page.frames:
+					try:
+						logger.debug(f'Trying frame {frame_index} URL: {frame.url}')
+						# First verify we can find the dropdown in this frame
+						find_dropdown_js = """
+							(xpath) => {
+								try {
+									const select = document.evaluate(xpath, document, null,
+										XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
+									if (!select) return null;
+									if (select.tagName.toLowerCase() !== 'select') {
+										return {
+											error: `Found element but it's a ${select.tagName}, not a SELECT`,
+											found: false
+										};
+									}
+									return {
+										id: select.id,
+										name: select.name,
+										found: true,
+										tagName: select.tagName,
+										optionCount: select.options.length,
+										currentValue: select.value,
+										availableOptions: Array.from(select.options).map(o => o.text.trim())
+									};
+								} catch (e) {
+									return {error: e.toString(), found: false};
+								}
+							}
+						"""
+						dropdown_info = await frame.evaluate(find_dropdown_js, dom_element.xpath)
+						if dropdown_info:
+							if not dropdown_info.get('found'):
+								logger.error(f'Frame {frame_index} error: {dropdown_info.get("error")}')
+								continue
+							logger.debug(f'Found dropdown in frame {frame_index}: {dropdown_info}')
+							# "label" because we are selecting by text
+							# nth(0) to disable error thrown by strict mode
+							# timeout=1000 because we are already waiting for all network events, therefore ideally we don't need to wait a lot here (default 30s)
+							selected_option_values = (
+								await frame.locator('//' + dom_element.xpath).nth(0).select_option(label=text, timeout=1000)
+							)
+							msg = f'selected option {text} with value {selected_option_values}'
+							logger.info(msg + f' in frame {frame_index}')
+							return ActionResult(extracted_content=msg, include_in_memory=True)
+					except Exception as frame_e:
+						logger.error(f'Frame {frame_index} attempt failed: {str(frame_e)}')
+						logger.error(f'Frame type: {type(frame)}')
+						logger.error(f'Frame URL: {frame.url}')
+					frame_index += 1
+				msg = f"Could not select option '{text}' in any frame"
+				logger.info(msg)
+				return ActionResult(extracted_content=msg, include_in_memory=True)
+			except Exception as e:
+				msg = f'Selection failed: {str(e)}'
+				logger.error(msg)
+				return ActionResult(error=msg, include_in_memory=True)
+	# Register ---------------------------------------------------------------
+	def action(self, description: str, **kwargs):
+		"""Decorator for registering custom actions
+		@param description: Describe the LLM what the function does (better description == better function calling)
+		"""
+		return self.registry.action(description, **kwargs)
+	# Act --------------------------------------------------------------------
+	@time_execution_sync('--act')
+	async def act(
+		self,
+		action: ActionModel,
+		browser_context: BrowserContext,
+		#
+		page_extraction_llm: Optional[BaseChatModel] = None,
+		sensitive_data: Optional[Dict[str, str]] = None,
+		available_file_paths: Optional[list[str]] = None,
+		#
+		context: Context | None = None,
+	) -> ActionResult:
+		"""Execute an action"""
+		try:
+			for action_name, params in action.model_dump(exclude_unset=True).items():
+				if params is not None:
+					# with Laminar.start_as_current_span(
+					# 	name=action_name,
+					# 	input={
+					# 		'action': action_name,
+					# 		'params': params,
+					# 	},
+					# 	span_type='TOOL',
+					# ):
+					result = await self.registry.execute_action(
+						action_name,
+						params,
+						browser=browser_context,
+						page_extraction_llm=page_extraction_llm,
+						sensitive_data=sensitive_data,
+						available_file_paths=available_file_paths,
+						context=context,
+					)
+					# Laminar.set_span_output(result)
+					if isinstance(result, str):
+						return ActionResult(extracted_content=result)
+					elif isinstance(result, ActionResult):
+						return result
+					elif result is None:
+						return ActionResult()
+					else:
+						raise ValueError(f'Invalid action result type: {type(result)} of {result}')
+			return ActionResult()
+		except Exception as e:
+			raise e

browser_use/controller/views.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import Optional
+from pydantic import BaseModel, model_validator
+# Action Input Models
+class SearchGoogleAction(BaseModel):
+	query: str
+class GoToUrlAction(BaseModel):
+	url: str
+class ClickElementAction(BaseModel):
+	index: int
+	xpath: Optional[str] = None
+class InputTextAction(BaseModel):
+	index: int
+	text: str
+	xpath: Optional[str] = None
+class DoneAction(BaseModel):
+	text: str
+	success: bool
+class SwitchTabAction(BaseModel):
+	page_id: int
+class OpenTabAction(BaseModel):
+	url: str
+class ScrollAction(BaseModel):
+	amount: Optional[int] = None  # The number of pixels to scroll. If None, scroll down/up one page
+class SendKeysAction(BaseModel):
+	keys: str
+class ExtractPageContentAction(BaseModel):
+	value: str
+class NoParamsAction(BaseModel):
+	"""
+	Accepts absolutely anything in the incoming data
+	and discards it, so the final parsed model is empty.
+	"""
+	@model_validator(mode='before')
+	def ignore_all_inputs(cls, values):
+		# No matter what the user sends, discard it and return empty.
+		return {}
+	class Config:
+		# If you want to silently allow unknown fields at top-level,
+		# set extra = 'allow' as well:
+		extra = 'allow'

browser_use/dom/__init__.py ADDED Viewed

File without changes

browser_use/dom/buildDomTree.js ADDED Viewed

	@@ -0,0 +1,1055 @@

+(
+  args = {
+    doHighlightElements: true,
+    focusHighlightIndex: -1,
+    viewportExpansion: 0,
+    debugMode: false,
+  }
+) => {
+  const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode } = args;
+  let highlightIndex = 0; // Reset highlight index
+  // Add timing stack to handle recursion
+  const TIMING_STACK = {
+    nodeProcessing: [],
+    treeTraversal: [],
+    highlighting: [],
+    current: null
+  };
+  function pushTiming(type) {
+    TIMING_STACK[type] = TIMING_STACK[type] || [];
+    TIMING_STACK[type].push(performance.now());
+  }
+  function popTiming(type) {
+    const start = TIMING_STACK[type].pop();
+    const duration = performance.now() - start;
+    return duration;
+  }
+  // Only initialize performance tracking if in debug mode
+  const PERF_METRICS = debugMode ? {
+    buildDomTreeCalls: 0,
+    timings: {
+      buildDomTree: 0,
+      highlightElement: 0,
+      isInteractiveElement: 0,
+      isElementVisible: 0,
+      isTopElement: 0,
+      isInExpandedViewport: 0,
+      isTextNodeVisible: 0,
+      getEffectiveScroll: 0,
+    },
+    cacheMetrics: {
+      boundingRectCacheHits: 0,
+      boundingRectCacheMisses: 0,
+      computedStyleCacheHits: 0,
+      computedStyleCacheMisses: 0,
+      getBoundingClientRectTime: 0,
+      getComputedStyleTime: 0,
+      boundingRectHitRate: 0,
+      computedStyleHitRate: 0,
+      overallHitRate: 0,
+    },
+    nodeMetrics: {
+      totalNodes: 0,
+      processedNodes: 0,
+      skippedNodes: 0,
+    },
+    buildDomTreeBreakdown: {
+      totalTime: 0,
+      totalSelfTime: 0,
+      buildDomTreeCalls: 0,
+      domOperations: {
+        getBoundingClientRect: 0,
+        getComputedStyle: 0,
+      },
+      domOperationCounts: {
+        getBoundingClientRect: 0,
+        getComputedStyle: 0,
+      }
+    }
+  } : null;
+  // Simple timing helper that only runs in debug mode
+  function measureTime(fn) {
+    if (!debugMode) return fn;
+    return function (...args) {
+      const start = performance.now();
+      const result = fn.apply(this, args);
+      const duration = performance.now() - start;
+      return result;
+    };
+  }
+  // Helper to measure DOM operations
+  function measureDomOperation(operation, name) {
+    if (!debugMode) return operation();
+    const start = performance.now();
+    const result = operation();
+    const duration = performance.now() - start;
+    if (PERF_METRICS && name in PERF_METRICS.buildDomTreeBreakdown.domOperations) {
+      PERF_METRICS.buildDomTreeBreakdown.domOperations[name] += duration;
+      PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[name]++;
+    }
+    return result;
+  }
+  // Add caching mechanisms at the top level
+  const DOM_CACHE = {
+    boundingRects: new WeakMap(),
+    computedStyles: new WeakMap(),
+    clearCache: () => {
+      DOM_CACHE.boundingRects = new WeakMap();
+      DOM_CACHE.computedStyles = new WeakMap();
+    }
+  };
+  // Cache helper functions
+  function getCachedBoundingRect(element) {
+    if (!element) return null;
+    if (DOM_CACHE.boundingRects.has(element)) {
+      if (debugMode && PERF_METRICS) {
+        PERF_METRICS.cacheMetrics.boundingRectCacheHits++;
+      }
+      return DOM_CACHE.boundingRects.get(element);
+    }
+    if (debugMode && PERF_METRICS) {
+      PERF_METRICS.cacheMetrics.boundingRectCacheMisses++;
+    }
+    let rect;
+    if (debugMode) {
+      const start = performance.now();
+      rect = element.getBoundingClientRect();
+      const duration = performance.now() - start;
+      if (PERF_METRICS) {
+        PERF_METRICS.buildDomTreeBreakdown.domOperations.getBoundingClientRect += duration;
+        PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getBoundingClientRect++;
+      }
+    } else {
+      rect = element.getBoundingClientRect();
+    }
+    if (rect) {
+      DOM_CACHE.boundingRects.set(element, rect);
+    }
+    return rect;
+  }
+  function getCachedComputedStyle(element) {
+    if (!element) return null;
+    if (DOM_CACHE.computedStyles.has(element)) {
+      if (debugMode && PERF_METRICS) {
+        PERF_METRICS.cacheMetrics.computedStyleCacheHits++;
+      }
+      return DOM_CACHE.computedStyles.get(element);
+    }
+    if (debugMode && PERF_METRICS) {
+      PERF_METRICS.cacheMetrics.computedStyleCacheMisses++;
+    }
+    let style;
+    if (debugMode) {
+      const start = performance.now();
+      style = window.getComputedStyle(element);
+      const duration = performance.now() - start;
+      if (PERF_METRICS) {
+        PERF_METRICS.buildDomTreeBreakdown.domOperations.getComputedStyle += duration;
+        PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getComputedStyle++;
+      }
+    } else {
+      style = window.getComputedStyle(element);
+    }
+    if (style) {
+      DOM_CACHE.computedStyles.set(element, style);
+    }
+    return style;
+  }
+  /**
+   * Hash map of DOM nodes indexed by their highlight index.
+   *
+   * @type {Object<string, any>}
+   */
+  const DOM_HASH_MAP = {};
+  const ID = { current: 0 };
+  const HIGHLIGHT_CONTAINER_ID = "playwright-highlight-container";
+  /**
+   * Highlights an element in the DOM and returns the index of the next element.
+   */
+  function highlightElement(element, index, parentIframe = null) {
+    if (!element) return index;
+    try {
+      // Create or get highlight container
+      let container = document.getElementById(HIGHLIGHT_CONTAINER_ID);
+      if (!container) {
+        container = document.createElement("div");
+        container.id = HIGHLIGHT_CONTAINER_ID;
+        container.style.position = "fixed";
+        container.style.pointerEvents = "none";
+        container.style.top = "0";
+        container.style.left = "0";
+        container.style.width = "100%";
+        container.style.height = "100%";
+        container.style.zIndex = "2147483647";
+        document.body.appendChild(container);
+      }
+      // Get element position
+      const rect = measureDomOperation(
+        () => element.getBoundingClientRect(),
+        'getBoundingClientRect'
+      );
+      if (!rect) return index;
+      // Generate a color based on the index
+      const colors = [
+        "#FF0000",
+        "#00FF00",
+        "#0000FF",
+        "#FFA500",
+        "#800080",
+        "#008080",
+        "#FF69B4",
+        "#4B0082",
+        "#FF4500",
+        "#2E8B57",
+        "#DC143C",
+        "#4682B4",
+      ];
+      const colorIndex = index % colors.length;
+      const baseColor = colors[colorIndex];
+      const backgroundColor = baseColor + "1A"; // 10% opacity version of the color
+      // Create highlight overlay
+      const overlay = document.createElement("div");
+      overlay.style.position = "fixed";
+      overlay.style.border = `2px solid ${baseColor}`;
+      overlay.style.backgroundColor = backgroundColor;
+      overlay.style.pointerEvents = "none";
+      overlay.style.boxSizing = "border-box";
+      // Get element position
+      let iframeOffset = { x: 0, y: 0 };
+      // If element is in an iframe, calculate iframe offset
+      if (parentIframe) {
+        const iframeRect = parentIframe.getBoundingClientRect();
+        iframeOffset.x = iframeRect.left;
+        iframeOffset.y = iframeRect.top;
+      }
+      // Calculate position
+      const top = rect.top + iframeOffset.y;
+      const left = rect.left + iframeOffset.x;
+      overlay.style.top = `${top}px`;
+      overlay.style.left = `${left}px`;
+      overlay.style.width = `${rect.width}px`;
+      overlay.style.height = `${rect.height}px`;
+      // Create and position label
+      const label = document.createElement("div");
+      label.className = "playwright-highlight-label";
+      label.style.position = "fixed";
+      label.style.background = baseColor;
+      label.style.color = "white";
+      label.style.padding = "1px 4px";
+      label.style.borderRadius = "4px";
+      label.style.fontSize = `${Math.min(12, Math.max(8, rect.height / 2))}px`;
+      label.textContent = index;
+      const labelWidth = 20;
+      const labelHeight = 16;
+      let labelTop = top + 2;
+      let labelLeft = left + rect.width - labelWidth - 2;
+      if (rect.width < labelWidth + 4 || rect.height < labelHeight + 4) {
+        labelTop = top - labelHeight - 2;
+        labelLeft = left + rect.width - labelWidth;
+      }
+      label.style.top = `${labelTop}px`;
+      label.style.left = `${labelLeft}px`;
+      // Add to container
+      container.appendChild(overlay);
+      container.appendChild(label);
+      // Update positions on scroll
+      const updatePositions = () => {
+        const newRect = element.getBoundingClientRect();
+        let newIframeOffset = { x: 0, y: 0 };
+        if (parentIframe) {
+          const iframeRect = parentIframe.getBoundingClientRect();
+          newIframeOffset.x = iframeRect.left;
+          newIframeOffset.y = iframeRect.top;
+        }
+        const newTop = newRect.top + newIframeOffset.y;
+        const newLeft = newRect.left + newIframeOffset.x;
+        overlay.style.top = `${newTop}px`;
+        overlay.style.left = `${newLeft}px`;
+        overlay.style.width = `${newRect.width}px`;
+        overlay.style.height = `${newRect.height}px`;
+        let newLabelTop = newTop + 2;
+        let newLabelLeft = newLeft + newRect.width - labelWidth - 2;
+        if (newRect.width < labelWidth + 4 || newRect.height < labelHeight + 4) {
+          newLabelTop = newTop - labelHeight - 2;
+          newLabelLeft = newLeft + newRect.width - labelWidth;
+        }
+        label.style.top = `${newLabelTop}px`;
+        label.style.left = `${newLabelLeft}px`;
+      };
+      window.addEventListener('scroll', updatePositions);
+      window.addEventListener('resize', updatePositions);
+      return index + 1;
+    } finally {
+      popTiming('highlighting');
+    }
+  }
+  /**
+   * Returns an XPath tree string for an element.
+   */
+  function getXPathTree(element, stopAtBoundary = true) {
+    const segments = [];
+    let currentElement = element;
+    while (currentElement && currentElement.nodeType === Node.ELEMENT_NODE) {
+      // Stop if we hit a shadow root or iframe
+      if (
+        stopAtBoundary &&
+        (currentElement.parentNode instanceof ShadowRoot ||
+          currentElement.parentNode instanceof HTMLIFrameElement)
+      ) {
+        break;
+      }
+      let index = 0;
+      let sibling = currentElement.previousSibling;
+      while (sibling) {
+        if (
+          sibling.nodeType === Node.ELEMENT_NODE &&
+          sibling.nodeName === currentElement.nodeName
+        ) {
+          index++;
+        }
+        sibling = sibling.previousSibling;
+      }
+      const tagName = currentElement.nodeName.toLowerCase();
+      const xpathIndex = index > 0 ? `[${index + 1}]` : "";
+      segments.unshift(`${tagName}${xpathIndex}`);
+      currentElement = currentElement.parentNode;
+    }
+    return segments.join("/");
+  }
+  /**
+   * Checks if a text node is visible.
+   */
+  function isTextNodeVisible(textNode) {
+    try {
+      const range = document.createRange();
+      range.selectNodeContents(textNode);
+      const rect = range.getBoundingClientRect();
+      // Simple size check
+      if (rect.width === 0 || rect.height === 0) {
+        return false;
+      }
+      // Simple viewport check without scroll calculations
+      const isInViewport = !(
+        rect.bottom < -viewportExpansion ||
+        rect.top > window.innerHeight + viewportExpansion ||
+        rect.right < -viewportExpansion ||
+        rect.left > window.innerWidth + viewportExpansion
+      );
+      // Check parent visibility
+      const parentElement = textNode.parentElement;
+      if (!parentElement) return false;
+      try {
+        return isInViewport && parentElement.checkVisibility({
+          checkOpacity: true,
+          checkVisibilityCSS: true,
+        });
+      } catch (e) {
+        // Fallback if checkVisibility is not supported
+        const style = window.getComputedStyle(parentElement);
+        return isInViewport &&
+          style.display !== 'none' &&
+          style.visibility !== 'hidden' &&
+          style.opacity !== '0';
+      }
+    } catch (e) {
+      console.warn('Error checking text node visibility:', e);
+      return false;
+    }
+  }
+  // Helper function to check if element is accepted
+  function isElementAccepted(element) {
+    if (!element || !element.tagName) return false;
+    // Always accept body and common container elements
+    const alwaysAccept = new Set([
+      "body", "div", "main", "article", "section", "nav", "header", "footer"
+    ]);
+    const tagName = element.tagName.toLowerCase();
+    if (alwaysAccept.has(tagName)) return true;
+    const leafElementDenyList = new Set([
+      "svg",
+      "script",
+      "style",
+      "link",
+      "meta",
+      "noscript",
+      "template",
+    ]);
+    return !leafElementDenyList.has(tagName);
+  }
+  /**
+   * Checks if an element is visible.
+   */
+  function isElementVisible(element) {
+    const style = getCachedComputedStyle(element);
+    return (
+      element.offsetWidth > 0 &&
+      element.offsetHeight > 0 &&
+      style.visibility !== "hidden" &&
+      style.display !== "none"
+    );
+  }
+  /**
+   * Checks if an element is interactive.
+   */
+  function isInteractiveElement(element) {
+    if (!element || element.nodeType !== Node.ELEMENT_NODE) {
+      return false;
+    }
+    // Special handling for cookie banner elements
+    const isCookieBannerElement =
+      (typeof element.closest === 'function') && (
+        element.closest('[id*="onetrust"]') ||
+        element.closest('[class*="onetrust"]') ||
+        element.closest('[data-nosnippet="true"]') ||
+        element.closest('[aria-label*="cookie"]')
+      );
+    if (isCookieBannerElement) {
+      // Check if it's a button or interactive element within the banner
+      if (
+        element.tagName.toLowerCase() === 'button' ||
+        element.getAttribute('role') === 'button' ||
+        element.onclick ||
+        element.getAttribute('onclick') ||
+        (element.classList && (
+          element.classList.contains('ot-sdk-button') ||
+          element.classList.contains('accept-button') ||
+          element.classList.contains('reject-button')
+        )) ||
+        element.getAttribute('aria-label')?.toLowerCase().includes('accept') ||
+        element.getAttribute('aria-label')?.toLowerCase().includes('reject')
+      ) {
+        return true;
+      }
+    }
+    // Base interactive elements and roles
+    const interactiveElements = new Set([
+      "a", "button", "details", "embed", "input", "menu", "menuitem",
+      "object", "select", "textarea", "canvas", "summary", "dialog",
+      "banner"
+    ]);
+    const interactiveRoles = new Set(['button-icon', 'dialog', 'button-text-icon-only', 'treeitem', 'alert', 'grid', 'progressbar', 'radio', 'checkbox', 'menuitem', 'option', 'switch', 'dropdown', 'scrollbar', 'combobox', 'a-button-text', 'button', 'region', 'textbox', 'tabpanel', 'tab', 'click', 'button-text', 'spinbutton', 'a-button-inner', 'link', 'menu', 'slider', 'listbox', 'a-dropdown-button', 'button-icon-only', 'searchbox', 'menuitemradio', 'tooltip', 'tree', 'menuitemcheckbox']);
+    const tagName = element.tagName.toLowerCase();
+    const role = element.getAttribute("role");
+    const ariaRole = element.getAttribute("aria-role");
+    const tabIndex = element.getAttribute("tabindex");
+    // Add check for specific class
+    const hasAddressInputClass = element.classList && (
+      element.classList.contains("address-input__container__input") ||
+      element.classList.contains("nav-btn") ||
+      element.classList.contains("pull-left")
+    );
+    // Added enhancement to capture dropdown interactive elements
+    if (element.classList && (
+      element.classList.contains('dropdown-toggle') ||
+      element.getAttribute('data-toggle') === 'dropdown' ||
+      element.getAttribute('aria-haspopup') === 'true'
+    )) {
+      return true;
+    }
+    // Basic role/attribute checks
+    const hasInteractiveRole =
+      hasAddressInputClass ||
+      interactiveElements.has(tagName) ||
+      interactiveRoles.has(role) ||
+      interactiveRoles.has(ariaRole) ||
+      (tabIndex !== null &&
+        tabIndex !== "-1" &&
+        element.parentElement?.tagName.toLowerCase() !== "body") ||
+      element.getAttribute("data-action") === "a-dropdown-select" ||
+      element.getAttribute("data-action") === "a-dropdown-button";
+    if (hasInteractiveRole) return true;
+    // Additional checks for cookie banners and consent UI
+    const isCookieBanner =
+      element.id?.toLowerCase().includes('cookie') ||
+      element.id?.toLowerCase().includes('consent') ||
+      element.id?.toLowerCase().includes('notice') ||
+      (element.classList && (
+        element.classList.contains('otCenterRounded') ||
+        element.classList.contains('ot-sdk-container')
+      )) ||
+      element.getAttribute('data-nosnippet') === 'true' ||
+      element.getAttribute('aria-label')?.toLowerCase().includes('cookie') ||
+      element.getAttribute('aria-label')?.toLowerCase().includes('consent') ||
+      (element.tagName.toLowerCase() === 'div' && (
+        element.id?.includes('onetrust') ||
+        (element.classList && (
+          element.classList.contains('onetrust') ||
+          element.classList.contains('cookie') ||
+          element.classList.contains('consent')
+        ))
+      ));
+    if (isCookieBanner) return true;
+    // Additional check for buttons in cookie banners
+    const isInCookieBanner = typeof element.closest === 'function' && element.closest(
+      '[id*="cookie"],[id*="consent"],[class*="cookie"],[class*="consent"],[id*="onetrust"]'
+    );
+    if (isInCookieBanner && (
+      element.tagName.toLowerCase() === 'button' ||
+      element.getAttribute('role') === 'button' ||
+      (element.classList && element.classList.contains('button')) ||
+      element.onclick ||
+      element.getAttribute('onclick')
+    )) {
+      return true;
+    }
+    // Get computed style
+    const style = window.getComputedStyle(element);
+    // Check for event listeners
+    const hasClickHandler =
+      element.onclick !== null ||
+      element.getAttribute("onclick") !== null ||
+      element.hasAttribute("ng-click") ||
+      element.hasAttribute("@click") ||
+      element.hasAttribute("v-on:click");
+    // Helper function to safely get event listeners
+    function getEventListeners(el) {
+      try {
+        return window.getEventListeners?.(el) || {};
+      } catch (e) {
+        const listeners = {};
+        const eventTypes = [
+          "click",
+          "mousedown",
+          "mouseup",
+          "touchstart",
+          "touchend",
+          "keydown",
+          "keyup",
+          "focus",
+          "blur",
+        ];
+        for (const type of eventTypes) {
+          const handler = el[`on${type}`];
+          if (handler) {
+            listeners[type] = [{ listener: handler, useCapture: false }];
+          }
+        }
+        return listeners;
+      }
+    }
+    // Check for click-related events
+    const listeners = getEventListeners(element);
+    const hasClickListeners =
+      listeners &&
+      (listeners.click?.length > 0 ||
+        listeners.mousedown?.length > 0 ||
+        listeners.mouseup?.length > 0 ||
+        listeners.touchstart?.length > 0 ||
+        listeners.touchend?.length > 0);
+    // Check for ARIA properties
+    const hasAriaProps =
+      element.hasAttribute("aria-expanded") ||
+      element.hasAttribute("aria-pressed") ||
+      element.hasAttribute("aria-selected") ||
+      element.hasAttribute("aria-checked");
+    const isContentEditable = element.getAttribute("contenteditable") === "true" ||
+      element.isContentEditable ||
+      element.id === "tinymce" ||
+      element.classList.contains("mce-content-body") ||
+      (element.tagName.toLowerCase() === "body" && element.getAttribute("data-id")?.startsWith("mce_"));
+    // Check if element is draggable
+    const isDraggable =
+      element.draggable || element.getAttribute("draggable") === "true";
+    return (
+      hasAriaProps ||
+      hasClickHandler ||
+      hasClickListeners ||
+      isDraggable ||
+      isContentEditable
+    );
+  }
+  /**
+   * Checks if an element is the topmost element at its position.
+   */
+  function isTopElement(element) {
+    const rect = getCachedBoundingRect(element);
+    // If element is not in viewport, consider it top
+    const isInViewport = (
+      rect.left < window.innerWidth &&
+      rect.right > 0 &&
+      rect.top < window.innerHeight &&
+      rect.bottom > 0
+    );
+    if (!isInViewport) {
+      return true;
+    }
+    // Find the correct document context and root element
+    let doc = element.ownerDocument;
+    // If we're in an iframe, elements are considered top by default
+    if (doc !== window.document) {
+      return true;
+    }
+    // For shadow DOM, we need to check within its own root context
+    const shadowRoot = element.getRootNode();
+    if (shadowRoot instanceof ShadowRoot) {
+      const centerX = rect.left + rect.width / 2;
+      const centerY = rect.top + rect.height / 2;
+      try {
+        const topEl = measureDomOperation(
+          () => shadowRoot.elementFromPoint(centerX, centerY),
+          'elementFromPoint'
+        );
+        if (!topEl) return false;
+        let current = topEl;
+        while (current && current !== shadowRoot) {
+          if (current === element) return true;
+          current = current.parentElement;
+        }
+        return false;
+      } catch (e) {
+        return true;
+      }
+    }
+    // For elements in viewport, check if they're topmost
+    const centerX = rect.left + rect.width / 2;
+    const centerY = rect.top + rect.height / 2;
+    try {
+      const topEl = document.elementFromPoint(centerX, centerY);
+      if (!topEl) return false;
+      let current = topEl;
+      while (current && current !== document.documentElement) {
+        if (current === element) return true;
+        current = current.parentElement;
+      }
+      return false;
+    } catch (e) {
+      return true;
+    }
+  }
+  /**
+   * Checks if an element is within the expanded viewport.
+   */
+  function isInExpandedViewport(element, viewportExpansion) {
+    if (viewportExpansion === -1) {
+      return true;
+    }
+    const rect = getCachedBoundingRect(element);
+    // Simple viewport check without scroll calculations
+    return !(
+      rect.bottom < -viewportExpansion ||
+      rect.top > window.innerHeight + viewportExpansion ||
+      rect.right < -viewportExpansion ||
+      rect.left > window.innerWidth + viewportExpansion
+    );
+  }
+  // Add this new helper function
+  function getEffectiveScroll(element) {
+    let currentEl = element;
+    let scrollX = 0;
+    let scrollY = 0;
+    return measureDomOperation(() => {
+      while (currentEl && currentEl !== document.documentElement) {
+        if (currentEl.scrollLeft || currentEl.scrollTop) {
+          scrollX += currentEl.scrollLeft;
+          scrollY += currentEl.scrollTop;
+        }
+        currentEl = currentEl.parentElement;
+      }
+      scrollX += window.scrollX;
+      scrollY += window.scrollY;
+      return { scrollX, scrollY };
+    }, 'scrollOperations');
+  }
+  // Add these helper functions at the top level
+  function isInteractiveCandidate(element) {
+    if (!element || element.nodeType !== Node.ELEMENT_NODE) return false;
+    const tagName = element.tagName.toLowerCase();
+    // Fast-path for common interactive elements
+    const interactiveElements = new Set([
+      "a", "button", "input", "select", "textarea", "details", "summary"
+    ]);
+    if (interactiveElements.has(tagName)) return true;
+    // Quick attribute checks without getting full lists
+    const hasQuickInteractiveAttr = element.hasAttribute("onclick") ||
+      element.hasAttribute("role") ||
+      element.hasAttribute("tabindex") ||
+      element.hasAttribute("aria-") ||
+      element.hasAttribute("data-action");
+    return hasQuickInteractiveAttr;
+  }
+  function quickVisibilityCheck(element) {
+    // Fast initial check before expensive getComputedStyle
+    return element.offsetWidth > 0 &&
+      element.offsetHeight > 0 &&
+      !element.hasAttribute("hidden") &&
+      element.style.display !== "none" &&
+      element.style.visibility !== "hidden";
+  }
+  /**
+   * Creates a node data object for a given node and its descendants.
+   */
+  function buildDomTree(node, parentIframe = null) {
+    if (debugMode) PERF_METRICS.nodeMetrics.totalNodes++;
+    if (!node || node.id === HIGHLIGHT_CONTAINER_ID) {
+      if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+      return null;
+    }
+    // Special handling for root node (body)
+    if (node === document.body) {
+      const nodeData = {
+        tagName: 'body',
+        attributes: {},
+        xpath: '/body',
+        children: [],
+      };
+      // Process children of body
+      for (const child of node.childNodes) {
+        const domElement = buildDomTree(child, parentIframe);
+        if (domElement) nodeData.children.push(domElement);
+      }
+      const id = `${ID.current++}`;
+      DOM_HASH_MAP[id] = nodeData;
+      if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
+      return id;
+    }
+    // Early bailout for non-element nodes except text
+    if (node.nodeType !== Node.ELEMENT_NODE && node.nodeType !== Node.TEXT_NODE) {
+      if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+      return null;
+    }
+    // Process text nodes
+    if (node.nodeType === Node.TEXT_NODE) {
+      const textContent = node.textContent.trim();
+      if (!textContent) {
+        if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+        return null;
+      }
+      // Only check visibility for text nodes that might be visible
+      const parentElement = node.parentElement;
+      if (!parentElement || parentElement.tagName.toLowerCase() === 'script') {
+        if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+        return null;
+      }
+      const id = `${ID.current++}`;
+      DOM_HASH_MAP[id] = {
+        type: "TEXT_NODE",
+        text: textContent,
+        isVisible: isTextNodeVisible(node),
+      };
+      if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
+      return id;
+    }
+    // Quick checks for element nodes
+    if (node.nodeType === Node.ELEMENT_NODE && !isElementAccepted(node)) {
+      if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+      return null;
+    }
+    // Early viewport check - only filter out elements clearly outside viewport
+    if (viewportExpansion !== -1) {
+      const rect = getCachedBoundingRect(node);
+      const style = getCachedComputedStyle(node);
+      // Skip viewport check for fixed/sticky elements as they may appear anywhere
+      const isFixedOrSticky = style && (style.position === 'fixed' || style.position === 'sticky');
+      // Check if element has actual dimensions
+      const hasSize = node.offsetWidth > 0 || node.offsetHeight > 0;
+      if (!rect || (!isFixedOrSticky && !hasSize && (
+        rect.bottom < -viewportExpansion ||
+        rect.top > window.innerHeight + viewportExpansion ||
+        rect.right < -viewportExpansion ||
+        rect.left > window.innerWidth + viewportExpansion
+      ))) {
+        if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+        return null;
+      }
+    }
+    // Process element node
+    const nodeData = {
+      tagName: node.tagName.toLowerCase(),
+      attributes: {},
+      xpath: getXPathTree(node, true),
+      children: [],
+    };
+    // Get attributes for interactive elements or potential text containers
+    if (isInteractiveCandidate(node) || node.tagName.toLowerCase() === 'iframe' || node.tagName.toLowerCase() === 'body') {
+      const attributeNames = node.getAttributeNames?.() || [];
+      for (const name of attributeNames) {
+        nodeData.attributes[name] = node.getAttribute(name);
+      }
+    }
+    // if (isInteractiveCandidate(node)) {
+    // Check interactivity
+    if (node.nodeType === Node.ELEMENT_NODE) {
+      nodeData.isVisible = isElementVisible(node);
+      if (nodeData.isVisible) {
+        nodeData.isTopElement = isTopElement(node);
+        if (nodeData.isTopElement) {
+          nodeData.isInteractive = isInteractiveElement(node);
+          if (nodeData.isInteractive) {
+            nodeData.isInViewport = true;
+            nodeData.highlightIndex = highlightIndex++;
+            if (doHighlightElements) {
+              if (focusHighlightIndex >= 0) {
+                if (focusHighlightIndex === nodeData.highlightIndex) {
+                  highlightElement(node, nodeData.highlightIndex, parentIframe);
+                }
+              } else {
+                highlightElement(node, nodeData.highlightIndex, parentIframe);
+              }
+            }
+          }
+        }
+      }
+    }
+    // Process children, with special handling for iframes and rich text editors
+    if (node.tagName) {
+      const tagName = node.tagName.toLowerCase();
+      // Handle iframes
+      if (tagName === "iframe") {
+        try {
+          const iframeDoc = node.contentDocument || node.contentWindow?.document;
+          if (iframeDoc) {
+            for (const child of iframeDoc.childNodes) {
+              const domElement = buildDomTree(child, node);
+              if (domElement) nodeData.children.push(domElement);
+            }
+          }
+        } catch (e) {
+          console.warn("Unable to access iframe:", e);
+        }
+      }
+      // Handle rich text editors and contenteditable elements
+      else if (
+        node.isContentEditable ||
+        node.getAttribute("contenteditable") === "true" ||
+        node.id === "tinymce" ||
+        node.classList.contains("mce-content-body") ||
+        (tagName === "body" && node.getAttribute("data-id")?.startsWith("mce_"))
+      ) {
+        // Process all child nodes to capture formatted text
+        for (const child of node.childNodes) {
+          const domElement = buildDomTree(child, parentIframe);
+          if (domElement) nodeData.children.push(domElement);
+        }
+      }
+      // Handle shadow DOM
+      else if (node.shadowRoot) {
+        nodeData.shadowRoot = true;
+        for (const child of node.shadowRoot.childNodes) {
+          const domElement = buildDomTree(child, parentIframe);
+          if (domElement) nodeData.children.push(domElement);
+        }
+      }
+      // Handle regular elements
+      else {
+        for (const child of node.childNodes) {
+          const domElement = buildDomTree(child, parentIframe);
+          if (domElement) nodeData.children.push(domElement);
+        }
+      }
+    }
+    // Skip empty anchor tags
+    if (nodeData.tagName === 'a' && nodeData.children.length === 0 && !nodeData.attributes.href) {
+      if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
+      return null;
+    }
+    const id = `${ID.current++}`;
+    DOM_HASH_MAP[id] = nodeData;
+    if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
+    return id;
+  }
+  // After all functions are defined, wrap them with performance measurement
+  // Remove buildDomTree from here as we measure it separately
+  highlightElement = measureTime(highlightElement);
+  isInteractiveElement = measureTime(isInteractiveElement);
+  isElementVisible = measureTime(isElementVisible);
+  isTopElement = measureTime(isTopElement);
+  isInExpandedViewport = measureTime(isInExpandedViewport);
+  isTextNodeVisible = measureTime(isTextNodeVisible);
+  getEffectiveScroll = measureTime(getEffectiveScroll);
+  const rootId = buildDomTree(document.body);
+  // Clear the cache before starting
+  DOM_CACHE.clearCache();
+  // Only process metrics in debug mode
+  if (debugMode && PERF_METRICS) {
+    // Convert timings to seconds and add useful derived metrics
+    Object.keys(PERF_METRICS.timings).forEach(key => {
+      PERF_METRICS.timings[key] = PERF_METRICS.timings[key] / 1000;
+    });
+    Object.keys(PERF_METRICS.buildDomTreeBreakdown).forEach(key => {
+      if (typeof PERF_METRICS.buildDomTreeBreakdown[key] === 'number') {
+        PERF_METRICS.buildDomTreeBreakdown[key] = PERF_METRICS.buildDomTreeBreakdown[key] / 1000;
+      }
+    });
+    // Add some useful derived metrics
+    if (PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls > 0) {
+      PERF_METRICS.buildDomTreeBreakdown.averageTimePerNode =
+        PERF_METRICS.buildDomTreeBreakdown.totalTime / PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls;
+    }
+    PERF_METRICS.buildDomTreeBreakdown.timeInChildCalls =
+      PERF_METRICS.buildDomTreeBreakdown.totalTime - PERF_METRICS.buildDomTreeBreakdown.totalSelfTime;
+    // Add average time per operation to the metrics
+    Object.keys(PERF_METRICS.buildDomTreeBreakdown.domOperations).forEach(op => {
+      const time = PERF_METRICS.buildDomTreeBreakdown.domOperations[op];
+      const count = PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[op];
+      if (count > 0) {
+        PERF_METRICS.buildDomTreeBreakdown.domOperations[`${op}Average`] = time / count;
+      }
+    });
+    // Calculate cache hit rates
+    const boundingRectTotal = PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.boundingRectCacheMisses;
+    const computedStyleTotal = PERF_METRICS.cacheMetrics.computedStyleCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheMisses;
+    if (boundingRectTotal > 0) {
+      PERF_METRICS.cacheMetrics.boundingRectHitRate = PERF_METRICS.cacheMetrics.boundingRectCacheHits / boundingRectTotal;
+    }
+    if (computedStyleTotal > 0) {
+      PERF_METRICS.cacheMetrics.computedStyleHitRate = PERF_METRICS.cacheMetrics.computedStyleCacheHits / computedStyleTotal;
+    }
+    if ((boundingRectTotal + computedStyleTotal) > 0) {
+      PERF_METRICS.cacheMetrics.overallHitRate =
+        (PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheHits) /
+        (boundingRectTotal + computedStyleTotal);
+    }
+  }
+  return debugMode ?
+    { rootId, map: DOM_HASH_MAP, perfMetrics: PERF_METRICS } :
+    { rootId, map: DOM_HASH_MAP };
+};

browser_use/dom/history_tree_processor/service.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import hashlib
+from typing import Optional
+from browser_use.dom.history_tree_processor.view import DOMHistoryElement, HashedDomElement
+from browser_use.dom.views import DOMElementNode
+class HistoryTreeProcessor:
+	""" "
+	Operations on the DOM elements
+	@dev be careful - text nodes can change even if elements stay the same
+	"""
+	@staticmethod
+	def convert_dom_element_to_history_element(dom_element: DOMElementNode) -> DOMHistoryElement:
+		from browser_use.browser.context import BrowserContext
+		parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
+		css_selector = BrowserContext._enhanced_css_selector_for_element(dom_element)
+		return DOMHistoryElement(
+			dom_element.tag_name,
+			dom_element.xpath,
+			dom_element.highlight_index,
+			parent_branch_path,
+			dom_element.attributes,
+			dom_element.shadow_root,
+			css_selector=css_selector,
+			page_coordinates=dom_element.page_coordinates,
+			viewport_coordinates=dom_element.viewport_coordinates,
+			viewport_info=dom_element.viewport_info,
+		)
+	@staticmethod
+	def find_history_element_in_tree(dom_history_element: DOMHistoryElement, tree: DOMElementNode) -> Optional[DOMElementNode]:
+		hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
+		def process_node(node: DOMElementNode):
+			if node.highlight_index is not None:
+				hashed_node = HistoryTreeProcessor._hash_dom_element(node)
+				if hashed_node == hashed_dom_history_element:
+					return node
+			for child in node.children:
+				if isinstance(child, DOMElementNode):
+					result = process_node(child)
+					if result is not None:
+						return result
+			return None
+		return process_node(tree)
+	@staticmethod
+	def compare_history_element_and_dom_element(dom_history_element: DOMHistoryElement, dom_element: DOMElementNode) -> bool:
+		hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
+		hashed_dom_element = HistoryTreeProcessor._hash_dom_element(dom_element)
+		return hashed_dom_history_element == hashed_dom_element
+	@staticmethod
+	def _hash_dom_history_element(dom_history_element: DOMHistoryElement) -> HashedDomElement:
+		branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(dom_history_element.entire_parent_branch_path)
+		attributes_hash = HistoryTreeProcessor._attributes_hash(dom_history_element.attributes)
+		xpath_hash = HistoryTreeProcessor._xpath_hash(dom_history_element.xpath)
+		return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
+	@staticmethod
+	def _hash_dom_element(dom_element: DOMElementNode) -> HashedDomElement:
+		parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
+		branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(parent_branch_path)
+		attributes_hash = HistoryTreeProcessor._attributes_hash(dom_element.attributes)
+		xpath_hash = HistoryTreeProcessor._xpath_hash(dom_element.xpath)
+		# text_hash = DomTreeProcessor._text_hash(dom_element)
+		return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
+	@staticmethod
+	def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]:
+		parents: list[DOMElementNode] = []
+		current_element: DOMElementNode = dom_element
+		while current_element.parent is not None:
+			parents.append(current_element)
+			current_element = current_element.parent
+		parents.reverse()
+		return [parent.tag_name for parent in parents]
+	@staticmethod
+	def _parent_branch_path_hash(parent_branch_path: list[str]) -> str:
+		parent_branch_path_string = '/'.join(parent_branch_path)
+		return hashlib.sha256(parent_branch_path_string.encode()).hexdigest()
+	@staticmethod
+	def _attributes_hash(attributes: dict[str, str]) -> str:
+		attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items())
+		return hashlib.sha256(attributes_string.encode()).hexdigest()
+	@staticmethod
+	def _xpath_hash(xpath: str) -> str:
+		return hashlib.sha256(xpath.encode()).hexdigest()
+	@staticmethod
+	def _text_hash(dom_element: DOMElementNode) -> str:
+		""" """
+		text_string = dom_element.get_all_text_till_next_clickable_element()
+		return hashlib.sha256(text_string.encode()).hexdigest()

browser_use/dom/history_tree_processor/view.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+from pydantic import BaseModel
+@dataclass
+class HashedDomElement:
+	"""
+	Hash of the dom element to be used as a unique identifier
+	"""
+	branch_path_hash: str
+	attributes_hash: str
+	xpath_hash: str
+	# text_hash: str
+class Coordinates(BaseModel):
+	x: int
+	y: int
+class CoordinateSet(BaseModel):
+	top_left: Coordinates
+	top_right: Coordinates
+	bottom_left: Coordinates
+	bottom_right: Coordinates
+	center: Coordinates
+	width: int
+	height: int
+class ViewportInfo(BaseModel):
+	scroll_x: int
+	scroll_y: int
+	width: int
+	height: int
+@dataclass
+class DOMHistoryElement:
+	tag_name: str
+	xpath: str
+	highlight_index: Optional[int]
+	entire_parent_branch_path: list[str]
+	attributes: dict[str, str]
+	shadow_root: bool = False
+	css_selector: Optional[str] = None
+	page_coordinates: Optional[CoordinateSet] = None
+	viewport_coordinates: Optional[CoordinateSet] = None
+	viewport_info: Optional[ViewportInfo] = None
+	def to_dict(self) -> dict:
+		page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None
+		viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None
+		viewport_info = self.viewport_info.model_dump() if self.viewport_info else None
+		return {
+			'tag_name': self.tag_name,
+			'xpath': self.xpath,
+			'highlight_index': self.highlight_index,
+			'entire_parent_branch_path': self.entire_parent_branch_path,
+			'attributes': self.attributes,
+			'shadow_root': self.shadow_root,
+			'css_selector': self.css_selector,
+			'page_coordinates': page_coordinates,
+			'viewport_coordinates': viewport_coordinates,
+			'viewport_info': viewport_info,
+		}

browser_use/dom/service.py ADDED Viewed

	@@ -0,0 +1,169 @@

+import gc
+import json
+import logging
+from dataclasses import dataclass
+from importlib import resources
+from typing import TYPE_CHECKING, Optional
+if TYPE_CHECKING:
+	from playwright.async_api import Page
+from browser_use.dom.views import (
+	DOMBaseNode,
+	DOMElementNode,
+	DOMState,
+	DOMTextNode,
+	SelectorMap,
+)
+from browser_use.utils import time_execution_async
+logger = logging.getLogger(__name__)
+@dataclass
+class ViewportInfo:
+	width: int
+	height: int
+class DomService:
+	def __init__(self, page: 'Page'):
+		self.page = page
+		self.xpath_cache = {}
+		self.js_code = resources.read_text('browser_use.dom', 'buildDomTree.js')
+	# region - Clickable elements
+	@time_execution_async('--get_clickable_elements')
+	async def get_clickable_elements(
+		self,
+		highlight_elements: bool = True,
+		focus_element: int = -1,
+		viewport_expansion: int = 0,
+	) -> DOMState:
+		element_tree, selector_map = await self._build_dom_tree(highlight_elements, focus_element, viewport_expansion)
+		return DOMState(element_tree=element_tree, selector_map=selector_map)
+	@time_execution_async('--build_dom_tree')
+	async def _build_dom_tree(
+		self,
+		highlight_elements: bool,
+		focus_element: int,
+		viewport_expansion: int,
+	) -> tuple[DOMElementNode, SelectorMap]:
+		if await self.page.evaluate('1+1') != 2:
+			raise ValueError('The page cannot evaluate javascript code properly')
+		# NOTE: We execute JS code in the browser to extract important DOM information.
+		#       The returned hash map contains information about the DOM tree and the
+		#       relationship between the DOM elements.
+		debug_mode = logger.getEffectiveLevel() == logging.DEBUG
+		args = {
+			'doHighlightElements': highlight_elements,
+			'focusHighlightIndex': focus_element,
+			'viewportExpansion': viewport_expansion,
+			'debugMode': debug_mode,
+		}
+		try:
+			eval_page = await self.page.evaluate(self.js_code, args)
+		except Exception as e:
+			logger.error('Error evaluating JavaScript: %s', e)
+			raise
+		# Only log performance metrics in debug mode
+		if debug_mode and 'perfMetrics' in eval_page:
+			logger.debug('DOM Tree Building Performance Metrics:\n%s', json.dumps(eval_page['perfMetrics'], indent=2))
+		return await self._construct_dom_tree(eval_page)
+	@time_execution_async('--construct_dom_tree')
+	async def _construct_dom_tree(
+		self,
+		eval_page: dict,
+	) -> tuple[DOMElementNode, SelectorMap]:
+		js_node_map = eval_page['map']
+		js_root_id = eval_page['rootId']
+		selector_map = {}
+		node_map = {}
+		for id, node_data in js_node_map.items():
+			node, children_ids = self._parse_node(node_data)
+			if node is None:
+				continue
+			node_map[id] = node
+			if isinstance(node, DOMElementNode) and node.highlight_index is not None:
+				selector_map[node.highlight_index] = node
+			# NOTE: We know that we are building the tree bottom up
+			#       and all children are already processed.
+			if isinstance(node, DOMElementNode):
+				for child_id in children_ids:
+					if child_id not in node_map:
+						continue
+					child_node = node_map[child_id]
+					child_node.parent = node
+					node.children.append(child_node)
+		html_to_dict = node_map[str(js_root_id)]
+		del node_map
+		del js_node_map
+		del js_root_id
+		gc.collect()
+		if html_to_dict is None or not isinstance(html_to_dict, DOMElementNode):
+			raise ValueError('Failed to parse HTML to dictionary')
+		return html_to_dict, selector_map
+	def _parse_node(
+		self,
+		node_data: dict,
+	) -> tuple[Optional[DOMBaseNode], list[int]]:
+		if not node_data:
+			return None, []
+		# Process text nodes immediately
+		if node_data.get('type') == 'TEXT_NODE':
+			text_node = DOMTextNode(
+				text=node_data['text'],
+				is_visible=node_data['isVisible'],
+				parent=None,
+			)
+			return text_node, []
+		# Process coordinates if they exist for element nodes
+		viewport_info = None
+		if 'viewport' in node_data:
+			viewport_info = ViewportInfo(
+				width=node_data['viewport']['width'],
+				height=node_data['viewport']['height'],
+			)
+		element_node = DOMElementNode(
+			tag_name=node_data['tagName'],
+			xpath=node_data['xpath'],
+			attributes=node_data.get('attributes', {}),
+			children=[],
+			is_visible=node_data.get('isVisible', False),
+			is_interactive=node_data.get('isInteractive', False),
+			is_top_element=node_data.get('isTopElement', False),
+			is_in_viewport=node_data.get('isInViewport', False),
+			highlight_index=node_data.get('highlightIndex'),
+			shadow_root=node_data.get('shadowRoot', False),
+			parent=None,
+			viewport_info=viewport_info,
+		)
+		children_ids = node_data.get('children', [])
+		return element_node, children_ids

browser_use/dom/tests/debug_page_structure.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import asyncio
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext
+async def analyze_page_structure(url: str):
+	"""Analyze and print the structure of a webpage with enhanced debugging"""
+	browser = Browser(
+		config=BrowserConfig(
+			headless=False,  # Set to True if you don't need to see the browser
+		)
+	)
+	context = BrowserContext(browser=browser)
+	try:
+		async with context as ctx:
+			# Navigate to the URL
+			page = await ctx.get_current_page()
+			await page.goto(url)
+			await page.wait_for_load_state('networkidle')
+			# Get viewport dimensions
+			viewport_info = await page.evaluate("""() => {
+				return {
+					viewport: {
+						width: window.innerWidth,
+						height: window.innerHeight,
+						scrollX: window.scrollX,
+						scrollY: window.scrollY
+					}
+				}
+			}""")
+			print('\nViewport Information:')
+			print(f'Width: {viewport_info["viewport"]["width"]}')
+			print(f'Height: {viewport_info["viewport"]["height"]}')
+			print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}')
+			print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}')
+			# Enhanced debug information for cookie consent and fixed position elements
+			debug_info = await page.evaluate("""() => {
+				function getElementInfo(element) {
+					const rect = element.getBoundingClientRect();
+					const style = window.getComputedStyle(element);
+					return {
+						tag: element.tagName.toLowerCase(),
+						id: element.id,
+						className: element.className,
+						position: style.position,
+						rect: {
+							top: rect.top,
+							right: rect.right,
+							bottom: rect.bottom,
+							left: rect.left,
+							width: rect.width,
+							height: rect.height
+						},
+						isFixed: style.position === 'fixed',
+						isSticky: style.position === 'sticky',
+						zIndex: style.zIndex,
+						visibility: style.visibility,
+						display: style.display,
+						opacity: style.opacity
+					};
+				}
+				// Find cookie-related elements
+				const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]'));
+				const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => {
+					const style = window.getComputedStyle(el);
+					return style.position === 'fixed' || style.position === 'sticky';
+				});
+				return {
+					cookieElements: cookieElements.map(getElementInfo),
+					fixedElements: fixedElements.map(getElementInfo)
+				};
+			}""")
+			print('\nCookie-related Elements:')
+			for elem in debug_info['cookieElements']:
+				print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
+				print(f'Position: {elem["position"]}')
+				print(f'Rect: {elem["rect"]}')
+				print(f'Z-Index: {elem["zIndex"]}')
+				print(f'Visibility: {elem["visibility"]}')
+				print(f'Display: {elem["display"]}')
+				print(f'Opacity: {elem["opacity"]}')
+			print('\nFixed/Sticky Position Elements:')
+			for elem in debug_info['fixedElements']:
+				print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
+				print(f'Position: {elem["position"]}')
+				print(f'Rect: {elem["rect"]}')
+				print(f'Z-Index: {elem["zIndex"]}')
+			print(f'\nPage Structure for {url}:\n')
+			structure = await ctx.get_page_structure()
+			print(structure)
+			input('Press Enter to close the browser...')
+	finally:
+		await browser.close()
+if __name__ == '__main__':
+	# You can modify this URL to analyze different pages
+	urls = [
+		'https://www.mlb.com/yankees/stats/',
+		'https://immobilienscout24.de',
+		'https://www.zeiss.com/career/en/job-search.html?page=1',
+		'https://www.zeiss.com/career/en/job-search.html?page=1',
+		'https://reddit.com',
+	]
+	for url in urls:
+		asyncio.run(analyze_page_structure(url))

browser_use/dom/tests/extraction_test.py ADDED Viewed

	@@ -0,0 +1,147 @@

+import asyncio
+import time
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContext, BrowserContextConfig
+from browser_use.dom.service import DomService
+from browser_use.utils import time_execution_sync
+async def test_process_html_file():
+	config = BrowserContextConfig(
+		cookies_file='cookies3.json',
+		disable_security=True,
+		wait_for_network_idle_page_load_time=2,
+	)
+	browser = Browser(
+		config=BrowserConfig(
+			# chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+		)
+	)
+	context = BrowserContext(browser=browser, config=config)  # noqa: F821
+	websites = [
+		'https://kayak.com/flights',
+		'https://immobilienscout24.de',
+		'https://google.com',
+		'https://amazon.com',
+		'https://github.com',
+	]
+	async with context as context:
+		page = await context.get_current_page()
+		dom_service = DomService(page)
+		for website in websites:
+			print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
+			await page.goto(website)
+			time.sleep(2)  # Additional wait for dynamic content
+			async def test_viewport(expansion: int, description: str):
+				print(f'\n{description}:')
+				dom_state = await time_execution_sync(f'get_clickable_elements ({description})')(
+					dom_service.get_clickable_elements
+				)(highlight_elements=True, viewport_expansion=expansion)
+				elements = dom_state.element_tree
+				selector_map = dom_state.selector_map
+				element_count = len(selector_map.keys())
+				token_count = count_string_tokens(elements.clickable_elements_to_string(), model='gpt-4o')
+				print(f'Number of elements: {element_count}')
+				print(f'Token count: {token_count}')
+				return element_count, token_count
+			expansions = [0, 100, 200, 300, 400, 500, 600, 1000, -1, -200]
+			results = []
+			for i, expansion in enumerate(expansions):
+				description = (
+					f'{i + 1}. Expansion {expansion}px' if expansion >= 0 else f'{i + 1}. All elements ({expansion} expansion)'
+				)
+				count, tokens = await test_viewport(expansion, description)
+				results.append((count, tokens))
+				input('Press Enter to continue...')
+				await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
+			# Print comparison summary
+			print('\nComparison Summary:')
+			for i, (count, tokens) in enumerate(results):
+				expansion = expansions[i]
+				description = f'Expansion {expansion}px' if expansion >= 0 else 'All elements (-1)'
+				initial_count, initial_tokens = results[0]
+				print(f'{description}: {count} elements (+{count - initial_count}), {tokens} tokens')
+			input('\nPress Enter to continue to next website...')
+			# Clear highlights before next website
+			await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
+async def test_focus_vs_all_elements():
+	config = BrowserContextConfig(
+		cookies_file='cookies3.json',
+		disable_security=True,
+		wait_for_network_idle_page_load_time=2,
+	)
+	browser = Browser(
+		config=BrowserConfig(
+			# chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
+		)
+	)
+	context = BrowserContext(browser=browser, config=config)  # noqa: F821
+	websites = [
+		'https://immobilienscout24.de',
+		'https://www.zeiss.com/career/en/job-search.html?page=1',
+		'https://www.mlb.com/yankees/stats/',
+		'https://www.amazon.com/s?k=laptop&s=review-rank&crid=1RZCEJ289EUSI&qid=1740202453&sprefix=laptop%2Caps%2C166&ref=sr_st_review-rank&ds=v1%3A4EnYKXVQA7DIE41qCvRZoNB4qN92Jlztd3BPsTFXmxU',
+		'https://codepen.io/geheimschriftstift/pen/mPLvQz',
+		'https://reddit.com',
+		'https://www.google.com/search?q=google+hi&oq=google+hi&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRhA0gEIMjI2NmowajSoAgCwAgE&sourceid=chrome&ie=UTF-8',
+		'https://kayak.com/flights',
+		'https://google.com',
+		'https://amazon.com',
+		'https://github.com',
+	]
+	async with context as context:
+		page = await context.get_current_page()
+		dom_service = DomService(page)
+		for website in websites:
+			# sleep 2
+			await page.goto(website)
+			time.sleep(2)
+			while True:
+				try:
+					print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
+					# time.sleep(2)  # Additional wait for dynamic content
+					# First get all elements
+					print('\nGetting all elements:')
+					all_elements_state = await time_execution_sync('get_all_elements')(dom_service.get_clickable_elements)(
+						highlight_elements=True, viewport_expansion=100
+					)
+					selector_map = all_elements_state.selector_map
+					total_elements = len(selector_map.keys())
+					print(f'Total number of elements: {total_elements}')
+					answer = input('Press Enter to clear highlights and continue...')
+					if answer == 'q':
+						break
+					await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
+				except Exception as e:
+					print(f'Error: {e}')
+					pass
+if __name__ == '__main__':
+	asyncio.run(test_focus_vs_all_elements())
+	asyncio.run(test_process_html_file())

browser_use/dom/tests/process_dom_test.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import json
+import os
+import time
+from browser_use.browser.browser import Browser, BrowserConfig
+async def test_process_dom():
+	browser = Browser(config=BrowserConfig(headless=False))
+	async with await browser.new_context() as context:
+		page = await context.get_current_page()
+		await page.goto('https://kayak.com/flights')
+		# await page.goto('https://google.com/flights')
+		# await page.goto('https://immobilienscout24.de')
+		# await page.goto('https://seleniumbase.io/w3schools/iframes')
+		time.sleep(3)
+		with open('browser_use/dom/buildDomTree.js', 'r') as f:
+			js_code = f.read()
+		start = time.time()
+		dom_tree = await page.evaluate(js_code)
+		end = time.time()
+		# print(dom_tree)
+		print(f'Time: {end - start:.2f}s')
+		os.makedirs('./tmp', exist_ok=True)
+		with open('./tmp/dom.json', 'w') as f:
+			json.dump(dom_tree, f, indent=1)
+		# both of these work for immobilienscout24.de
+		# await page.click('.sc-dcJsrY.ezjNCe')
+		# await page.click(
+		# 	'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)'
+		# )
+		input('Press Enter to continue...')

browser_use/dom/views.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from dataclasses import dataclass
+from functools import cached_property
+from typing import TYPE_CHECKING, Dict, List, Optional
+from browser_use.dom.history_tree_processor.view import CoordinateSet, HashedDomElement, ViewportInfo
+from browser_use.utils import time_execution_sync
+# Avoid circular import issues
+if TYPE_CHECKING:
+	from .views import DOMElementNode
+@dataclass(frozen=False)
+class DOMBaseNode:
+	is_visible: bool
+	# Use None as default and set parent later to avoid circular reference issues
+	parent: Optional['DOMElementNode']
+@dataclass(frozen=False)
+class DOMTextNode(DOMBaseNode):
+	text: str
+	type: str = 'TEXT_NODE'
+	def has_parent_with_highlight_index(self) -> bool:
+		current = self.parent
+		while current is not None:
+			# stop if the element has a highlight index (will be handled separately)
+			if current.highlight_index is not None:
+				return True
+			current = current.parent
+		return False
+	def is_parent_in_viewport(self) -> bool:
+		if self.parent is None:
+			return False
+		return self.parent.is_in_viewport
+	def is_parent_top_element(self) -> bool:
+		if self.parent is None:
+			return False
+		return self.parent.is_top_element
+@dataclass(frozen=False)
+class DOMElementNode(DOMBaseNode):
+	"""
+	xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe).
+	To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`)
+	"""
+	tag_name: str
+	xpath: str
+	attributes: Dict[str, str]
+	children: List[DOMBaseNode]
+	is_interactive: bool = False
+	is_top_element: bool = False
+	is_in_viewport: bool = False
+	shadow_root: bool = False
+	highlight_index: Optional[int] = None
+	viewport_coordinates: Optional[CoordinateSet] = None
+	page_coordinates: Optional[CoordinateSet] = None
+	viewport_info: Optional[ViewportInfo] = None
+	def __repr__(self) -> str:
+		tag_str = f'<{self.tag_name}'
+		# Add attributes
+		for key, value in self.attributes.items():
+			tag_str += f' {key}="{value}"'
+		tag_str += '>'
+		# Add extra info
+		extras = []
+		if self.is_interactive:
+			extras.append('interactive')
+		if self.is_top_element:
+			extras.append('top')
+		if self.shadow_root:
+			extras.append('shadow-root')
+		if self.highlight_index is not None:
+			extras.append(f'highlight:{self.highlight_index}')
+		if self.is_in_viewport:
+			extras.append('in-viewport')
+		if extras:
+			tag_str += f' [{", ".join(extras)}]'
+		return tag_str
+	@cached_property
+	def hash(self) -> HashedDomElement:
+		from browser_use.dom.history_tree_processor.service import (
+			HistoryTreeProcessor,
+		)
+		return HistoryTreeProcessor._hash_dom_element(self)
+	def get_all_text_till_next_clickable_element(self, max_depth: int = -1) -> str:
+		text_parts = []
+		def collect_text(node: DOMBaseNode, current_depth: int) -> None:
+			if max_depth != -1 and current_depth > max_depth:
+				return
+			# Skip this branch if we hit a highlighted element (except for the current node)
+			if isinstance(node, DOMElementNode) and node != self and node.highlight_index is not None:
+				return
+			if isinstance(node, DOMTextNode):
+				text_parts.append(node.text)
+			elif isinstance(node, DOMElementNode):
+				for child in node.children:
+					collect_text(child, current_depth + 1)
+		collect_text(self, 0)
+		return '\n'.join(text_parts).strip()
+	@time_execution_sync('--clickable_elements_to_string')
+	def clickable_elements_to_string(self, include_attributes: list[str] | None = None) -> str:
+		"""Convert the processed DOM content to HTML."""
+		formatted_text = []
+		def process_node(node: DOMBaseNode, depth: int) -> None:
+			if isinstance(node, DOMElementNode):
+				# Add element with highlight_index
+				if node.highlight_index is not None:
+					attributes_str = ''
+					text = node.get_all_text_till_next_clickable_element()
+					if include_attributes:
+						attributes = list(
+							set(
+								[
+									str(value)
+									for key, value in node.attributes.items()
+									if key in include_attributes and value != node.tag_name
+								]
+							)
+						)
+						if text in attributes:
+							attributes.remove(text)
+						attributes_str = ';'.join(attributes)
+					line = f'[{node.highlight_index}]<{node.tag_name} '
+					if attributes_str:
+						line += f'{attributes_str}'
+					if text:
+						if attributes_str:
+							line += f'>{text}'
+						else:
+							line += f'{text}'
+					line += '/>'
+					formatted_text.append(line)
+				# Process children regardless
+				for child in node.children:
+					process_node(child, depth + 1)
+			elif isinstance(node, DOMTextNode):
+				# Add text only if it doesn't have a highlighted parent
+				if not node.has_parent_with_highlight_index() and node.is_visible:  # and node.is_parent_top_element()
+					formatted_text.append(f'{node.text}')
+		process_node(self, 0)
+		return '\n'.join(formatted_text)
+	def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']:
+		# Check if current element is a file input
+		if self.tag_name == 'input' and self.attributes.get('type') == 'file':
+			return self
+		# Check children
+		for child in self.children:
+			if isinstance(child, DOMElementNode):
+				result = child.get_file_upload_element(check_siblings=False)
+				if result:
+					return result
+		# Check siblings only for the initial call
+		if check_siblings and self.parent:
+			for sibling in self.parent.children:
+				if sibling is not self and isinstance(sibling, DOMElementNode):
+					result = sibling.get_file_upload_element(check_siblings=False)
+					if result:
+						return result
+		return None
+SelectorMap = dict[int, DOMElementNode]
+@dataclass
+class DOMState:
+	element_tree: DOMElementNode
+	selector_map: SelectorMap

browser_use/logging_config.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import logging
+import os
+import sys
+from dotenv import load_dotenv
+load_dotenv()
+def addLoggingLevel(levelName, levelNum, methodName=None):
+	"""
+	Comprehensively adds a new logging level to the `logging` module and the
+	currently configured logging class.
+	`levelName` becomes an attribute of the `logging` module with the value
+	`levelNum`. `methodName` becomes a convenience method for both `logging`
+	itself and the class returned by `logging.getLoggerClass()` (usually just
+	`logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
+	used.
+	To avoid accidental clobberings of existing attributes, this method will
+	raise an `AttributeError` if the level name is already an attribute of the
+	`logging` module or if the method name is already present
+	Example
+	-------
+	>>> addLoggingLevel('TRACE', logging.DEBUG - 5)
+	>>> logging.getLogger(__name__).setLevel('TRACE')
+	>>> logging.getLogger(__name__).trace('that worked')
+	>>> logging.trace('so did this')
+	>>> logging.TRACE
+	5
+	"""
+	if not methodName:
+		methodName = levelName.lower()
+	if hasattr(logging, levelName):
+		raise AttributeError('{} already defined in logging module'.format(levelName))
+	if hasattr(logging, methodName):
+		raise AttributeError('{} already defined in logging module'.format(methodName))
+	if hasattr(logging.getLoggerClass(), methodName):
+		raise AttributeError('{} already defined in logger class'.format(methodName))
+	# This method was inspired by the answers to Stack Overflow post
+	# http://stackoverflow.com/q/2183233/2988730, especially
+	# http://stackoverflow.com/a/13638084/2988730
+	def logForLevel(self, message, *args, **kwargs):
+		if self.isEnabledFor(levelNum):
+			self._log(levelNum, message, args, **kwargs)
+	def logToRoot(message, *args, **kwargs):
+		logging.log(levelNum, message, *args, **kwargs)
+	logging.addLevelName(levelNum, levelName)
+	setattr(logging, levelName, levelNum)
+	setattr(logging.getLoggerClass(), methodName, logForLevel)
+	setattr(logging, methodName, logToRoot)
+def setup_logging():
+	# Try to add RESULT level, but ignore if it already exists
+	try:
+		addLoggingLevel('RESULT', 35)  # This allows ERROR, FATAL and CRITICAL
+	except AttributeError:
+		pass  # Level already exists, which is fine
+	log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
+	# Check if handlers are already set up
+	if logging.getLogger().hasHandlers():
+		return
+	# Clear existing handlers
+	root = logging.getLogger()
+	root.handlers = []
+	class BrowserUseFormatter(logging.Formatter):
+		def format(self, record):
+			if type(record.name) == str and record.name.startswith('browser_use.'):
+				record.name = record.name.split('.')[-2]
+			return super().format(record)
+	# Setup single handler for all loggers
+	console = logging.StreamHandler(sys.stdout)
+	# adittional setLevel here to filter logs
+	if log_type == 'result':
+		console.setLevel('RESULT')
+		console.setFormatter(BrowserUseFormatter('%(message)s'))
+	else:
+		console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s'))
+	# Configure root logger only
+	root.addHandler(console)
+	# switch cases for log_type
+	if log_type == 'result':
+		root.setLevel('RESULT')  # string usage to avoid syntax error
+	elif log_type == 'debug':
+		root.setLevel(logging.DEBUG)
+	else:
+		root.setLevel(logging.INFO)
+	# Configure browser_use logger
+	browser_use_logger = logging.getLogger('browser_use')
+	browser_use_logger.propagate = False  # Don't propagate to root logger
+	browser_use_logger.addHandler(console)
+	browser_use_logger.setLevel(root.level)  # Set same level as root logger
+	logger = logging.getLogger('browser_use')
+	logger.info('BrowserUse logging setup complete with level %s', log_type)
+	# Silence third-party loggers
+	for logger in [
+		'WDM',
+		'httpx',
+		'selenium',
+		'playwright',
+		'urllib3',
+		'asyncio',
+		'langchain',
+		'openai',
+		'httpcore',
+		'charset_normalizer',
+		'anthropic._base_client',
+		'PIL.PngImagePlugin',
+		'trafilatura.htmlprocessing',
+		'trafilatura',
+	]:
+		third_party = logging.getLogger(logger)
+		third_party.setLevel(logging.ERROR)
+		third_party.propagate = False

browser_use/telemetry/service.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import logging
+import os
+import uuid
+from pathlib import Path
+from dotenv import load_dotenv
+from posthog import Posthog
+from browser_use.telemetry.views import BaseTelemetryEvent
+from browser_use.utils import singleton
+load_dotenv()
+logger = logging.getLogger(__name__)
+POSTHOG_EVENT_SETTINGS = {
+	'process_person_profile': True,
+}
+@singleton
+class ProductTelemetry:
+	"""
+	Service for capturing anonymized telemetry data.
+	If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled.
+	"""
+	USER_ID_PATH = str(Path.home() / '.cache' / 'browser_use' / 'telemetry_user_id')
+	PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh'
+	HOST = 'https://eu.i.posthog.com'
+	UNKNOWN_USER_ID = 'UNKNOWN'
+	_curr_user_id = None
+	def __init__(self) -> None:
+		telemetry_disabled = os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
+		self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug'
+		if telemetry_disabled:
+			self._posthog_client = None
+		else:
+			logging.info(
+				'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.'
+			)
+			self._posthog_client = Posthog(
+				project_api_key=self.PROJECT_API_KEY,
+				host=self.HOST,
+				disable_geoip=False,
+			)
+			# Silence posthog's logging
+			if not self.debug_logging:
+				posthog_logger = logging.getLogger('posthog')
+				posthog_logger.disabled = True
+		if self._posthog_client is None:
+			logger.debug('Telemetry disabled')
+	def capture(self, event: BaseTelemetryEvent) -> None:
+		if self._posthog_client is None:
+			return
+		if self.debug_logging:
+			logger.debug(f'Telemetry event: {event.name} {event.properties}')
+		self._direct_capture(event)
+	def _direct_capture(self, event: BaseTelemetryEvent) -> None:
+		"""
+		Should not be thread blocking because posthog magically handles it
+		"""
+		if self._posthog_client is None:
+			return
+		try:
+			self._posthog_client.capture(
+				self.user_id,
+				event.name,
+				{**event.properties, **POSTHOG_EVENT_SETTINGS},
+			)
+		except Exception as e:
+			logger.error(f'Failed to send telemetry event {event.name}: {e}')
+	@property
+	def user_id(self) -> str:
+		if self._curr_user_id:
+			return self._curr_user_id
+		# File access may fail due to permissions or other reasons. We don't want to
+		# crash so we catch all exceptions.
+		try:
+			if not os.path.exists(self.USER_ID_PATH):
+				os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True)
+				with open(self.USER_ID_PATH, 'w') as f:
+					new_user_id = str(uuid.uuid4())
+					f.write(new_user_id)
+				self._curr_user_id = new_user_id
+			else:
+				with open(self.USER_ID_PATH, 'r') as f:
+					self._curr_user_id = f.read()
+		except Exception:
+			self._curr_user_id = 'UNKNOWN_USER_ID'
+		return self._curr_user_id

browser_use/telemetry/views.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from abc import ABC, abstractmethod
+from dataclasses import asdict, dataclass
+from typing import Any, Dict, Sequence
+@dataclass
+class BaseTelemetryEvent(ABC):
+	@property
+	@abstractmethod
+	def name(self) -> str:
+		pass
+	@property
+	def properties(self) -> Dict[str, Any]:
+		return {k: v for k, v in asdict(self).items() if k != 'name'}
+@dataclass
+class RegisteredFunction:
+	name: str
+	params: dict[str, Any]
+@dataclass
+class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent):
+	registered_functions: list[RegisteredFunction]
+	name: str = 'controller_registered_functions'
+@dataclass
+class AgentStepTelemetryEvent(BaseTelemetryEvent):
+	agent_id: str
+	step: int
+	step_error: list[str]
+	consecutive_failures: int
+	actions: list[dict]
+	name: str = 'agent_step'
+@dataclass
+class AgentRunTelemetryEvent(BaseTelemetryEvent):
+	agent_id: str
+	use_vision: bool
+	task: str
+	model_name: str
+	chat_model_library: str
+	version: str
+	source: str
+	name: str = 'agent_run'
+@dataclass
+class AgentEndTelemetryEvent(BaseTelemetryEvent):
+	agent_id: str
+	steps: int
+	max_steps_reached: bool
+	is_done: bool
+	success: bool | None
+	total_input_tokens: int
+	total_duration_seconds: float
+	errors: Sequence[str | None]
+	name: str = 'agent_end'

browser_use/utils.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import logging
+import time
+from functools import wraps
+from typing import Any, Callable, Coroutine, ParamSpec, TypeVar
+logger = logging.getLogger(__name__)
+# Define generic type variables for return type and parameters
+R = TypeVar('R')
+P = ParamSpec('P')
+def time_execution_sync(additional_text: str = '') -> Callable[[Callable[P, R]], Callable[P, R]]:
+	def decorator(func: Callable[P, R]) -> Callable[P, R]:
+		@wraps(func)
+		def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+			start_time = time.time()
+			result = func(*args, **kwargs)
+			execution_time = time.time() - start_time
+			logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
+			return result
+		return wrapper
+	return decorator
+def time_execution_async(
+	additional_text: str = '',
+) -> Callable[[Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]]:
+	def decorator(func: Callable[P, Coroutine[Any, Any, R]]) -> Callable[P, Coroutine[Any, Any, R]]:
+		@wraps(func)
+		async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+			start_time = time.time()
+			result = await func(*args, **kwargs)
+			execution_time = time.time() - start_time
+			logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
+			return result
+		return wrapper
+	return decorator
+def singleton(cls):
+	instance = [None]
+	def wrapper(*args, **kwargs):
+		if instance[0] is None:
+			instance[0] = cls(*args, **kwargs)
+		return instance[0]
+	return wrapper

codebeaver.yml ADDED Viewed

	@@ -0,0 +1,4 @@

+environment:
+- OPENAI_API_KEY=empty
+- AZURE_OPENAI_API_KEY=empty
+from: pytest

conftest.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+import sys
+from browser_use.logging_config import setup_logging
+# Get the absolute path to the project root
+project_root = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, project_root)
+setup_logging()

docs/README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# Docs
+The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com).
+### Development
+Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command
+```
+npm i -g mintlify
+```
+Run the following command at the root of your documentation (where mint.json is)
+```
+mintlify dev
+```