Boobs00 commited on
Commit
db4810d
·
verified ·
1 Parent(s): e91a48e

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +8 -0
  2. .gitattributes +4 -35
  3. .github/ISSUE_TEMPLATE/bug_report.yml +84 -0
  4. .github/ISSUE_TEMPLATE/config.yml +11 -0
  5. .github/ISSUE_TEMPLATE/docs_issue.yml +55 -0
  6. .github/ISSUE_TEMPLATE/feature_request.yml +43 -0
  7. .github/workflows/publish.yml +38 -0
  8. .gitignore +190 -0
  9. .pre-commit-config.yaml +18 -0
  10. .python-version +1 -0
  11. LICENSE +21 -0
  12. README.md +193 -10
  13. SECURITY.md +20 -0
  14. browser_use/README.md +51 -0
  15. browser_use/__init__.py +27 -0
  16. browser_use/agent/gif.py +325 -0
  17. browser_use/agent/message_manager/service.py +306 -0
  18. browser_use/agent/message_manager/tests.py +237 -0
  19. browser_use/agent/message_manager/utils.py +127 -0
  20. browser_use/agent/message_manager/views.py +129 -0
  21. browser_use/agent/prompts.py +165 -0
  22. browser_use/agent/service.py +964 -0
  23. browser_use/agent/system_prompt.md +69 -0
  24. browser_use/agent/tests.py +197 -0
  25. browser_use/agent/views.py +393 -0
  26. browser_use/browser/browser.py +253 -0
  27. browser_use/browser/context.py +1353 -0
  28. browser_use/browser/tests/screenshot_test.py +37 -0
  29. browser_use/browser/tests/test_clicks.py +94 -0
  30. browser_use/browser/views.py +53 -0
  31. browser_use/controller/registry/service.py +199 -0
  32. browser_use/controller/registry/views.py +70 -0
  33. browser_use/controller/service.py +532 -0
  34. browser_use/controller/views.py +65 -0
  35. browser_use/dom/__init__.py +0 -0
  36. browser_use/dom/buildDomTree.js +1055 -0
  37. browser_use/dom/history_tree_processor/service.py +107 -0
  38. browser_use/dom/history_tree_processor/view.py +70 -0
  39. browser_use/dom/service.py +169 -0
  40. browser_use/dom/tests/debug_page_structure.py +123 -0
  41. browser_use/dom/tests/extraction_test.py +147 -0
  42. browser_use/dom/tests/process_dom_test.py +40 -0
  43. browser_use/dom/views.py +196 -0
  44. browser_use/logging_config.py +132 -0
  45. browser_use/telemetry/service.py +105 -0
  46. browser_use/telemetry/views.py +63 -0
  47. browser_use/utils.py +54 -0
  48. codebeaver.yml +4 -0
  49. conftest.py +10 -0
  50. docs/README.md +17 -0
.env.example ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ OPENAI_API_KEY=
2
+ ANTHROPIC_API_KEY=
3
+
4
+ # Set to false to disable anonymized telemetry
5
+ ANONYMIZED_TELEMETRY=true
6
+
7
+ # LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info
8
+ BROWSER_USE_LOGGING_LEVEL=info
.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ static/*.gif filter=lfs diff=lfs merge=lfs -text
2
+ # static/*.mp4 filter=lfs diff=lfs merge=lfs -text
3
+ docs/images/checks-passed.png filter=lfs diff=lfs merge=lfs -text
4
+ docs/images/laminar.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 🐛 Bug Report
2
+ description: Report a bug in browser-use
3
+ labels: ["bug", "triage"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for taking the time to fill out this bug report! Please fill out the form below to help us reproduce and fix the issue.
9
+
10
+ - type: textarea
11
+ id: description
12
+ attributes:
13
+ label: Bug Description
14
+ description: A clear and concise description of what the bug is.
15
+ placeholder: When I try to... the library...
16
+ validations:
17
+ required: true
18
+
19
+ - type: textarea
20
+ id: reproduction
21
+ attributes:
22
+ label: Reproduction Steps
23
+ description: Steps to reproduce the behavior
24
+ placeholder: |
25
+ 1. Install browser-use...
26
+ 2. Run the following task...
27
+ 3. See error...
28
+ validations:
29
+ required: true
30
+
31
+ - type: textarea
32
+ id: code
33
+ attributes:
34
+ label: Code Sample
35
+ description: Include a minimal code sample that reproduces the issue
36
+ render: python
37
+ validations:
38
+ required: true
39
+
40
+ - type: input
41
+ id: version
42
+ attributes:
43
+ label: Version
44
+ description: What version of browser-use are you using? (Run `uv pip show browser-use` to find out)
45
+ placeholder: "e.g., pip 0.1.26, or git main branch"
46
+ validations:
47
+ required: true
48
+
49
+ - type: dropdown
50
+ id: model
51
+ attributes:
52
+ label: LLM Model
53
+ description: Which LLM model(s) are you using?
54
+ multiple: true
55
+ options:
56
+ - GPT-4o
57
+ - GPT-4
58
+ - Claude 3.5 Sonnet
59
+ - Claude 3.5 Opus
60
+ - Claude 3.5 Haiku
61
+ - Gemini 1.5 Pro
62
+ - Gemini 1.5 Ultra
63
+ - Fireworks Mixtral
64
+ - DeepSeek Coder
65
+ - Local Model (Specify model in description)
66
+ - Other (specify in description)
67
+ validations:
68
+ required: true
69
+
70
+ - type: input
71
+ id: os
72
+ attributes:
73
+ label: Operating System
74
+ description: What operating system are you using?
75
+ placeholder: "e.g., macOS 13.1, Windows 11, Ubuntu 22.04"
76
+ validations:
77
+ required: true
78
+
79
+ - type: textarea
80
+ id: logs
81
+ attributes:
82
+ label: Relevant Log Output
83
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code.
84
+ render: shell
.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ blank_issues_enabled: false # Set to true if you want to allow blank issues
2
+ contact_links:
3
+ - name: 🤔 Quickstart Guide
4
+ url: https://docs.browser-use.com/quickstart
5
+ about: Most common issues can be resolved by following our quickstart guide
6
+ - name: 🤔 Questions and Help
7
+ url: https://link.browser-use.com/discord
8
+ about: Please ask questions in our Discord community
9
+ - name: 📖 Documentation
10
+ url: https://docs.browser-use.com
11
+ about: Check our documentation for answers first
.github/ISSUE_TEMPLATE/docs_issue.yml ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 📚 Documentation Issue
2
+ description: Report an issue in the browser-use documentation
3
+ labels: ["documentation"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for taking the time to improve our documentation! Please fill out the form below to help us understand the issue.
9
+
10
+ - type: dropdown
11
+ id: type
12
+ attributes:
13
+ label: Type of Documentation Issue
14
+ description: What type of documentation issue is this?
15
+ options:
16
+ - Missing documentation
17
+ - Incorrect documentation
18
+ - Unclear documentation
19
+ - Broken link
20
+ - Other (specify in description)
21
+ validations:
22
+ required: true
23
+
24
+ - type: input
25
+ id: page
26
+ attributes:
27
+ label: Documentation Page
28
+ description: Which page or section of the documentation is this about?
29
+ placeholder: "e.g., https://docs.browser-use.com/getting-started or Installation Guide"
30
+ validations:
31
+ required: true
32
+
33
+ - type: textarea
34
+ id: description
35
+ attributes:
36
+ label: Issue Description
37
+ description: Describe what's wrong or missing in the documentation
38
+ placeholder: The documentation should...
39
+ validations:
40
+ required: true
41
+
42
+ - type: textarea
43
+ id: suggestion
44
+ attributes:
45
+ label: Suggested Changes
46
+ description: If you have specific suggestions for how to improve the documentation, please share them
47
+ placeholder: |
48
+ The documentation could be improved by...
49
+
50
+ Example:
51
+ ```python
52
+ # Your suggested code example or text here
53
+ ```
54
+ validations:
55
+ required: true
.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: 💡 Feature Request
2
+ description: Suggest a new feature for browser-use
3
+ labels: ["enhancement"]
4
+ body:
5
+ - type: markdown
6
+ attributes:
7
+ value: |
8
+ Thanks for taking the time to suggest a new feature! Please fill out the form below to help us understand your suggestion.
9
+
10
+ - type: textarea
11
+ id: problem
12
+ attributes:
13
+ label: Problem Description
14
+ description: Is your feature request related to a problem? Please describe.
15
+ placeholder: I'm always frustrated when...
16
+ validations:
17
+ required: true
18
+
19
+ - type: textarea
20
+ id: solution
21
+ attributes:
22
+ label: Proposed Solution
23
+ description: Describe the solution you'd like to see
24
+ placeholder: It would be great if...
25
+ validations:
26
+ required: true
27
+
28
+ - type: textarea
29
+ id: alternatives
30
+ attributes:
31
+ label: Alternative Solutions
32
+ description: Describe any alternative solutions or features you've considered
33
+ placeholder: I've also thought about...
34
+
35
+ - type: textarea
36
+ id: context
37
+ attributes:
38
+ label: Additional Context
39
+ description: Add any other context or examples about the feature request here
40
+ placeholder: |
41
+ - Example use cases
42
+ - Screenshots or mockups
43
+ - Related issues or discussions
.github/workflows/publish.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This workflow will upload a Python Package using Twine when a release is created
2
+ # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3
+
4
+ # This workflow uses actions that are not certified by GitHub.
5
+ # They are provided by a third-party and are governed by
6
+ # separate terms of service, privacy policy, and support
7
+ # documentation.
8
+
9
+ name: Upload Python Package
10
+
11
+ on:
12
+ release:
13
+ types: [published]
14
+
15
+ permissions:
16
+ contents: read
17
+
18
+ jobs:
19
+ deploy:
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+ - name: Set up Python
25
+ uses: actions/setup-python@v5
26
+ with:
27
+ python-version: "3.x"
28
+ - name: Install dependencies
29
+ run: |
30
+ python -m pip install --upgrade pip
31
+ pip install build hatch
32
+ - name: Build package
33
+ run: python -m build
34
+ - name: Publish package
35
+ uses: pypa/gh-action-pypi-publish@release/v1
36
+ with:
37
+ user: __token__
38
+ password: ${{ secrets.PYPI_API_TOKEN }}
.gitignore ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+ test_env/
133
+
134
+
135
+ # Spyder project settings
136
+ .spyderproject
137
+ .spyproject
138
+
139
+ # Rope project settings
140
+ .ropeproject
141
+
142
+ # mkdocs documentation
143
+ /site
144
+
145
+ # mypy
146
+ .mypy_cache/
147
+ .dmypy.json
148
+ dmypy.json
149
+
150
+ # Pyre type checker
151
+ .pyre/
152
+
153
+ # pytype static type analyzer
154
+ .pytype/
155
+
156
+ # Cython debug symbols
157
+ cython_debug/
158
+
159
+ # PyCharm
160
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
163
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164
+ #.idea/
165
+ temp
166
+ tmp
167
+
168
+
169
+ .DS_Store
170
+
171
+ private_example.py
172
+ private_example
173
+
174
+ browser_cookies.json
175
+ cookies.json
176
+ AgentHistory.json
177
+ cv_04_24.pdf
178
+ AgentHistoryList.json
179
+ *.gif
180
+ gcp-login.json
181
+ .vscode
182
+ .ruff_cache
183
+ .idea
184
+ *.txt
185
+ *.pdf
186
+ *.csv
187
+ *.json
188
+ *.jsonl
189
+
190
+ uv.lock
.pre-commit-config.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ rev: v0.3.0
4
+ hooks:
5
+ - id: ruff
6
+ args: [
7
+ --line-length=130,
8
+ --select=E,F,I,
9
+ --fix,
10
+ ]
11
+
12
+ - repo: https://github.com/pre-commit/pre-commit-hooks
13
+ rev: v4.5.0
14
+ hooks:
15
+ - id: trailing-whitespace
16
+ - id: end-of-file-fixer
17
+ - id: check-yaml
18
+ - id: check-toml
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Gregor Zunic
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,10 +1,193 @@
1
- ---
2
- title: Use
3
- emoji: 🌍
4
- colorFrom: indigo
5
- colorTo: indigo
6
- sdk: static
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <picture>
2
+ <source media="(prefers-color-scheme: dark)" srcset="./static/browser-use-dark.png">
3
+ <source media="(prefers-color-scheme: light)" srcset="./static/browser-use.png">
4
+ <img alt="Shows a black Browser Use Logo in light color mode and a white one in dark color mode." src="./static/browser-use.png" width="full">
5
+ </picture>
6
+
7
+ <h1 align="center">Enable AI to control your browser 🤖</h1>
8
+
9
+ [![GitHub stars](https://img.shields.io/github/stars/gregpr07/browser-use?style=social)](https://github.com/gregpr07/browser-use/stargazers)
10
+ [![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://link.browser-use.com/discord)
11
+ [![Cloud](https://img.shields.io/badge/Cloud-☁️-blue)](https://cloud.browser-use.com)
12
+ [![Documentation](https://img.shields.io/badge/Documentation-📕-blue)](https://docs.browser-use.com)
13
+ [![Twitter Follow](https://img.shields.io/twitter/follow/Gregor?style=social)](https://x.com/gregpr07)
14
+ [![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/mamagnus00)
15
+ [![Weave Badge](https://img.shields.io/endpoint?url=https%3A%2F%2Fapp.workweave.ai%2Fapi%2Frepository%2Fbadge%2Forg_T5Pvn3UBswTHIsN1dWS3voPg%2F881458615&labelColor=#EC6341)](https://app.workweave.ai/reports/repository/org_T5Pvn3UBswTHIsN1dWS3voPg/881458615)
16
+
17
+ 🌐 Browser-use is the easiest way to connect your AI agents with the browser.
18
+
19
+ 💡 See what others are building and share your projects in our [Discord](https://link.browser-use.com/discord)! Want Swag? Check out our [Merch store](https://browsermerch.com).
20
+
21
+ 🌤️ Skip the setup - try our <b>hosted version</b> for instant browser automation! <b>[Try the cloud ☁︎](https://cloud.browser-use.com)</b>.
22
+
23
+ # Quick start
24
+
25
+ With pip (Python>=3.11):
26
+
27
+ ```bash
28
+ pip install browser-use
29
+ ```
30
+
31
+ install playwright:
32
+
33
+ ```bash
34
+ playwright install
35
+ ```
36
+
37
+ Spin up your agent:
38
+
39
+ ```python
40
+ from langchain_openai import ChatOpenAI
41
+ from browser_use import Agent
42
+ import asyncio
43
+ from dotenv import load_dotenv
44
+ load_dotenv()
45
+
46
+ async def main():
47
+ agent = Agent(
48
+ task="Compare the price of gpt-4o and DeepSeek-V3",
49
+ llm=ChatOpenAI(model="gpt-4o"),
50
+ )
51
+ await agent.run()
52
+
53
+ asyncio.run(main())
54
+ ```
55
+
56
+ Add your API keys for the provider you want to use to your `.env` file.
57
+
58
+ ```bash
59
+ OPENAI_API_KEY=
60
+ ```
61
+
62
+ For other settings, models, and more, check out the [documentation 📕](https://docs.browser-use.com).
63
+
64
+ ### Test with UI
65
+
66
+ You can test [browser-use with a UI repository](https://github.com/browser-use/web-ui)
67
+
68
+ Or simply run the gradio example:
69
+
70
+ ```
71
+ uv pip install gradio
72
+ ```
73
+
74
+ ```bash
75
+ python examples/ui/gradio_demo.py
76
+ ```
77
+
78
+ # Demos
79
+
80
+ <br/><br/>
81
+
82
+ [Task](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/shopping.py): Add grocery items to cart, and checkout.
83
+
84
+ [![AI Did My Groceries](https://github.com/user-attachments/assets/d9359085-bde6-41d4-aa4e-6520d0221872)](https://www.youtube.com/watch?v=L2Ya9PYNns8)
85
+
86
+ <br/><br/>
87
+
88
+ Prompt: Add my latest LinkedIn follower to my leads in Salesforce.
89
+
90
+ ![LinkedIn to Salesforce](https://github.com/user-attachments/assets/1440affc-a552-442e-b702-d0d3b277b0ae)
91
+
92
+ <br/><br/>
93
+
94
+ [Prompt](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/find_and_apply_to_jobs.py): Read my CV & find ML jobs, save them to a file, and then start applying for them in new tabs, if you need help, ask me.'
95
+
96
+ https://github.com/user-attachments/assets/171fb4d6-0355-46f2-863e-edb04a828d04
97
+
98
+ <br/><br/>
99
+
100
+ [Prompt](https://github.com/browser-use/browser-use/blob/main/examples/browser/real_browser.py): Write a letter in Google Docs to my Papa, thanking him for everything, and save the document as a PDF.
101
+
102
+ ![Letter to Papa](https://github.com/user-attachments/assets/242ade3e-15bc-41c2-988f-cbc5415a66aa)
103
+
104
+ <br/><br/>
105
+
106
+ [Prompt](https://github.com/browser-use/browser-use/blob/main/examples/custom-functions/save_to_file_hugging_face.py): Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file.
107
+
108
+ https://github.com/user-attachments/assets/de73ee39-432c-4b97-b4e8-939fd7f323b3
109
+
110
+ <br/><br/>
111
+
112
+ ## More examples
113
+
114
+ For more examples see the [examples](examples) folder or join the [Discord](https://link.browser-use.com/discord) and show off your project.
115
+
116
+ # Vision
117
+
118
+ Tell your computer what to do, and it gets it done.
119
+
120
+ ## Roadmap
121
+
122
+ ### Agent
123
+
124
+ - [ ] Improve agent memory (summarize, compress, RAG, etc.)
125
+ - [ ] Enhance planning capabilities (load website specific context)
126
+ - [ ] Reduce token consumption (system prompt, DOM state)
127
+
128
+ ### DOM Extraction
129
+
130
+ - [ ] Improve extraction for datepickers, dropdowns, special elements
131
+ - [ ] Improve state representation for UI elements
132
+
133
+ ### Rerunning tasks
134
+
135
+ - [ ] LLM as fallback
136
+ - [ ] Make it easy to define workfows templates where LLM fills in the details
137
+ - [ ] Return playwright script from the agent
138
+
139
+ ### Datasets
140
+
141
+ - [ ] Create datasets for complex tasks
142
+ - [ ] Benchmark various models against each other
143
+ - [ ] Fine-tuning models for specific tasks
144
+
145
+ ### User Experience
146
+
147
+ - [ ] Human-in-the-loop execution
148
+ - [ ] Improve the generated GIF quality
149
+ - [ ] Create various demos for tutorial execution, job application, QA testing, social media, etc.
150
+
151
+ ## Contributing
152
+
153
+ We love contributions! Feel free to open issues for bugs or feature requests. To contribute to the docs, check out the `/docs` folder.
154
+
155
+ ## Local Setup
156
+
157
+ To learn more about the library, check out the [local setup 📕](https://docs.browser-use.com/development/local-setup).
158
+
159
+ ## Cooperations
160
+
161
+ We are forming a commission to define best practices for UI/UX design for browser agents.
162
+ Together, we're exploring how software redesign improves the performance of AI agents and gives these companies a competitive advantage by designing their existing software to be at the forefront of the agent age.
163
+
164
+ Email [Toby](mailto:tbiddle@loop11.com?subject=I%20want%20to%20join%20the%20UI/UX%20commission%20for%20AI%20agents&body=Hi%20Toby%2C%0A%0AI%20found%20you%20in%20the%20browser-use%20GitHub%20README.%0A%0A) to apply for a seat on the committee.
165
+
166
+ ## Swag
167
+
168
+ Want to show off your Browser-use swag? Check out our [Merch store](https://browsermerch.com). Good contributors will receive swag for free 👀.
169
+
170
+ ## Citation
171
+
172
+ If you use Browser Use in your research or project, please cite:
173
+
174
+ ```bibtex
175
+ @software{browser_use2024,
176
+ author = {Müller, Magnus and Žunič, Gregor},
177
+ title = {Browser Use: Enable AI to control your browser},
178
+ year = {2024},
179
+ publisher = {GitHub},
180
+ url = {https://github.com/browser-use/browser-use}
181
+ }
182
+ ```
183
+
184
+ <div align="center"> <img src="https://github.com/user-attachments/assets/402b2129-b6ac-44d3-a217-01aea3277dce" width="400"/>
185
+
186
+ [![Twitter Follow](https://img.shields.io/twitter/follow/Gregor?style=social)](https://x.com/gregpr07)
187
+ [![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/mamagnus00)
188
+
189
+ </div>
190
+
191
+ <div align="center">
192
+ Made with ❤️ in Zurich and San Francisco
193
+ </div>
SECURITY.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Reporting Security Issues
2
+
3
+ If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure.
4
+
5
+ **Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.**
6
+
7
+ Instead, please open a new [Github security advisory](https://github.com/browser-use/browser-use/security/advisories/new).
8
+
9
+ Please include as much of the information listed below as you can to help me better understand and resolve the issue:
10
+
11
+ * The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting)
12
+ * Full paths of source file(s) related to the manifestation of the issue
13
+ * The location of the affected source code (tag/branch/commit or direct URL)
14
+ * Any special configuration required to reproduce the issue
15
+ * Step-by-step instructions to reproduce the issue
16
+ * Proof-of-concept or exploit code (if possible)
17
+ * Impact of the issue, including how an attacker might exploit the issue
18
+
19
+ This information will help me triage your report more quickly.
20
+
browser_use/README.md ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Codebase Structure
2
+
3
+ > The code structure inspired by https://github.com/Netflix/dispatch.
4
+
5
+ Very good structure on how to make a scalable codebase is also in [this repo](https://github.com/zhanymkanov/fastapi-best-practices).
6
+
7
+ Just a brief document about how we should structure our backend codebase.
8
+
9
+ ## Code Structure
10
+
11
+ ```markdown
12
+ src/
13
+ /<service name>/
14
+ models.py
15
+ services.py
16
+ prompts.py
17
+ views.py
18
+ utils.py
19
+ routers.py
20
+
21
+ /_<subservice name>/
22
+ ```
23
+
24
+ ### Service.py
25
+
26
+ Always a single file, except if it becomes too long - more than ~500 lines, split it into \_subservices
27
+
28
+ ### Views.py
29
+
30
+ Always split the views into two parts
31
+
32
+ ```python
33
+ # All
34
+ ...
35
+
36
+ # Requests
37
+ ...
38
+
39
+ # Responses
40
+ ...
41
+ ```
42
+
43
+ If too long → split into multiple files
44
+
45
+ ### Prompts.py
46
+
47
+ Single file; if too long → split into multiple files (one prompt per file or so)
48
+
49
+ ### Routers.py
50
+
51
+ Never split into more than one file
browser_use/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from browser_use.logging_config import setup_logging
2
+
3
+ setup_logging()
4
+
5
+ from browser_use.agent.prompts import SystemPrompt as SystemPrompt
6
+ from browser_use.agent.service import Agent as Agent
7
+ from browser_use.agent.views import ActionModel as ActionModel
8
+ from browser_use.agent.views import ActionResult as ActionResult
9
+ from browser_use.agent.views import AgentHistoryList as AgentHistoryList
10
+ from browser_use.browser.browser import Browser as Browser
11
+ from browser_use.browser.browser import BrowserConfig as BrowserConfig
12
+ from browser_use.browser.context import BrowserContextConfig
13
+ from browser_use.controller.service import Controller as Controller
14
+ from browser_use.dom.service import DomService as DomService
15
+
16
+ __all__ = [
17
+ 'Agent',
18
+ 'Browser',
19
+ 'BrowserConfig',
20
+ 'Controller',
21
+ 'DomService',
22
+ 'SystemPrompt',
23
+ 'ActionResult',
24
+ 'ActionModel',
25
+ 'AgentHistoryList',
26
+ 'BrowserContextConfig',
27
+ ]
browser_use/agent/gif.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import io
5
+ import logging
6
+ import os
7
+ import platform
8
+ from typing import TYPE_CHECKING, Optional
9
+
10
+ from browser_use.agent.views import (
11
+ AgentHistoryList,
12
+ )
13
+
14
+ if TYPE_CHECKING:
15
+ from PIL import Image, ImageFont
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def create_history_gif(
21
+ task: str,
22
+ history: AgentHistoryList,
23
+ #
24
+ output_path: str = 'agent_history.gif',
25
+ duration: int = 3000,
26
+ show_goals: bool = True,
27
+ show_task: bool = True,
28
+ show_logo: bool = False,
29
+ font_size: int = 40,
30
+ title_font_size: int = 56,
31
+ goal_font_size: int = 44,
32
+ margin: int = 40,
33
+ line_spacing: float = 1.5,
34
+ ) -> None:
35
+ """Create a GIF from the agent's history with overlaid task and goal text."""
36
+ if not history.history:
37
+ logger.warning('No history to create GIF from')
38
+ return
39
+
40
+ from PIL import Image, ImageFont
41
+
42
+ images = []
43
+
44
+ # if history is empty or first screenshot is None, we can't create a gif
45
+ if not history.history or not history.history[0].state.screenshot:
46
+ logger.warning('No history or first screenshot to create GIF from')
47
+ return
48
+
49
+ # Try to load nicer fonts
50
+ try:
51
+ # Try different font options in order of preference
52
+ font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana']
53
+ font_loaded = False
54
+
55
+ for font_name in font_options:
56
+ try:
57
+ if platform.system() == 'Windows':
58
+ # Need to specify the abs font path on Windows
59
+ font_name = os.path.join(os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts'), font_name + '.ttf')
60
+ regular_font = ImageFont.truetype(font_name, font_size)
61
+ title_font = ImageFont.truetype(font_name, title_font_size)
62
+ goal_font = ImageFont.truetype(font_name, goal_font_size)
63
+ font_loaded = True
64
+ break
65
+ except OSError:
66
+ continue
67
+
68
+ if not font_loaded:
69
+ raise OSError('No preferred fonts found')
70
+
71
+ except OSError:
72
+ regular_font = ImageFont.load_default()
73
+ title_font = ImageFont.load_default()
74
+
75
+ goal_font = regular_font
76
+
77
+ # Load logo if requested
78
+ logo = None
79
+ if show_logo:
80
+ try:
81
+ logo = Image.open('./static/browser-use.png')
82
+ # Resize logo to be small (e.g., 40px height)
83
+ logo_height = 150
84
+ aspect_ratio = logo.width / logo.height
85
+ logo_width = int(logo_height * aspect_ratio)
86
+ logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS)
87
+ except Exception as e:
88
+ logger.warning(f'Could not load logo: {e}')
89
+
90
+ # Create task frame if requested
91
+ if show_task and task:
92
+ task_frame = _create_task_frame(
93
+ task,
94
+ history.history[0].state.screenshot,
95
+ title_font, # type: ignore
96
+ regular_font, # type: ignore
97
+ logo,
98
+ line_spacing,
99
+ )
100
+ images.append(task_frame)
101
+
102
+ # Process each history item
103
+ for i, item in enumerate(history.history, 1):
104
+ if not item.state.screenshot:
105
+ continue
106
+
107
+ # Convert base64 screenshot to PIL Image
108
+ img_data = base64.b64decode(item.state.screenshot)
109
+ image = Image.open(io.BytesIO(img_data))
110
+
111
+ if show_goals and item.model_output:
112
+ image = _add_overlay_to_image(
113
+ image=image,
114
+ step_number=i,
115
+ goal_text=item.model_output.current_state.next_goal,
116
+ regular_font=regular_font, # type: ignore
117
+ title_font=title_font, # type: ignore
118
+ margin=margin,
119
+ logo=logo,
120
+ )
121
+
122
+ images.append(image)
123
+
124
+ if images:
125
+ # Save the GIF
126
+ images[0].save(
127
+ output_path,
128
+ save_all=True,
129
+ append_images=images[1:],
130
+ duration=duration,
131
+ loop=0,
132
+ optimize=False,
133
+ )
134
+ logger.info(f'Created GIF at {output_path}')
135
+ else:
136
+ logger.warning('No images found in history to create GIF')
137
+
138
+
139
+ def _create_task_frame(
140
+ task: str,
141
+ first_screenshot: str,
142
+ title_font: 'ImageFont.FreeTypeFont',
143
+ regular_font: 'ImageFont.FreeTypeFont',
144
+ logo: Optional[Image.Image] = None,
145
+ line_spacing: float = 1.5,
146
+ ) -> 'Image.Image':
147
+ """Create initial frame showing the task."""
148
+ from PIL import Image, ImageDraw, ImageFont
149
+
150
+ img_data = base64.b64decode(first_screenshot)
151
+ template = Image.open(io.BytesIO(img_data))
152
+ image = Image.new('RGB', template.size, (0, 0, 0))
153
+ draw = ImageDraw.Draw(image)
154
+
155
+ # Calculate vertical center of image
156
+ center_y = image.height // 2
157
+
158
+ # Draw task text with increased font size
159
+ margin = 140 # Increased margin
160
+ max_width = image.width - (2 * margin)
161
+ larger_font = ImageFont.truetype(regular_font.path, regular_font.size + 16) # Increase font size more
162
+ wrapped_text = _wrap_text(task, larger_font, max_width)
163
+
164
+ # Calculate line height with spacing
165
+ line_height = larger_font.size * line_spacing
166
+
167
+ # Split text into lines and draw with custom spacing
168
+ lines = wrapped_text.split('\n')
169
+ total_height = line_height * len(lines)
170
+
171
+ # Start position for first line
172
+ text_y = center_y - (total_height / 2) + 50 # Shifted down slightly
173
+
174
+ for line in lines:
175
+ # Get line width for centering
176
+ line_bbox = draw.textbbox((0, 0), line, font=larger_font)
177
+ text_x = (image.width - (line_bbox[2] - line_bbox[0])) // 2
178
+
179
+ draw.text(
180
+ (text_x, text_y),
181
+ line,
182
+ font=larger_font,
183
+ fill=(255, 255, 255),
184
+ )
185
+ text_y += line_height
186
+
187
+ # Add logo if provided (top right corner)
188
+ if logo:
189
+ logo_margin = 20
190
+ logo_x = image.width - logo.width - logo_margin
191
+ image.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
192
+
193
+ return image
194
+
195
+
196
+ def _add_overlay_to_image(
197
+ image: 'Image.Image',
198
+ step_number: int,
199
+ goal_text: str,
200
+ regular_font: 'ImageFont.FreeTypeFont',
201
+ title_font: 'ImageFont.FreeTypeFont',
202
+ margin: int,
203
+ logo: Optional['Image.Image'] = None,
204
+ display_step: bool = True,
205
+ text_color: tuple[int, int, int, int] = (255, 255, 255, 255),
206
+ text_box_color: tuple[int, int, int, int] = (0, 0, 0, 255),
207
+ ) -> 'Image.Image':
208
+ """Add step number and goal overlay to an image."""
209
+ from PIL import Image, ImageDraw
210
+
211
+ image = image.convert('RGBA')
212
+ txt_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
213
+ draw = ImageDraw.Draw(txt_layer)
214
+ if display_step:
215
+ # Add step number (bottom left)
216
+ step_text = str(step_number)
217
+ step_bbox = draw.textbbox((0, 0), step_text, font=title_font)
218
+ step_width = step_bbox[2] - step_bbox[0]
219
+ step_height = step_bbox[3] - step_bbox[1]
220
+
221
+ # Position step number in bottom left
222
+ x_step = margin + 10 # Slight additional offset from edge
223
+ y_step = image.height - margin - step_height - 10 # Slight offset from bottom
224
+
225
+ # Draw rounded rectangle background for step number
226
+ padding = 20 # Increased padding
227
+ step_bg_bbox = (
228
+ x_step - padding,
229
+ y_step - padding,
230
+ x_step + step_width + padding,
231
+ y_step + step_height + padding,
232
+ )
233
+ draw.rounded_rectangle(
234
+ step_bg_bbox,
235
+ radius=15, # Add rounded corners
236
+ fill=text_box_color,
237
+ )
238
+
239
+ # Draw step number
240
+ draw.text(
241
+ (x_step, y_step),
242
+ step_text,
243
+ font=title_font,
244
+ fill=text_color,
245
+ )
246
+
247
+ # Draw goal text (centered, bottom)
248
+ max_width = image.width - (4 * margin)
249
+ wrapped_goal = _wrap_text(goal_text, title_font, max_width)
250
+ goal_bbox = draw.multiline_textbbox((0, 0), wrapped_goal, font=title_font)
251
+ goal_width = goal_bbox[2] - goal_bbox[0]
252
+ goal_height = goal_bbox[3] - goal_bbox[1]
253
+
254
+ # Center goal text horizontally, place above step number
255
+ x_goal = (image.width - goal_width) // 2
256
+ y_goal = y_step - goal_height - padding * 4 # More space between step and goal
257
+
258
+ # Draw rounded rectangle background for goal
259
+ padding_goal = 25 # Increased padding for goal
260
+ goal_bg_bbox = (
261
+ x_goal - padding_goal, # Remove extra space for logo
262
+ y_goal - padding_goal,
263
+ x_goal + goal_width + padding_goal,
264
+ y_goal + goal_height + padding_goal,
265
+ )
266
+ draw.rounded_rectangle(
267
+ goal_bg_bbox,
268
+ radius=15, # Add rounded corners
269
+ fill=text_box_color,
270
+ )
271
+
272
+ # Draw goal text
273
+ draw.multiline_text(
274
+ (x_goal, y_goal),
275
+ wrapped_goal,
276
+ font=title_font,
277
+ fill=text_color,
278
+ align='center',
279
+ )
280
+
281
+ # Add logo if provided (top right corner)
282
+ if logo:
283
+ logo_layer = Image.new('RGBA', image.size, (0, 0, 0, 0))
284
+ logo_margin = 20
285
+ logo_x = image.width - logo.width - logo_margin
286
+ logo_layer.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None)
287
+ txt_layer = Image.alpha_composite(logo_layer, txt_layer)
288
+
289
+ # Composite and convert
290
+ result = Image.alpha_composite(image, txt_layer)
291
+ return result.convert('RGB')
292
+
293
+
294
+ def _wrap_text(text: str, font: 'ImageFont.FreeTypeFont', max_width: int) -> str:
295
+ """
296
+ Wrap text to fit within a given width.
297
+
298
+ Args:
299
+ text: Text to wrap
300
+ font: Font to use for text
301
+ max_width: Maximum width in pixels
302
+
303
+ Returns:
304
+ Wrapped text with newlines
305
+ """
306
+ words = text.split()
307
+ lines = []
308
+ current_line = []
309
+
310
+ for word in words:
311
+ current_line.append(word)
312
+ line = ' '.join(current_line)
313
+ bbox = font.getbbox(line)
314
+ if bbox[2] > max_width:
315
+ if len(current_line) == 1:
316
+ lines.append(current_line.pop())
317
+ else:
318
+ current_line.pop()
319
+ lines.append(' '.join(current_line))
320
+ current_line = [word]
321
+
322
+ if current_line:
323
+ lines.append(' '.join(current_line))
324
+
325
+ return '\n'.join(lines)
browser_use/agent/message_manager/service.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Dict, List, Optional
5
+
6
+ from langchain_core.messages import (
7
+ AIMessage,
8
+ BaseMessage,
9
+ HumanMessage,
10
+ SystemMessage,
11
+ ToolMessage,
12
+ )
13
+ from pydantic import BaseModel
14
+
15
+ from browser_use.agent.message_manager.views import MessageMetadata
16
+ from browser_use.agent.prompts import AgentMessagePrompt
17
+ from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState
18
+ from browser_use.browser.views import BrowserState
19
+ from browser_use.utils import time_execution_sync
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class MessageManagerSettings(BaseModel):
25
+ max_input_tokens: int = 128000
26
+ estimated_characters_per_token: int = 3
27
+ image_tokens: int = 800
28
+ include_attributes: list[str] = []
29
+ message_context: Optional[str] = None
30
+ sensitive_data: Optional[Dict[str, str]] = None
31
+ available_file_paths: Optional[List[str]] = None
32
+
33
+
34
+ class MessageManager:
35
+ def __init__(
36
+ self,
37
+ task: str,
38
+ system_message: SystemMessage,
39
+ settings: MessageManagerSettings = MessageManagerSettings(),
40
+ state: MessageManagerState = MessageManagerState(),
41
+ ):
42
+ self.task = task
43
+ self.settings = settings
44
+ self.state = state
45
+ self.system_prompt = system_message
46
+
47
+ # Only initialize messages if state is empty
48
+ if len(self.state.history.messages) == 0:
49
+ self._init_messages()
50
+
51
+ def _init_messages(self) -> None:
52
+ """Initialize the message history with system message, context, task, and other initial messages"""
53
+ self._add_message_with_tokens(self.system_prompt)
54
+
55
+ if self.settings.message_context:
56
+ context_message = HumanMessage(content='Context for the task' + self.settings.message_context)
57
+ self._add_message_with_tokens(context_message)
58
+
59
+ task_message = HumanMessage(
60
+ content=f'Your ultimate task is: """{self.task}""". If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task. If not, continue as usual.'
61
+ )
62
+ self._add_message_with_tokens(task_message)
63
+
64
+ if self.settings.sensitive_data:
65
+ info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}'
66
+ info += 'To use them, write <secret>the placeholder name</secret>'
67
+ info_message = HumanMessage(content=info)
68
+ self._add_message_with_tokens(info_message)
69
+
70
+ placeholder_message = HumanMessage(content='Example output:')
71
+ self._add_message_with_tokens(placeholder_message)
72
+
73
+ tool_calls = [
74
+ {
75
+ 'name': 'AgentOutput',
76
+ 'args': {
77
+ 'current_state': {
78
+ 'evaluation_previous_goal': 'Success - I opend the first page',
79
+ 'memory': 'Starting with the new task. I have completed 1/10 steps',
80
+ 'next_goal': 'Click on company a',
81
+ },
82
+ 'action': [{'click_element': {'index': 0}}],
83
+ },
84
+ 'id': str(self.state.tool_id),
85
+ 'type': 'tool_call',
86
+ }
87
+ ]
88
+
89
+ example_tool_call = AIMessage(
90
+ content='',
91
+ tool_calls=tool_calls,
92
+ )
93
+ self._add_message_with_tokens(example_tool_call)
94
+ self.add_tool_message(content='Browser started')
95
+
96
+ placeholder_message = HumanMessage(content='[Your task history memory starts here]')
97
+ self._add_message_with_tokens(placeholder_message)
98
+
99
+ if self.settings.available_file_paths:
100
+ filepaths_msg = HumanMessage(content=f'Here are file paths you can use: {self.settings.available_file_paths}')
101
+ self._add_message_with_tokens(filepaths_msg)
102
+
103
+ def add_new_task(self, new_task: str) -> None:
104
+ content = f'Your new ultimate task is: """{new_task}""". Take the previous context into account and finish your new ultimate task. '
105
+ msg = HumanMessage(content=content)
106
+ self._add_message_with_tokens(msg)
107
+ self.task = new_task
108
+
109
+ @time_execution_sync('--add_state_message')
110
+ def add_state_message(
111
+ self,
112
+ state: BrowserState,
113
+ result: Optional[List[ActionResult]] = None,
114
+ step_info: Optional[AgentStepInfo] = None,
115
+ use_vision=True,
116
+ ) -> None:
117
+ """Add browser state as human message"""
118
+
119
+ # if keep in memory, add to directly to history and add state without result
120
+ if result:
121
+ for r in result:
122
+ if r.include_in_memory:
123
+ if r.extracted_content:
124
+ msg = HumanMessage(content='Action result: ' + str(r.extracted_content))
125
+ self._add_message_with_tokens(msg)
126
+ if r.error:
127
+ # if endswith \n, remove it
128
+ if r.error.endswith('\n'):
129
+ r.error = r.error[:-1]
130
+ # get only last line of error
131
+ last_line = r.error.split('\n')[-1]
132
+ msg = HumanMessage(content='Action error: ' + last_line)
133
+ self._add_message_with_tokens(msg)
134
+ result = None # if result in history, we dont want to add it again
135
+
136
+ # otherwise add state message and result to next message (which will not stay in memory)
137
+ state_message = AgentMessagePrompt(
138
+ state,
139
+ result,
140
+ include_attributes=self.settings.include_attributes,
141
+ step_info=step_info,
142
+ ).get_user_message(use_vision)
143
+ self._add_message_with_tokens(state_message)
144
+
145
+ def add_model_output(self, model_output: AgentOutput) -> None:
146
+ """Add model output as AI message"""
147
+ tool_calls = [
148
+ {
149
+ 'name': 'AgentOutput',
150
+ 'args': model_output.model_dump(mode='json', exclude_unset=True),
151
+ 'id': str(self.state.tool_id),
152
+ 'type': 'tool_call',
153
+ }
154
+ ]
155
+
156
+ msg = AIMessage(
157
+ content='',
158
+ tool_calls=tool_calls,
159
+ )
160
+
161
+ self._add_message_with_tokens(msg)
162
+ # empty tool response
163
+ self.add_tool_message(content='')
164
+
165
+ def add_plan(self, plan: Optional[str], position: int | None = None) -> None:
166
+ if plan:
167
+ msg = AIMessage(content=plan)
168
+ self._add_message_with_tokens(msg, position)
169
+
170
+ @time_execution_sync('--get_messages')
171
+ def get_messages(self) -> List[BaseMessage]:
172
+ """Get current message list, potentially trimmed to max tokens"""
173
+
174
+ msg = [m.message for m in self.state.history.messages]
175
+ # debug which messages are in history with token count # log
176
+ total_input_tokens = 0
177
+ logger.debug(f'Messages in history: {len(self.state.history.messages)}:')
178
+ for m in self.state.history.messages:
179
+ total_input_tokens += m.metadata.tokens
180
+ logger.debug(f'{m.message.__class__.__name__} - Token count: {m.metadata.tokens}')
181
+ logger.debug(f'Total input tokens: {total_input_tokens}')
182
+
183
+ return msg
184
+
185
+ def _add_message_with_tokens(self, message: BaseMessage, position: int | None = None) -> None:
186
+ """Add message with token count metadata
187
+ position: None for last, -1 for second last, etc.
188
+ """
189
+
190
+ # filter out sensitive data from the message
191
+ if self.settings.sensitive_data:
192
+ message = self._filter_sensitive_data(message)
193
+
194
+ token_count = self._count_tokens(message)
195
+ metadata = MessageMetadata(tokens=token_count)
196
+ self.state.history.add_message(message, metadata, position)
197
+
198
+ @time_execution_sync('--filter_sensitive_data')
199
+ def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage:
200
+ """Filter out sensitive data from the message"""
201
+
202
+ def replace_sensitive(value: str) -> str:
203
+ if not self.settings.sensitive_data:
204
+ return value
205
+ for key, val in self.settings.sensitive_data.items():
206
+ if not val:
207
+ continue
208
+ value = value.replace(val, f'<secret>{key}</secret>')
209
+ return value
210
+
211
+ if isinstance(message.content, str):
212
+ message.content = replace_sensitive(message.content)
213
+ elif isinstance(message.content, list):
214
+ for i, item in enumerate(message.content):
215
+ if isinstance(item, dict) and 'text' in item:
216
+ item['text'] = replace_sensitive(item['text'])
217
+ message.content[i] = item
218
+ return message
219
+
220
+ def _count_tokens(self, message: BaseMessage) -> int:
221
+ """Count tokens in a message using the model's tokenizer"""
222
+ tokens = 0
223
+ if isinstance(message.content, list):
224
+ for item in message.content:
225
+ if 'image_url' in item:
226
+ tokens += self.settings.image_tokens
227
+ elif isinstance(item, dict) and 'text' in item:
228
+ tokens += self._count_text_tokens(item['text'])
229
+ else:
230
+ msg = message.content
231
+ if hasattr(message, 'tool_calls'):
232
+ msg += str(message.tool_calls) # type: ignore
233
+ tokens += self._count_text_tokens(msg)
234
+ return tokens
235
+
236
+ def _count_text_tokens(self, text: str) -> int:
237
+ """Count tokens in a text string"""
238
+ tokens = len(text) // self.settings.estimated_characters_per_token # Rough estimate if no tokenizer available
239
+ return tokens
240
+
241
+ def cut_messages(self):
242
+ """Get current message list, potentially trimmed to max tokens"""
243
+ diff = self.state.history.current_tokens - self.settings.max_input_tokens
244
+ if diff <= 0:
245
+ return None
246
+
247
+ msg = self.state.history.messages[-1]
248
+
249
+ # if list with image remove image
250
+ if isinstance(msg.message.content, list):
251
+ text = ''
252
+ for item in msg.message.content:
253
+ if 'image_url' in item:
254
+ msg.message.content.remove(item)
255
+ diff -= self.settings.image_tokens
256
+ msg.metadata.tokens -= self.settings.image_tokens
257
+ self.state.history.current_tokens -= self.settings.image_tokens
258
+ logger.debug(
259
+ f'Removed image with {self.settings.image_tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens}'
260
+ )
261
+ elif 'text' in item and isinstance(item, dict):
262
+ text += item['text']
263
+ msg.message.content = text
264
+ self.state.history.messages[-1] = msg
265
+
266
+ if diff <= 0:
267
+ return None
268
+
269
+ # if still over, remove text from state message proportionally to the number of tokens needed with buffer
270
+ # Calculate the proportion of content to remove
271
+ proportion_to_remove = diff / msg.metadata.tokens
272
+ if proportion_to_remove > 0.99:
273
+ raise ValueError(
274
+ f'Max token limit reached - history is too long - reduce the system prompt or task. '
275
+ f'proportion_to_remove: {proportion_to_remove}'
276
+ )
277
+ logger.debug(
278
+ f'Removing {proportion_to_remove * 100:.2f}% of the last message {proportion_to_remove * msg.metadata.tokens:.2f} / {msg.metadata.tokens:.2f} tokens)'
279
+ )
280
+
281
+ content = msg.message.content
282
+ characters_to_remove = int(len(content) * proportion_to_remove)
283
+ content = content[:-characters_to_remove]
284
+
285
+ # remove tokens and old long message
286
+ self.state.history.remove_last_state_message()
287
+
288
+ # new message with updated content
289
+ msg = HumanMessage(content=content)
290
+ self._add_message_with_tokens(msg)
291
+
292
+ last_msg = self.state.history.messages[-1]
293
+
294
+ logger.debug(
295
+ f'Added message with {last_msg.metadata.tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens} - total messages: {len(self.state.history.messages)}'
296
+ )
297
+
298
+ def _remove_last_state_message(self) -> None:
299
+ """Remove last state message from history"""
300
+ self.state.history.remove_last_state_message()
301
+
302
+ def add_tool_message(self, content: str) -> None:
303
+ """Add tool message to history"""
304
+ msg = ToolMessage(content=content, tool_call_id=str(self.state.tool_id))
305
+ self.state.tool_id += 1
306
+ self._add_message_with_tokens(msg)
browser_use/agent/message_manager/tests.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ from langchain_anthropic import ChatAnthropic
3
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
4
+ from langchain_openai import AzureChatOpenAI, ChatOpenAI
5
+
6
+ from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
7
+ from browser_use.agent.views import ActionResult
8
+ from browser_use.browser.views import BrowserState, TabInfo
9
+ from browser_use.dom.views import DOMElementNode, DOMTextNode
10
+
11
+
12
+ @pytest.fixture(
13
+ params=[
14
+ ChatOpenAI(model='gpt-4o-mini'),
15
+ AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'),
16
+ ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None),
17
+ ],
18
+ ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'],
19
+ )
20
+ def message_manager(request: pytest.FixtureRequest):
21
+ task = 'Test task'
22
+ action_descriptions = 'Test actions'
23
+ return MessageManager(
24
+ task=task,
25
+ system_message=SystemMessage(content=action_descriptions),
26
+ settings=MessageManagerSettings(
27
+ max_input_tokens=1000,
28
+ estimated_characters_per_token=3,
29
+ image_tokens=800,
30
+ ),
31
+ )
32
+
33
+
34
+ def test_initial_messages(message_manager: MessageManager):
35
+ """Test that message manager initializes with system and task messages"""
36
+ messages = message_manager.get_messages()
37
+ assert len(messages) == 2
38
+ assert isinstance(messages[0], SystemMessage)
39
+ assert isinstance(messages[1], HumanMessage)
40
+ assert 'Test task' in messages[1].content
41
+
42
+
43
+ def test_add_state_message(message_manager: MessageManager):
44
+ """Test adding browser state message"""
45
+ state = BrowserState(
46
+ url='https://test.com',
47
+ title='Test Page',
48
+ element_tree=DOMElementNode(
49
+ tag_name='div',
50
+ attributes={},
51
+ children=[],
52
+ is_visible=True,
53
+ parent=None,
54
+ xpath='//div',
55
+ ),
56
+ selector_map={},
57
+ tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
58
+ )
59
+ message_manager.add_state_message(state)
60
+
61
+ messages = message_manager.get_messages()
62
+ assert len(messages) == 3
63
+ assert isinstance(messages[2], HumanMessage)
64
+ assert 'https://test.com' in messages[2].content
65
+
66
+
67
+ def test_add_state_with_memory_result(message_manager: MessageManager):
68
+ """Test adding state with result that should be included in memory"""
69
+ state = BrowserState(
70
+ url='https://test.com',
71
+ title='Test Page',
72
+ element_tree=DOMElementNode(
73
+ tag_name='div',
74
+ attributes={},
75
+ children=[],
76
+ is_visible=True,
77
+ parent=None,
78
+ xpath='//div',
79
+ ),
80
+ selector_map={},
81
+ tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
82
+ )
83
+ result = ActionResult(extracted_content='Important content', include_in_memory=True)
84
+
85
+ message_manager.add_state_message(state, [result])
86
+ messages = message_manager.get_messages()
87
+
88
+ # Should have system, task, extracted content, and state messages
89
+ assert len(messages) == 4
90
+ assert 'Important content' in messages[2].content
91
+ assert isinstance(messages[2], HumanMessage)
92
+ assert isinstance(messages[3], HumanMessage)
93
+ assert 'Important content' not in messages[3].content
94
+
95
+
96
+ def test_add_state_with_non_memory_result(message_manager: MessageManager):
97
+ """Test adding state with result that should not be included in memory"""
98
+ state = BrowserState(
99
+ url='https://test.com',
100
+ title='Test Page',
101
+ element_tree=DOMElementNode(
102
+ tag_name='div',
103
+ attributes={},
104
+ children=[],
105
+ is_visible=True,
106
+ parent=None,
107
+ xpath='//div',
108
+ ),
109
+ selector_map={},
110
+ tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')],
111
+ )
112
+ result = ActionResult(extracted_content='Temporary content', include_in_memory=False)
113
+
114
+ message_manager.add_state_message(state, [result])
115
+ messages = message_manager.get_messages()
116
+
117
+ # Should have system, task, and combined state+result message
118
+ assert len(messages) == 3
119
+ assert 'Temporary content' in messages[2].content
120
+ assert isinstance(messages[2], HumanMessage)
121
+
122
+
123
+ @pytest.mark.skip('not sure how to fix this')
124
+ @pytest.mark.parametrize('max_tokens', [100000, 10000, 5000])
125
+ def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens):
126
+ """Test handling of token overflow in a realistic message flow"""
127
+ # Set more realistic token limit
128
+ message_manager.settings.max_input_tokens = max_tokens
129
+
130
+ # Create a long sequence of interactions
131
+ for i in range(200): # Simulate 40 steps of interaction
132
+ # Create state with varying content length
133
+ state = BrowserState(
134
+ url=f'https://test{i}.com',
135
+ title=f'Test Page {i}',
136
+ element_tree=DOMElementNode(
137
+ tag_name='div',
138
+ attributes={},
139
+ children=[
140
+ DOMTextNode(
141
+ text=f'Content {j} ' * (10 + i), # Increasing content length
142
+ is_visible=True,
143
+ parent=None,
144
+ )
145
+ for j in range(5) # Multiple DOM items
146
+ ],
147
+ is_visible=True,
148
+ parent=None,
149
+ xpath='//div',
150
+ ),
151
+ selector_map={j: f'//div[{j}]' for j in range(5)},
152
+ tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')],
153
+ )
154
+
155
+ # Alternate between different types of results
156
+ result = None
157
+ if i % 2 == 0: # Every other iteration
158
+ result = ActionResult(
159
+ extracted_content=f'Important content from step {i}' * 5,
160
+ include_in_memory=i % 4 == 0, # Include in memory every 4th message
161
+ )
162
+
163
+ # Add state message
164
+ if result:
165
+ message_manager.add_state_message(state, [result])
166
+ else:
167
+ message_manager.add_state_message(state)
168
+
169
+ try:
170
+ messages = message_manager.get_messages()
171
+ except ValueError as e:
172
+ if 'Max token limit reached - history is too long' in str(e):
173
+ return # If error occurs, end the test
174
+ else:
175
+ raise e
176
+
177
+ assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100
178
+
179
+ last_msg = messages[-1]
180
+ assert isinstance(last_msg, HumanMessage)
181
+
182
+ if i % 4 == 0:
183
+ assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage)
184
+ if i % 2 == 0 and not i % 4 == 0:
185
+ if isinstance(last_msg.content, list):
186
+ assert 'Current url: https://test' in last_msg.content[0]['text']
187
+ else:
188
+ assert 'Current url: https://test' in last_msg.content
189
+
190
+ # Add model output every time
191
+ from browser_use.agent.views import AgentBrain, AgentOutput
192
+ from browser_use.controller.registry.views import ActionModel
193
+
194
+ output = AgentOutput(
195
+ current_state=AgentBrain(
196
+ evaluation_previous_goal=f'Success in step {i}',
197
+ memory=f'Memory from step {i}',
198
+ next_goal=f'Goal for step {i + 1}',
199
+ ),
200
+ action=[ActionModel()],
201
+ )
202
+ message_manager._remove_last_state_message()
203
+ message_manager.add_model_output(output)
204
+
205
+ # Get messages and verify after each addition
206
+ messages = [m.message for m in message_manager.state.history.messages]
207
+
208
+ # Verify token limit is respected
209
+
210
+ # Verify essential messages are preserved
211
+ assert isinstance(messages[0], SystemMessage) # System prompt always first
212
+ assert isinstance(messages[1], HumanMessage) # Task always second
213
+ assert 'Test task' in messages[1].content
214
+
215
+ # Verify structure of latest messages
216
+ assert isinstance(messages[-1], AIMessage) # Last message should be model output
217
+ assert f'step {i}' in messages[-1].content # Should contain current step info
218
+
219
+ # Log token usage for debugging
220
+ token_usage = message_manager.state.history.current_tokens
221
+ token_limit = message_manager.settings.max_input_tokens
222
+ # print(f'Step {i}: Using {token_usage}/{token_limit} tokens')
223
+
224
+ # go through all messages and verify that the token count and total tokens is correct
225
+ total_tokens = 0
226
+ real_tokens = []
227
+ stored_tokens = []
228
+ for msg in message_manager.state.history.messages:
229
+ total_tokens += msg.metadata.tokens
230
+ stored_tokens.append(msg.metadata.tokens)
231
+ real_tokens.append(message_manager._count_tokens(msg.message))
232
+ assert total_tokens == sum(real_tokens)
233
+ assert stored_tokens == real_tokens
234
+ assert message_manager.state.history.current_tokens == total_tokens
235
+
236
+
237
+ # pytest -s browser_use/agent/message_manager/tests.py
browser_use/agent/message_manager/utils.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ from typing import Any, Optional, Type
7
+
8
+ from langchain_core.messages import (
9
+ AIMessage,
10
+ BaseMessage,
11
+ HumanMessage,
12
+ SystemMessage,
13
+ ToolMessage,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def extract_json_from_model_output(content: str) -> dict:
20
+ """Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON."""
21
+ try:
22
+ # If content is wrapped in code blocks, extract just the JSON part
23
+ if '```' in content:
24
+ # Find the JSON content between code blocks
25
+ content = content.split('```')[1]
26
+ # Remove language identifier if present (e.g., 'json\n')
27
+ if '\n' in content:
28
+ content = content.split('\n', 1)[1]
29
+ # Parse the cleaned content
30
+ return json.loads(content)
31
+ except json.JSONDecodeError as e:
32
+ logger.warning(f'Failed to parse model output: {content} {str(e)}')
33
+ raise ValueError('Could not parse response.')
34
+
35
+
36
+ def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]:
37
+ """Convert input messages to a format that is compatible with the planner model"""
38
+ if model_name is None:
39
+ return input_messages
40
+ if model_name == 'deepseek-reasoner' or 'deepseek-r1' in model_name:
41
+ converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages)
42
+ merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage)
43
+ merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage)
44
+ return merged_input_messages
45
+ return input_messages
46
+
47
+
48
+ def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]:
49
+ """Convert messages for non-function-calling models"""
50
+ output_messages = []
51
+ for message in input_messages:
52
+ if isinstance(message, HumanMessage):
53
+ output_messages.append(message)
54
+ elif isinstance(message, SystemMessage):
55
+ output_messages.append(message)
56
+ elif isinstance(message, ToolMessage):
57
+ output_messages.append(HumanMessage(content=message.content))
58
+ elif isinstance(message, AIMessage):
59
+ # check if tool_calls is a valid JSON object
60
+ if message.tool_calls:
61
+ tool_calls = json.dumps(message.tool_calls)
62
+ output_messages.append(AIMessage(content=tool_calls))
63
+ else:
64
+ output_messages.append(message)
65
+ else:
66
+ raise ValueError(f'Unknown message type: {type(message)}')
67
+ return output_messages
68
+
69
+
70
+ def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]:
71
+ """Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one."""
72
+ merged_messages = []
73
+ streak = 0
74
+ for message in messages:
75
+ if isinstance(message, class_to_merge):
76
+ streak += 1
77
+ if streak > 1:
78
+ if isinstance(message.content, list):
79
+ merged_messages[-1].content += message.content[0]['text'] # type:ignore
80
+ else:
81
+ merged_messages[-1].content += message.content
82
+ else:
83
+ merged_messages.append(message)
84
+ else:
85
+ merged_messages.append(message)
86
+ streak = 0
87
+ return merged_messages
88
+
89
+
90
+ def save_conversation(input_messages: list[BaseMessage], response: Any, target: str, encoding: Optional[str] = None) -> None:
91
+ """Save conversation history to file."""
92
+
93
+ # create folders if not exists
94
+ os.makedirs(os.path.dirname(target), exist_ok=True)
95
+
96
+ with open(
97
+ target,
98
+ 'w',
99
+ encoding=encoding,
100
+ ) as f:
101
+ _write_messages_to_file(f, input_messages)
102
+ _write_response_to_file(f, response)
103
+
104
+
105
+ def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None:
106
+ """Write messages to conversation file"""
107
+ for message in messages:
108
+ f.write(f' {message.__class__.__name__} \n')
109
+
110
+ if isinstance(message.content, list):
111
+ for item in message.content:
112
+ if isinstance(item, dict) and item.get('type') == 'text':
113
+ f.write(item['text'].strip() + '\n')
114
+ elif isinstance(message.content, str):
115
+ try:
116
+ content = json.loads(message.content)
117
+ f.write(json.dumps(content, indent=2) + '\n')
118
+ except json.JSONDecodeError:
119
+ f.write(message.content.strip() + '\n')
120
+
121
+ f.write('\n')
122
+
123
+
124
+ def _write_response_to_file(f: Any, response: Any) -> None:
125
+ """Write model response to conversation file"""
126
+ f.write(' RESPONSE\n')
127
+ f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2))
browser_use/agent/message_manager/views.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from langchain_core.load import dumpd, load
6
+ from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
7
+ from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator
8
+
9
+ if TYPE_CHECKING:
10
+ from browser_use.agent.views import AgentOutput
11
+
12
+
13
+ class MessageMetadata(BaseModel):
14
+ """Metadata for a message"""
15
+
16
+ tokens: int = 0
17
+
18
+
19
+ class ManagedMessage(BaseModel):
20
+ """A message with its metadata"""
21
+
22
+ message: BaseMessage
23
+ metadata: MessageMetadata = Field(default_factory=MessageMetadata)
24
+
25
+ model_config = ConfigDict(arbitrary_types_allowed=True)
26
+
27
+ # https://github.com/pydantic/pydantic/discussions/7558
28
+ @model_serializer(mode='wrap')
29
+ def to_json(self, original_dump):
30
+ """
31
+ Returns the JSON representation of the model.
32
+
33
+ It uses langchain's `dumps` function to serialize the `message`
34
+ property before encoding the overall dict with json.dumps.
35
+ """
36
+ data = original_dump(self)
37
+
38
+ # NOTE: We override the message field to use langchain JSON serialization.
39
+ data['message'] = dumpd(self.message)
40
+
41
+ return data
42
+
43
+ @model_validator(mode='before')
44
+ @classmethod
45
+ def validate(
46
+ cls,
47
+ value: Any,
48
+ *,
49
+ strict: bool | None = None,
50
+ from_attributes: bool | None = None,
51
+ context: Any | None = None,
52
+ ) -> Any:
53
+ """
54
+ Custom validator that uses langchain's `loads` function
55
+ to parse the message if it is provided as a JSON string.
56
+ """
57
+ if isinstance(value, dict) and 'message' in value:
58
+ # NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object.
59
+ value['message'] = load(value['message'])
60
+ return value
61
+
62
+
63
+ class MessageHistory(BaseModel):
64
+ """History of messages with metadata"""
65
+
66
+ messages: list[ManagedMessage] = Field(default_factory=list)
67
+ current_tokens: int = 0
68
+
69
+ model_config = ConfigDict(arbitrary_types_allowed=True)
70
+
71
+ def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None:
72
+ """Add message with metadata to history"""
73
+ if position is None:
74
+ self.messages.append(ManagedMessage(message=message, metadata=metadata))
75
+ else:
76
+ self.messages.insert(position, ManagedMessage(message=message, metadata=metadata))
77
+ self.current_tokens += metadata.tokens
78
+
79
+ def add_model_output(self, output: 'AgentOutput') -> None:
80
+ """Add model output as AI message"""
81
+ tool_calls = [
82
+ {
83
+ 'name': 'AgentOutput',
84
+ 'args': output.model_dump(mode='json', exclude_unset=True),
85
+ 'id': '1',
86
+ 'type': 'tool_call',
87
+ }
88
+ ]
89
+
90
+ msg = AIMessage(
91
+ content='',
92
+ tool_calls=tool_calls,
93
+ )
94
+ self.add_message(msg, MessageMetadata(tokens=100)) # Estimate tokens for tool calls
95
+
96
+ # Empty tool response
97
+ tool_message = ToolMessage(content='', tool_call_id='1')
98
+ self.add_message(tool_message, MessageMetadata(tokens=10)) # Estimate tokens for empty response
99
+
100
+ def get_messages(self) -> list[BaseMessage]:
101
+ """Get all messages"""
102
+ return [m.message for m in self.messages]
103
+
104
+ def get_total_tokens(self) -> int:
105
+ """Get total tokens in history"""
106
+ return self.current_tokens
107
+
108
+ def remove_oldest_message(self) -> None:
109
+ """Remove oldest non-system message"""
110
+ for i, msg in enumerate(self.messages):
111
+ if not isinstance(msg.message, SystemMessage):
112
+ self.current_tokens -= msg.metadata.tokens
113
+ self.messages.pop(i)
114
+ break
115
+
116
+ def remove_last_state_message(self) -> None:
117
+ """Remove last state message from history"""
118
+ if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage):
119
+ self.current_tokens -= self.messages[-1].metadata.tokens
120
+ self.messages.pop()
121
+
122
+
123
+ class MessageManagerState(BaseModel):
124
+ """Holds the state for MessageManager"""
125
+
126
+ history: MessageHistory = Field(default_factory=MessageHistory)
127
+ tool_id: int = 1
128
+
129
+ model_config = ConfigDict(arbitrary_types_allowed=True)
browser_use/agent/prompts.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import importlib.resources
3
+ from datetime import datetime
4
+ from typing import TYPE_CHECKING, List, Optional
5
+
6
+ from langchain_core.messages import HumanMessage, SystemMessage
7
+
8
+ if TYPE_CHECKING:
9
+ from browser_use.agent.views import ActionResult, AgentStepInfo
10
+ from browser_use.browser.views import BrowserState
11
+
12
+
13
+ class SystemPrompt:
14
+ def __init__(
15
+ self,
16
+ action_description: str,
17
+ max_actions_per_step: int = 10,
18
+ override_system_message: Optional[str] = None,
19
+ extend_system_message: Optional[str] = None,
20
+ ):
21
+ self.default_action_description = action_description
22
+ self.max_actions_per_step = max_actions_per_step
23
+ prompt = ''
24
+ if override_system_message:
25
+ prompt = override_system_message
26
+ else:
27
+ self._load_prompt_template()
28
+ prompt = self.prompt_template.format(max_actions=self.max_actions_per_step)
29
+
30
+ if extend_system_message:
31
+ prompt += f'\n{extend_system_message}'
32
+
33
+ self.system_message = SystemMessage(content=prompt)
34
+
35
+ def _load_prompt_template(self) -> None:
36
+ """Load the prompt template from the markdown file."""
37
+ try:
38
+ # This works both in development and when installed as a package
39
+ with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r') as f:
40
+ self.prompt_template = f.read()
41
+ except Exception as e:
42
+ raise RuntimeError(f'Failed to load system prompt template: {e}')
43
+
44
+ def get_system_message(self) -> SystemMessage:
45
+ """
46
+ Get the system prompt for the agent.
47
+
48
+ Returns:
49
+ SystemMessage: Formatted system prompt
50
+ """
51
+ return self.system_message
52
+
53
+
54
+ # Functions:
55
+ # {self.default_action_description}
56
+
57
+ # Example:
58
+ # {self.example_response()}
59
+ # Your AVAILABLE ACTIONS:
60
+ # {self.default_action_description}
61
+
62
+
63
+ class AgentMessagePrompt:
64
+ def __init__(
65
+ self,
66
+ state: 'BrowserState',
67
+ result: Optional[List['ActionResult']] = None,
68
+ include_attributes: list[str] = [],
69
+ step_info: Optional['AgentStepInfo'] = None,
70
+ ):
71
+ self.state = state
72
+ self.result = result
73
+ self.include_attributes = include_attributes
74
+ self.step_info = step_info
75
+
76
+ def get_user_message(self, use_vision: bool = True) -> HumanMessage:
77
+ elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
78
+
79
+ has_content_above = (self.state.pixels_above or 0) > 0
80
+ has_content_below = (self.state.pixels_below or 0) > 0
81
+
82
+ if elements_text != '':
83
+ if has_content_above:
84
+ elements_text = (
85
+ f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
86
+ )
87
+ else:
88
+ elements_text = f'[Start of page]\n{elements_text}'
89
+ if has_content_below:
90
+ elements_text = (
91
+ f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
92
+ )
93
+ else:
94
+ elements_text = f'{elements_text}\n[End of page]'
95
+ else:
96
+ elements_text = 'empty page'
97
+
98
+ if self.step_info:
99
+ step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}'
100
+ else:
101
+ step_info_description = ''
102
+ time_str = datetime.now().strftime('%Y-%m-%d %H:%M')
103
+ step_info_description += f'Current date and time: {time_str}'
104
+
105
+ state_description = f"""
106
+ [Task history memory ends]
107
+ [Current state starts here]
108
+ The following is one-time information - if you need to remember it write it to memory:
109
+ Current url: {self.state.url}
110
+ Available tabs:
111
+ {self.state.tabs}
112
+ Interactive elements from top layer of the current page inside the viewport:
113
+ {elements_text}
114
+ {step_info_description}
115
+ """
116
+
117
+ if self.result:
118
+ for i, result in enumerate(self.result):
119
+ if result.extracted_content:
120
+ state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.extracted_content}'
121
+ if result.error:
122
+ # only use last line of error
123
+ error = result.error.split('\n')[-1]
124
+ state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}'
125
+
126
+ if self.state.screenshot and use_vision == True:
127
+ # Format message for vision model
128
+ return HumanMessage(
129
+ content=[
130
+ {'type': 'text', 'text': state_description},
131
+ {
132
+ 'type': 'image_url',
133
+ 'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'}, # , 'detail': 'low'
134
+ },
135
+ ]
136
+ )
137
+
138
+ return HumanMessage(content=state_description)
139
+
140
+
141
+ class PlannerPrompt(SystemPrompt):
142
+ def get_system_message(self) -> SystemMessage:
143
+ return SystemMessage(
144
+ content="""You are a planning agent that helps break down tasks into smaller steps and reason about the current state.
145
+ Your role is to:
146
+ 1. Analyze the current state and history
147
+ 2. Evaluate progress towards the ultimate goal
148
+ 3. Identify potential challenges or roadblocks
149
+ 4. Suggest the next high-level steps to take
150
+
151
+ Inside your messages, there will be AI messages from different agents with different formats.
152
+
153
+ Your output format should be always a JSON object with the following fields:
154
+ {
155
+ "state_analysis": "Brief analysis of the current state and what has been done so far",
156
+ "progress_evaluation": "Evaluation of progress towards the ultimate goal (as percentage and description)",
157
+ "challenges": "List any potential challenges or roadblocks",
158
+ "next_steps": "List 2-3 concrete next steps to take",
159
+ "reasoning": "Explain your reasoning for the suggested next steps"
160
+ }
161
+
162
+ Ignore the other AI messages output structures.
163
+
164
+ Keep your responses concise and focused on actionable insights."""
165
+ )
browser_use/agent/service.py ADDED
@@ -0,0 +1,964 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ import re
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar
10
+
11
+ from dotenv import load_dotenv
12
+ from langchain_core.language_models.chat_models import BaseChatModel
13
+ from langchain_core.messages import (
14
+ BaseMessage,
15
+ HumanMessage,
16
+ SystemMessage,
17
+ )
18
+
19
+ # from lmnr.sdk.decorators import observe
20
+ from pydantic import BaseModel, ValidationError
21
+
22
+ from browser_use.agent.gif import create_history_gif
23
+ from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings
24
+ from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation
25
+ from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt
26
+ from browser_use.agent.views import (
27
+ ActionResult,
28
+ AgentError,
29
+ AgentHistory,
30
+ AgentHistoryList,
31
+ AgentOutput,
32
+ AgentSettings,
33
+ AgentState,
34
+ AgentStepInfo,
35
+ StepMetadata,
36
+ ToolCallingMethod,
37
+ )
38
+ from browser_use.browser.browser import Browser
39
+ from browser_use.browser.context import BrowserContext
40
+ from browser_use.browser.views import BrowserState, BrowserStateHistory
41
+ from browser_use.controller.registry.views import ActionModel
42
+ from browser_use.controller.service import Controller
43
+ from browser_use.dom.history_tree_processor.service import (
44
+ DOMHistoryElement,
45
+ HistoryTreeProcessor,
46
+ )
47
+ from browser_use.telemetry.service import ProductTelemetry
48
+ from browser_use.telemetry.views import (
49
+ AgentEndTelemetryEvent,
50
+ AgentRunTelemetryEvent,
51
+ AgentStepTelemetryEvent,
52
+ )
53
+ from browser_use.utils import time_execution_async, time_execution_sync
54
+
55
+ load_dotenv()
56
+ logger = logging.getLogger(__name__)
57
+
58
+
59
+ def log_response(response: AgentOutput) -> None:
60
+ """Utility function to log the model's response."""
61
+
62
+ if 'Success' in response.current_state.evaluation_previous_goal:
63
+ emoji = '👍'
64
+ elif 'Failed' in response.current_state.evaluation_previous_goal:
65
+ emoji = '⚠'
66
+ else:
67
+ emoji = '🤷'
68
+
69
+ logger.info(f'{emoji} Eval: {response.current_state.evaluation_previous_goal}')
70
+ logger.info(f'🧠 Memory: {response.current_state.memory}')
71
+ logger.info(f'🎯 Next goal: {response.current_state.next_goal}')
72
+ for i, action in enumerate(response.action):
73
+ logger.info(f'🛠️ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}')
74
+
75
+
76
+ Context = TypeVar('Context')
77
+
78
+
79
+ class Agent(Generic[Context]):
80
+ @time_execution_sync('--init (agent)')
81
+ def __init__(
82
+ self,
83
+ task: str,
84
+ llm: BaseChatModel,
85
+ # Optional parameters
86
+ browser: Browser | None = None,
87
+ browser_context: BrowserContext | None = None,
88
+ controller: Controller[Context] = Controller(),
89
+ # Initial agent run parameters
90
+ sensitive_data: Optional[Dict[str, str]] = None,
91
+ initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None,
92
+ # Cloud Callbacks
93
+ register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None,
94
+ register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None,
95
+ register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None,
96
+ # Agent settings
97
+ use_vision: bool = True,
98
+ use_vision_for_planner: bool = False,
99
+ save_conversation_path: Optional[str] = None,
100
+ save_conversation_path_encoding: Optional[str] = 'utf-8',
101
+ max_failures: int = 3,
102
+ retry_delay: int = 10,
103
+ override_system_message: Optional[str] = None,
104
+ extend_system_message: Optional[str] = None,
105
+ max_input_tokens: int = 128000,
106
+ validate_output: bool = False,
107
+ message_context: Optional[str] = None,
108
+ generate_gif: bool | str = False,
109
+ available_file_paths: Optional[list[str]] = None,
110
+ include_attributes: list[str] = [
111
+ 'title',
112
+ 'type',
113
+ 'name',
114
+ 'role',
115
+ 'aria-label',
116
+ 'placeholder',
117
+ 'value',
118
+ 'alt',
119
+ 'aria-expanded',
120
+ 'data-date-format',
121
+ ],
122
+ max_actions_per_step: int = 10,
123
+ tool_calling_method: Optional[ToolCallingMethod] = 'auto',
124
+ page_extraction_llm: Optional[BaseChatModel] = None,
125
+ planner_llm: Optional[BaseChatModel] = None,
126
+ planner_interval: int = 1, # Run planner every N steps
127
+ # Inject state
128
+ injected_agent_state: Optional[AgentState] = None,
129
+ #
130
+ context: Context | None = None,
131
+ ):
132
+ if page_extraction_llm is None:
133
+ page_extraction_llm = llm
134
+
135
+ # Core components
136
+ self.task = task
137
+ self.llm = llm
138
+ self.controller = controller
139
+ self.sensitive_data = sensitive_data
140
+
141
+ self.settings = AgentSettings(
142
+ use_vision=use_vision,
143
+ use_vision_for_planner=use_vision_for_planner,
144
+ save_conversation_path=save_conversation_path,
145
+ save_conversation_path_encoding=save_conversation_path_encoding,
146
+ max_failures=max_failures,
147
+ retry_delay=retry_delay,
148
+ override_system_message=override_system_message,
149
+ extend_system_message=extend_system_message,
150
+ max_input_tokens=max_input_tokens,
151
+ validate_output=validate_output,
152
+ message_context=message_context,
153
+ generate_gif=generate_gif,
154
+ available_file_paths=available_file_paths,
155
+ include_attributes=include_attributes,
156
+ max_actions_per_step=max_actions_per_step,
157
+ tool_calling_method=tool_calling_method,
158
+ page_extraction_llm=page_extraction_llm,
159
+ planner_llm=planner_llm,
160
+ planner_interval=planner_interval,
161
+ )
162
+
163
+ # Initialize state
164
+ self.state = injected_agent_state or AgentState()
165
+
166
+ # Action setup
167
+ self._setup_action_models()
168
+ self._set_browser_use_version_and_source()
169
+ self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None
170
+
171
+ # Model setup
172
+ self._set_model_names()
173
+
174
+ # for models without tool calling, add available actions to context
175
+ self.available_actions = self.controller.registry.get_prompt_description()
176
+
177
+ self.tool_calling_method = self._set_tool_calling_method()
178
+ self.settings.message_context = self._set_message_context()
179
+
180
+ # Initialize message manager with state
181
+ self._message_manager = MessageManager(
182
+ task=task,
183
+ system_message=SystemPrompt(
184
+ action_description=self.available_actions,
185
+ max_actions_per_step=self.settings.max_actions_per_step,
186
+ override_system_message=override_system_message,
187
+ extend_system_message=extend_system_message,
188
+ ).get_system_message(),
189
+ settings=MessageManagerSettings(
190
+ max_input_tokens=self.settings.max_input_tokens,
191
+ include_attributes=self.settings.include_attributes,
192
+ message_context=self.settings.message_context,
193
+ sensitive_data=sensitive_data,
194
+ available_file_paths=self.settings.available_file_paths,
195
+ ),
196
+ state=self.state.message_manager_state,
197
+ )
198
+
199
+ # Browser setup
200
+ self.injected_browser = browser is not None
201
+ self.injected_browser_context = browser_context is not None
202
+ self.browser = browser if browser is not None else (None if browser_context else Browser())
203
+ if browser_context:
204
+ self.browser_context = browser_context
205
+ elif self.browser:
206
+ self.browser_context = BrowserContext(browser=self.browser, config=self.browser.config.new_context_config)
207
+ else:
208
+ self.browser = Browser()
209
+ self.browser_context = BrowserContext(browser=self.browser)
210
+
211
+ # Callbacks
212
+ self.register_new_step_callback = register_new_step_callback
213
+ self.register_done_callback = register_done_callback
214
+ self.register_external_agent_status_raise_error_callback = register_external_agent_status_raise_error_callback
215
+
216
+ # Context
217
+ self.context = context
218
+
219
+ # Telemetry
220
+ self.telemetry = ProductTelemetry()
221
+
222
+ if self.settings.save_conversation_path:
223
+ logger.info(f'Saving conversation to {self.settings.save_conversation_path}')
224
+
225
+ def _set_message_context(self) -> str | None:
226
+ if self.tool_calling_method == 'raw':
227
+ if self.settings.message_context:
228
+ self.settings.message_context += f'\n\nAvailable actions: {self.available_actions}'
229
+ else:
230
+ self.settings.message_context = f'Available actions: {self.available_actions}'
231
+ return self.settings.message_context
232
+
233
+ def _set_browser_use_version_and_source(self) -> None:
234
+ """Get the version and source of the browser-use package (git or pip in a nutshell)"""
235
+ try:
236
+ # First check for repository-specific files
237
+ repo_files = ['.git', 'README.md', 'docs', 'examples']
238
+ package_root = Path(__file__).parent.parent.parent
239
+
240
+ # If all of these files/dirs exist, it's likely from git
241
+ if all(Path(package_root / file).exists() for file in repo_files):
242
+ try:
243
+ import subprocess
244
+
245
+ version = subprocess.check_output(['git', 'describe', '--tags']).decode('utf-8').strip()
246
+ except Exception:
247
+ version = 'unknown'
248
+ source = 'git'
249
+ else:
250
+ # If no repo files found, try getting version from pip
251
+ import pkg_resources
252
+
253
+ version = pkg_resources.get_distribution('browser-use').version
254
+ source = 'pip'
255
+ except Exception:
256
+ version = 'unknown'
257
+ source = 'unknown'
258
+
259
+ logger.debug(f'Version: {version}, Source: {source}')
260
+ self.version = version
261
+ self.source = source
262
+
263
+ def _set_model_names(self) -> None:
264
+ self.chat_model_library = self.llm.__class__.__name__
265
+ self.model_name = 'Unknown'
266
+ if hasattr(self.llm, 'model_name'):
267
+ model = self.llm.model_name # type: ignore
268
+ self.model_name = model if model is not None else 'Unknown'
269
+ elif hasattr(self.llm, 'model'):
270
+ model = self.llm.model # type: ignore
271
+ self.model_name = model if model is not None else 'Unknown'
272
+
273
+ if self.settings.planner_llm:
274
+ if hasattr(self.settings.planner_llm, 'model_name'):
275
+ self.planner_model_name = self.settings.planner_llm.model_name # type: ignore
276
+ elif hasattr(self.settings.planner_llm, 'model'):
277
+ self.planner_model_name = self.settings.planner_llm.model # type: ignore
278
+ else:
279
+ self.planner_model_name = 'Unknown'
280
+ else:
281
+ self.planner_model_name = None
282
+
283
+ def _setup_action_models(self) -> None:
284
+ """Setup dynamic action models from controller's registry"""
285
+ self.ActionModel = self.controller.registry.create_action_model()
286
+ # Create output model with the dynamic actions
287
+ self.AgentOutput = AgentOutput.type_with_custom_actions(self.ActionModel)
288
+
289
+ # used to force the done action when max_steps is reached
290
+ self.DoneActionModel = self.controller.registry.create_action_model(include_actions=['done'])
291
+ self.DoneAgentOutput = AgentOutput.type_with_custom_actions(self.DoneActionModel)
292
+
293
+ def _set_tool_calling_method(self) -> Optional[ToolCallingMethod]:
294
+ tool_calling_method = self.settings.tool_calling_method
295
+ if tool_calling_method == 'auto':
296
+ if 'deepseek-reasoner' in self.model_name or 'deepseek-r1' in self.model_name:
297
+ return 'raw'
298
+ elif self.chat_model_library == 'ChatGoogleGenerativeAI':
299
+ return None
300
+ elif self.chat_model_library == 'ChatOpenAI':
301
+ return 'function_calling'
302
+ elif self.chat_model_library == 'AzureChatOpenAI':
303
+ return 'function_calling'
304
+ else:
305
+ return None
306
+ else:
307
+ return tool_calling_method
308
+
309
+ def add_new_task(self, new_task: str) -> None:
310
+ self._message_manager.add_new_task(new_task)
311
+
312
+ async def _raise_if_stopped_or_paused(self) -> None:
313
+ """Utility function that raises an InterruptedError if the agent is stopped or paused."""
314
+
315
+ if self.register_external_agent_status_raise_error_callback:
316
+ if await self.register_external_agent_status_raise_error_callback():
317
+ raise InterruptedError
318
+
319
+ if self.state.stopped or self.state.paused:
320
+ logger.debug('Agent paused after getting state')
321
+ raise InterruptedError
322
+
323
+ # @observe(name='agent.step', ignore_output=True, ignore_input=True)
324
+ @time_execution_async('--step (agent)')
325
+ async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
326
+ """Execute one step of the task"""
327
+ logger.info(f'📍 Step {self.state.n_steps}')
328
+ state = None
329
+ model_output = None
330
+ result: list[ActionResult] = []
331
+ step_start_time = time.time()
332
+ tokens = 0
333
+
334
+ try:
335
+ state = await self.browser_context.get_state()
336
+
337
+ await self._raise_if_stopped_or_paused()
338
+
339
+ self._message_manager.add_state_message(state, self.state.last_result, step_info, self.settings.use_vision)
340
+
341
+ # Run planner at specified intervals if planner is configured
342
+ if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0:
343
+ plan = await self._run_planner()
344
+ # add plan before last state message
345
+ self._message_manager.add_plan(plan, position=-1)
346
+
347
+ if step_info and step_info.is_last_step():
348
+ # Add last step warning if needed
349
+ msg = 'Now comes your last step. Use only the "done" action now. No other actions - so here your action sequence must have length 1.'
350
+ msg += '\nIf the task is not yet fully finished as requested by the user, set success in "done" to false! E.g. if not all steps are fully completed.'
351
+ msg += '\nIf the task is fully finished, set success in "done" to true.'
352
+ msg += '\nInclude everything you found out for the ultimate task in the done text.'
353
+ logger.info('Last step finishing up')
354
+ self._message_manager._add_message_with_tokens(HumanMessage(content=msg))
355
+ self.AgentOutput = self.DoneAgentOutput
356
+
357
+ input_messages = self._message_manager.get_messages()
358
+ tokens = self._message_manager.state.history.current_tokens
359
+
360
+ try:
361
+ model_output = await self.get_next_action(input_messages)
362
+
363
+ self.state.n_steps += 1
364
+
365
+ if self.register_new_step_callback:
366
+ await self.register_new_step_callback(state, model_output, self.state.n_steps)
367
+
368
+ if self.settings.save_conversation_path:
369
+ target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt'
370
+ save_conversation(input_messages, model_output, target, self.settings.save_conversation_path_encoding)
371
+
372
+ self._message_manager._remove_last_state_message() # we dont want the whole state in the chat history
373
+
374
+ await self._raise_if_stopped_or_paused()
375
+
376
+ self._message_manager.add_model_output(model_output)
377
+ except Exception as e:
378
+ # model call failed, remove last state message from history
379
+ self._message_manager._remove_last_state_message()
380
+ raise e
381
+
382
+ result: list[ActionResult] = await self.multi_act(model_output.action)
383
+
384
+ self.state.last_result = result
385
+
386
+ if len(result) > 0 and result[-1].is_done:
387
+ logger.info(f'📄 Result: {result[-1].extracted_content}')
388
+
389
+ self.state.consecutive_failures = 0
390
+
391
+ except InterruptedError:
392
+ logger.debug('Agent paused')
393
+ self.state.last_result = [
394
+ ActionResult(
395
+ error='The agent was paused - now continuing actions might need to be repeated', include_in_memory=True
396
+ )
397
+ ]
398
+ return
399
+ except Exception as e:
400
+ result = await self._handle_step_error(e)
401
+ self.state.last_result = result
402
+
403
+ finally:
404
+ step_end_time = time.time()
405
+ actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else []
406
+ self.telemetry.capture(
407
+ AgentStepTelemetryEvent(
408
+ agent_id=self.state.agent_id,
409
+ step=self.state.n_steps,
410
+ actions=actions,
411
+ consecutive_failures=self.state.consecutive_failures,
412
+ step_error=[r.error for r in result if r.error] if result else ['No result'],
413
+ )
414
+ )
415
+ if not result:
416
+ return
417
+
418
+ if state:
419
+ metadata = StepMetadata(
420
+ step_number=self.state.n_steps,
421
+ step_start_time=step_start_time,
422
+ step_end_time=step_end_time,
423
+ input_tokens=tokens,
424
+ )
425
+ self._make_history_item(model_output, state, result, metadata)
426
+
427
+ @time_execution_async('--handle_step_error (agent)')
428
+ async def _handle_step_error(self, error: Exception) -> list[ActionResult]:
429
+ """Handle all types of errors that can occur during a step"""
430
+ include_trace = logger.isEnabledFor(logging.DEBUG)
431
+ error_msg = AgentError.format_error(error, include_trace=include_trace)
432
+ prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures} times:\n '
433
+
434
+ if isinstance(error, (ValidationError, ValueError)):
435
+ logger.error(f'{prefix}{error_msg}')
436
+ if 'Max token limit reached' in error_msg:
437
+ # cut tokens from history
438
+ self._message_manager.settings.max_input_tokens = self.settings.max_input_tokens - 500
439
+ logger.info(
440
+ f'Cutting tokens from history - new max input tokens: {self._message_manager.settings.max_input_tokens}'
441
+ )
442
+ self._message_manager.cut_messages()
443
+ elif 'Could not parse response' in error_msg:
444
+ # give model a hint how output should look like
445
+ error_msg += '\n\nReturn a valid JSON object with the required fields.'
446
+
447
+ self.state.consecutive_failures += 1
448
+ else:
449
+ from google.api_core.exceptions import ResourceExhausted
450
+ from openai import RateLimitError
451
+
452
+ if isinstance(error, RateLimitError) or isinstance(error, ResourceExhausted):
453
+ logger.warning(f'{prefix}{error_msg}')
454
+ await asyncio.sleep(self.settings.retry_delay)
455
+ self.state.consecutive_failures += 1
456
+ else:
457
+ logger.error(f'{prefix}{error_msg}')
458
+ self.state.consecutive_failures += 1
459
+
460
+ return [ActionResult(error=error_msg, include_in_memory=True)]
461
+
462
+ def _make_history_item(
463
+ self,
464
+ model_output: AgentOutput | None,
465
+ state: BrowserState,
466
+ result: list[ActionResult],
467
+ metadata: Optional[StepMetadata] = None,
468
+ ) -> None:
469
+ """Create and store history item"""
470
+
471
+ if model_output:
472
+ interacted_elements = AgentHistory.get_interacted_element(model_output, state.selector_map)
473
+ else:
474
+ interacted_elements = [None]
475
+
476
+ state_history = BrowserStateHistory(
477
+ url=state.url,
478
+ title=state.title,
479
+ tabs=state.tabs,
480
+ interacted_element=interacted_elements,
481
+ screenshot=state.screenshot,
482
+ )
483
+
484
+ history_item = AgentHistory(model_output=model_output, result=result, state=state_history, metadata=metadata)
485
+
486
+ self.state.history.history.append(history_item)
487
+
488
+ THINK_TAGS = re.compile(r'<think>.*?</think>', re.DOTALL)
489
+ STRAY_CLOSE_TAG = re.compile(r'.*?</think>', re.DOTALL)
490
+
491
+ def _remove_think_tags(self, text: str) -> str:
492
+ # Step 1: Remove well-formed <think>...</think>
493
+ text = re.sub(self.THINK_TAGS, '', text)
494
+ # Step 2: If there's an unmatched closing tag </think>,
495
+ # remove everything up to and including that.
496
+ text = re.sub(self.STRAY_CLOSE_TAG, '', text)
497
+ return text.strip()
498
+
499
+ def _convert_input_messages(self, input_messages: list[BaseMessage]) -> list[BaseMessage]:
500
+ """Convert input messages to the correct format"""
501
+ if self.model_name == 'deepseek-reasoner' or 'deepseek-r1' in self.model_name:
502
+ return convert_input_messages(input_messages, self.model_name)
503
+ else:
504
+ return input_messages
505
+
506
+ @time_execution_async('--get_next_action (agent)')
507
+ async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
508
+ """Get next action from LLM based on current state"""
509
+ input_messages = self._convert_input_messages(input_messages)
510
+
511
+ if self.tool_calling_method == 'raw':
512
+ output = self.llm.invoke(input_messages)
513
+ # TODO: currently invoke does not return reasoning_content, we should override invoke
514
+ output.content = self._remove_think_tags(str(output.content))
515
+ try:
516
+ parsed_json = extract_json_from_model_output(output.content)
517
+ parsed = self.AgentOutput(**parsed_json)
518
+ except (ValueError, ValidationError) as e:
519
+ logger.warning(f'Failed to parse model output: {output} {str(e)}')
520
+ raise ValueError('Could not parse response.')
521
+
522
+ elif self.tool_calling_method is None:
523
+ structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True)
524
+ response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore
525
+ parsed: AgentOutput | None = response['parsed']
526
+ else:
527
+ structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True, method=self.tool_calling_method)
528
+ response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore
529
+ parsed: AgentOutput | None = response['parsed']
530
+
531
+ if parsed is None:
532
+ raise ValueError('Could not parse response.')
533
+
534
+ # cut the number of actions to max_actions_per_step if needed
535
+ if len(parsed.action) > self.settings.max_actions_per_step:
536
+ parsed.action = parsed.action[: self.settings.max_actions_per_step]
537
+
538
+ log_response(parsed)
539
+
540
+ return parsed
541
+
542
+ def _log_agent_run(self) -> None:
543
+ """Log the agent run"""
544
+ logger.info(f'🚀 Starting task: {self.task}')
545
+
546
+ logger.debug(f'Version: {self.version}, Source: {self.source}')
547
+ self.telemetry.capture(
548
+ AgentRunTelemetryEvent(
549
+ agent_id=self.state.agent_id,
550
+ use_vision=self.settings.use_vision,
551
+ task=self.task,
552
+ model_name=self.model_name,
553
+ chat_model_library=self.chat_model_library,
554
+ version=self.version,
555
+ source=self.source,
556
+ )
557
+ )
558
+
559
+ async def take_step(self) -> tuple[bool, bool]:
560
+ """Take a step
561
+
562
+ Returns:
563
+ Tuple[bool, bool]: (is_done, is_valid)
564
+ """
565
+ await self.step()
566
+
567
+ if self.state.history.is_done():
568
+ if self.settings.validate_output:
569
+ if not await self._validate_output():
570
+ return True, False
571
+
572
+ await self.log_completion()
573
+ if self.register_done_callback:
574
+ await self.register_done_callback(self.state.history)
575
+
576
+ return True, True
577
+
578
+ return False, False
579
+
580
+ # @observe(name='agent.run', ignore_output=True)
581
+ @time_execution_async('--run (agent)')
582
+ async def run(self, max_steps: int = 100) -> AgentHistoryList:
583
+ """Execute the task with maximum number of steps"""
584
+ try:
585
+ self._log_agent_run()
586
+
587
+ # Execute initial actions if provided
588
+ if self.initial_actions:
589
+ result = await self.multi_act(self.initial_actions, check_for_new_elements=False)
590
+ self.state.last_result = result
591
+
592
+ for step in range(max_steps):
593
+ # Check if we should stop due to too many failures
594
+ if self.state.consecutive_failures >= self.settings.max_failures:
595
+ logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures')
596
+ break
597
+
598
+ # Check control flags before each step
599
+ if self.state.stopped:
600
+ logger.info('Agent stopped')
601
+ break
602
+
603
+ while self.state.paused:
604
+ await asyncio.sleep(0.2) # Small delay to prevent CPU spinning
605
+ if self.state.stopped: # Allow stopping while paused
606
+ break
607
+
608
+ step_info = AgentStepInfo(step_number=step, max_steps=max_steps)
609
+ await self.step(step_info)
610
+
611
+ if self.state.history.is_done():
612
+ if self.settings.validate_output and step < max_steps - 1:
613
+ if not await self._validate_output():
614
+ continue
615
+
616
+ await self.log_completion()
617
+ break
618
+ else:
619
+ logger.info('❌ Failed to complete task in maximum steps')
620
+
621
+ return self.state.history
622
+ finally:
623
+ self.telemetry.capture(
624
+ AgentEndTelemetryEvent(
625
+ agent_id=self.state.agent_id,
626
+ is_done=self.state.history.is_done(),
627
+ success=self.state.history.is_successful(),
628
+ steps=self.state.n_steps,
629
+ max_steps_reached=self.state.n_steps >= max_steps,
630
+ errors=self.state.history.errors(),
631
+ total_input_tokens=self.state.history.total_input_tokens(),
632
+ total_duration_seconds=self.state.history.total_duration_seconds(),
633
+ )
634
+ )
635
+
636
+ if not self.injected_browser_context:
637
+ await self.browser_context.close()
638
+
639
+ if not self.injected_browser and self.browser:
640
+ await self.browser.close()
641
+
642
+ if self.settings.generate_gif:
643
+ output_path: str = 'agent_history.gif'
644
+ if isinstance(self.settings.generate_gif, str):
645
+ output_path = self.settings.generate_gif
646
+
647
+ create_history_gif(task=self.task, history=self.state.history, output_path=output_path)
648
+
649
+ # @observe(name='controller.multi_act')
650
+ @time_execution_async('--multi-act (agent)')
651
+ async def multi_act(
652
+ self,
653
+ actions: list[ActionModel],
654
+ check_for_new_elements: bool = True,
655
+ ) -> list[ActionResult]:
656
+ """Execute multiple actions"""
657
+ results = []
658
+
659
+ cached_selector_map = await self.browser_context.get_selector_map()
660
+ cached_path_hashes = set(e.hash.branch_path_hash for e in cached_selector_map.values())
661
+
662
+ await self.browser_context.remove_highlights()
663
+
664
+ for i, action in enumerate(actions):
665
+ if action.get_index() is not None and i != 0:
666
+ new_state = await self.browser_context.get_state()
667
+ new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values())
668
+ if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes):
669
+ # next action requires index but there are new elements on the page
670
+ msg = f'Something new appeared after action {i} / {len(actions)}'
671
+ logger.info(msg)
672
+ results.append(ActionResult(extracted_content=msg, include_in_memory=True))
673
+ break
674
+
675
+ await self._raise_if_stopped_or_paused()
676
+
677
+ result = await self.controller.act(
678
+ action,
679
+ self.browser_context,
680
+ self.settings.page_extraction_llm,
681
+ self.sensitive_data,
682
+ self.settings.available_file_paths,
683
+ context=self.context,
684
+ )
685
+
686
+ results.append(result)
687
+
688
+ logger.debug(f'Executed action {i + 1} / {len(actions)}')
689
+ if results[-1].is_done or results[-1].error or i == len(actions) - 1:
690
+ break
691
+
692
+ await asyncio.sleep(self.browser_context.config.wait_between_actions)
693
+ # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page)
694
+
695
+ return results
696
+
697
+ async def _validate_output(self) -> bool:
698
+ """Validate the output of the last action is what the user wanted"""
699
+ system_msg = (
700
+ f'You are a validator of an agent who interacts with a browser. '
701
+ f'Validate if the output of last action is what the user wanted and if the task is completed. '
702
+ f'If the task is unclear defined, you can let it pass. But if something is missing or the image does not show what was requested dont let it pass. '
703
+ f'Try to understand the page and help the model with suggestions like scroll, do x, ... to get the solution right. '
704
+ f'Task to validate: {self.task}. Return a JSON object with 2 keys: is_valid and reason. '
705
+ f'is_valid is a boolean that indicates if the output is correct. '
706
+ f'reason is a string that explains why it is valid or not.'
707
+ f' example: {{"is_valid": false, "reason": "The user wanted to search for "cat photos", but the agent searched for "dog photos" instead."}}'
708
+ )
709
+
710
+ if self.browser_context.session:
711
+ state = await self.browser_context.get_state()
712
+ content = AgentMessagePrompt(
713
+ state=state,
714
+ result=self.state.last_result,
715
+ include_attributes=self.settings.include_attributes,
716
+ )
717
+ msg = [SystemMessage(content=system_msg), content.get_user_message(self.settings.use_vision)]
718
+ else:
719
+ # if no browser session, we can't validate the output
720
+ return True
721
+
722
+ class ValidationResult(BaseModel):
723
+ """
724
+ Validation results.
725
+ """
726
+
727
+ is_valid: bool
728
+ reason: str
729
+
730
+ validator = self.llm.with_structured_output(ValidationResult, include_raw=True)
731
+ response: dict[str, Any] = await validator.ainvoke(msg) # type: ignore
732
+ parsed: ValidationResult = response['parsed']
733
+ is_valid = parsed.is_valid
734
+ if not is_valid:
735
+ logger.info(f'❌ Validator decision: {parsed.reason}')
736
+ msg = f'The output is not yet correct. {parsed.reason}.'
737
+ self.state.last_result = [ActionResult(extracted_content=msg, include_in_memory=True)]
738
+ else:
739
+ logger.info(f'✅ Validator decision: {parsed.reason}')
740
+ return is_valid
741
+
742
+ async def log_completion(self) -> None:
743
+ """Log the completion of the task"""
744
+ logger.info('✅ Task completed')
745
+ if self.state.history.is_successful():
746
+ logger.info('✅ Successfully')
747
+ else:
748
+ logger.info('❌ Unfinished')
749
+
750
+ if self.register_done_callback:
751
+ await self.register_done_callback(self.state.history)
752
+
753
+ async def rerun_history(
754
+ self,
755
+ history: AgentHistoryList,
756
+ max_retries: int = 3,
757
+ skip_failures: bool = True,
758
+ delay_between_actions: float = 2.0,
759
+ ) -> list[ActionResult]:
760
+ """
761
+ Rerun a saved history of actions with error handling and retry logic.
762
+
763
+ Args:
764
+ history: The history to replay
765
+ max_retries: Maximum number of retries per action
766
+ skip_failures: Whether to skip failed actions or stop execution
767
+ delay_between_actions: Delay between actions in seconds
768
+
769
+ Returns:
770
+ List of action results
771
+ """
772
+ # Execute initial actions if provided
773
+ if self.initial_actions:
774
+ result = await self.multi_act(self.initial_actions)
775
+ self.state.last_result = result
776
+
777
+ results = []
778
+
779
+ for i, history_item in enumerate(history.history):
780
+ goal = history_item.model_output.current_state.next_goal if history_item.model_output else ''
781
+ logger.info(f'Replaying step {i + 1}/{len(history.history)}: goal: {goal}')
782
+
783
+ if (
784
+ not history_item.model_output
785
+ or not history_item.model_output.action
786
+ or history_item.model_output.action == [None]
787
+ ):
788
+ logger.warning(f'Step {i + 1}: No action to replay, skipping')
789
+ results.append(ActionResult(error='No action to replay'))
790
+ continue
791
+
792
+ retry_count = 0
793
+ while retry_count < max_retries:
794
+ try:
795
+ result = await self._execute_history_step(history_item, delay_between_actions)
796
+ results.extend(result)
797
+ break
798
+
799
+ except Exception as e:
800
+ retry_count += 1
801
+ if retry_count == max_retries:
802
+ error_msg = f'Step {i + 1} failed after {max_retries} attempts: {str(e)}'
803
+ logger.error(error_msg)
804
+ if not skip_failures:
805
+ results.append(ActionResult(error=error_msg))
806
+ raise RuntimeError(error_msg)
807
+ else:
808
+ logger.warning(f'Step {i + 1} failed (attempt {retry_count}/{max_retries}), retrying...')
809
+ await asyncio.sleep(delay_between_actions)
810
+
811
+ return results
812
+
813
+ async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]:
814
+ """Execute a single step from history with element validation"""
815
+ state = await self.browser_context.get_state()
816
+ if not state or not history_item.model_output:
817
+ raise ValueError('Invalid state or model output')
818
+ updated_actions = []
819
+ for i, action in enumerate(history_item.model_output.action):
820
+ updated_action = await self._update_action_indices(
821
+ history_item.state.interacted_element[i],
822
+ action,
823
+ state,
824
+ )
825
+ updated_actions.append(updated_action)
826
+
827
+ if updated_action is None:
828
+ raise ValueError(f'Could not find matching element {i} in current page')
829
+
830
+ result = await self.multi_act(updated_actions)
831
+
832
+ await asyncio.sleep(delay)
833
+ return result
834
+
835
+ async def _update_action_indices(
836
+ self,
837
+ historical_element: Optional[DOMHistoryElement],
838
+ action: ActionModel, # Type this properly based on your action model
839
+ current_state: BrowserState,
840
+ ) -> Optional[ActionModel]:
841
+ """
842
+ Update action indices based on current page state.
843
+ Returns updated action or None if element cannot be found.
844
+ """
845
+ if not historical_element or not current_state.element_tree:
846
+ return action
847
+
848
+ current_element = HistoryTreeProcessor.find_history_element_in_tree(historical_element, current_state.element_tree)
849
+
850
+ if not current_element or current_element.highlight_index is None:
851
+ return None
852
+
853
+ old_index = action.get_index()
854
+ if old_index != current_element.highlight_index:
855
+ action.set_index(current_element.highlight_index)
856
+ logger.info(f'Element moved in DOM, updated index from {old_index} to {current_element.highlight_index}')
857
+
858
+ return action
859
+
860
+ async def load_and_rerun(self, history_file: Optional[str | Path] = None, **kwargs) -> list[ActionResult]:
861
+ """
862
+ Load history from file and rerun it.
863
+
864
+ Args:
865
+ history_file: Path to the history file
866
+ **kwargs: Additional arguments passed to rerun_history
867
+ """
868
+ if not history_file:
869
+ history_file = 'AgentHistory.json'
870
+ history = AgentHistoryList.load_from_file(history_file, self.AgentOutput)
871
+ return await self.rerun_history(history, **kwargs)
872
+
873
+ def save_history(self, file_path: Optional[str | Path] = None) -> None:
874
+ """Save the history to a file"""
875
+ if not file_path:
876
+ file_path = 'AgentHistory.json'
877
+ self.state.history.save_to_file(file_path)
878
+
879
+ def pause(self) -> None:
880
+ """Pause the agent before the next step"""
881
+ logger.info('🔄 pausing Agent ')
882
+ self.state.paused = True
883
+
884
+ def resume(self) -> None:
885
+ """Resume the agent"""
886
+ logger.info('▶️ Agent resuming')
887
+ self.state.paused = False
888
+
889
+ def stop(self) -> None:
890
+ """Stop the agent"""
891
+ logger.info('⏹️ Agent stopping')
892
+ self.state.stopped = True
893
+
894
+ def _convert_initial_actions(self, actions: List[Dict[str, Dict[str, Any]]]) -> List[ActionModel]:
895
+ """Convert dictionary-based actions to ActionModel instances"""
896
+ converted_actions = []
897
+ action_model = self.ActionModel
898
+ for action_dict in actions:
899
+ # Each action_dict should have a single key-value pair
900
+ action_name = next(iter(action_dict))
901
+ params = action_dict[action_name]
902
+
903
+ # Get the parameter model for this action from registry
904
+ action_info = self.controller.registry.registry.actions[action_name]
905
+ param_model = action_info.param_model
906
+
907
+ # Create validated parameters using the appropriate param model
908
+ validated_params = param_model(**params)
909
+
910
+ # Create ActionModel instance with the validated parameters
911
+ action_model = self.ActionModel(**{action_name: validated_params})
912
+ converted_actions.append(action_model)
913
+
914
+ return converted_actions
915
+
916
+ async def _run_planner(self) -> Optional[str]:
917
+ """Run the planner to analyze state and suggest next steps"""
918
+ # Skip planning if no planner_llm is set
919
+ if not self.settings.planner_llm:
920
+ return None
921
+
922
+ # Create planner message history using full message history
923
+ planner_messages = [
924
+ PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(),
925
+ *self._message_manager.get_messages()[1:], # Use full message history except the first
926
+ ]
927
+
928
+ if not self.settings.use_vision_for_planner and self.settings.use_vision:
929
+ last_state_message: HumanMessage = planner_messages[-1]
930
+ # remove image from last state message
931
+ new_msg = ''
932
+ if isinstance(last_state_message.content, list):
933
+ for msg in last_state_message.content:
934
+ if msg['type'] == 'text': # type: ignore
935
+ new_msg += msg['text'] # type: ignore
936
+ elif msg['type'] == 'image_url': # type: ignore
937
+ continue # type: ignore
938
+ else:
939
+ new_msg = last_state_message.content
940
+
941
+ planner_messages[-1] = HumanMessage(content=new_msg)
942
+
943
+ planner_messages = convert_input_messages(planner_messages, self.planner_model_name)
944
+
945
+ # Get planner output
946
+ response = await self.settings.planner_llm.ainvoke(planner_messages)
947
+ plan = str(response.content)
948
+ # if deepseek-reasoner, remove think tags
949
+ if self.planner_model_name and ('deepseek-r1' in self.planner_model_name or 'deepseek-reasoner' in self.planner_model_name):
950
+ plan = self._remove_think_tags(plan)
951
+ try:
952
+ plan_json = json.loads(plan)
953
+ logger.info(f'Planning Analysis:\n{json.dumps(plan_json, indent=4)}')
954
+ except json.JSONDecodeError:
955
+ logger.info(f'Planning Analysis:\n{plan}')
956
+ except Exception as e:
957
+ logger.debug(f'Error parsing planning analysis: {e}')
958
+ logger.info(f'Plan: {plan}')
959
+
960
+ return plan
961
+
962
+ @property
963
+ def message_manager(self) -> MessageManager:
964
+ return self._message_manager
browser_use/agent/system_prompt.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules.
2
+
3
+ # Input Format
4
+ Task
5
+ Previous steps
6
+ Current URL
7
+ Open Tabs
8
+ Interactive Elements
9
+ [index]<type>text</type>
10
+ - index: Numeric identifier for interaction
11
+ - type: HTML element type (button, input, etc.)
12
+ - text: Element description
13
+ Example:
14
+ [33]<button>Submit Form</button>
15
+
16
+ - Only elements with numeric indexes in [] are interactive
17
+ - elements without [] provide only context
18
+
19
+ # Response Rules
20
+ 1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format:
21
+ {{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not",
22
+ "memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz",
23
+ "next_goal": "What needs to be done with the next immediate action"}},
24
+ "action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}}
25
+
26
+ 2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence.
27
+ Common action sequences:
28
+ - Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}]
29
+ - Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}]
30
+ - Actions are executed in the given order
31
+ - If the page changes after an action, the sequence is interrupted and you get the new state.
32
+ - Only provide the action sequence until an action which changes the page state significantly.
33
+ - Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page
34
+ - only use multiple actions if it makes sense.
35
+
36
+ 3. ELEMENT INTERACTION:
37
+ - Only use indexes of the interactive elements
38
+ - Elements marked with "[]Non-interactive text" are non-interactive
39
+
40
+ 4. NAVIGATION & ERROR HANDLING:
41
+ - If no suitable elements exist, use other functions to complete the task
42
+ - If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc.
43
+ - Handle popups/cookies by accepting or closing them
44
+ - Use scroll to find elements you are looking for
45
+ - If you want to research something, open a new tab instead of using the current tab
46
+ - If captcha pops up, try to solve it - else try a different approach
47
+ - If the page is not fully loaded, use wait action
48
+
49
+ 5. TASK COMPLETION:
50
+ - Use the done action as the last action as soon as the ultimate task is complete
51
+ - Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps.
52
+ - If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false!
53
+ - If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step.
54
+ - Don't hallucinate actions
55
+ - Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task.
56
+
57
+ 6. VISUAL CONTEXT:
58
+ - When an image is provided, use it to understand the page layout
59
+ - Bounding boxes with labels on their top right corner correspond to element indexes
60
+
61
+ 7. Form filling:
62
+ - If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field.
63
+
64
+ 8. Long tasks:
65
+ - Keep track of the status and subresults in the memory.
66
+
67
+ 9. Extraction:
68
+ - If your task is to find information - call extract_content on the specific pages to get and store the information.
69
+ Your responses must be always JSON with the specified format.
browser_use/agent/tests.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+
3
+ from browser_use.agent.views import (
4
+ ActionResult,
5
+ AgentBrain,
6
+ AgentHistory,
7
+ AgentHistoryList,
8
+ AgentOutput,
9
+ )
10
+ from browser_use.browser.views import BrowserState, BrowserStateHistory, TabInfo
11
+ from browser_use.controller.registry.service import Registry
12
+ from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction
13
+ from browser_use.dom.views import DOMElementNode
14
+
15
+
16
+ @pytest.fixture
17
+ def sample_browser_state():
18
+ return BrowserState(
19
+ url='https://example.com',
20
+ title='Example Page',
21
+ tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)],
22
+ screenshot='screenshot1.png',
23
+ element_tree=DOMElementNode(
24
+ tag_name='root',
25
+ is_visible=True,
26
+ parent=None,
27
+ xpath='',
28
+ attributes={},
29
+ children=[],
30
+ ),
31
+ selector_map={},
32
+ )
33
+
34
+
35
+ @pytest.fixture
36
+ def action_registry():
37
+ registry = Registry()
38
+
39
+ # Register the actions we need for testing
40
+ @registry.action(description='Click an element', param_model=ClickElementAction)
41
+ def click_element(params: ClickElementAction, browser=None):
42
+ pass
43
+
44
+ @registry.action(
45
+ description='Extract page content',
46
+ param_model=ExtractPageContentAction,
47
+ )
48
+ def extract_page_content(params: ExtractPageContentAction, browser=None):
49
+ pass
50
+
51
+ @registry.action(description='Mark task as done', param_model=DoneAction)
52
+ def done(params: DoneAction):
53
+ pass
54
+
55
+ # Create the dynamic ActionModel with all registered actions
56
+ return registry.create_action_model()
57
+
58
+
59
+ @pytest.fixture
60
+ def sample_history(action_registry):
61
+ # Create actions with nested params structure
62
+ click_action = action_registry(click_element={'index': 1})
63
+
64
+ extract_action = action_registry(extract_page_content={'value': 'text'})
65
+
66
+ done_action = action_registry(done={'text': 'Task completed'})
67
+
68
+ histories = [
69
+ AgentHistory(
70
+ model_output=AgentOutput(
71
+ current_state=AgentBrain(
72
+ evaluation_previous_goal='None',
73
+ memory='Started task',
74
+ next_goal='Click button',
75
+ ),
76
+ action=[click_action],
77
+ ),
78
+ result=[ActionResult(is_done=False)],
79
+ state=BrowserStateHistory(
80
+ url='https://example.com',
81
+ title='Page 1',
82
+ tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)],
83
+ screenshot='screenshot1.png',
84
+ interacted_element=[{'xpath': '//button[1]'}],
85
+ ),
86
+ ),
87
+ AgentHistory(
88
+ model_output=AgentOutput(
89
+ current_state=AgentBrain(
90
+ evaluation_previous_goal='Clicked button',
91
+ memory='Button clicked',
92
+ next_goal='Extract content',
93
+ ),
94
+ action=[extract_action],
95
+ ),
96
+ result=[
97
+ ActionResult(
98
+ is_done=False,
99
+ extracted_content='Extracted text',
100
+ error='Failed to extract completely',
101
+ )
102
+ ],
103
+ state=BrowserStateHistory(
104
+ url='https://example.com/page2',
105
+ title='Page 2',
106
+ tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
107
+ screenshot='screenshot2.png',
108
+ interacted_element=[{'xpath': '//div[1]'}],
109
+ ),
110
+ ),
111
+ AgentHistory(
112
+ model_output=AgentOutput(
113
+ current_state=AgentBrain(
114
+ evaluation_previous_goal='Extracted content',
115
+ memory='Content extracted',
116
+ next_goal='Finish task',
117
+ ),
118
+ action=[done_action],
119
+ ),
120
+ result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)],
121
+ state=BrowserStateHistory(
122
+ url='https://example.com/page2',
123
+ title='Page 2',
124
+ tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)],
125
+ screenshot='screenshot3.png',
126
+ interacted_element=[{'xpath': '//div[1]'}],
127
+ ),
128
+ ),
129
+ ]
130
+ return AgentHistoryList(history=histories)
131
+
132
+
133
+ def test_last_model_output(sample_history: AgentHistoryList):
134
+ last_output = sample_history.last_action()
135
+ print(last_output)
136
+ assert last_output == {'done': {'text': 'Task completed'}}
137
+
138
+
139
+ def test_get_errors(sample_history: AgentHistoryList):
140
+ errors = sample_history.errors()
141
+ assert len(errors) == 1
142
+ assert errors[0] == 'Failed to extract completely'
143
+
144
+
145
+ def test_final_result(sample_history: AgentHistoryList):
146
+ assert sample_history.final_result() == 'Task completed'
147
+
148
+
149
+ def test_is_done(sample_history: AgentHistoryList):
150
+ assert sample_history.is_done() == True
151
+
152
+
153
+ def test_urls(sample_history: AgentHistoryList):
154
+ urls = sample_history.urls()
155
+ assert 'https://example.com' in urls
156
+ assert 'https://example.com/page2' in urls
157
+
158
+
159
+ def test_all_screenshots(sample_history: AgentHistoryList):
160
+ screenshots = sample_history.screenshots()
161
+ assert len(screenshots) == 3
162
+ assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png']
163
+
164
+
165
+ def test_all_model_outputs(sample_history: AgentHistoryList):
166
+ outputs = sample_history.model_actions()
167
+ print(f'DEBUG: {outputs[0]}')
168
+ assert len(outputs) == 3
169
+ # get first key value pair
170
+ assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}}
171
+ assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}}
172
+ assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}}
173
+
174
+
175
+ def test_all_model_outputs_filtered(sample_history: AgentHistoryList):
176
+ filtered = sample_history.model_actions_filtered(include=['click_element'])
177
+ assert len(filtered) == 1
178
+ assert filtered[0]['click_element']['index'] == 1
179
+
180
+
181
+ def test_empty_history():
182
+ empty_history = AgentHistoryList(history=[])
183
+ assert empty_history.last_action() is None
184
+ assert empty_history.final_result() is None
185
+ assert empty_history.is_done() == False
186
+ assert len(empty_history.urls()) == 0
187
+
188
+
189
+ # Add a test to verify action creation
190
+ def test_action_creation(action_registry):
191
+ click_action = action_registry(click_element={'index': 1})
192
+
193
+ assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}}
194
+
195
+
196
+ # run this with:
197
+ # pytest browser_use/agent/tests.py
browser_use/agent/views.py ADDED
@@ -0,0 +1,393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import traceback
5
+ import uuid
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Literal, Optional, Type
9
+
10
+ from langchain_core.language_models.chat_models import BaseChatModel
11
+ from openai import RateLimitError
12
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model
13
+
14
+ from browser_use.agent.message_manager.views import MessageManagerState
15
+ from browser_use.browser.views import BrowserStateHistory
16
+ from browser_use.controller.registry.views import ActionModel
17
+ from browser_use.dom.history_tree_processor.service import (
18
+ DOMElementNode,
19
+ DOMHistoryElement,
20
+ HistoryTreeProcessor,
21
+ )
22
+ from browser_use.dom.views import SelectorMap
23
+
24
+ ToolCallingMethod = Literal['function_calling', 'json_mode', 'raw', 'auto']
25
+
26
+
27
+ class AgentSettings(BaseModel):
28
+ """Options for the agent"""
29
+
30
+ use_vision: bool = True
31
+ use_vision_for_planner: bool = False
32
+ save_conversation_path: Optional[str] = None
33
+ save_conversation_path_encoding: Optional[str] = 'utf-8'
34
+ max_failures: int = 3
35
+ retry_delay: int = 10
36
+ max_input_tokens: int = 128000
37
+ validate_output: bool = False
38
+ message_context: Optional[str] = None
39
+ generate_gif: bool | str = False
40
+ available_file_paths: Optional[list[str]] = None
41
+ override_system_message: Optional[str] = None
42
+ extend_system_message: Optional[str] = None
43
+ include_attributes: list[str] = [
44
+ 'title',
45
+ 'type',
46
+ 'name',
47
+ 'role',
48
+ 'tabindex',
49
+ 'aria-label',
50
+ 'placeholder',
51
+ 'value',
52
+ 'alt',
53
+ 'aria-expanded',
54
+ ]
55
+ max_actions_per_step: int = 10
56
+
57
+ tool_calling_method: Optional[ToolCallingMethod] = 'auto'
58
+ page_extraction_llm: Optional[BaseChatModel] = None
59
+ planner_llm: Optional[BaseChatModel] = None
60
+ planner_interval: int = 1 # Run planner every N steps
61
+
62
+
63
+ class AgentState(BaseModel):
64
+ """Holds all state information for an Agent"""
65
+
66
+ agent_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
67
+ n_steps: int = 1
68
+ consecutive_failures: int = 0
69
+ last_result: Optional[List['ActionResult']] = None
70
+ history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[]))
71
+ last_plan: Optional[str] = None
72
+ paused: bool = False
73
+ stopped: bool = False
74
+
75
+ message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState)
76
+
77
+ # class Config:
78
+ # arbitrary_types_allowed = True
79
+
80
+
81
+ @dataclass
82
+ class AgentStepInfo:
83
+ step_number: int
84
+ max_steps: int
85
+
86
+ def is_last_step(self) -> bool:
87
+ """Check if this is the last step"""
88
+ return self.step_number >= self.max_steps - 1
89
+
90
+
91
+ class ActionResult(BaseModel):
92
+ """Result of executing an action"""
93
+
94
+ is_done: Optional[bool] = False
95
+ success: Optional[bool] = None
96
+ extracted_content: Optional[str] = None
97
+ error: Optional[str] = None
98
+ include_in_memory: bool = False # whether to include in past messages as context or not
99
+
100
+
101
+ class StepMetadata(BaseModel):
102
+ """Metadata for a single step including timing and token information"""
103
+
104
+ step_start_time: float
105
+ step_end_time: float
106
+ input_tokens: int # Approximate tokens from message manager for this step
107
+ step_number: int
108
+
109
+ @property
110
+ def duration_seconds(self) -> float:
111
+ """Calculate step duration in seconds"""
112
+ return self.step_end_time - self.step_start_time
113
+
114
+
115
+ class AgentBrain(BaseModel):
116
+ """Current state of the agent"""
117
+
118
+ evaluation_previous_goal: str
119
+ memory: str
120
+ next_goal: str
121
+
122
+
123
+ class AgentOutput(BaseModel):
124
+ """Output model for agent
125
+
126
+ @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
127
+ """
128
+
129
+ model_config = ConfigDict(arbitrary_types_allowed=True)
130
+
131
+ current_state: AgentBrain
132
+ action: list[ActionModel] = Field(
133
+ ...,
134
+ description='List of actions to execute',
135
+ json_schema_extra={'min_items': 1}, # Ensure at least one action is provided
136
+ )
137
+
138
+ @staticmethod
139
+ def type_with_custom_actions(custom_actions: Type[ActionModel]) -> Type['AgentOutput']:
140
+ """Extend actions with custom actions"""
141
+ model_ = create_model(
142
+ 'AgentOutput',
143
+ __base__=AgentOutput,
144
+ action=(
145
+ list[custom_actions],
146
+ Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}),
147
+ ),
148
+ __module__=AgentOutput.__module__,
149
+ )
150
+ model_.__doc__ = 'AgentOutput model with custom actions'
151
+ return model_
152
+
153
+
154
+ class AgentHistory(BaseModel):
155
+ """History item for agent actions"""
156
+
157
+ model_output: AgentOutput | None
158
+ result: list[ActionResult]
159
+ state: BrowserStateHistory
160
+ metadata: Optional[StepMetadata] = None
161
+
162
+ model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=())
163
+
164
+ @staticmethod
165
+ def get_interacted_element(model_output: AgentOutput, selector_map: SelectorMap) -> list[DOMHistoryElement | None]:
166
+ elements = []
167
+ for action in model_output.action:
168
+ index = action.get_index()
169
+ if index and index in selector_map:
170
+ el: DOMElementNode = selector_map[index]
171
+ elements.append(HistoryTreeProcessor.convert_dom_element_to_history_element(el))
172
+ else:
173
+ elements.append(None)
174
+ return elements
175
+
176
+ def model_dump(self, **kwargs) -> Dict[str, Any]:
177
+ """Custom serialization handling circular references"""
178
+
179
+ # Handle action serialization
180
+ model_output_dump = None
181
+ if self.model_output:
182
+ action_dump = [action.model_dump(exclude_none=True) for action in self.model_output.action]
183
+ model_output_dump = {
184
+ 'current_state': self.model_output.current_state.model_dump(),
185
+ 'action': action_dump, # This preserves the actual action data
186
+ }
187
+
188
+ return {
189
+ 'model_output': model_output_dump,
190
+ 'result': [r.model_dump(exclude_none=True) for r in self.result],
191
+ 'state': self.state.to_dict(),
192
+ 'metadata': self.metadata.model_dump() if self.metadata else None,
193
+ }
194
+
195
+
196
+ class AgentHistoryList(BaseModel):
197
+ """List of agent history items"""
198
+
199
+ history: list[AgentHistory]
200
+
201
+ def total_duration_seconds(self) -> float:
202
+ """Get total duration of all steps in seconds"""
203
+ total = 0.0
204
+ for h in self.history:
205
+ if h.metadata:
206
+ total += h.metadata.duration_seconds
207
+ return total
208
+
209
+ def total_input_tokens(self) -> int:
210
+ """
211
+ Get total tokens used across all steps.
212
+ Note: These are from the approximate token counting of the message manager.
213
+ For accurate token counting, use tools like LangChain Smith or OpenAI's token counters.
214
+ """
215
+ total = 0
216
+ for h in self.history:
217
+ if h.metadata:
218
+ total += h.metadata.input_tokens
219
+ return total
220
+
221
+ def input_token_usage(self) -> list[int]:
222
+ """Get token usage for each step"""
223
+ return [h.metadata.input_tokens for h in self.history if h.metadata]
224
+
225
+ def __str__(self) -> str:
226
+ """Representation of the AgentHistoryList object"""
227
+ return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})'
228
+
229
+ def __repr__(self) -> str:
230
+ """Representation of the AgentHistoryList object"""
231
+ return self.__str__()
232
+
233
+ def save_to_file(self, filepath: str | Path) -> None:
234
+ """Save history to JSON file with proper serialization"""
235
+ try:
236
+ Path(filepath).parent.mkdir(parents=True, exist_ok=True)
237
+ data = self.model_dump()
238
+ with open(filepath, 'w', encoding='utf-8') as f:
239
+ json.dump(data, f, indent=2)
240
+ except Exception as e:
241
+ raise e
242
+
243
+ def model_dump(self, **kwargs) -> Dict[str, Any]:
244
+ """Custom serialization that properly uses AgentHistory's model_dump"""
245
+ return {
246
+ 'history': [h.model_dump(**kwargs) for h in self.history],
247
+ }
248
+
249
+ @classmethod
250
+ def load_from_file(cls, filepath: str | Path, output_model: Type[AgentOutput]) -> 'AgentHistoryList':
251
+ """Load history from JSON file"""
252
+ with open(filepath, 'r', encoding='utf-8') as f:
253
+ data = json.load(f)
254
+ # loop through history and validate output_model actions to enrich with custom actions
255
+ for h in data['history']:
256
+ if h['model_output']:
257
+ if isinstance(h['model_output'], dict):
258
+ h['model_output'] = output_model.model_validate(h['model_output'])
259
+ else:
260
+ h['model_output'] = None
261
+ if 'interacted_element' not in h['state']:
262
+ h['state']['interacted_element'] = None
263
+ history = cls.model_validate(data)
264
+ return history
265
+
266
+ def last_action(self) -> None | dict:
267
+ """Last action in history"""
268
+ if self.history and self.history[-1].model_output:
269
+ return self.history[-1].model_output.action[-1].model_dump(exclude_none=True)
270
+ return None
271
+
272
+ def errors(self) -> list[str | None]:
273
+ """Get all errors from history, with None for steps without errors"""
274
+ errors = []
275
+ for h in self.history:
276
+ step_errors = [r.error for r in h.result if r.error]
277
+
278
+ # each step can have only one error
279
+ errors.append(step_errors[0] if step_errors else None)
280
+ return errors
281
+
282
+ def final_result(self) -> None | str:
283
+ """Final result from history"""
284
+ if self.history and self.history[-1].result[-1].extracted_content:
285
+ return self.history[-1].result[-1].extracted_content
286
+ return None
287
+
288
+ def is_done(self) -> bool:
289
+ """Check if the agent is done"""
290
+ if self.history and len(self.history[-1].result) > 0:
291
+ last_result = self.history[-1].result[-1]
292
+ return last_result.is_done is True
293
+ return False
294
+
295
+ def is_successful(self) -> bool | None:
296
+ """Check if the agent completed successfully - the agent decides in the last step if it was successful or not. None if not done yet."""
297
+ if self.history and len(self.history[-1].result) > 0:
298
+ last_result = self.history[-1].result[-1]
299
+ if last_result.is_done is True:
300
+ return last_result.success
301
+ return None
302
+
303
+ def has_errors(self) -> bool:
304
+ """Check if the agent has any non-None errors"""
305
+ return any(error is not None for error in self.errors())
306
+
307
+ def urls(self) -> list[str | None]:
308
+ """Get all unique URLs from history"""
309
+ return [h.state.url if h.state.url is not None else None for h in self.history]
310
+
311
+ def screenshots(self) -> list[str | None]:
312
+ """Get all screenshots from history"""
313
+ return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history]
314
+
315
+ def action_names(self) -> list[str]:
316
+ """Get all action names from history"""
317
+ action_names = []
318
+ for action in self.model_actions():
319
+ actions = list(action.keys())
320
+ if actions:
321
+ action_names.append(actions[0])
322
+ return action_names
323
+
324
+ def model_thoughts(self) -> list[AgentBrain]:
325
+ """Get all thoughts from history"""
326
+ return [h.model_output.current_state for h in self.history if h.model_output]
327
+
328
+ def model_outputs(self) -> list[AgentOutput]:
329
+ """Get all model outputs from history"""
330
+ return [h.model_output for h in self.history if h.model_output]
331
+
332
+ # get all actions with params
333
+ def model_actions(self) -> list[dict]:
334
+ """Get all actions from history"""
335
+ outputs = []
336
+
337
+ for h in self.history:
338
+ if h.model_output:
339
+ for action, interacted_element in zip(h.model_output.action, h.state.interacted_element):
340
+ output = action.model_dump(exclude_none=True)
341
+ output['interacted_element'] = interacted_element
342
+ outputs.append(output)
343
+ return outputs
344
+
345
+ def action_results(self) -> list[ActionResult]:
346
+ """Get all results from history"""
347
+ results = []
348
+ for h in self.history:
349
+ results.extend([r for r in h.result if r])
350
+ return results
351
+
352
+ def extracted_content(self) -> list[str]:
353
+ """Get all extracted content from history"""
354
+ content = []
355
+ for h in self.history:
356
+ content.extend([r.extracted_content for r in h.result if r.extracted_content])
357
+ return content
358
+
359
+ def model_actions_filtered(self, include: list[str] | None = None) -> list[dict]:
360
+ """Get all model actions from history as JSON"""
361
+ if include is None:
362
+ include = []
363
+ outputs = self.model_actions()
364
+ result = []
365
+ for o in outputs:
366
+ for i in include:
367
+ if i == list(o.keys())[0]:
368
+ result.append(o)
369
+ return result
370
+
371
+ def number_of_steps(self) -> int:
372
+ """Get the number of steps in the history"""
373
+ return len(self.history)
374
+
375
+
376
+ class AgentError:
377
+ """Container for agent error handling"""
378
+
379
+ VALIDATION_ERROR = 'Invalid model output format. Please follow the correct schema.'
380
+ RATE_LIMIT_ERROR = 'Rate limit reached. Waiting before retry.'
381
+ NO_VALID_ACTION = 'No valid action found'
382
+
383
+ @staticmethod
384
+ def format_error(error: Exception, include_trace: bool = False) -> str:
385
+ """Format error message based on error type and optionally include trace"""
386
+ message = ''
387
+ if isinstance(error, ValidationError):
388
+ return f'{AgentError.VALIDATION_ERROR}\nDetails: {str(error)}'
389
+ if isinstance(error, RateLimitError):
390
+ return AgentError.RATE_LIMIT_ERROR
391
+ if include_trace:
392
+ return f'{str(error)}\nStacktrace:\n{traceback.format_exc()}'
393
+ return f'{str(error)}'
browser_use/browser/browser.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Playwright browser on steroids.
3
+ """
4
+
5
+ import asyncio
6
+ import gc
7
+ import logging
8
+ from dataclasses import dataclass, field
9
+
10
+ from playwright._impl._api_structures import ProxySettings
11
+ from playwright.async_api import Browser as PlaywrightBrowser
12
+ from playwright.async_api import (
13
+ Playwright,
14
+ async_playwright,
15
+ )
16
+
17
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
18
+ from browser_use.utils import time_execution_async
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class BrowserConfig:
25
+ r"""
26
+ Configuration for the Browser.
27
+
28
+ Default values:
29
+ headless: True
30
+ Whether to run browser in headless mode
31
+
32
+ disable_security: True
33
+ Disable browser security features
34
+
35
+ extra_chromium_args: []
36
+ Extra arguments to pass to the browser
37
+
38
+ wss_url: None
39
+ Connect to a browser instance via WebSocket
40
+
41
+ cdp_url: None
42
+ Connect to a browser instance via CDP
43
+
44
+ chrome_instance_path: None
45
+ Path to a Chrome instance to use to connect to your normal browser
46
+ e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome'
47
+ """
48
+
49
+ headless: bool = False
50
+ disable_security: bool = True
51
+ extra_chromium_args: list[str] = field(default_factory=list)
52
+ chrome_instance_path: str | None = None
53
+ wss_url: str | None = None
54
+ cdp_url: str | None = None
55
+
56
+ proxy: ProxySettings | None = field(default=None)
57
+ new_context_config: BrowserContextConfig = field(default_factory=BrowserContextConfig)
58
+
59
+ _force_keep_browser_alive: bool = False
60
+
61
+
62
+ # @singleton: TODO - think about id singleton makes sense here
63
+ # @dev By default this is a singleton, but you can create multiple instances if you need to.
64
+ class Browser:
65
+ """
66
+ Playwright browser on steroids.
67
+
68
+ This is persistant browser factory that can spawn multiple browser contexts.
69
+ It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise).
70
+ """
71
+
72
+ def __init__(
73
+ self,
74
+ config: BrowserConfig = BrowserConfig(),
75
+ ):
76
+ logger.debug('Initializing new browser')
77
+ self.config = config
78
+ self.playwright: Playwright | None = None
79
+ self.playwright_browser: PlaywrightBrowser | None = None
80
+
81
+ self.disable_security_args = []
82
+ if self.config.disable_security:
83
+ self.disable_security_args = [
84
+ '--disable-web-security',
85
+ '--disable-site-isolation-trials',
86
+ '--disable-features=IsolateOrigins,site-per-process',
87
+ ]
88
+
89
+ async def new_context(self, config: BrowserContextConfig = BrowserContextConfig()) -> BrowserContext:
90
+ """Create a browser context"""
91
+ return BrowserContext(config=config, browser=self)
92
+
93
+ async def get_playwright_browser(self) -> PlaywrightBrowser:
94
+ """Get a browser context"""
95
+ if self.playwright_browser is None:
96
+ return await self._init()
97
+
98
+ return self.playwright_browser
99
+
100
+ @time_execution_async('--init (browser)')
101
+ async def _init(self):
102
+ """Initialize the browser session"""
103
+ playwright = await async_playwright().start()
104
+ browser = await self._setup_browser(playwright)
105
+
106
+ self.playwright = playwright
107
+ self.playwright_browser = browser
108
+
109
+ return self.playwright_browser
110
+
111
+ async def _setup_cdp(self, playwright: Playwright) -> PlaywrightBrowser:
112
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
113
+ if not self.config.cdp_url:
114
+ raise ValueError('CDP URL is required')
115
+ logger.info(f'Connecting to remote browser via CDP {self.config.cdp_url}')
116
+ browser = await playwright.chromium.connect_over_cdp(self.config.cdp_url)
117
+ return browser
118
+
119
+ async def _setup_wss(self, playwright: Playwright) -> PlaywrightBrowser:
120
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
121
+ if not self.config.wss_url:
122
+ raise ValueError('WSS URL is required')
123
+ logger.info(f'Connecting to remote browser via WSS {self.config.wss_url}')
124
+ browser = await playwright.chromium.connect(self.config.wss_url)
125
+ return browser
126
+
127
+ async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser:
128
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
129
+ if not self.config.chrome_instance_path:
130
+ raise ValueError('Chrome instance path is required')
131
+ import subprocess
132
+
133
+ import requests
134
+
135
+ try:
136
+ # Check if browser is already running
137
+ response = requests.get('http://localhost:9222/json/version', timeout=2)
138
+ if response.status_code == 200:
139
+ logger.info('Reusing existing Chrome instance')
140
+ browser = await playwright.chromium.connect_over_cdp(
141
+ endpoint_url='http://localhost:9222',
142
+ timeout=20000, # 20 second timeout for connection
143
+ )
144
+ return browser
145
+ except requests.ConnectionError:
146
+ logger.debug('No existing Chrome instance found, starting a new one')
147
+
148
+ # Start a new Chrome instance
149
+ subprocess.Popen(
150
+ [
151
+ self.config.chrome_instance_path,
152
+ '--remote-debugging-port=9222',
153
+ ]
154
+ + self.config.extra_chromium_args,
155
+ stdout=subprocess.DEVNULL,
156
+ stderr=subprocess.DEVNULL,
157
+ )
158
+
159
+ # Attempt to connect again after starting a new instance
160
+ for _ in range(10):
161
+ try:
162
+ response = requests.get('http://localhost:9222/json/version', timeout=2)
163
+ if response.status_code == 200:
164
+ break
165
+ except requests.ConnectionError:
166
+ pass
167
+ await asyncio.sleep(1)
168
+
169
+ # Attempt to connect again after starting a new instance
170
+ try:
171
+ browser = await playwright.chromium.connect_over_cdp(
172
+ endpoint_url='http://localhost:9222',
173
+ timeout=20000, # 20 second timeout for connection
174
+ )
175
+ return browser
176
+ except Exception as e:
177
+ logger.error(f'Failed to start a new Chrome instance.: {str(e)}')
178
+ raise RuntimeError(
179
+ ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.'
180
+ )
181
+
182
+ async def _setup_standard_browser(self, playwright: Playwright) -> PlaywrightBrowser:
183
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
184
+ browser = await playwright.chromium.launch(
185
+ headless=self.config.headless,
186
+ args=[
187
+ '--no-sandbox',
188
+ '--disable-blink-features=AutomationControlled',
189
+ '--disable-infobars',
190
+ '--disable-background-timer-throttling',
191
+ '--disable-popup-blocking',
192
+ '--disable-backgrounding-occluded-windows',
193
+ '--disable-renderer-backgrounding',
194
+ '--disable-window-activation',
195
+ '--disable-focus-on-load',
196
+ '--no-first-run',
197
+ '--no-default-browser-check',
198
+ '--no-startup-window',
199
+ '--window-position=0,0',
200
+ # '--window-size=1280,1000',
201
+ ]
202
+ + self.disable_security_args
203
+ + self.config.extra_chromium_args,
204
+ proxy=self.config.proxy,
205
+ )
206
+ # convert to Browser
207
+ return browser
208
+
209
+ async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser:
210
+ """Sets up and returns a Playwright Browser instance with anti-detection measures."""
211
+ try:
212
+ if self.config.cdp_url:
213
+ return await self._setup_cdp(playwright)
214
+ if self.config.wss_url:
215
+ return await self._setup_wss(playwright)
216
+ elif self.config.chrome_instance_path:
217
+ return await self._setup_browser_with_instance(playwright)
218
+ else:
219
+ return await self._setup_standard_browser(playwright)
220
+ except Exception as e:
221
+ logger.error(f'Failed to initialize Playwright browser: {str(e)}')
222
+ raise
223
+
224
+ async def close(self):
225
+ """Close the browser instance"""
226
+ try:
227
+ if not self.config._force_keep_browser_alive:
228
+ if self.playwright_browser:
229
+ await self.playwright_browser.close()
230
+ del self.playwright_browser
231
+ if self.playwright:
232
+ await self.playwright.stop()
233
+ del self.playwright
234
+
235
+ except Exception as e:
236
+ logger.debug(f'Failed to close browser properly: {e}')
237
+ finally:
238
+ self.playwright_browser = None
239
+ self.playwright = None
240
+
241
+ gc.collect()
242
+
243
+ def __del__(self):
244
+ """Async cleanup when object is destroyed"""
245
+ try:
246
+ if self.playwright_browser or self.playwright:
247
+ loop = asyncio.get_running_loop()
248
+ if loop.is_running():
249
+ loop.create_task(self.close())
250
+ else:
251
+ asyncio.run(self.close())
252
+ except Exception as e:
253
+ logger.debug(f'Failed to cleanup browser in destructor: {e}')
browser_use/browser/context.py ADDED
@@ -0,0 +1,1353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Playwright browser on steroids.
3
+ """
4
+
5
+ import asyncio
6
+ import base64
7
+ import gc
8
+ import json
9
+ import logging
10
+ import os
11
+ import re
12
+ import time
13
+ import uuid
14
+ from dataclasses import dataclass, field
15
+ from typing import TYPE_CHECKING, Optional, TypedDict
16
+
17
+ from playwright._impl._errors import TimeoutError
18
+ from playwright.async_api import Browser as PlaywrightBrowser
19
+ from playwright.async_api import (
20
+ BrowserContext as PlaywrightBrowserContext,
21
+ )
22
+ from playwright.async_api import (
23
+ ElementHandle,
24
+ FrameLocator,
25
+ Page,
26
+ )
27
+
28
+ from browser_use.browser.views import (
29
+ BrowserError,
30
+ BrowserState,
31
+ TabInfo,
32
+ URLNotAllowedError,
33
+ )
34
+ from browser_use.dom.service import DomService
35
+ from browser_use.dom.views import DOMElementNode, SelectorMap
36
+ from browser_use.utils import time_execution_async, time_execution_sync
37
+
38
+ if TYPE_CHECKING:
39
+ from browser_use.browser.browser import Browser
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+
44
+ class BrowserContextWindowSize(TypedDict):
45
+ width: int
46
+ height: int
47
+
48
+
49
+ @dataclass
50
+ class BrowserContextConfig:
51
+ """
52
+ Configuration for the BrowserContext.
53
+
54
+ Default values:
55
+ cookies_file: None
56
+ Path to cookies file for persistence
57
+
58
+ disable_security: True
59
+ Disable browser security features
60
+
61
+ minimum_wait_page_load_time: 0.5
62
+ Minimum time to wait before getting page state for LLM input
63
+
64
+ wait_for_network_idle_page_load_time: 1.0
65
+ Time to wait for network requests to finish before getting page state.
66
+ Lower values may result in incomplete page loads.
67
+
68
+ maximum_wait_page_load_time: 5.0
69
+ Maximum time to wait for page load before proceeding anyway
70
+
71
+ wait_between_actions: 1.0
72
+ Time to wait between multiple per step actions
73
+
74
+ browser_window_size: {
75
+ 'width': 1280,
76
+ 'height': 1100,
77
+ }
78
+ Default browser window size
79
+
80
+ no_viewport: False
81
+ Disable viewport
82
+
83
+ save_recording_path: None
84
+ Path to save video recordings
85
+
86
+ save_downloads_path: None
87
+ Path to save downloads to
88
+
89
+ trace_path: None
90
+ Path to save trace files. It will auto name the file with the TRACE_PATH/{context_id}.zip
91
+
92
+ locale: None
93
+ Specify user locale, for example en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. If not provided, defaults to the system default locale.
94
+
95
+ user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
96
+ custom user agent to use.
97
+
98
+ highlight_elements: True
99
+ Highlight elements in the DOM on the screen
100
+
101
+ viewport_expansion: 500
102
+ Viewport expansion in pixels. This amount will increase the number of elements which are included in the state what the LLM will see. If set to -1, all elements will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included.
103
+
104
+ allowed_domains: None
105
+ List of allowed domains that can be accessed. If None, all domains are allowed.
106
+ Example: ['example.com', 'api.example.com']
107
+
108
+ include_dynamic_attributes: bool = True
109
+ Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False.
110
+ """
111
+
112
+ cookies_file: str | None = None
113
+ minimum_wait_page_load_time: float = 0.25
114
+ wait_for_network_idle_page_load_time: float = 0.5
115
+ maximum_wait_page_load_time: float = 5
116
+ wait_between_actions: float = 0.5
117
+
118
+ disable_security: bool = True
119
+
120
+ browser_window_size: BrowserContextWindowSize = field(default_factory=lambda: {'width': 1280, 'height': 1100})
121
+ no_viewport: Optional[bool] = None
122
+
123
+ save_recording_path: str | None = None
124
+ save_downloads_path: str | None = None
125
+ trace_path: str | None = None
126
+ locale: str | None = None
127
+ user_agent: str = (
128
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
129
+ )
130
+
131
+ highlight_elements: bool = True
132
+ viewport_expansion: int = 500
133
+ allowed_domains: list[str] | None = None
134
+ include_dynamic_attributes: bool = True
135
+
136
+ _force_keep_context_alive: bool = False
137
+
138
+
139
+ @dataclass
140
+ class BrowserSession:
141
+ context: PlaywrightBrowserContext
142
+ cached_state: BrowserState | None
143
+
144
+
145
+ @dataclass
146
+ class BrowserContextState:
147
+ """
148
+ State of the browser context
149
+ """
150
+
151
+ target_id: str | None = None # CDP target ID
152
+
153
+
154
+ class BrowserContext:
155
+ def __init__(
156
+ self,
157
+ browser: 'Browser',
158
+ config: BrowserContextConfig = BrowserContextConfig(),
159
+ state: Optional[BrowserContextState] = None,
160
+ ):
161
+ self.context_id = str(uuid.uuid4())
162
+ logger.debug(f'Initializing new browser context with id: {self.context_id}')
163
+
164
+ self.config = config
165
+ self.browser = browser
166
+
167
+ self.state = state or BrowserContextState()
168
+
169
+ # Initialize these as None - they'll be set up when needed
170
+ self.session: BrowserSession | None = None
171
+
172
+ async def __aenter__(self):
173
+ """Async context manager entry"""
174
+ await self._initialize_session()
175
+ return self
176
+
177
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
178
+ """Async context manager exit"""
179
+ await self.close()
180
+
181
+ @time_execution_async('--close')
182
+ async def close(self):
183
+ """Close the browser instance"""
184
+ logger.debug('Closing browser context')
185
+
186
+ try:
187
+ if self.session is None:
188
+ return
189
+
190
+ # Then remove CDP protocol listeners
191
+ if self._page_event_handler and self.session.context:
192
+ try:
193
+ # This actually sends a CDP command to unsubscribe
194
+ self.session.context.remove_listener('page', self._page_event_handler)
195
+ except Exception as e:
196
+ logger.debug(f'Failed to remove CDP listener: {e}')
197
+ self._page_event_handler = None
198
+
199
+ await self.save_cookies()
200
+
201
+ if self.config.trace_path:
202
+ try:
203
+ await self.session.context.tracing.stop(path=os.path.join(self.config.trace_path, f'{self.context_id}.zip'))
204
+ except Exception as e:
205
+ logger.debug(f'Failed to stop tracing: {e}')
206
+
207
+ # This is crucial - it closes the CDP connection
208
+ if not self.config._force_keep_context_alive:
209
+ try:
210
+ await self.session.context.close()
211
+ except Exception as e:
212
+ logger.debug(f'Failed to close context: {e}')
213
+
214
+ finally:
215
+ # Dereference everything
216
+ self.session = None
217
+ self._page_event_handler = None
218
+
219
+ def __del__(self):
220
+ """Cleanup when object is destroyed"""
221
+ if not self.config._force_keep_context_alive and self.session is not None:
222
+ logger.debug('BrowserContext was not properly closed before destruction')
223
+ try:
224
+ # Use sync Playwright method for force cleanup
225
+ if hasattr(self.session.context, '_impl_obj'):
226
+ asyncio.run(self.session.context._impl_obj.close())
227
+
228
+ self.session = None
229
+ gc.collect()
230
+ except Exception as e:
231
+ logger.warning(f'Failed to force close browser context: {e}')
232
+
233
+ @time_execution_async('--initialize_session')
234
+ async def _initialize_session(self):
235
+ """Initialize the browser session"""
236
+ logger.debug('Initializing browser context')
237
+
238
+ playwright_browser = await self.browser.get_playwright_browser()
239
+ context = await self._create_context(playwright_browser)
240
+ self._page_event_handler = None
241
+
242
+ # Get or create a page to use
243
+ pages = context.pages
244
+
245
+ self.session = BrowserSession(
246
+ context=context,
247
+ cached_state=None,
248
+ )
249
+
250
+ active_page = None
251
+ if self.browser.config.cdp_url:
252
+ # If we have a saved target ID, try to find and activate it
253
+ if self.state.target_id:
254
+ targets = await self._get_cdp_targets()
255
+ for target in targets:
256
+ if target['targetId'] == self.state.target_id:
257
+ # Find matching page by URL
258
+ for page in pages:
259
+ if page.url == target['url']:
260
+ active_page = page
261
+ break
262
+ break
263
+
264
+ # If no target ID or couldn't find it, use existing page or create new
265
+ if not active_page:
266
+ if pages:
267
+ active_page = pages[0]
268
+ logger.debug('Using existing page')
269
+ else:
270
+ active_page = await context.new_page()
271
+ logger.debug('Created new page')
272
+
273
+ # Get target ID for the active page
274
+ if self.browser.config.cdp_url:
275
+ targets = await self._get_cdp_targets()
276
+ for target in targets:
277
+ if target['url'] == active_page.url:
278
+ self.state.target_id = target['targetId']
279
+ break
280
+
281
+ # Bring page to front
282
+ await active_page.bring_to_front()
283
+ await active_page.wait_for_load_state('load')
284
+
285
+ return self.session
286
+
287
+ def _add_new_page_listener(self, context: PlaywrightBrowserContext):
288
+ async def on_page(page: Page):
289
+ if self.browser.config.cdp_url:
290
+ await page.reload() # Reload the page to avoid timeout errors
291
+ await page.wait_for_load_state()
292
+ logger.debug(f'New page opened: {page.url}')
293
+ if self.session is not None:
294
+ self.state.target_id = None
295
+
296
+ self._page_event_handler = on_page
297
+ context.on('page', on_page)
298
+
299
+ async def get_session(self) -> BrowserSession:
300
+ """Lazy initialization of the browser and related components"""
301
+ if self.session is None:
302
+ return await self._initialize_session()
303
+ return self.session
304
+
305
+ async def get_current_page(self) -> Page:
306
+ """Get the current page"""
307
+ session = await self.get_session()
308
+ return await self._get_current_page(session)
309
+
310
+ async def _create_context(self, browser: PlaywrightBrowser):
311
+ """Creates a new browser context with anti-detection measures and loads cookies if available."""
312
+ if self.browser.config.cdp_url and len(browser.contexts) > 0:
313
+ context = browser.contexts[0]
314
+ elif self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
315
+ # Connect to existing Chrome instance instead of creating new one
316
+ context = browser.contexts[0]
317
+ else:
318
+ # Original code for creating new context
319
+ context = await browser.new_context(
320
+ viewport=self.config.browser_window_size,
321
+ no_viewport=False,
322
+ user_agent=self.config.user_agent,
323
+ java_script_enabled=True,
324
+ bypass_csp=self.config.disable_security,
325
+ ignore_https_errors=self.config.disable_security,
326
+ record_video_dir=self.config.save_recording_path,
327
+ record_video_size=self.config.browser_window_size,
328
+ locale=self.config.locale,
329
+ )
330
+
331
+ if self.config.trace_path:
332
+ await context.tracing.start(screenshots=True, snapshots=True, sources=True)
333
+
334
+ # Load cookies if they exist
335
+ if self.config.cookies_file and os.path.exists(self.config.cookies_file):
336
+ with open(self.config.cookies_file, 'r') as f:
337
+ cookies = json.load(f)
338
+ logger.info(f'Loaded {len(cookies)} cookies from {self.config.cookies_file}')
339
+ await context.add_cookies(cookies)
340
+
341
+ # Expose anti-detection scripts
342
+ await context.add_init_script(
343
+ """
344
+ // Webdriver property
345
+ Object.defineProperty(navigator, 'webdriver', {
346
+ get: () => undefined
347
+ });
348
+
349
+ // Languages
350
+ Object.defineProperty(navigator, 'languages', {
351
+ get: () => ['en-US']
352
+ });
353
+
354
+ // Plugins
355
+ Object.defineProperty(navigator, 'plugins', {
356
+ get: () => [1, 2, 3, 4, 5]
357
+ });
358
+
359
+ // Chrome runtime
360
+ window.chrome = { runtime: {} };
361
+
362
+ // Permissions
363
+ const originalQuery = window.navigator.permissions.query;
364
+ window.navigator.permissions.query = (parameters) => (
365
+ parameters.name === 'notifications' ?
366
+ Promise.resolve({ state: Notification.permission }) :
367
+ originalQuery(parameters)
368
+ );
369
+ (function () {
370
+ const originalAttachShadow = Element.prototype.attachShadow;
371
+ Element.prototype.attachShadow = function attachShadow(options) {
372
+ return originalAttachShadow.call(this, { ...options, mode: "open" });
373
+ };
374
+ })();
375
+ """
376
+ )
377
+
378
+ return context
379
+
380
+ async def _wait_for_stable_network(self):
381
+ page = await self.get_current_page()
382
+
383
+ pending_requests = set()
384
+ last_activity = asyncio.get_event_loop().time()
385
+
386
+ # Define relevant resource types and content types
387
+ RELEVANT_RESOURCE_TYPES = {
388
+ 'document',
389
+ 'stylesheet',
390
+ 'image',
391
+ 'font',
392
+ 'script',
393
+ 'iframe',
394
+ }
395
+
396
+ RELEVANT_CONTENT_TYPES = {
397
+ 'text/html',
398
+ 'text/css',
399
+ 'application/javascript',
400
+ 'image/',
401
+ 'font/',
402
+ 'application/json',
403
+ }
404
+
405
+ # Additional patterns to filter out
406
+ IGNORED_URL_PATTERNS = {
407
+ # Analytics and tracking
408
+ 'analytics',
409
+ 'tracking',
410
+ 'telemetry',
411
+ 'beacon',
412
+ 'metrics',
413
+ # Ad-related
414
+ 'doubleclick',
415
+ 'adsystem',
416
+ 'adserver',
417
+ 'advertising',
418
+ # Social media widgets
419
+ 'facebook.com/plugins',
420
+ 'platform.twitter',
421
+ 'linkedin.com/embed',
422
+ # Live chat and support
423
+ 'livechat',
424
+ 'zendesk',
425
+ 'intercom',
426
+ 'crisp.chat',
427
+ 'hotjar',
428
+ # Push notifications
429
+ 'push-notifications',
430
+ 'onesignal',
431
+ 'pushwoosh',
432
+ # Background sync/heartbeat
433
+ 'heartbeat',
434
+ 'ping',
435
+ 'alive',
436
+ # WebRTC and streaming
437
+ 'webrtc',
438
+ 'rtmp://',
439
+ 'wss://',
440
+ # Common CDNs for dynamic content
441
+ 'cloudfront.net',
442
+ 'fastly.net',
443
+ }
444
+
445
+ async def on_request(request):
446
+ # Filter by resource type
447
+ if request.resource_type not in RELEVANT_RESOURCE_TYPES:
448
+ return
449
+
450
+ # Filter out streaming, websocket, and other real-time requests
451
+ if request.resource_type in {
452
+ 'websocket',
453
+ 'media',
454
+ 'eventsource',
455
+ 'manifest',
456
+ 'other',
457
+ }:
458
+ return
459
+
460
+ # Filter out by URL patterns
461
+ url = request.url.lower()
462
+ if any(pattern in url for pattern in IGNORED_URL_PATTERNS):
463
+ return
464
+
465
+ # Filter out data URLs and blob URLs
466
+ if url.startswith(('data:', 'blob:')):
467
+ return
468
+
469
+ # Filter out requests with certain headers
470
+ headers = request.headers
471
+ if headers.get('purpose') == 'prefetch' or headers.get('sec-fetch-dest') in [
472
+ 'video',
473
+ 'audio',
474
+ ]:
475
+ return
476
+
477
+ nonlocal last_activity
478
+ pending_requests.add(request)
479
+ last_activity = asyncio.get_event_loop().time()
480
+ # logger.debug(f'Request started: {request.url} ({request.resource_type})')
481
+
482
+ async def on_response(response):
483
+ request = response.request
484
+ if request not in pending_requests:
485
+ return
486
+
487
+ # Filter by content type if available
488
+ content_type = response.headers.get('content-type', '').lower()
489
+
490
+ # Skip if content type indicates streaming or real-time data
491
+ if any(
492
+ t in content_type
493
+ for t in [
494
+ 'streaming',
495
+ 'video',
496
+ 'audio',
497
+ 'webm',
498
+ 'mp4',
499
+ 'event-stream',
500
+ 'websocket',
501
+ 'protobuf',
502
+ ]
503
+ ):
504
+ pending_requests.remove(request)
505
+ return
506
+
507
+ # Only process relevant content types
508
+ if not any(ct in content_type for ct in RELEVANT_CONTENT_TYPES):
509
+ pending_requests.remove(request)
510
+ return
511
+
512
+ # Skip if response is too large (likely not essential for page load)
513
+ content_length = response.headers.get('content-length')
514
+ if content_length and int(content_length) > 5 * 1024 * 1024: # 5MB
515
+ pending_requests.remove(request)
516
+ return
517
+
518
+ nonlocal last_activity
519
+ pending_requests.remove(request)
520
+ last_activity = asyncio.get_event_loop().time()
521
+ # logger.debug(f'Request resolved: {request.url} ({content_type})')
522
+
523
+ # Attach event listeners
524
+ page.on('request', on_request)
525
+ page.on('response', on_response)
526
+
527
+ try:
528
+ # Wait for idle time
529
+ start_time = asyncio.get_event_loop().time()
530
+ while True:
531
+ await asyncio.sleep(0.1)
532
+ now = asyncio.get_event_loop().time()
533
+ if len(pending_requests) == 0 and (now - last_activity) >= self.config.wait_for_network_idle_page_load_time:
534
+ break
535
+ if now - start_time > self.config.maximum_wait_page_load_time:
536
+ logger.debug(
537
+ f'Network timeout after {self.config.maximum_wait_page_load_time}s with {len(pending_requests)} '
538
+ f'pending requests: {[r.url for r in pending_requests]}'
539
+ )
540
+ break
541
+
542
+ finally:
543
+ # Clean up event listeners
544
+ page.remove_listener('request', on_request)
545
+ page.remove_listener('response', on_response)
546
+
547
+ logger.debug(f'Network stabilized for {self.config.wait_for_network_idle_page_load_time} seconds')
548
+
549
+ async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None):
550
+ """
551
+ Ensures page is fully loaded before continuing.
552
+ Waits for either network to be idle or minimum WAIT_TIME, whichever is longer.
553
+ Also checks if the loaded URL is allowed.
554
+ """
555
+ # Start timing
556
+ start_time = time.time()
557
+
558
+ # Wait for page load
559
+ try:
560
+ await self._wait_for_stable_network()
561
+
562
+ # Check if the loaded URL is allowed
563
+ page = await self.get_current_page()
564
+ await self._check_and_handle_navigation(page)
565
+ except URLNotAllowedError as e:
566
+ raise e
567
+ except Exception:
568
+ logger.warning('Page load failed, continuing...')
569
+ pass
570
+
571
+ # Calculate remaining time to meet minimum WAIT_TIME
572
+ elapsed = time.time() - start_time
573
+ remaining = max((timeout_overwrite or self.config.minimum_wait_page_load_time) - elapsed, 0)
574
+
575
+ logger.debug(f'--Page loaded in {elapsed:.2f} seconds, waiting for additional {remaining:.2f} seconds')
576
+
577
+ # Sleep remaining time if needed
578
+ if remaining > 0:
579
+ await asyncio.sleep(remaining)
580
+
581
+ def _is_url_allowed(self, url: str) -> bool:
582
+ """Check if a URL is allowed based on the whitelist configuration."""
583
+ if not self.config.allowed_domains:
584
+ return True
585
+
586
+ try:
587
+ from urllib.parse import urlparse
588
+
589
+ parsed_url = urlparse(url)
590
+ domain = parsed_url.netloc.lower()
591
+
592
+ # Remove port number if present
593
+ if ':' in domain:
594
+ domain = domain.split(':')[0]
595
+
596
+ # Check if domain matches any allowed domain pattern
597
+ return any(
598
+ domain == allowed_domain.lower() or domain.endswith('.' + allowed_domain.lower())
599
+ for allowed_domain in self.config.allowed_domains
600
+ )
601
+ except Exception as e:
602
+ logger.error(f'Error checking URL allowlist: {str(e)}')
603
+ return False
604
+
605
+ async def _check_and_handle_navigation(self, page: Page) -> None:
606
+ """Check if current page URL is allowed and handle if not."""
607
+ if not self._is_url_allowed(page.url):
608
+ logger.warning(f'Navigation to non-allowed URL detected: {page.url}')
609
+ try:
610
+ await self.go_back()
611
+ except Exception as e:
612
+ logger.error(f'Failed to go back after detecting non-allowed URL: {str(e)}')
613
+ raise URLNotAllowedError(f'Navigation to non-allowed URL: {page.url}')
614
+
615
+ async def navigate_to(self, url: str):
616
+ """Navigate to a URL"""
617
+ if not self._is_url_allowed(url):
618
+ raise BrowserError(f'Navigation to non-allowed URL: {url}')
619
+
620
+ page = await self.get_current_page()
621
+ await page.goto(url)
622
+ await page.wait_for_load_state()
623
+
624
+ async def refresh_page(self):
625
+ """Refresh the current page"""
626
+ page = await self.get_current_page()
627
+ await page.reload()
628
+ await page.wait_for_load_state()
629
+
630
+ async def go_back(self):
631
+ """Navigate back in history"""
632
+ page = await self.get_current_page()
633
+ try:
634
+ # 10 ms timeout
635
+ await page.go_back(timeout=10, wait_until='domcontentloaded')
636
+ # await self._wait_for_page_and_frames_load(timeout_overwrite=1.0)
637
+ except Exception as e:
638
+ # Continue even if its not fully loaded, because we wait later for the page to load
639
+ logger.debug(f'During go_back: {e}')
640
+
641
+ async def go_forward(self):
642
+ """Navigate forward in history"""
643
+ page = await self.get_current_page()
644
+ try:
645
+ await page.go_forward(timeout=10, wait_until='domcontentloaded')
646
+ except Exception as e:
647
+ # Continue even if its not fully loaded, because we wait later for the page to load
648
+ logger.debug(f'During go_forward: {e}')
649
+
650
+ async def close_current_tab(self):
651
+ """Close the current tab"""
652
+ session = await self.get_session()
653
+ page = await self._get_current_page(session)
654
+ await page.close()
655
+
656
+ # Switch to the first available tab if any exist
657
+ if session.context.pages:
658
+ await self.switch_to_tab(0)
659
+
660
+ # otherwise the browser will be closed
661
+
662
+ async def get_page_html(self) -> str:
663
+ """Get the current page HTML content"""
664
+ page = await self.get_current_page()
665
+ return await page.content()
666
+
667
+ async def execute_javascript(self, script: str):
668
+ """Execute JavaScript code on the page"""
669
+ page = await self.get_current_page()
670
+ return await page.evaluate(script)
671
+
672
+ async def get_page_structure(self) -> str:
673
+ """Get a debug view of the page structure including iframes"""
674
+ debug_script = """(() => {
675
+ function getPageStructure(element = document, depth = 0, maxDepth = 10) {
676
+ if (depth >= maxDepth) return '';
677
+
678
+ const indent = ' '.repeat(depth);
679
+ let structure = '';
680
+
681
+ // Skip certain elements that clutter the output
682
+ const skipTags = new Set(['script', 'style', 'link', 'meta', 'noscript']);
683
+
684
+ // Add current element info if it's not the document
685
+ if (element !== document) {
686
+ const tagName = element.tagName.toLowerCase();
687
+
688
+ // Skip uninteresting elements
689
+ if (skipTags.has(tagName)) return '';
690
+
691
+ const id = element.id ? `#${element.id}` : '';
692
+ const classes = element.className && typeof element.className === 'string' ?
693
+ `.${element.className.split(' ').filter(c => c).join('.')}` : '';
694
+
695
+ // Get additional useful attributes
696
+ const attrs = [];
697
+ if (element.getAttribute('role')) attrs.push(`role="${element.getAttribute('role')}"`);
698
+ if (element.getAttribute('aria-label')) attrs.push(`aria-label="${element.getAttribute('aria-label')}"`);
699
+ if (element.getAttribute('type')) attrs.push(`type="${element.getAttribute('type')}"`);
700
+ if (element.getAttribute('name')) attrs.push(`name="${element.getAttribute('name')}"`);
701
+ if (element.getAttribute('src')) {
702
+ const src = element.getAttribute('src');
703
+ attrs.push(`src="${src.substring(0, 50)}${src.length > 50 ? '...' : ''}"`);
704
+ }
705
+
706
+ // Add element info
707
+ structure += `${indent}${tagName}${id}${classes}${attrs.length ? ' [' + attrs.join(', ') + ']' : ''}\\n`;
708
+
709
+ // Handle iframes specially
710
+ if (tagName === 'iframe') {
711
+ try {
712
+ const iframeDoc = element.contentDocument || element.contentWindow?.document;
713
+ if (iframeDoc) {
714
+ structure += `${indent} [IFRAME CONTENT]:\\n`;
715
+ structure += getPageStructure(iframeDoc, depth + 2, maxDepth);
716
+ } else {
717
+ structure += `${indent} [IFRAME: No access - likely cross-origin]\\n`;
718
+ }
719
+ } catch (e) {
720
+ structure += `${indent} [IFRAME: Access denied - ${e.message}]\\n`;
721
+ }
722
+ }
723
+ }
724
+
725
+ // Get all child elements
726
+ const children = element.children || element.childNodes;
727
+ for (const child of children) {
728
+ if (child.nodeType === 1) { // Element nodes only
729
+ structure += getPageStructure(child, depth + 1, maxDepth);
730
+ }
731
+ }
732
+
733
+ return structure;
734
+ }
735
+
736
+ return getPageStructure();
737
+ })()"""
738
+
739
+ page = await self.get_current_page()
740
+ structure = await page.evaluate(debug_script)
741
+ return structure
742
+
743
+ @time_execution_sync('--get_state') # This decorator might need to be updated to handle async
744
+ async def get_state(self) -> BrowserState:
745
+ """Get the current state of the browser"""
746
+ await self._wait_for_page_and_frames_load()
747
+ session = await self.get_session()
748
+ session.cached_state = await self._update_state()
749
+
750
+ # Save cookies if a file is specified
751
+ if self.config.cookies_file:
752
+ asyncio.create_task(self.save_cookies())
753
+
754
+ return session.cached_state
755
+
756
+ async def _update_state(self, focus_element: int = -1) -> BrowserState:
757
+ """Update and return state."""
758
+ session = await self.get_session()
759
+
760
+ # Check if current page is still valid, if not switch to another available page
761
+ try:
762
+ page = await self.get_current_page()
763
+ # Test if page is still accessible
764
+ await page.evaluate('1')
765
+ except Exception as e:
766
+ logger.debug(f'Current page is no longer accessible: {str(e)}')
767
+ # Get all available pages
768
+ pages = session.context.pages
769
+ if pages:
770
+ self.state.target_id = None
771
+ page = await self._get_current_page(session)
772
+ logger.debug(f'Switched to page: {await page.title()}')
773
+ else:
774
+ raise BrowserError('Browser closed: no valid pages available')
775
+
776
+ try:
777
+ await self.remove_highlights()
778
+ dom_service = DomService(page)
779
+ content = await dom_service.get_clickable_elements(
780
+ focus_element=focus_element,
781
+ viewport_expansion=self.config.viewport_expansion,
782
+ highlight_elements=self.config.highlight_elements,
783
+ )
784
+
785
+ screenshot_b64 = await self.take_screenshot()
786
+ pixels_above, pixels_below = await self.get_scroll_info(page)
787
+
788
+ self.current_state = BrowserState(
789
+ element_tree=content.element_tree,
790
+ selector_map=content.selector_map,
791
+ url=page.url,
792
+ title=await page.title(),
793
+ tabs=await self.get_tabs_info(),
794
+ screenshot=screenshot_b64,
795
+ pixels_above=pixels_above,
796
+ pixels_below=pixels_below,
797
+ )
798
+
799
+ return self.current_state
800
+ except Exception as e:
801
+ logger.error(f'Failed to update state: {str(e)}')
802
+ # Return last known good state if available
803
+ if hasattr(self, 'current_state'):
804
+ return self.current_state
805
+ raise
806
+
807
+ # region - Browser Actions
808
+ @time_execution_async('--take_screenshot')
809
+ async def take_screenshot(self, full_page: bool = False) -> str:
810
+ """
811
+ Returns a base64 encoded screenshot of the current page.
812
+ """
813
+ page = await self.get_current_page()
814
+
815
+ await page.bring_to_front()
816
+ await page.wait_for_load_state()
817
+
818
+ screenshot = await page.screenshot(
819
+ full_page=full_page,
820
+ animations='disabled',
821
+ )
822
+
823
+ screenshot_b64 = base64.b64encode(screenshot).decode('utf-8')
824
+
825
+ # await self.remove_highlights()
826
+
827
+ return screenshot_b64
828
+
829
+ @time_execution_async('--remove_highlights')
830
+ async def remove_highlights(self):
831
+ """
832
+ Removes all highlight overlays and labels created by the highlightElement function.
833
+ Handles cases where the page might be closed or inaccessible.
834
+ """
835
+ try:
836
+ page = await self.get_current_page()
837
+ await page.evaluate(
838
+ """
839
+ try {
840
+ // Remove the highlight container and all its contents
841
+ const container = document.getElementById('playwright-highlight-container');
842
+ if (container) {
843
+ container.remove();
844
+ }
845
+
846
+ // Remove highlight attributes from elements
847
+ const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]');
848
+ highlightedElements.forEach(el => {
849
+ el.removeAttribute('browser-user-highlight-id');
850
+ });
851
+ } catch (e) {
852
+ console.error('Failed to remove highlights:', e);
853
+ }
854
+ """
855
+ )
856
+ except Exception as e:
857
+ logger.debug(f'Failed to remove highlights (this is usually ok): {str(e)}')
858
+ # Don't raise the error since this is not critical functionality
859
+ pass
860
+
861
+ # endregion
862
+
863
+ # region - User Actions
864
+
865
+ @classmethod
866
+ def _convert_simple_xpath_to_css_selector(cls, xpath: str) -> str:
867
+ """Converts simple XPath expressions to CSS selectors."""
868
+ if not xpath:
869
+ return ''
870
+
871
+ # Remove leading slash if present
872
+ xpath = xpath.lstrip('/')
873
+
874
+ # Split into parts
875
+ parts = xpath.split('/')
876
+ css_parts = []
877
+
878
+ for part in parts:
879
+ if not part:
880
+ continue
881
+
882
+ # Handle index notation [n]
883
+ if '[' in part:
884
+ base_part = part[: part.find('[')]
885
+ index_part = part[part.find('[') :]
886
+
887
+ # Handle multiple indices
888
+ indices = [i.strip('[]') for i in index_part.split(']')[:-1]]
889
+
890
+ for idx in indices:
891
+ try:
892
+ # Handle numeric indices
893
+ if idx.isdigit():
894
+ index = int(idx) - 1
895
+ base_part += f':nth-of-type({index + 1})'
896
+ # Handle last() function
897
+ elif idx == 'last()':
898
+ base_part += ':last-of-type'
899
+ # Handle position() functions
900
+ elif 'position()' in idx:
901
+ if '>1' in idx:
902
+ base_part += ':nth-of-type(n+2)'
903
+ except ValueError:
904
+ continue
905
+
906
+ css_parts.append(base_part)
907
+ else:
908
+ css_parts.append(part)
909
+
910
+ base_selector = ' > '.join(css_parts)
911
+ return base_selector
912
+
913
+ @classmethod
914
+ @time_execution_sync('--enhanced_css_selector_for_element')
915
+ def _enhanced_css_selector_for_element(cls, element: DOMElementNode, include_dynamic_attributes: bool = True) -> str:
916
+ """
917
+ Creates a CSS selector for a DOM element, handling various edge cases and special characters.
918
+
919
+ Args:
920
+ element: The DOM element to create a selector for
921
+
922
+ Returns:
923
+ A valid CSS selector string
924
+ """
925
+ try:
926
+ # Get base selector from XPath
927
+ css_selector = cls._convert_simple_xpath_to_css_selector(element.xpath)
928
+
929
+ # Handle class attributes
930
+ if 'class' in element.attributes and element.attributes['class'] and include_dynamic_attributes:
931
+ # Define a regex pattern for valid class names in CSS
932
+ valid_class_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_-]*$')
933
+
934
+ # Iterate through the class attribute values
935
+ classes = element.attributes['class'].split()
936
+ for class_name in classes:
937
+ # Skip empty class names
938
+ if not class_name.strip():
939
+ continue
940
+
941
+ # Check if the class name is valid
942
+ if valid_class_name_pattern.match(class_name):
943
+ # Append the valid class name to the CSS selector
944
+ css_selector += f'.{class_name}'
945
+ else:
946
+ # Skip invalid class names
947
+ continue
948
+
949
+ # Expanded set of safe attributes that are stable and useful for selection
950
+ SAFE_ATTRIBUTES = {
951
+ # Data attributes (if they're stable in your application)
952
+ 'id',
953
+ # Standard HTML attributes
954
+ 'name',
955
+ 'type',
956
+ 'placeholder',
957
+ # Accessibility attributes
958
+ 'aria-label',
959
+ 'aria-labelledby',
960
+ 'aria-describedby',
961
+ 'role',
962
+ # Common form attributes
963
+ 'for',
964
+ 'autocomplete',
965
+ 'required',
966
+ 'readonly',
967
+ # Media attributes
968
+ 'alt',
969
+ 'title',
970
+ 'src',
971
+ # Custom stable attributes (add any application-specific ones)
972
+ 'href',
973
+ 'target',
974
+ }
975
+
976
+ if include_dynamic_attributes:
977
+ dynamic_attributes = {
978
+ 'data-id',
979
+ 'data-qa',
980
+ 'data-cy',
981
+ 'data-testid',
982
+ }
983
+ SAFE_ATTRIBUTES.update(dynamic_attributes)
984
+
985
+ # Handle other attributes
986
+ for attribute, value in element.attributes.items():
987
+ if attribute == 'class':
988
+ continue
989
+
990
+ # Skip invalid attribute names
991
+ if not attribute.strip():
992
+ continue
993
+
994
+ if attribute not in SAFE_ATTRIBUTES:
995
+ continue
996
+
997
+ # Escape special characters in attribute names
998
+ safe_attribute = attribute.replace(':', r'\:')
999
+
1000
+ # Handle different value cases
1001
+ if value == '':
1002
+ css_selector += f'[{safe_attribute}]'
1003
+ elif any(char in value for char in '"\'<>`\n\r\t'):
1004
+ # Use contains for values with special characters
1005
+ # Regex-substitute *any* whitespace with a single space, then strip.
1006
+ collapsed_value = re.sub(r'\s+', ' ', value).strip()
1007
+ # Escape embedded double-quotes.
1008
+ safe_value = collapsed_value.replace('"', '\\"')
1009
+ css_selector += f'[{safe_attribute}*="{safe_value}"]'
1010
+ else:
1011
+ css_selector += f'[{safe_attribute}="{value}"]'
1012
+
1013
+ return css_selector
1014
+
1015
+ except Exception:
1016
+ # Fallback to a more basic selector if something goes wrong
1017
+ tag_name = element.tag_name or '*'
1018
+ return f"{tag_name}[highlight_index='{element.highlight_index}']"
1019
+
1020
+ @time_execution_async('--get_locate_element')
1021
+ async def get_locate_element(self, element: DOMElementNode) -> Optional[ElementHandle]:
1022
+ current_frame = await self.get_current_page()
1023
+
1024
+ # Start with the target element and collect all parents
1025
+ parents: list[DOMElementNode] = []
1026
+ current = element
1027
+ while current.parent is not None:
1028
+ parent = current.parent
1029
+ parents.append(parent)
1030
+ current = parent
1031
+
1032
+ # Reverse the parents list to process from top to bottom
1033
+ parents.reverse()
1034
+
1035
+ # Process all iframe parents in sequence
1036
+ iframes = [item for item in parents if item.tag_name == 'iframe']
1037
+ for parent in iframes:
1038
+ css_selector = self._enhanced_css_selector_for_element(
1039
+ parent,
1040
+ include_dynamic_attributes=self.config.include_dynamic_attributes,
1041
+ )
1042
+ current_frame = current_frame.frame_locator(css_selector)
1043
+
1044
+ css_selector = self._enhanced_css_selector_for_element(
1045
+ element, include_dynamic_attributes=self.config.include_dynamic_attributes
1046
+ )
1047
+
1048
+ try:
1049
+ if isinstance(current_frame, FrameLocator):
1050
+ element_handle = await current_frame.locator(css_selector).element_handle()
1051
+ return element_handle
1052
+ else:
1053
+ # Try to scroll into view if hidden
1054
+ element_handle = await current_frame.query_selector(css_selector)
1055
+ if element_handle:
1056
+ await element_handle.scroll_into_view_if_needed()
1057
+ return element_handle
1058
+ return None
1059
+ except Exception as e:
1060
+ logger.error(f'Failed to locate element: {str(e)}')
1061
+ return None
1062
+
1063
+ @time_execution_async('--input_text_element_node')
1064
+ async def _input_text_element_node(self, element_node: DOMElementNode, text: str):
1065
+ """
1066
+ Input text into an element with proper error handling and state management.
1067
+ Handles different types of input fields and ensures proper element state before input.
1068
+ """
1069
+ try:
1070
+ # Highlight before typing
1071
+ # if element_node.highlight_index is not None:
1072
+ # await self._update_state(focus_element=element_node.highlight_index)
1073
+
1074
+ element_handle = await self.get_locate_element(element_node)
1075
+
1076
+ if element_handle is None:
1077
+ raise BrowserError(f'Element: {repr(element_node)} not found')
1078
+
1079
+ # Ensure element is ready for input
1080
+ try:
1081
+ await element_handle.wait_for_element_state('stable', timeout=1000)
1082
+ await element_handle.scroll_into_view_if_needed(timeout=1000)
1083
+ except Exception:
1084
+ pass
1085
+
1086
+ # Get element properties to determine input method
1087
+ tag_handle = await element_handle.get_property("tagName")
1088
+ tag_name = (await tag_handle.json_value()).lower()
1089
+ is_contenteditable = await element_handle.get_property('isContentEditable')
1090
+ readonly_handle = await element_handle.get_property("readOnly")
1091
+ disabled_handle = await element_handle.get_property("disabled")
1092
+
1093
+ readonly = await readonly_handle.json_value() if readonly_handle else False
1094
+ disabled = await disabled_handle.json_value() if disabled_handle else False
1095
+
1096
+ if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled):
1097
+ await element_handle.evaluate('el => el.textContent = ""')
1098
+ await element_handle.type(text, delay=5)
1099
+ else:
1100
+ await element_handle.fill(text)
1101
+
1102
+ except Exception as e:
1103
+ logger.debug(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}')
1104
+ raise BrowserError(f'Failed to input text into index {element_node.highlight_index}')
1105
+
1106
+ @time_execution_async('--click_element_node')
1107
+ async def _click_element_node(self, element_node: DOMElementNode) -> Optional[str]:
1108
+ """
1109
+ Optimized method to click an element using xpath.
1110
+ """
1111
+ page = await self.get_current_page()
1112
+
1113
+ try:
1114
+ # Highlight before clicking
1115
+ # if element_node.highlight_index is not None:
1116
+ # await self._update_state(focus_element=element_node.highlight_index)
1117
+
1118
+ element_handle = await self.get_locate_element(element_node)
1119
+
1120
+ if element_handle is None:
1121
+ raise Exception(f'Element: {repr(element_node)} not found')
1122
+
1123
+ async def perform_click(click_func):
1124
+ """Performs the actual click, handling both download
1125
+ and navigation scenarios."""
1126
+ if self.config.save_downloads_path:
1127
+ try:
1128
+ # Try short-timeout expect_download to detect a file download has been been triggered
1129
+ async with page.expect_download(timeout=5000) as download_info:
1130
+ await click_func()
1131
+ download = await download_info.value
1132
+ # Determine file path
1133
+ suggested_filename = download.suggested_filename
1134
+ unique_filename = await self._get_unique_filename(self.config.save_downloads_path, suggested_filename)
1135
+ download_path = os.path.join(self.config.save_downloads_path, unique_filename)
1136
+ await download.save_as(download_path)
1137
+ logger.debug(f'Download triggered. Saved file to: {download_path}')
1138
+ return download_path
1139
+ except TimeoutError:
1140
+ # If no download is triggered, treat as normal click
1141
+ logger.debug('No download triggered within timeout. Checking navigation...')
1142
+ await page.wait_for_load_state()
1143
+ await self._check_and_handle_navigation(page)
1144
+ else:
1145
+ # Standard click logic if no download is expected
1146
+ await click_func()
1147
+ await page.wait_for_load_state()
1148
+ await self._check_and_handle_navigation(page)
1149
+
1150
+ try:
1151
+ return await perform_click(lambda: element_handle.click(timeout=1500))
1152
+ except URLNotAllowedError as e:
1153
+ raise e
1154
+ except Exception:
1155
+ try:
1156
+ return await perform_click(lambda: page.evaluate('(el) => el.click()', element_handle))
1157
+ except URLNotAllowedError as e:
1158
+ raise e
1159
+ except Exception as e:
1160
+ raise Exception(f'Failed to click element: {str(e)}')
1161
+
1162
+ except URLNotAllowedError as e:
1163
+ raise e
1164
+ except Exception as e:
1165
+ raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}')
1166
+
1167
+ @time_execution_async('--get_tabs_info')
1168
+ async def get_tabs_info(self) -> list[TabInfo]:
1169
+ """Get information about all tabs"""
1170
+ session = await self.get_session()
1171
+
1172
+ tabs_info = []
1173
+ for page_id, page in enumerate(session.context.pages):
1174
+ tab_info = TabInfo(page_id=page_id, url=page.url, title=await page.title())
1175
+ tabs_info.append(tab_info)
1176
+
1177
+ return tabs_info
1178
+
1179
+ @time_execution_async('--switch_to_tab')
1180
+ async def switch_to_tab(self, page_id: int) -> None:
1181
+ """Switch to a specific tab by its page_id"""
1182
+ session = await self.get_session()
1183
+ pages = session.context.pages
1184
+
1185
+ if page_id >= len(pages):
1186
+ raise BrowserError(f'No tab found with page_id: {page_id}')
1187
+
1188
+ page = pages[page_id]
1189
+
1190
+ # Check if the tab's URL is allowed before switching
1191
+ if not self._is_url_allowed(page.url):
1192
+ raise BrowserError(f'Cannot switch to tab with non-allowed URL: {page.url}')
1193
+
1194
+ # Update target ID if using CDP
1195
+ if self.browser.config.cdp_url:
1196
+ targets = await self._get_cdp_targets()
1197
+ for target in targets:
1198
+ if target['url'] == page.url:
1199
+ self.state.target_id = target['targetId']
1200
+ break
1201
+
1202
+ await page.bring_to_front()
1203
+ await page.wait_for_load_state()
1204
+
1205
+ @time_execution_async('--create_new_tab')
1206
+ async def create_new_tab(self, url: str | None = None) -> None:
1207
+ """Create a new tab and optionally navigate to a URL"""
1208
+ if url and not self._is_url_allowed(url):
1209
+ raise BrowserError(f'Cannot create new tab with non-allowed URL: {url}')
1210
+
1211
+ session = await self.get_session()
1212
+ new_page = await session.context.new_page()
1213
+ await new_page.wait_for_load_state()
1214
+
1215
+ if url:
1216
+ await new_page.goto(url)
1217
+ await self._wait_for_page_and_frames_load(timeout_overwrite=1)
1218
+
1219
+ # Get target ID for new page if using CDP
1220
+ if self.browser.config.cdp_url:
1221
+ targets = await self._get_cdp_targets()
1222
+ for target in targets:
1223
+ if target['url'] == new_page.url:
1224
+ self.state.target_id = target['targetId']
1225
+ break
1226
+
1227
+ # endregion
1228
+
1229
+ # region - Helper methods for easier access to the DOM
1230
+ async def _get_current_page(self, session: BrowserSession) -> Page:
1231
+ pages = session.context.pages
1232
+
1233
+ # Try to find page by target ID if using CDP
1234
+ if self.browser.config.cdp_url and self.state.target_id:
1235
+ targets = await self._get_cdp_targets()
1236
+ for target in targets:
1237
+ if target['targetId'] == self.state.target_id:
1238
+ for page in pages:
1239
+ if page.url == target['url']:
1240
+ return page
1241
+
1242
+ # Fallback to last page
1243
+ return pages[-1] if pages else await session.context.new_page()
1244
+
1245
+ async def get_selector_map(self) -> SelectorMap:
1246
+ session = await self.get_session()
1247
+ if session.cached_state is None:
1248
+ return {}
1249
+ return session.cached_state.selector_map
1250
+
1251
+ async def get_element_by_index(self, index: int) -> ElementHandle | None:
1252
+ selector_map = await self.get_selector_map()
1253
+ element_handle = await self.get_locate_element(selector_map[index])
1254
+ return element_handle
1255
+
1256
+ async def get_dom_element_by_index(self, index: int) -> DOMElementNode:
1257
+ selector_map = await self.get_selector_map()
1258
+ return selector_map[index]
1259
+
1260
+ async def save_cookies(self):
1261
+ """Save current cookies to file"""
1262
+ if self.session and self.session.context and self.config.cookies_file:
1263
+ try:
1264
+ cookies = await self.session.context.cookies()
1265
+ logger.debug(f'Saving {len(cookies)} cookies to {self.config.cookies_file}')
1266
+
1267
+ # Check if the path is a directory and create it if necessary
1268
+ dirname = os.path.dirname(self.config.cookies_file)
1269
+ if dirname:
1270
+ os.makedirs(dirname, exist_ok=True)
1271
+
1272
+ with open(self.config.cookies_file, 'w') as f:
1273
+ json.dump(cookies, f)
1274
+ except Exception as e:
1275
+ logger.warning(f'Failed to save cookies: {str(e)}')
1276
+
1277
+ async def is_file_uploader(self, element_node: DOMElementNode, max_depth: int = 3, current_depth: int = 0) -> bool:
1278
+ """Check if element or its children are file uploaders"""
1279
+ if current_depth > max_depth:
1280
+ return False
1281
+
1282
+ # Check current element
1283
+ is_uploader = False
1284
+
1285
+ if not isinstance(element_node, DOMElementNode):
1286
+ return False
1287
+
1288
+ # Check for file input attributes
1289
+ if element_node.tag_name == 'input':
1290
+ is_uploader = element_node.attributes.get('type') == 'file' or element_node.attributes.get('accept') is not None
1291
+
1292
+ if is_uploader:
1293
+ return True
1294
+
1295
+ # Recursively check children
1296
+ if element_node.children and current_depth < max_depth:
1297
+ for child in element_node.children:
1298
+ if isinstance(child, DOMElementNode):
1299
+ if await self.is_file_uploader(child, max_depth, current_depth + 1):
1300
+ return True
1301
+
1302
+ return False
1303
+
1304
+ async def get_scroll_info(self, page: Page) -> tuple[int, int]:
1305
+ """Get scroll position information for the current page."""
1306
+ scroll_y = await page.evaluate('window.scrollY')
1307
+ viewport_height = await page.evaluate('window.innerHeight')
1308
+ total_height = await page.evaluate('document.documentElement.scrollHeight')
1309
+ pixels_above = scroll_y
1310
+ pixels_below = total_height - (scroll_y + viewport_height)
1311
+ return pixels_above, pixels_below
1312
+
1313
+ async def reset_context(self):
1314
+ """Reset the browser session
1315
+ Call this when you don't want to kill the context but just kill the state
1316
+ """
1317
+ # close all tabs and clear cached state
1318
+ session = await self.get_session()
1319
+
1320
+ pages = session.context.pages
1321
+ for page in pages:
1322
+ await page.close()
1323
+
1324
+ session.cached_state = None
1325
+ self.state.target_id = None
1326
+
1327
+ async def _get_unique_filename(self, directory, filename):
1328
+ """Generate a unique filename by appending (1), (2), etc., if a file already exists."""
1329
+ base, ext = os.path.splitext(filename)
1330
+ counter = 1
1331
+ new_filename = filename
1332
+ while os.path.exists(os.path.join(directory, new_filename)):
1333
+ new_filename = f'{base} ({counter}){ext}'
1334
+ counter += 1
1335
+ return new_filename
1336
+
1337
+ async def _get_cdp_targets(self) -> list[dict]:
1338
+ """Get all CDP targets directly using CDP protocol"""
1339
+ if not self.browser.config.cdp_url or not self.session:
1340
+ return []
1341
+
1342
+ try:
1343
+ pages = self.session.context.pages
1344
+ if not pages:
1345
+ return []
1346
+
1347
+ cdp_session = await pages[0].context.new_cdp_session(pages[0])
1348
+ result = await cdp_session.send('Target.getTargets')
1349
+ await cdp_session.detach()
1350
+ return result.get('targetInfos', [])
1351
+ except Exception as e:
1352
+ logger.debug(f'Failed to get CDP targets: {e}')
1353
+ return []
browser_use/browser/tests/screenshot_test.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+
3
+ import pytest
4
+
5
+ from browser_use.browser.browser import Browser, BrowserConfig
6
+
7
+
8
+ @pytest.fixture
9
+ async def browser():
10
+ browser_service = Browser(config=BrowserConfig(headless=True))
11
+ yield browser_service
12
+
13
+ await browser_service.close()
14
+
15
+
16
+ # @pytest.mark.skip(reason='takes too long')
17
+ def test_take_full_page_screenshot(browser):
18
+ # Go to a test page
19
+ browser.go_to_url('https://example.com')
20
+
21
+ # Take full page screenshot
22
+ screenshot_b64 = browser.take_screenshot(full_page=True)
23
+
24
+ # Verify screenshot is not empty and is valid base64
25
+ assert screenshot_b64 is not None
26
+ assert isinstance(screenshot_b64, str)
27
+ assert len(screenshot_b64) > 0
28
+
29
+ # Test we can decode the base64 string
30
+ try:
31
+ base64.b64decode(screenshot_b64)
32
+ except Exception as e:
33
+ pytest.fail(f'Failed to decode base64 screenshot: {str(e)}')
34
+
35
+
36
+ if __name__ == '__main__':
37
+ test_take_full_page_screenshot(Browser(config=BrowserConfig(headless=False)))
browser_use/browser/tests/test_clicks.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+
4
+ import pytest
5
+
6
+ from browser_use.browser.browser import Browser, BrowserConfig
7
+ from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode
8
+ from browser_use.utils import time_execution_sync
9
+
10
+
11
+ class ElementTreeSerializer:
12
+ @staticmethod
13
+ def dom_element_node_to_json(element_tree: DOMElementNode) -> dict:
14
+ def node_to_dict(node: DOMBaseNode) -> dict:
15
+ if isinstance(node, DOMTextNode):
16
+ return {'type': 'text', 'text': node.text}
17
+ elif isinstance(node, DOMElementNode):
18
+ return {
19
+ 'type': 'element',
20
+ 'tag_name': node.tag_name,
21
+ 'attributes': node.attributes,
22
+ 'highlight_index': node.highlight_index,
23
+ 'children': [node_to_dict(child) for child in node.children],
24
+ }
25
+ return {}
26
+
27
+ return node_to_dict(element_tree)
28
+
29
+
30
+ # run with: pytest browser_use/browser/tests/test_clicks.py
31
+ @pytest.mark.asyncio
32
+ async def test_highlight_elements():
33
+ browser = Browser(config=BrowserConfig(headless=False, disable_security=True))
34
+
35
+ async with await browser.new_context() as context:
36
+ page = await context.get_current_page()
37
+ # await page.goto('https://immobilienscout24.de')
38
+ # await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans')
39
+ # await page.goto('https://google.com/search?q=elon+musk')
40
+ # await page.goto('https://kayak.com')
41
+ # await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe')
42
+ # await page.goto('https://dictionary.cambridge.org')
43
+ # await page.goto('https://github.com')
44
+ await page.goto('https://huggingface.co/')
45
+
46
+ await asyncio.sleep(1)
47
+
48
+ while True:
49
+ try:
50
+ # await asyncio.sleep(10)
51
+ state = await context.get_state()
52
+
53
+ with open('./tmp/page.json', 'w') as f:
54
+ json.dump(
55
+ ElementTreeSerializer.dom_element_node_to_json(state.element_tree),
56
+ f,
57
+ indent=1,
58
+ )
59
+
60
+ # await time_execution_sync('highlight_selector_map_elements')(
61
+ # browser.highlight_selector_map_elements
62
+ # )(state.selector_map)
63
+
64
+ # Find and print duplicate XPaths
65
+ xpath_counts = {}
66
+ if not state.selector_map:
67
+ continue
68
+ for selector in state.selector_map.values():
69
+ xpath = selector.xpath
70
+ if xpath in xpath_counts:
71
+ xpath_counts[xpath] += 1
72
+ else:
73
+ xpath_counts[xpath] = 1
74
+
75
+ print('\nDuplicate XPaths found:')
76
+ for xpath, count in xpath_counts.items():
77
+ if count > 1:
78
+ print(f'XPath: {xpath}')
79
+ print(f'Count: {count}\n')
80
+
81
+ print(list(state.selector_map.keys()), 'Selector map keys')
82
+ print(state.element_tree.clickable_elements_to_string())
83
+ action = input('Select next action: ')
84
+
85
+ await time_execution_sync('remove_highlight_elements')(context.remove_highlights)()
86
+
87
+ node_element = state.selector_map[int(action)]
88
+
89
+ # check if index of selector map are the same as index of items in dom_items
90
+
91
+ await context._click_element_node(node_element)
92
+
93
+ except Exception as e:
94
+ print(e)
browser_use/browser/views.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+ from browser_use.dom.history_tree_processor.service import DOMHistoryElement
7
+ from browser_use.dom.views import DOMState
8
+
9
+
10
+ # Pydantic
11
+ class TabInfo(BaseModel):
12
+ """Represents information about a browser tab"""
13
+
14
+ page_id: int
15
+ url: str
16
+ title: str
17
+
18
+
19
+ @dataclass
20
+ class BrowserState(DOMState):
21
+ url: str
22
+ title: str
23
+ tabs: list[TabInfo]
24
+ screenshot: Optional[str] = None
25
+ pixels_above: int = 0
26
+ pixels_below: int = 0
27
+ browser_errors: list[str] = field(default_factory=list)
28
+
29
+
30
+ @dataclass
31
+ class BrowserStateHistory:
32
+ url: str
33
+ title: str
34
+ tabs: list[TabInfo]
35
+ interacted_element: list[DOMHistoryElement | None] | list[None]
36
+ screenshot: Optional[str] = None
37
+
38
+ def to_dict(self) -> dict[str, Any]:
39
+ data = {}
40
+ data['tabs'] = [tab.model_dump() for tab in self.tabs]
41
+ data['screenshot'] = self.screenshot
42
+ data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element]
43
+ data['url'] = self.url
44
+ data['title'] = self.title
45
+ return data
46
+
47
+
48
+ class BrowserError(Exception):
49
+ """Base class for all browser errors"""
50
+
51
+
52
+ class URLNotAllowedError(BrowserError):
53
+ """Error raised when a URL is not allowed"""
browser_use/controller/registry/service.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from inspect import iscoroutinefunction, signature
3
+ from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar
4
+
5
+ from langchain_core.language_models.chat_models import BaseChatModel
6
+ from pydantic import BaseModel, Field, create_model
7
+
8
+ from browser_use.browser.context import BrowserContext
9
+ from browser_use.controller.registry.views import (
10
+ ActionModel,
11
+ ActionRegistry,
12
+ RegisteredAction,
13
+ )
14
+ from browser_use.telemetry.service import ProductTelemetry
15
+ from browser_use.telemetry.views import (
16
+ ControllerRegisteredFunctionsTelemetryEvent,
17
+ RegisteredFunction,
18
+ )
19
+ from browser_use.utils import time_execution_async, time_execution_sync
20
+
21
+ Context = TypeVar('Context')
22
+
23
+
24
+ class Registry(Generic[Context]):
25
+ """Service for registering and managing actions"""
26
+
27
+ def __init__(self, exclude_actions: list[str] | None = None):
28
+ self.registry = ActionRegistry()
29
+ self.telemetry = ProductTelemetry()
30
+ self.exclude_actions = exclude_actions if exclude_actions is not None else []
31
+
32
+ @time_execution_sync('--create_param_model')
33
+ def _create_param_model(self, function: Callable) -> Type[BaseModel]:
34
+ """Creates a Pydantic model from function signature"""
35
+ sig = signature(function)
36
+ params = {
37
+ name: (param.annotation, ... if param.default == param.empty else param.default)
38
+ for name, param in sig.parameters.items()
39
+ if name != 'browser' and name != 'page_extraction_llm' and name != 'available_file_paths'
40
+ }
41
+ # TODO: make the types here work
42
+ return create_model(
43
+ f'{function.__name__}_parameters',
44
+ __base__=ActionModel,
45
+ **params, # type: ignore
46
+ )
47
+
48
+ def action(
49
+ self,
50
+ description: str,
51
+ param_model: Optional[Type[BaseModel]] = None,
52
+ ):
53
+ """Decorator for registering actions"""
54
+
55
+ def decorator(func: Callable):
56
+ # Skip registration if action is in exclude_actions
57
+ if func.__name__ in self.exclude_actions:
58
+ return func
59
+
60
+ # Create param model from function if not provided
61
+ actual_param_model = param_model or self._create_param_model(func)
62
+
63
+ # Wrap sync functions to make them async
64
+ if not iscoroutinefunction(func):
65
+
66
+ async def async_wrapper(*args, **kwargs):
67
+ return await asyncio.to_thread(func, *args, **kwargs)
68
+
69
+ # Copy the signature and other metadata from the original function
70
+ async_wrapper.__signature__ = signature(func)
71
+ async_wrapper.__name__ = func.__name__
72
+ async_wrapper.__annotations__ = func.__annotations__
73
+ wrapped_func = async_wrapper
74
+ else:
75
+ wrapped_func = func
76
+
77
+ action = RegisteredAction(
78
+ name=func.__name__,
79
+ description=description,
80
+ function=wrapped_func,
81
+ param_model=actual_param_model,
82
+ )
83
+ self.registry.actions[func.__name__] = action
84
+ return func
85
+
86
+ return decorator
87
+
88
+ @time_execution_async('--execute_action')
89
+ async def execute_action(
90
+ self,
91
+ action_name: str,
92
+ params: dict,
93
+ browser: Optional[BrowserContext] = None,
94
+ page_extraction_llm: Optional[BaseChatModel] = None,
95
+ sensitive_data: Optional[Dict[str, str]] = None,
96
+ available_file_paths: Optional[list[str]] = None,
97
+ #
98
+ context: Context | None = None,
99
+ ) -> Any:
100
+ """Execute a registered action"""
101
+ if action_name not in self.registry.actions:
102
+ raise ValueError(f'Action {action_name} not found')
103
+
104
+ action = self.registry.actions[action_name]
105
+ try:
106
+ # Create the validated Pydantic model
107
+ validated_params = action.param_model(**params)
108
+
109
+ # Check if the first parameter is a Pydantic model
110
+ sig = signature(action.function)
111
+ parameters = list(sig.parameters.values())
112
+ is_pydantic = parameters and issubclass(parameters[0].annotation, BaseModel)
113
+ parameter_names = [param.name for param in parameters]
114
+
115
+ if sensitive_data:
116
+ validated_params = self._replace_sensitive_data(validated_params, sensitive_data)
117
+
118
+ # Check if the action requires browser
119
+ if 'browser' in parameter_names and not browser:
120
+ raise ValueError(f'Action {action_name} requires browser but none provided.')
121
+ if 'page_extraction_llm' in parameter_names and not page_extraction_llm:
122
+ raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.')
123
+ if 'available_file_paths' in parameter_names and not available_file_paths:
124
+ raise ValueError(f'Action {action_name} requires available_file_paths but none provided.')
125
+
126
+ if 'context' in parameter_names and not context:
127
+ raise ValueError(f'Action {action_name} requires context but none provided.')
128
+
129
+ # Prepare arguments based on parameter type
130
+ extra_args = {}
131
+ if 'context' in parameter_names:
132
+ extra_args['context'] = context
133
+ if 'browser' in parameter_names:
134
+ extra_args['browser'] = browser
135
+ if 'page_extraction_llm' in parameter_names:
136
+ extra_args['page_extraction_llm'] = page_extraction_llm
137
+ if 'available_file_paths' in parameter_names:
138
+ extra_args['available_file_paths'] = available_file_paths
139
+ if action_name == 'input_text' and sensitive_data:
140
+ extra_args['has_sensitive_data'] = True
141
+ if is_pydantic:
142
+ return await action.function(validated_params, **extra_args)
143
+ return await action.function(**validated_params.model_dump(), **extra_args)
144
+
145
+ except Exception as e:
146
+ raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e
147
+
148
+ def _replace_sensitive_data(self, params: BaseModel, sensitive_data: Dict[str, str]) -> BaseModel:
149
+ """Replaces the sensitive data in the params"""
150
+ # if there are any str with <secret>placeholder</secret> in the params, replace them with the actual value from sensitive_data
151
+
152
+ import re
153
+
154
+ secret_pattern = re.compile(r'<secret>(.*?)</secret>')
155
+
156
+ def replace_secrets(value):
157
+ if isinstance(value, str):
158
+ matches = secret_pattern.findall(value)
159
+ for placeholder in matches:
160
+ if placeholder in sensitive_data:
161
+ value = value.replace(f'<secret>{placeholder}</secret>', sensitive_data[placeholder])
162
+ return value
163
+ elif isinstance(value, dict):
164
+ return {k: replace_secrets(v) for k, v in value.items()}
165
+ elif isinstance(value, list):
166
+ return [replace_secrets(v) for v in value]
167
+ return value
168
+
169
+ for key, value in params.model_dump().items():
170
+ params.__dict__[key] = replace_secrets(value)
171
+ return params
172
+
173
+ @time_execution_sync('--create_action_model')
174
+ def create_action_model(self, include_actions: Optional[list[str]] = None) -> Type[ActionModel]:
175
+ """Creates a Pydantic model from registered actions"""
176
+ fields = {
177
+ name: (
178
+ Optional[action.param_model],
179
+ Field(default=None, description=action.description),
180
+ )
181
+ for name, action in self.registry.actions.items()
182
+ if include_actions is None or name in include_actions
183
+ }
184
+
185
+ self.telemetry.capture(
186
+ ControllerRegisteredFunctionsTelemetryEvent(
187
+ registered_functions=[
188
+ RegisteredFunction(name=name, params=action.param_model.model_json_schema())
189
+ for name, action in self.registry.actions.items()
190
+ if include_actions is None or name in include_actions
191
+ ]
192
+ )
193
+ )
194
+
195
+ return create_model('ActionModel', __base__=ActionModel, **fields) # type:ignore
196
+
197
+ def get_prompt_description(self) -> str:
198
+ """Get a description of all actions for the prompt"""
199
+ return self.registry.get_prompt_description()
browser_use/controller/registry/views.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, Dict, Type
2
+
3
+ from pydantic import BaseModel, ConfigDict
4
+
5
+
6
+ class RegisteredAction(BaseModel):
7
+ """Model for a registered action"""
8
+
9
+ name: str
10
+ description: str
11
+ function: Callable
12
+ param_model: Type[BaseModel]
13
+
14
+ model_config = ConfigDict(arbitrary_types_allowed=True)
15
+
16
+ def prompt_description(self) -> str:
17
+ """Get a description of the action for the prompt"""
18
+ skip_keys = ['title']
19
+ s = f'{self.description}: \n'
20
+ s += '{' + str(self.name) + ': '
21
+ s += str(
22
+ {
23
+ k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys}
24
+ for k, v in self.param_model.schema()['properties'].items()
25
+ }
26
+ )
27
+ s += '}'
28
+ return s
29
+
30
+
31
+ class ActionModel(BaseModel):
32
+ """Base model for dynamically created action models"""
33
+
34
+ # this will have all the registered actions, e.g.
35
+ # click_element = param_model = ClickElementParams
36
+ # done = param_model = None
37
+ #
38
+ model_config = ConfigDict(arbitrary_types_allowed=True)
39
+
40
+ def get_index(self) -> int | None:
41
+ """Get the index of the action"""
42
+ # {'clicked_element': {'index':5}}
43
+ params = self.model_dump(exclude_unset=True).values()
44
+ if not params:
45
+ return None
46
+ for param in params:
47
+ if param is not None and 'index' in param:
48
+ return param['index']
49
+ return None
50
+
51
+ def set_index(self, index: int):
52
+ """Overwrite the index of the action"""
53
+ # Get the action name and params
54
+ action_data = self.model_dump(exclude_unset=True)
55
+ action_name = next(iter(action_data.keys()))
56
+ action_params = getattr(self, action_name)
57
+
58
+ # Update the index directly on the model
59
+ if hasattr(action_params, 'index'):
60
+ action_params.index = index
61
+
62
+
63
+ class ActionRegistry(BaseModel):
64
+ """Model representing the action registry"""
65
+
66
+ actions: Dict[str, RegisteredAction] = {}
67
+
68
+ def get_prompt_description(self) -> str:
69
+ """Get a description of all actions for the prompt"""
70
+ return '\n'.join([action.prompt_description() for action in self.actions.values()])
browser_use/controller/service.py ADDED
@@ -0,0 +1,532 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import enum
4
+ import logging
5
+ from typing import Dict, Generic, Optional, Type, TypeVar
6
+
7
+ from langchain_core.language_models.chat_models import BaseChatModel
8
+ from langchain_core.prompts import PromptTemplate
9
+
10
+ # from lmnr.sdk.laminar import Laminar
11
+ from pydantic import BaseModel
12
+
13
+ from browser_use.agent.views import ActionModel, ActionResult
14
+ from browser_use.browser.context import BrowserContext
15
+ from browser_use.controller.registry.service import Registry
16
+ from browser_use.controller.views import (
17
+ ClickElementAction,
18
+ DoneAction,
19
+ GoToUrlAction,
20
+ InputTextAction,
21
+ NoParamsAction,
22
+ OpenTabAction,
23
+ ScrollAction,
24
+ SearchGoogleAction,
25
+ SendKeysAction,
26
+ SwitchTabAction,
27
+ )
28
+ from browser_use.utils import time_execution_sync
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ Context = TypeVar('Context')
34
+
35
+
36
+ class Controller(Generic[Context]):
37
+ def __init__(
38
+ self,
39
+ exclude_actions: list[str] = [],
40
+ output_model: Optional[Type[BaseModel]] = None,
41
+ ):
42
+ self.registry = Registry[Context](exclude_actions)
43
+
44
+ """Register all default browser actions"""
45
+
46
+ if output_model is not None:
47
+ # Create a new model that extends the output model with success parameter
48
+ class ExtendedOutputModel(BaseModel): # type: ignore
49
+ success: bool = True
50
+ data: output_model
51
+
52
+ @self.registry.action(
53
+ 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
54
+ param_model=ExtendedOutputModel,
55
+ )
56
+ async def done(params: ExtendedOutputModel):
57
+ # Exclude success from the output JSON since it's an internal parameter
58
+ output_dict = params.data.model_dump()
59
+
60
+ # Enums are not serializable, convert to string
61
+ for key, value in output_dict.items():
62
+ if isinstance(value, enum.Enum):
63
+ output_dict[key] = value.value
64
+
65
+ return ActionResult(is_done=True, success=params.success, extracted_content=json.dumps(output_dict))
66
+ else:
67
+
68
+ @self.registry.action(
69
+ 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached',
70
+ param_model=DoneAction,
71
+ )
72
+ async def done(params: DoneAction):
73
+ return ActionResult(is_done=True, success=params.success, extracted_content=params.text)
74
+
75
+ # Basic Navigation Actions
76
+ @self.registry.action(
77
+ 'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ',
78
+ param_model=SearchGoogleAction,
79
+ )
80
+ async def search_google(params: SearchGoogleAction, browser: BrowserContext):
81
+ page = await browser.get_current_page()
82
+ await page.goto(f'https://www.google.com/search?q={params.query}&udm=14')
83
+ await page.wait_for_load_state()
84
+ msg = f'🔍 Searched for "{params.query}" in Google'
85
+ logger.info(msg)
86
+ return ActionResult(extracted_content=msg, include_in_memory=True)
87
+
88
+ @self.registry.action('Navigate to URL in the current tab', param_model=GoToUrlAction)
89
+ async def go_to_url(params: GoToUrlAction, browser: BrowserContext):
90
+ page = await browser.get_current_page()
91
+ await page.goto(params.url)
92
+ await page.wait_for_load_state()
93
+ msg = f'🔗 Navigated to {params.url}'
94
+ logger.info(msg)
95
+ return ActionResult(extracted_content=msg, include_in_memory=True)
96
+
97
+ @self.registry.action('Go back', param_model=NoParamsAction)
98
+ async def go_back(_: NoParamsAction, browser: BrowserContext):
99
+ await browser.go_back()
100
+ msg = '🔙 Navigated back'
101
+ logger.info(msg)
102
+ return ActionResult(extracted_content=msg, include_in_memory=True)
103
+
104
+ # wait for x seconds
105
+ @self.registry.action('Wait for x seconds default 3')
106
+ async def wait(seconds: int = 3):
107
+ msg = f'🕒 Waiting for {seconds} seconds'
108
+ logger.info(msg)
109
+ await asyncio.sleep(seconds)
110
+ return ActionResult(extracted_content=msg, include_in_memory=True)
111
+
112
+ # Element Interaction Actions
113
+ @self.registry.action('Click element', param_model=ClickElementAction)
114
+ async def click_element(params: ClickElementAction, browser: BrowserContext):
115
+ session = await browser.get_session()
116
+
117
+ if params.index not in await browser.get_selector_map():
118
+ raise Exception(f'Element with index {params.index} does not exist - retry or use alternative actions')
119
+
120
+ element_node = await browser.get_dom_element_by_index(params.index)
121
+ initial_pages = len(session.context.pages)
122
+
123
+ # if element has file uploader then dont click
124
+ if await browser.is_file_uploader(element_node):
125
+ msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files '
126
+ logger.info(msg)
127
+ return ActionResult(extracted_content=msg, include_in_memory=True)
128
+
129
+ msg = None
130
+
131
+ try:
132
+ download_path = await browser._click_element_node(element_node)
133
+ if download_path:
134
+ msg = f'💾 Downloaded file to {download_path}'
135
+ else:
136
+ msg = f'🖱️ Clicked button with index {params.index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}'
137
+
138
+ logger.info(msg)
139
+ logger.debug(f'Element xpath: {element_node.xpath}')
140
+ if len(session.context.pages) > initial_pages:
141
+ new_tab_msg = 'New tab opened - switching to it'
142
+ msg += f' - {new_tab_msg}'
143
+ logger.info(new_tab_msg)
144
+ await browser.switch_to_tab(-1)
145
+ return ActionResult(extracted_content=msg, include_in_memory=True)
146
+ except Exception as e:
147
+ logger.warning(f'Element not clickable with index {params.index} - most likely the page changed')
148
+ return ActionResult(error=str(e))
149
+
150
+ @self.registry.action(
151
+ 'Input text into a input interactive element',
152
+ param_model=InputTextAction,
153
+ )
154
+ async def input_text(params: InputTextAction, browser: BrowserContext, has_sensitive_data: bool = False):
155
+ if params.index not in await browser.get_selector_map():
156
+ raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions')
157
+
158
+ element_node = await browser.get_dom_element_by_index(params.index)
159
+ await browser._input_text_element_node(element_node, params.text)
160
+ if not has_sensitive_data:
161
+ msg = f'⌨️ Input {params.text} into index {params.index}'
162
+ else:
163
+ msg = f'⌨️ Input sensitive data into index {params.index}'
164
+ logger.info(msg)
165
+ logger.debug(f'Element xpath: {element_node.xpath}')
166
+ return ActionResult(extracted_content=msg, include_in_memory=True)
167
+
168
+ # Tab Management Actions
169
+ @self.registry.action('Switch tab', param_model=SwitchTabAction)
170
+ async def switch_tab(params: SwitchTabAction, browser: BrowserContext):
171
+ await browser.switch_to_tab(params.page_id)
172
+ # Wait for tab to be ready
173
+ page = await browser.get_current_page()
174
+ await page.wait_for_load_state()
175
+ msg = f'🔄 Switched to tab {params.page_id}'
176
+ logger.info(msg)
177
+ return ActionResult(extracted_content=msg, include_in_memory=True)
178
+
179
+ @self.registry.action('Open url in new tab', param_model=OpenTabAction)
180
+ async def open_tab(params: OpenTabAction, browser: BrowserContext):
181
+ await browser.create_new_tab(params.url)
182
+ msg = f'🔗 Opened new tab with {params.url}'
183
+ logger.info(msg)
184
+ return ActionResult(extracted_content=msg, include_in_memory=True)
185
+
186
+ # Content Actions
187
+ @self.registry.action(
188
+ 'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links',
189
+ )
190
+ async def extract_content(goal: str, browser: BrowserContext, page_extraction_llm: BaseChatModel):
191
+ page = await browser.get_current_page()
192
+ import markdownify
193
+
194
+ content = markdownify.markdownify(await page.content())
195
+
196
+ prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}'
197
+ template = PromptTemplate(input_variables=['goal', 'page'], template=prompt)
198
+ try:
199
+ output = page_extraction_llm.invoke(template.format(goal=goal, page=content))
200
+ msg = f'📄 Extracted from page\n: {output.content}\n'
201
+ logger.info(msg)
202
+ return ActionResult(extracted_content=msg, include_in_memory=True)
203
+ except Exception as e:
204
+ logger.debug(f'Error extracting content: {e}')
205
+ msg = f'📄 Extracted from page\n: {content}\n'
206
+ logger.info(msg)
207
+ return ActionResult(extracted_content=msg)
208
+
209
+ @self.registry.action(
210
+ 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page',
211
+ param_model=ScrollAction,
212
+ )
213
+ async def scroll_down(params: ScrollAction, browser: BrowserContext):
214
+ page = await browser.get_current_page()
215
+ if params.amount is not None:
216
+ await page.evaluate(f'window.scrollBy(0, {params.amount});')
217
+ else:
218
+ await page.evaluate('window.scrollBy(0, window.innerHeight);')
219
+
220
+ amount = f'{params.amount} pixels' if params.amount is not None else 'one page'
221
+ msg = f'🔍 Scrolled down the page by {amount}'
222
+ logger.info(msg)
223
+ return ActionResult(
224
+ extracted_content=msg,
225
+ include_in_memory=True,
226
+ )
227
+
228
+ # scroll up
229
+ @self.registry.action(
230
+ 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page',
231
+ param_model=ScrollAction,
232
+ )
233
+ async def scroll_up(params: ScrollAction, browser: BrowserContext):
234
+ page = await browser.get_current_page()
235
+ if params.amount is not None:
236
+ await page.evaluate(f'window.scrollBy(0, -{params.amount});')
237
+ else:
238
+ await page.evaluate('window.scrollBy(0, -window.innerHeight);')
239
+
240
+ amount = f'{params.amount} pixels' if params.amount is not None else 'one page'
241
+ msg = f'🔍 Scrolled up the page by {amount}'
242
+ logger.info(msg)
243
+ return ActionResult(
244
+ extracted_content=msg,
245
+ include_in_memory=True,
246
+ )
247
+
248
+ # send keys
249
+ @self.registry.action(
250
+ 'Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. ',
251
+ param_model=SendKeysAction,
252
+ )
253
+ async def send_keys(params: SendKeysAction, browser: BrowserContext):
254
+ page = await browser.get_current_page()
255
+
256
+ try:
257
+ await page.keyboard.press(params.keys)
258
+ except Exception as e:
259
+ if 'Unknown key' in str(e):
260
+ # loop over the keys and try to send each one
261
+ for key in params.keys:
262
+ try:
263
+ await page.keyboard.press(key)
264
+ except Exception as e:
265
+ logger.debug(f'Error sending key {key}: {str(e)}')
266
+ raise e
267
+ else:
268
+ raise e
269
+ msg = f'⌨️ Sent keys: {params.keys}'
270
+ logger.info(msg)
271
+ return ActionResult(extracted_content=msg, include_in_memory=True)
272
+
273
+ @self.registry.action(
274
+ description='If you dont find something which you want to interact with, scroll to it',
275
+ )
276
+ async def scroll_to_text(text: str, browser: BrowserContext): # type: ignore
277
+ page = await browser.get_current_page()
278
+ try:
279
+ # Try different locator strategies
280
+ locators = [
281
+ page.get_by_text(text, exact=False),
282
+ page.locator(f'text={text}'),
283
+ page.locator(f"//*[contains(text(), '{text}')]"),
284
+ ]
285
+
286
+ for locator in locators:
287
+ try:
288
+ # First check if element exists and is visible
289
+ if await locator.count() > 0 and await locator.first.is_visible():
290
+ await locator.first.scroll_into_view_if_needed()
291
+ await asyncio.sleep(0.5) # Wait for scroll to complete
292
+ msg = f'🔍 Scrolled to text: {text}'
293
+ logger.info(msg)
294
+ return ActionResult(extracted_content=msg, include_in_memory=True)
295
+ except Exception as e:
296
+ logger.debug(f'Locator attempt failed: {str(e)}')
297
+ continue
298
+
299
+ msg = f"Text '{text}' not found or not visible on page"
300
+ logger.info(msg)
301
+ return ActionResult(extracted_content=msg, include_in_memory=True)
302
+
303
+ except Exception as e:
304
+ msg = f"Failed to scroll to text '{text}': {str(e)}"
305
+ logger.error(msg)
306
+ return ActionResult(error=msg, include_in_memory=True)
307
+
308
+ @self.registry.action(
309
+ description='Get all options from a native dropdown',
310
+ )
311
+ async def get_dropdown_options(index: int, browser: BrowserContext) -> ActionResult:
312
+ """Get all options from a native dropdown"""
313
+ page = await browser.get_current_page()
314
+ selector_map = await browser.get_selector_map()
315
+ dom_element = selector_map[index]
316
+
317
+ try:
318
+ # Frame-aware approach since we know it works
319
+ all_options = []
320
+ frame_index = 0
321
+
322
+ for frame in page.frames:
323
+ try:
324
+ options = await frame.evaluate(
325
+ """
326
+ (xpath) => {
327
+ const select = document.evaluate(xpath, document, null,
328
+ XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
329
+ if (!select) return null;
330
+
331
+ return {
332
+ options: Array.from(select.options).map(opt => ({
333
+ text: opt.text, //do not trim, because we are doing exact match in select_dropdown_option
334
+ value: opt.value,
335
+ index: opt.index
336
+ })),
337
+ id: select.id,
338
+ name: select.name
339
+ };
340
+ }
341
+ """,
342
+ dom_element.xpath,
343
+ )
344
+
345
+ if options:
346
+ logger.debug(f'Found dropdown in frame {frame_index}')
347
+ logger.debug(f'Dropdown ID: {options["id"]}, Name: {options["name"]}')
348
+
349
+ formatted_options = []
350
+ for opt in options['options']:
351
+ # encoding ensures AI uses the exact string in select_dropdown_option
352
+ encoded_text = json.dumps(opt['text'])
353
+ formatted_options.append(f'{opt["index"]}: text={encoded_text}')
354
+
355
+ all_options.extend(formatted_options)
356
+
357
+ except Exception as frame_e:
358
+ logger.debug(f'Frame {frame_index} evaluation failed: {str(frame_e)}')
359
+
360
+ frame_index += 1
361
+
362
+ if all_options:
363
+ msg = '\n'.join(all_options)
364
+ msg += '\nUse the exact text string in select_dropdown_option'
365
+ logger.info(msg)
366
+ return ActionResult(extracted_content=msg, include_in_memory=True)
367
+ else:
368
+ msg = 'No options found in any frame for dropdown'
369
+ logger.info(msg)
370
+ return ActionResult(extracted_content=msg, include_in_memory=True)
371
+
372
+ except Exception as e:
373
+ logger.error(f'Failed to get dropdown options: {str(e)}')
374
+ msg = f'Error getting options: {str(e)}'
375
+ logger.info(msg)
376
+ return ActionResult(extracted_content=msg, include_in_memory=True)
377
+
378
+ @self.registry.action(
379
+ description='Select dropdown option for interactive element index by the text of the option you want to select',
380
+ )
381
+ async def select_dropdown_option(
382
+ index: int,
383
+ text: str,
384
+ browser: BrowserContext,
385
+ ) -> ActionResult:
386
+ """Select dropdown option by the text of the option you want to select"""
387
+ page = await browser.get_current_page()
388
+ selector_map = await browser.get_selector_map()
389
+ dom_element = selector_map[index]
390
+
391
+ # Validate that we're working with a select element
392
+ if dom_element.tag_name != 'select':
393
+ logger.error(f'Element is not a select! Tag: {dom_element.tag_name}, Attributes: {dom_element.attributes}')
394
+ msg = f'Cannot select option: Element with index {index} is a {dom_element.tag_name}, not a select'
395
+ return ActionResult(extracted_content=msg, include_in_memory=True)
396
+
397
+ logger.debug(f"Attempting to select '{text}' using xpath: {dom_element.xpath}")
398
+ logger.debug(f'Element attributes: {dom_element.attributes}')
399
+ logger.debug(f'Element tag: {dom_element.tag_name}')
400
+
401
+ xpath = '//' + dom_element.xpath
402
+
403
+ try:
404
+ frame_index = 0
405
+ for frame in page.frames:
406
+ try:
407
+ logger.debug(f'Trying frame {frame_index} URL: {frame.url}')
408
+
409
+ # First verify we can find the dropdown in this frame
410
+ find_dropdown_js = """
411
+ (xpath) => {
412
+ try {
413
+ const select = document.evaluate(xpath, document, null,
414
+ XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
415
+ if (!select) return null;
416
+ if (select.tagName.toLowerCase() !== 'select') {
417
+ return {
418
+ error: `Found element but it's a ${select.tagName}, not a SELECT`,
419
+ found: false
420
+ };
421
+ }
422
+ return {
423
+ id: select.id,
424
+ name: select.name,
425
+ found: true,
426
+ tagName: select.tagName,
427
+ optionCount: select.options.length,
428
+ currentValue: select.value,
429
+ availableOptions: Array.from(select.options).map(o => o.text.trim())
430
+ };
431
+ } catch (e) {
432
+ return {error: e.toString(), found: false};
433
+ }
434
+ }
435
+ """
436
+
437
+ dropdown_info = await frame.evaluate(find_dropdown_js, dom_element.xpath)
438
+
439
+ if dropdown_info:
440
+ if not dropdown_info.get('found'):
441
+ logger.error(f'Frame {frame_index} error: {dropdown_info.get("error")}')
442
+ continue
443
+
444
+ logger.debug(f'Found dropdown in frame {frame_index}: {dropdown_info}')
445
+
446
+ # "label" because we are selecting by text
447
+ # nth(0) to disable error thrown by strict mode
448
+ # timeout=1000 because we are already waiting for all network events, therefore ideally we don't need to wait a lot here (default 30s)
449
+ selected_option_values = (
450
+ await frame.locator('//' + dom_element.xpath).nth(0).select_option(label=text, timeout=1000)
451
+ )
452
+
453
+ msg = f'selected option {text} with value {selected_option_values}'
454
+ logger.info(msg + f' in frame {frame_index}')
455
+
456
+ return ActionResult(extracted_content=msg, include_in_memory=True)
457
+
458
+ except Exception as frame_e:
459
+ logger.error(f'Frame {frame_index} attempt failed: {str(frame_e)}')
460
+ logger.error(f'Frame type: {type(frame)}')
461
+ logger.error(f'Frame URL: {frame.url}')
462
+
463
+ frame_index += 1
464
+
465
+ msg = f"Could not select option '{text}' in any frame"
466
+ logger.info(msg)
467
+ return ActionResult(extracted_content=msg, include_in_memory=True)
468
+
469
+ except Exception as e:
470
+ msg = f'Selection failed: {str(e)}'
471
+ logger.error(msg)
472
+ return ActionResult(error=msg, include_in_memory=True)
473
+
474
+ # Register ---------------------------------------------------------------
475
+
476
+ def action(self, description: str, **kwargs):
477
+ """Decorator for registering custom actions
478
+
479
+ @param description: Describe the LLM what the function does (better description == better function calling)
480
+ """
481
+ return self.registry.action(description, **kwargs)
482
+
483
+ # Act --------------------------------------------------------------------
484
+
485
+ @time_execution_sync('--act')
486
+ async def act(
487
+ self,
488
+ action: ActionModel,
489
+ browser_context: BrowserContext,
490
+ #
491
+ page_extraction_llm: Optional[BaseChatModel] = None,
492
+ sensitive_data: Optional[Dict[str, str]] = None,
493
+ available_file_paths: Optional[list[str]] = None,
494
+ #
495
+ context: Context | None = None,
496
+ ) -> ActionResult:
497
+ """Execute an action"""
498
+
499
+ try:
500
+ for action_name, params in action.model_dump(exclude_unset=True).items():
501
+ if params is not None:
502
+ # with Laminar.start_as_current_span(
503
+ # name=action_name,
504
+ # input={
505
+ # 'action': action_name,
506
+ # 'params': params,
507
+ # },
508
+ # span_type='TOOL',
509
+ # ):
510
+ result = await self.registry.execute_action(
511
+ action_name,
512
+ params,
513
+ browser=browser_context,
514
+ page_extraction_llm=page_extraction_llm,
515
+ sensitive_data=sensitive_data,
516
+ available_file_paths=available_file_paths,
517
+ context=context,
518
+ )
519
+
520
+ # Laminar.set_span_output(result)
521
+
522
+ if isinstance(result, str):
523
+ return ActionResult(extracted_content=result)
524
+ elif isinstance(result, ActionResult):
525
+ return result
526
+ elif result is None:
527
+ return ActionResult()
528
+ else:
529
+ raise ValueError(f'Invalid action result type: {type(result)} of {result}')
530
+ return ActionResult()
531
+ except Exception as e:
532
+ raise e
browser_use/controller/views.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, model_validator
4
+
5
+
6
+ # Action Input Models
7
+ class SearchGoogleAction(BaseModel):
8
+ query: str
9
+
10
+
11
+ class GoToUrlAction(BaseModel):
12
+ url: str
13
+
14
+
15
+ class ClickElementAction(BaseModel):
16
+ index: int
17
+ xpath: Optional[str] = None
18
+
19
+
20
+ class InputTextAction(BaseModel):
21
+ index: int
22
+ text: str
23
+ xpath: Optional[str] = None
24
+
25
+
26
+ class DoneAction(BaseModel):
27
+ text: str
28
+ success: bool
29
+
30
+
31
+ class SwitchTabAction(BaseModel):
32
+ page_id: int
33
+
34
+
35
+ class OpenTabAction(BaseModel):
36
+ url: str
37
+
38
+
39
+ class ScrollAction(BaseModel):
40
+ amount: Optional[int] = None # The number of pixels to scroll. If None, scroll down/up one page
41
+
42
+
43
+ class SendKeysAction(BaseModel):
44
+ keys: str
45
+
46
+
47
+ class ExtractPageContentAction(BaseModel):
48
+ value: str
49
+
50
+
51
+ class NoParamsAction(BaseModel):
52
+ """
53
+ Accepts absolutely anything in the incoming data
54
+ and discards it, so the final parsed model is empty.
55
+ """
56
+
57
+ @model_validator(mode='before')
58
+ def ignore_all_inputs(cls, values):
59
+ # No matter what the user sends, discard it and return empty.
60
+ return {}
61
+
62
+ class Config:
63
+ # If you want to silently allow unknown fields at top-level,
64
+ # set extra = 'allow' as well:
65
+ extra = 'allow'
browser_use/dom/__init__.py ADDED
File without changes
browser_use/dom/buildDomTree.js ADDED
@@ -0,0 +1,1055 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ (
2
+ args = {
3
+ doHighlightElements: true,
4
+ focusHighlightIndex: -1,
5
+ viewportExpansion: 0,
6
+ debugMode: false,
7
+ }
8
+ ) => {
9
+ const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode } = args;
10
+ let highlightIndex = 0; // Reset highlight index
11
+
12
+ // Add timing stack to handle recursion
13
+ const TIMING_STACK = {
14
+ nodeProcessing: [],
15
+ treeTraversal: [],
16
+ highlighting: [],
17
+ current: null
18
+ };
19
+
20
+ function pushTiming(type) {
21
+ TIMING_STACK[type] = TIMING_STACK[type] || [];
22
+ TIMING_STACK[type].push(performance.now());
23
+ }
24
+
25
+ function popTiming(type) {
26
+ const start = TIMING_STACK[type].pop();
27
+ const duration = performance.now() - start;
28
+ return duration;
29
+ }
30
+
31
+ // Only initialize performance tracking if in debug mode
32
+ const PERF_METRICS = debugMode ? {
33
+ buildDomTreeCalls: 0,
34
+ timings: {
35
+ buildDomTree: 0,
36
+ highlightElement: 0,
37
+ isInteractiveElement: 0,
38
+ isElementVisible: 0,
39
+ isTopElement: 0,
40
+ isInExpandedViewport: 0,
41
+ isTextNodeVisible: 0,
42
+ getEffectiveScroll: 0,
43
+ },
44
+ cacheMetrics: {
45
+ boundingRectCacheHits: 0,
46
+ boundingRectCacheMisses: 0,
47
+ computedStyleCacheHits: 0,
48
+ computedStyleCacheMisses: 0,
49
+ getBoundingClientRectTime: 0,
50
+ getComputedStyleTime: 0,
51
+ boundingRectHitRate: 0,
52
+ computedStyleHitRate: 0,
53
+ overallHitRate: 0,
54
+ },
55
+ nodeMetrics: {
56
+ totalNodes: 0,
57
+ processedNodes: 0,
58
+ skippedNodes: 0,
59
+ },
60
+ buildDomTreeBreakdown: {
61
+ totalTime: 0,
62
+ totalSelfTime: 0,
63
+ buildDomTreeCalls: 0,
64
+ domOperations: {
65
+ getBoundingClientRect: 0,
66
+ getComputedStyle: 0,
67
+ },
68
+ domOperationCounts: {
69
+ getBoundingClientRect: 0,
70
+ getComputedStyle: 0,
71
+ }
72
+ }
73
+ } : null;
74
+
75
+ // Simple timing helper that only runs in debug mode
76
+ function measureTime(fn) {
77
+ if (!debugMode) return fn;
78
+ return function (...args) {
79
+ const start = performance.now();
80
+ const result = fn.apply(this, args);
81
+ const duration = performance.now() - start;
82
+ return result;
83
+ };
84
+ }
85
+
86
+ // Helper to measure DOM operations
87
+ function measureDomOperation(operation, name) {
88
+ if (!debugMode) return operation();
89
+
90
+ const start = performance.now();
91
+ const result = operation();
92
+ const duration = performance.now() - start;
93
+
94
+ if (PERF_METRICS && name in PERF_METRICS.buildDomTreeBreakdown.domOperations) {
95
+ PERF_METRICS.buildDomTreeBreakdown.domOperations[name] += duration;
96
+ PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[name]++;
97
+ }
98
+
99
+ return result;
100
+ }
101
+
102
+ // Add caching mechanisms at the top level
103
+ const DOM_CACHE = {
104
+ boundingRects: new WeakMap(),
105
+ computedStyles: new WeakMap(),
106
+ clearCache: () => {
107
+ DOM_CACHE.boundingRects = new WeakMap();
108
+ DOM_CACHE.computedStyles = new WeakMap();
109
+ }
110
+ };
111
+
112
+ // Cache helper functions
113
+ function getCachedBoundingRect(element) {
114
+ if (!element) return null;
115
+
116
+ if (DOM_CACHE.boundingRects.has(element)) {
117
+ if (debugMode && PERF_METRICS) {
118
+ PERF_METRICS.cacheMetrics.boundingRectCacheHits++;
119
+ }
120
+ return DOM_CACHE.boundingRects.get(element);
121
+ }
122
+
123
+ if (debugMode && PERF_METRICS) {
124
+ PERF_METRICS.cacheMetrics.boundingRectCacheMisses++;
125
+ }
126
+
127
+ let rect;
128
+ if (debugMode) {
129
+ const start = performance.now();
130
+ rect = element.getBoundingClientRect();
131
+ const duration = performance.now() - start;
132
+ if (PERF_METRICS) {
133
+ PERF_METRICS.buildDomTreeBreakdown.domOperations.getBoundingClientRect += duration;
134
+ PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getBoundingClientRect++;
135
+ }
136
+ } else {
137
+ rect = element.getBoundingClientRect();
138
+ }
139
+
140
+ if (rect) {
141
+ DOM_CACHE.boundingRects.set(element, rect);
142
+ }
143
+ return rect;
144
+ }
145
+
146
+ function getCachedComputedStyle(element) {
147
+ if (!element) return null;
148
+
149
+ if (DOM_CACHE.computedStyles.has(element)) {
150
+ if (debugMode && PERF_METRICS) {
151
+ PERF_METRICS.cacheMetrics.computedStyleCacheHits++;
152
+ }
153
+ return DOM_CACHE.computedStyles.get(element);
154
+ }
155
+
156
+ if (debugMode && PERF_METRICS) {
157
+ PERF_METRICS.cacheMetrics.computedStyleCacheMisses++;
158
+ }
159
+
160
+ let style;
161
+ if (debugMode) {
162
+ const start = performance.now();
163
+ style = window.getComputedStyle(element);
164
+ const duration = performance.now() - start;
165
+ if (PERF_METRICS) {
166
+ PERF_METRICS.buildDomTreeBreakdown.domOperations.getComputedStyle += duration;
167
+ PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getComputedStyle++;
168
+ }
169
+ } else {
170
+ style = window.getComputedStyle(element);
171
+ }
172
+
173
+ if (style) {
174
+ DOM_CACHE.computedStyles.set(element, style);
175
+ }
176
+ return style;
177
+ }
178
+
179
+ /**
180
+ * Hash map of DOM nodes indexed by their highlight index.
181
+ *
182
+ * @type {Object<string, any>}
183
+ */
184
+ const DOM_HASH_MAP = {};
185
+
186
+ const ID = { current: 0 };
187
+
188
+ const HIGHLIGHT_CONTAINER_ID = "playwright-highlight-container";
189
+
190
+ /**
191
+ * Highlights an element in the DOM and returns the index of the next element.
192
+ */
193
+ function highlightElement(element, index, parentIframe = null) {
194
+ if (!element) return index;
195
+
196
+ try {
197
+ // Create or get highlight container
198
+ let container = document.getElementById(HIGHLIGHT_CONTAINER_ID);
199
+ if (!container) {
200
+ container = document.createElement("div");
201
+ container.id = HIGHLIGHT_CONTAINER_ID;
202
+ container.style.position = "fixed";
203
+ container.style.pointerEvents = "none";
204
+ container.style.top = "0";
205
+ container.style.left = "0";
206
+ container.style.width = "100%";
207
+ container.style.height = "100%";
208
+ container.style.zIndex = "2147483647";
209
+ document.body.appendChild(container);
210
+ }
211
+
212
+ // Get element position
213
+ const rect = measureDomOperation(
214
+ () => element.getBoundingClientRect(),
215
+ 'getBoundingClientRect'
216
+ );
217
+
218
+ if (!rect) return index;
219
+
220
+ // Generate a color based on the index
221
+ const colors = [
222
+ "#FF0000",
223
+ "#00FF00",
224
+ "#0000FF",
225
+ "#FFA500",
226
+ "#800080",
227
+ "#008080",
228
+ "#FF69B4",
229
+ "#4B0082",
230
+ "#FF4500",
231
+ "#2E8B57",
232
+ "#DC143C",
233
+ "#4682B4",
234
+ ];
235
+ const colorIndex = index % colors.length;
236
+ const baseColor = colors[colorIndex];
237
+ const backgroundColor = baseColor + "1A"; // 10% opacity version of the color
238
+
239
+ // Create highlight overlay
240
+ const overlay = document.createElement("div");
241
+ overlay.style.position = "fixed";
242
+ overlay.style.border = `2px solid ${baseColor}`;
243
+ overlay.style.backgroundColor = backgroundColor;
244
+ overlay.style.pointerEvents = "none";
245
+ overlay.style.boxSizing = "border-box";
246
+
247
+ // Get element position
248
+ let iframeOffset = { x: 0, y: 0 };
249
+
250
+ // If element is in an iframe, calculate iframe offset
251
+ if (parentIframe) {
252
+ const iframeRect = parentIframe.getBoundingClientRect();
253
+ iframeOffset.x = iframeRect.left;
254
+ iframeOffset.y = iframeRect.top;
255
+ }
256
+
257
+ // Calculate position
258
+ const top = rect.top + iframeOffset.y;
259
+ const left = rect.left + iframeOffset.x;
260
+
261
+ overlay.style.top = `${top}px`;
262
+ overlay.style.left = `${left}px`;
263
+ overlay.style.width = `${rect.width}px`;
264
+ overlay.style.height = `${rect.height}px`;
265
+
266
+ // Create and position label
267
+ const label = document.createElement("div");
268
+ label.className = "playwright-highlight-label";
269
+ label.style.position = "fixed";
270
+ label.style.background = baseColor;
271
+ label.style.color = "white";
272
+ label.style.padding = "1px 4px";
273
+ label.style.borderRadius = "4px";
274
+ label.style.fontSize = `${Math.min(12, Math.max(8, rect.height / 2))}px`;
275
+ label.textContent = index;
276
+
277
+ const labelWidth = 20;
278
+ const labelHeight = 16;
279
+
280
+ let labelTop = top + 2;
281
+ let labelLeft = left + rect.width - labelWidth - 2;
282
+
283
+ if (rect.width < labelWidth + 4 || rect.height < labelHeight + 4) {
284
+ labelTop = top - labelHeight - 2;
285
+ labelLeft = left + rect.width - labelWidth;
286
+ }
287
+
288
+ label.style.top = `${labelTop}px`;
289
+ label.style.left = `${labelLeft}px`;
290
+
291
+ // Add to container
292
+ container.appendChild(overlay);
293
+ container.appendChild(label);
294
+
295
+ // Update positions on scroll
296
+ const updatePositions = () => {
297
+ const newRect = element.getBoundingClientRect();
298
+ let newIframeOffset = { x: 0, y: 0 };
299
+
300
+ if (parentIframe) {
301
+ const iframeRect = parentIframe.getBoundingClientRect();
302
+ newIframeOffset.x = iframeRect.left;
303
+ newIframeOffset.y = iframeRect.top;
304
+ }
305
+
306
+ const newTop = newRect.top + newIframeOffset.y;
307
+ const newLeft = newRect.left + newIframeOffset.x;
308
+
309
+ overlay.style.top = `${newTop}px`;
310
+ overlay.style.left = `${newLeft}px`;
311
+ overlay.style.width = `${newRect.width}px`;
312
+ overlay.style.height = `${newRect.height}px`;
313
+
314
+ let newLabelTop = newTop + 2;
315
+ let newLabelLeft = newLeft + newRect.width - labelWidth - 2;
316
+
317
+ if (newRect.width < labelWidth + 4 || newRect.height < labelHeight + 4) {
318
+ newLabelTop = newTop - labelHeight - 2;
319
+ newLabelLeft = newLeft + newRect.width - labelWidth;
320
+ }
321
+
322
+ label.style.top = `${newLabelTop}px`;
323
+ label.style.left = `${newLabelLeft}px`;
324
+ };
325
+
326
+ window.addEventListener('scroll', updatePositions);
327
+ window.addEventListener('resize', updatePositions);
328
+
329
+ return index + 1;
330
+ } finally {
331
+ popTiming('highlighting');
332
+ }
333
+ }
334
+
335
+ /**
336
+ * Returns an XPath tree string for an element.
337
+ */
338
+ function getXPathTree(element, stopAtBoundary = true) {
339
+ const segments = [];
340
+ let currentElement = element;
341
+
342
+ while (currentElement && currentElement.nodeType === Node.ELEMENT_NODE) {
343
+ // Stop if we hit a shadow root or iframe
344
+ if (
345
+ stopAtBoundary &&
346
+ (currentElement.parentNode instanceof ShadowRoot ||
347
+ currentElement.parentNode instanceof HTMLIFrameElement)
348
+ ) {
349
+ break;
350
+ }
351
+
352
+ let index = 0;
353
+ let sibling = currentElement.previousSibling;
354
+ while (sibling) {
355
+ if (
356
+ sibling.nodeType === Node.ELEMENT_NODE &&
357
+ sibling.nodeName === currentElement.nodeName
358
+ ) {
359
+ index++;
360
+ }
361
+ sibling = sibling.previousSibling;
362
+ }
363
+
364
+ const tagName = currentElement.nodeName.toLowerCase();
365
+ const xpathIndex = index > 0 ? `[${index + 1}]` : "";
366
+ segments.unshift(`${tagName}${xpathIndex}`);
367
+
368
+ currentElement = currentElement.parentNode;
369
+ }
370
+
371
+ return segments.join("/");
372
+ }
373
+
374
+ /**
375
+ * Checks if a text node is visible.
376
+ */
377
+ function isTextNodeVisible(textNode) {
378
+ try {
379
+ const range = document.createRange();
380
+ range.selectNodeContents(textNode);
381
+ const rect = range.getBoundingClientRect();
382
+
383
+ // Simple size check
384
+ if (rect.width === 0 || rect.height === 0) {
385
+ return false;
386
+ }
387
+
388
+ // Simple viewport check without scroll calculations
389
+ const isInViewport = !(
390
+ rect.bottom < -viewportExpansion ||
391
+ rect.top > window.innerHeight + viewportExpansion ||
392
+ rect.right < -viewportExpansion ||
393
+ rect.left > window.innerWidth + viewportExpansion
394
+ );
395
+
396
+ // Check parent visibility
397
+ const parentElement = textNode.parentElement;
398
+ if (!parentElement) return false;
399
+
400
+ try {
401
+ return isInViewport && parentElement.checkVisibility({
402
+ checkOpacity: true,
403
+ checkVisibilityCSS: true,
404
+ });
405
+ } catch (e) {
406
+ // Fallback if checkVisibility is not supported
407
+ const style = window.getComputedStyle(parentElement);
408
+ return isInViewport &&
409
+ style.display !== 'none' &&
410
+ style.visibility !== 'hidden' &&
411
+ style.opacity !== '0';
412
+ }
413
+ } catch (e) {
414
+ console.warn('Error checking text node visibility:', e);
415
+ return false;
416
+ }
417
+ }
418
+
419
+ // Helper function to check if element is accepted
420
+ function isElementAccepted(element) {
421
+ if (!element || !element.tagName) return false;
422
+
423
+ // Always accept body and common container elements
424
+ const alwaysAccept = new Set([
425
+ "body", "div", "main", "article", "section", "nav", "header", "footer"
426
+ ]);
427
+ const tagName = element.tagName.toLowerCase();
428
+
429
+ if (alwaysAccept.has(tagName)) return true;
430
+
431
+ const leafElementDenyList = new Set([
432
+ "svg",
433
+ "script",
434
+ "style",
435
+ "link",
436
+ "meta",
437
+ "noscript",
438
+ "template",
439
+ ]);
440
+
441
+ return !leafElementDenyList.has(tagName);
442
+ }
443
+
444
+ /**
445
+ * Checks if an element is visible.
446
+ */
447
+ function isElementVisible(element) {
448
+ const style = getCachedComputedStyle(element);
449
+ return (
450
+ element.offsetWidth > 0 &&
451
+ element.offsetHeight > 0 &&
452
+ style.visibility !== "hidden" &&
453
+ style.display !== "none"
454
+ );
455
+ }
456
+
457
+ /**
458
+ * Checks if an element is interactive.
459
+ */
460
+ function isInteractiveElement(element) {
461
+ if (!element || element.nodeType !== Node.ELEMENT_NODE) {
462
+ return false;
463
+ }
464
+
465
+ // Special handling for cookie banner elements
466
+ const isCookieBannerElement =
467
+ (typeof element.closest === 'function') && (
468
+ element.closest('[id*="onetrust"]') ||
469
+ element.closest('[class*="onetrust"]') ||
470
+ element.closest('[data-nosnippet="true"]') ||
471
+ element.closest('[aria-label*="cookie"]')
472
+ );
473
+
474
+ if (isCookieBannerElement) {
475
+ // Check if it's a button or interactive element within the banner
476
+ if (
477
+ element.tagName.toLowerCase() === 'button' ||
478
+ element.getAttribute('role') === 'button' ||
479
+ element.onclick ||
480
+ element.getAttribute('onclick') ||
481
+ (element.classList && (
482
+ element.classList.contains('ot-sdk-button') ||
483
+ element.classList.contains('accept-button') ||
484
+ element.classList.contains('reject-button')
485
+ )) ||
486
+ element.getAttribute('aria-label')?.toLowerCase().includes('accept') ||
487
+ element.getAttribute('aria-label')?.toLowerCase().includes('reject')
488
+ ) {
489
+ return true;
490
+ }
491
+ }
492
+
493
+ // Base interactive elements and roles
494
+ const interactiveElements = new Set([
495
+ "a", "button", "details", "embed", "input", "menu", "menuitem",
496
+ "object", "select", "textarea", "canvas", "summary", "dialog",
497
+ "banner"
498
+ ]);
499
+
500
+ const interactiveRoles = new Set(['button-icon', 'dialog', 'button-text-icon-only', 'treeitem', 'alert', 'grid', 'progressbar', 'radio', 'checkbox', 'menuitem', 'option', 'switch', 'dropdown', 'scrollbar', 'combobox', 'a-button-text', 'button', 'region', 'textbox', 'tabpanel', 'tab', 'click', 'button-text', 'spinbutton', 'a-button-inner', 'link', 'menu', 'slider', 'listbox', 'a-dropdown-button', 'button-icon-only', 'searchbox', 'menuitemradio', 'tooltip', 'tree', 'menuitemcheckbox']);
501
+
502
+ const tagName = element.tagName.toLowerCase();
503
+ const role = element.getAttribute("role");
504
+ const ariaRole = element.getAttribute("aria-role");
505
+ const tabIndex = element.getAttribute("tabindex");
506
+
507
+ // Add check for specific class
508
+ const hasAddressInputClass = element.classList && (
509
+ element.classList.contains("address-input__container__input") ||
510
+ element.classList.contains("nav-btn") ||
511
+ element.classList.contains("pull-left")
512
+ );
513
+
514
+ // Added enhancement to capture dropdown interactive elements
515
+ if (element.classList && (
516
+ element.classList.contains('dropdown-toggle') ||
517
+ element.getAttribute('data-toggle') === 'dropdown' ||
518
+ element.getAttribute('aria-haspopup') === 'true'
519
+ )) {
520
+ return true;
521
+ }
522
+
523
+ // Basic role/attribute checks
524
+ const hasInteractiveRole =
525
+ hasAddressInputClass ||
526
+ interactiveElements.has(tagName) ||
527
+ interactiveRoles.has(role) ||
528
+ interactiveRoles.has(ariaRole) ||
529
+ (tabIndex !== null &&
530
+ tabIndex !== "-1" &&
531
+ element.parentElement?.tagName.toLowerCase() !== "body") ||
532
+ element.getAttribute("data-action") === "a-dropdown-select" ||
533
+ element.getAttribute("data-action") === "a-dropdown-button";
534
+
535
+ if (hasInteractiveRole) return true;
536
+
537
+ // Additional checks for cookie banners and consent UI
538
+ const isCookieBanner =
539
+ element.id?.toLowerCase().includes('cookie') ||
540
+ element.id?.toLowerCase().includes('consent') ||
541
+ element.id?.toLowerCase().includes('notice') ||
542
+ (element.classList && (
543
+ element.classList.contains('otCenterRounded') ||
544
+ element.classList.contains('ot-sdk-container')
545
+ )) ||
546
+ element.getAttribute('data-nosnippet') === 'true' ||
547
+ element.getAttribute('aria-label')?.toLowerCase().includes('cookie') ||
548
+ element.getAttribute('aria-label')?.toLowerCase().includes('consent') ||
549
+ (element.tagName.toLowerCase() === 'div' && (
550
+ element.id?.includes('onetrust') ||
551
+ (element.classList && (
552
+ element.classList.contains('onetrust') ||
553
+ element.classList.contains('cookie') ||
554
+ element.classList.contains('consent')
555
+ ))
556
+ ));
557
+
558
+ if (isCookieBanner) return true;
559
+
560
+ // Additional check for buttons in cookie banners
561
+ const isInCookieBanner = typeof element.closest === 'function' && element.closest(
562
+ '[id*="cookie"],[id*="consent"],[class*="cookie"],[class*="consent"],[id*="onetrust"]'
563
+ );
564
+
565
+ if (isInCookieBanner && (
566
+ element.tagName.toLowerCase() === 'button' ||
567
+ element.getAttribute('role') === 'button' ||
568
+ (element.classList && element.classList.contains('button')) ||
569
+ element.onclick ||
570
+ element.getAttribute('onclick')
571
+ )) {
572
+ return true;
573
+ }
574
+
575
+ // Get computed style
576
+ const style = window.getComputedStyle(element);
577
+
578
+ // Check for event listeners
579
+ const hasClickHandler =
580
+ element.onclick !== null ||
581
+ element.getAttribute("onclick") !== null ||
582
+ element.hasAttribute("ng-click") ||
583
+ element.hasAttribute("@click") ||
584
+ element.hasAttribute("v-on:click");
585
+
586
+ // Helper function to safely get event listeners
587
+ function getEventListeners(el) {
588
+ try {
589
+ return window.getEventListeners?.(el) || {};
590
+ } catch (e) {
591
+ const listeners = {};
592
+ const eventTypes = [
593
+ "click",
594
+ "mousedown",
595
+ "mouseup",
596
+ "touchstart",
597
+ "touchend",
598
+ "keydown",
599
+ "keyup",
600
+ "focus",
601
+ "blur",
602
+ ];
603
+
604
+ for (const type of eventTypes) {
605
+ const handler = el[`on${type}`];
606
+ if (handler) {
607
+ listeners[type] = [{ listener: handler, useCapture: false }];
608
+ }
609
+ }
610
+ return listeners;
611
+ }
612
+ }
613
+
614
+ // Check for click-related events
615
+ const listeners = getEventListeners(element);
616
+ const hasClickListeners =
617
+ listeners &&
618
+ (listeners.click?.length > 0 ||
619
+ listeners.mousedown?.length > 0 ||
620
+ listeners.mouseup?.length > 0 ||
621
+ listeners.touchstart?.length > 0 ||
622
+ listeners.touchend?.length > 0);
623
+
624
+ // Check for ARIA properties
625
+ const hasAriaProps =
626
+ element.hasAttribute("aria-expanded") ||
627
+ element.hasAttribute("aria-pressed") ||
628
+ element.hasAttribute("aria-selected") ||
629
+ element.hasAttribute("aria-checked");
630
+
631
+ const isContentEditable = element.getAttribute("contenteditable") === "true" ||
632
+ element.isContentEditable ||
633
+ element.id === "tinymce" ||
634
+ element.classList.contains("mce-content-body") ||
635
+ (element.tagName.toLowerCase() === "body" && element.getAttribute("data-id")?.startsWith("mce_"));
636
+
637
+ // Check if element is draggable
638
+ const isDraggable =
639
+ element.draggable || element.getAttribute("draggable") === "true";
640
+
641
+ return (
642
+ hasAriaProps ||
643
+ hasClickHandler ||
644
+ hasClickListeners ||
645
+ isDraggable ||
646
+ isContentEditable
647
+ );
648
+ }
649
+
650
+ /**
651
+ * Checks if an element is the topmost element at its position.
652
+ */
653
+ function isTopElement(element) {
654
+ const rect = getCachedBoundingRect(element);
655
+
656
+ // If element is not in viewport, consider it top
657
+ const isInViewport = (
658
+ rect.left < window.innerWidth &&
659
+ rect.right > 0 &&
660
+ rect.top < window.innerHeight &&
661
+ rect.bottom > 0
662
+ );
663
+
664
+ if (!isInViewport) {
665
+ return true;
666
+ }
667
+
668
+ // Find the correct document context and root element
669
+ let doc = element.ownerDocument;
670
+
671
+ // If we're in an iframe, elements are considered top by default
672
+ if (doc !== window.document) {
673
+ return true;
674
+ }
675
+
676
+ // For shadow DOM, we need to check within its own root context
677
+ const shadowRoot = element.getRootNode();
678
+ if (shadowRoot instanceof ShadowRoot) {
679
+ const centerX = rect.left + rect.width / 2;
680
+ const centerY = rect.top + rect.height / 2;
681
+
682
+ try {
683
+ const topEl = measureDomOperation(
684
+ () => shadowRoot.elementFromPoint(centerX, centerY),
685
+ 'elementFromPoint'
686
+ );
687
+ if (!topEl) return false;
688
+
689
+ let current = topEl;
690
+ while (current && current !== shadowRoot) {
691
+ if (current === element) return true;
692
+ current = current.parentElement;
693
+ }
694
+ return false;
695
+ } catch (e) {
696
+ return true;
697
+ }
698
+ }
699
+
700
+ // For elements in viewport, check if they're topmost
701
+ const centerX = rect.left + rect.width / 2;
702
+ const centerY = rect.top + rect.height / 2;
703
+
704
+ try {
705
+ const topEl = document.elementFromPoint(centerX, centerY);
706
+ if (!topEl) return false;
707
+
708
+ let current = topEl;
709
+ while (current && current !== document.documentElement) {
710
+ if (current === element) return true;
711
+ current = current.parentElement;
712
+ }
713
+ return false;
714
+ } catch (e) {
715
+ return true;
716
+ }
717
+ }
718
+
719
+ /**
720
+ * Checks if an element is within the expanded viewport.
721
+ */
722
+ function isInExpandedViewport(element, viewportExpansion) {
723
+ if (viewportExpansion === -1) {
724
+ return true;
725
+ }
726
+
727
+ const rect = getCachedBoundingRect(element);
728
+
729
+ // Simple viewport check without scroll calculations
730
+ return !(
731
+ rect.bottom < -viewportExpansion ||
732
+ rect.top > window.innerHeight + viewportExpansion ||
733
+ rect.right < -viewportExpansion ||
734
+ rect.left > window.innerWidth + viewportExpansion
735
+ );
736
+ }
737
+
738
+ // Add this new helper function
739
+ function getEffectiveScroll(element) {
740
+ let currentEl = element;
741
+ let scrollX = 0;
742
+ let scrollY = 0;
743
+
744
+ return measureDomOperation(() => {
745
+ while (currentEl && currentEl !== document.documentElement) {
746
+ if (currentEl.scrollLeft || currentEl.scrollTop) {
747
+ scrollX += currentEl.scrollLeft;
748
+ scrollY += currentEl.scrollTop;
749
+ }
750
+ currentEl = currentEl.parentElement;
751
+ }
752
+
753
+ scrollX += window.scrollX;
754
+ scrollY += window.scrollY;
755
+
756
+ return { scrollX, scrollY };
757
+ }, 'scrollOperations');
758
+ }
759
+
760
+ // Add these helper functions at the top level
761
+ function isInteractiveCandidate(element) {
762
+ if (!element || element.nodeType !== Node.ELEMENT_NODE) return false;
763
+
764
+ const tagName = element.tagName.toLowerCase();
765
+
766
+ // Fast-path for common interactive elements
767
+ const interactiveElements = new Set([
768
+ "a", "button", "input", "select", "textarea", "details", "summary"
769
+ ]);
770
+
771
+ if (interactiveElements.has(tagName)) return true;
772
+
773
+ // Quick attribute checks without getting full lists
774
+ const hasQuickInteractiveAttr = element.hasAttribute("onclick") ||
775
+ element.hasAttribute("role") ||
776
+ element.hasAttribute("tabindex") ||
777
+ element.hasAttribute("aria-") ||
778
+ element.hasAttribute("data-action");
779
+
780
+ return hasQuickInteractiveAttr;
781
+ }
782
+
783
+ function quickVisibilityCheck(element) {
784
+ // Fast initial check before expensive getComputedStyle
785
+ return element.offsetWidth > 0 &&
786
+ element.offsetHeight > 0 &&
787
+ !element.hasAttribute("hidden") &&
788
+ element.style.display !== "none" &&
789
+ element.style.visibility !== "hidden";
790
+ }
791
+
792
+ /**
793
+ * Creates a node data object for a given node and its descendants.
794
+ */
795
+ function buildDomTree(node, parentIframe = null) {
796
+ if (debugMode) PERF_METRICS.nodeMetrics.totalNodes++;
797
+
798
+ if (!node || node.id === HIGHLIGHT_CONTAINER_ID) {
799
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
800
+ return null;
801
+ }
802
+
803
+ // Special handling for root node (body)
804
+ if (node === document.body) {
805
+ const nodeData = {
806
+ tagName: 'body',
807
+ attributes: {},
808
+ xpath: '/body',
809
+ children: [],
810
+ };
811
+
812
+ // Process children of body
813
+ for (const child of node.childNodes) {
814
+ const domElement = buildDomTree(child, parentIframe);
815
+ if (domElement) nodeData.children.push(domElement);
816
+ }
817
+
818
+ const id = `${ID.current++}`;
819
+ DOM_HASH_MAP[id] = nodeData;
820
+ if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
821
+ return id;
822
+ }
823
+
824
+ // Early bailout for non-element nodes except text
825
+ if (node.nodeType !== Node.ELEMENT_NODE && node.nodeType !== Node.TEXT_NODE) {
826
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
827
+ return null;
828
+ }
829
+
830
+ // Process text nodes
831
+ if (node.nodeType === Node.TEXT_NODE) {
832
+ const textContent = node.textContent.trim();
833
+ if (!textContent) {
834
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
835
+ return null;
836
+ }
837
+
838
+ // Only check visibility for text nodes that might be visible
839
+ const parentElement = node.parentElement;
840
+ if (!parentElement || parentElement.tagName.toLowerCase() === 'script') {
841
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
842
+ return null;
843
+ }
844
+
845
+ const id = `${ID.current++}`;
846
+ DOM_HASH_MAP[id] = {
847
+ type: "TEXT_NODE",
848
+ text: textContent,
849
+ isVisible: isTextNodeVisible(node),
850
+ };
851
+ if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
852
+ return id;
853
+ }
854
+
855
+ // Quick checks for element nodes
856
+ if (node.nodeType === Node.ELEMENT_NODE && !isElementAccepted(node)) {
857
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
858
+ return null;
859
+ }
860
+
861
+ // Early viewport check - only filter out elements clearly outside viewport
862
+ if (viewportExpansion !== -1) {
863
+ const rect = getCachedBoundingRect(node);
864
+ const style = getCachedComputedStyle(node);
865
+
866
+ // Skip viewport check for fixed/sticky elements as they may appear anywhere
867
+ const isFixedOrSticky = style && (style.position === 'fixed' || style.position === 'sticky');
868
+
869
+ // Check if element has actual dimensions
870
+ const hasSize = node.offsetWidth > 0 || node.offsetHeight > 0;
871
+
872
+ if (!rect || (!isFixedOrSticky && !hasSize && (
873
+ rect.bottom < -viewportExpansion ||
874
+ rect.top > window.innerHeight + viewportExpansion ||
875
+ rect.right < -viewportExpansion ||
876
+ rect.left > window.innerWidth + viewportExpansion
877
+ ))) {
878
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
879
+ return null;
880
+ }
881
+ }
882
+
883
+ // Process element node
884
+ const nodeData = {
885
+ tagName: node.tagName.toLowerCase(),
886
+ attributes: {},
887
+ xpath: getXPathTree(node, true),
888
+ children: [],
889
+ };
890
+
891
+ // Get attributes for interactive elements or potential text containers
892
+ if (isInteractiveCandidate(node) || node.tagName.toLowerCase() === 'iframe' || node.tagName.toLowerCase() === 'body') {
893
+ const attributeNames = node.getAttributeNames?.() || [];
894
+ for (const name of attributeNames) {
895
+ nodeData.attributes[name] = node.getAttribute(name);
896
+ }
897
+ }
898
+
899
+ // if (isInteractiveCandidate(node)) {
900
+
901
+ // Check interactivity
902
+ if (node.nodeType === Node.ELEMENT_NODE) {
903
+ nodeData.isVisible = isElementVisible(node);
904
+ if (nodeData.isVisible) {
905
+ nodeData.isTopElement = isTopElement(node);
906
+ if (nodeData.isTopElement) {
907
+ nodeData.isInteractive = isInteractiveElement(node);
908
+ if (nodeData.isInteractive) {
909
+ nodeData.isInViewport = true;
910
+ nodeData.highlightIndex = highlightIndex++;
911
+
912
+ if (doHighlightElements) {
913
+ if (focusHighlightIndex >= 0) {
914
+ if (focusHighlightIndex === nodeData.highlightIndex) {
915
+ highlightElement(node, nodeData.highlightIndex, parentIframe);
916
+ }
917
+ } else {
918
+ highlightElement(node, nodeData.highlightIndex, parentIframe);
919
+ }
920
+ }
921
+ }
922
+ }
923
+ }
924
+ }
925
+
926
+ // Process children, with special handling for iframes and rich text editors
927
+ if (node.tagName) {
928
+ const tagName = node.tagName.toLowerCase();
929
+
930
+ // Handle iframes
931
+ if (tagName === "iframe") {
932
+ try {
933
+ const iframeDoc = node.contentDocument || node.contentWindow?.document;
934
+ if (iframeDoc) {
935
+ for (const child of iframeDoc.childNodes) {
936
+ const domElement = buildDomTree(child, node);
937
+ if (domElement) nodeData.children.push(domElement);
938
+ }
939
+ }
940
+ } catch (e) {
941
+ console.warn("Unable to access iframe:", e);
942
+ }
943
+ }
944
+ // Handle rich text editors and contenteditable elements
945
+ else if (
946
+ node.isContentEditable ||
947
+ node.getAttribute("contenteditable") === "true" ||
948
+ node.id === "tinymce" ||
949
+ node.classList.contains("mce-content-body") ||
950
+ (tagName === "body" && node.getAttribute("data-id")?.startsWith("mce_"))
951
+ ) {
952
+ // Process all child nodes to capture formatted text
953
+ for (const child of node.childNodes) {
954
+ const domElement = buildDomTree(child, parentIframe);
955
+ if (domElement) nodeData.children.push(domElement);
956
+ }
957
+ }
958
+ // Handle shadow DOM
959
+ else if (node.shadowRoot) {
960
+ nodeData.shadowRoot = true;
961
+ for (const child of node.shadowRoot.childNodes) {
962
+ const domElement = buildDomTree(child, parentIframe);
963
+ if (domElement) nodeData.children.push(domElement);
964
+ }
965
+ }
966
+ // Handle regular elements
967
+ else {
968
+ for (const child of node.childNodes) {
969
+ const domElement = buildDomTree(child, parentIframe);
970
+ if (domElement) nodeData.children.push(domElement);
971
+ }
972
+ }
973
+ }
974
+
975
+ // Skip empty anchor tags
976
+ if (nodeData.tagName === 'a' && nodeData.children.length === 0 && !nodeData.attributes.href) {
977
+ if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++;
978
+ return null;
979
+ }
980
+
981
+ const id = `${ID.current++}`;
982
+ DOM_HASH_MAP[id] = nodeData;
983
+ if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++;
984
+ return id;
985
+ }
986
+
987
+ // After all functions are defined, wrap them with performance measurement
988
+ // Remove buildDomTree from here as we measure it separately
989
+ highlightElement = measureTime(highlightElement);
990
+ isInteractiveElement = measureTime(isInteractiveElement);
991
+ isElementVisible = measureTime(isElementVisible);
992
+ isTopElement = measureTime(isTopElement);
993
+ isInExpandedViewport = measureTime(isInExpandedViewport);
994
+ isTextNodeVisible = measureTime(isTextNodeVisible);
995
+ getEffectiveScroll = measureTime(getEffectiveScroll);
996
+
997
+ const rootId = buildDomTree(document.body);
998
+
999
+ // Clear the cache before starting
1000
+ DOM_CACHE.clearCache();
1001
+
1002
+ // Only process metrics in debug mode
1003
+ if (debugMode && PERF_METRICS) {
1004
+ // Convert timings to seconds and add useful derived metrics
1005
+ Object.keys(PERF_METRICS.timings).forEach(key => {
1006
+ PERF_METRICS.timings[key] = PERF_METRICS.timings[key] / 1000;
1007
+ });
1008
+
1009
+ Object.keys(PERF_METRICS.buildDomTreeBreakdown).forEach(key => {
1010
+ if (typeof PERF_METRICS.buildDomTreeBreakdown[key] === 'number') {
1011
+ PERF_METRICS.buildDomTreeBreakdown[key] = PERF_METRICS.buildDomTreeBreakdown[key] / 1000;
1012
+ }
1013
+ });
1014
+
1015
+ // Add some useful derived metrics
1016
+ if (PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls > 0) {
1017
+ PERF_METRICS.buildDomTreeBreakdown.averageTimePerNode =
1018
+ PERF_METRICS.buildDomTreeBreakdown.totalTime / PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls;
1019
+ }
1020
+
1021
+ PERF_METRICS.buildDomTreeBreakdown.timeInChildCalls =
1022
+ PERF_METRICS.buildDomTreeBreakdown.totalTime - PERF_METRICS.buildDomTreeBreakdown.totalSelfTime;
1023
+
1024
+ // Add average time per operation to the metrics
1025
+ Object.keys(PERF_METRICS.buildDomTreeBreakdown.domOperations).forEach(op => {
1026
+ const time = PERF_METRICS.buildDomTreeBreakdown.domOperations[op];
1027
+ const count = PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[op];
1028
+ if (count > 0) {
1029
+ PERF_METRICS.buildDomTreeBreakdown.domOperations[`${op}Average`] = time / count;
1030
+ }
1031
+ });
1032
+
1033
+ // Calculate cache hit rates
1034
+ const boundingRectTotal = PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.boundingRectCacheMisses;
1035
+ const computedStyleTotal = PERF_METRICS.cacheMetrics.computedStyleCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheMisses;
1036
+
1037
+ if (boundingRectTotal > 0) {
1038
+ PERF_METRICS.cacheMetrics.boundingRectHitRate = PERF_METRICS.cacheMetrics.boundingRectCacheHits / boundingRectTotal;
1039
+ }
1040
+
1041
+ if (computedStyleTotal > 0) {
1042
+ PERF_METRICS.cacheMetrics.computedStyleHitRate = PERF_METRICS.cacheMetrics.computedStyleCacheHits / computedStyleTotal;
1043
+ }
1044
+
1045
+ if ((boundingRectTotal + computedStyleTotal) > 0) {
1046
+ PERF_METRICS.cacheMetrics.overallHitRate =
1047
+ (PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheHits) /
1048
+ (boundingRectTotal + computedStyleTotal);
1049
+ }
1050
+ }
1051
+
1052
+ return debugMode ?
1053
+ { rootId, map: DOM_HASH_MAP, perfMetrics: PERF_METRICS } :
1054
+ { rootId, map: DOM_HASH_MAP };
1055
+ };
browser_use/dom/history_tree_processor/service.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hashlib
2
+ from typing import Optional
3
+
4
+ from browser_use.dom.history_tree_processor.view import DOMHistoryElement, HashedDomElement
5
+ from browser_use.dom.views import DOMElementNode
6
+
7
+
8
+ class HistoryTreeProcessor:
9
+ """ "
10
+ Operations on the DOM elements
11
+
12
+ @dev be careful - text nodes can change even if elements stay the same
13
+ """
14
+
15
+ @staticmethod
16
+ def convert_dom_element_to_history_element(dom_element: DOMElementNode) -> DOMHistoryElement:
17
+ from browser_use.browser.context import BrowserContext
18
+
19
+ parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
20
+ css_selector = BrowserContext._enhanced_css_selector_for_element(dom_element)
21
+ return DOMHistoryElement(
22
+ dom_element.tag_name,
23
+ dom_element.xpath,
24
+ dom_element.highlight_index,
25
+ parent_branch_path,
26
+ dom_element.attributes,
27
+ dom_element.shadow_root,
28
+ css_selector=css_selector,
29
+ page_coordinates=dom_element.page_coordinates,
30
+ viewport_coordinates=dom_element.viewport_coordinates,
31
+ viewport_info=dom_element.viewport_info,
32
+ )
33
+
34
+ @staticmethod
35
+ def find_history_element_in_tree(dom_history_element: DOMHistoryElement, tree: DOMElementNode) -> Optional[DOMElementNode]:
36
+ hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
37
+
38
+ def process_node(node: DOMElementNode):
39
+ if node.highlight_index is not None:
40
+ hashed_node = HistoryTreeProcessor._hash_dom_element(node)
41
+ if hashed_node == hashed_dom_history_element:
42
+ return node
43
+ for child in node.children:
44
+ if isinstance(child, DOMElementNode):
45
+ result = process_node(child)
46
+ if result is not None:
47
+ return result
48
+ return None
49
+
50
+ return process_node(tree)
51
+
52
+ @staticmethod
53
+ def compare_history_element_and_dom_element(dom_history_element: DOMHistoryElement, dom_element: DOMElementNode) -> bool:
54
+ hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element)
55
+ hashed_dom_element = HistoryTreeProcessor._hash_dom_element(dom_element)
56
+
57
+ return hashed_dom_history_element == hashed_dom_element
58
+
59
+ @staticmethod
60
+ def _hash_dom_history_element(dom_history_element: DOMHistoryElement) -> HashedDomElement:
61
+ branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(dom_history_element.entire_parent_branch_path)
62
+ attributes_hash = HistoryTreeProcessor._attributes_hash(dom_history_element.attributes)
63
+ xpath_hash = HistoryTreeProcessor._xpath_hash(dom_history_element.xpath)
64
+
65
+ return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
66
+
67
+ @staticmethod
68
+ def _hash_dom_element(dom_element: DOMElementNode) -> HashedDomElement:
69
+ parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element)
70
+ branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(parent_branch_path)
71
+ attributes_hash = HistoryTreeProcessor._attributes_hash(dom_element.attributes)
72
+ xpath_hash = HistoryTreeProcessor._xpath_hash(dom_element.xpath)
73
+ # text_hash = DomTreeProcessor._text_hash(dom_element)
74
+
75
+ return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash)
76
+
77
+ @staticmethod
78
+ def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]:
79
+ parents: list[DOMElementNode] = []
80
+ current_element: DOMElementNode = dom_element
81
+ while current_element.parent is not None:
82
+ parents.append(current_element)
83
+ current_element = current_element.parent
84
+
85
+ parents.reverse()
86
+
87
+ return [parent.tag_name for parent in parents]
88
+
89
+ @staticmethod
90
+ def _parent_branch_path_hash(parent_branch_path: list[str]) -> str:
91
+ parent_branch_path_string = '/'.join(parent_branch_path)
92
+ return hashlib.sha256(parent_branch_path_string.encode()).hexdigest()
93
+
94
+ @staticmethod
95
+ def _attributes_hash(attributes: dict[str, str]) -> str:
96
+ attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items())
97
+ return hashlib.sha256(attributes_string.encode()).hexdigest()
98
+
99
+ @staticmethod
100
+ def _xpath_hash(xpath: str) -> str:
101
+ return hashlib.sha256(xpath.encode()).hexdigest()
102
+
103
+ @staticmethod
104
+ def _text_hash(dom_element: DOMElementNode) -> str:
105
+ """ """
106
+ text_string = dom_element.get_all_text_till_next_clickable_element()
107
+ return hashlib.sha256(text_string.encode()).hexdigest()
browser_use/dom/history_tree_processor/view.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import TYPE_CHECKING, Optional
3
+
4
+ from pydantic import BaseModel
5
+
6
+
7
+ @dataclass
8
+ class HashedDomElement:
9
+ """
10
+ Hash of the dom element to be used as a unique identifier
11
+ """
12
+
13
+ branch_path_hash: str
14
+ attributes_hash: str
15
+ xpath_hash: str
16
+ # text_hash: str
17
+
18
+
19
+ class Coordinates(BaseModel):
20
+ x: int
21
+ y: int
22
+
23
+
24
+ class CoordinateSet(BaseModel):
25
+ top_left: Coordinates
26
+ top_right: Coordinates
27
+ bottom_left: Coordinates
28
+ bottom_right: Coordinates
29
+ center: Coordinates
30
+ width: int
31
+ height: int
32
+
33
+
34
+ class ViewportInfo(BaseModel):
35
+ scroll_x: int
36
+ scroll_y: int
37
+ width: int
38
+ height: int
39
+
40
+
41
+ @dataclass
42
+ class DOMHistoryElement:
43
+ tag_name: str
44
+ xpath: str
45
+ highlight_index: Optional[int]
46
+ entire_parent_branch_path: list[str]
47
+ attributes: dict[str, str]
48
+ shadow_root: bool = False
49
+ css_selector: Optional[str] = None
50
+ page_coordinates: Optional[CoordinateSet] = None
51
+ viewport_coordinates: Optional[CoordinateSet] = None
52
+ viewport_info: Optional[ViewportInfo] = None
53
+
54
+ def to_dict(self) -> dict:
55
+ page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None
56
+ viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None
57
+ viewport_info = self.viewport_info.model_dump() if self.viewport_info else None
58
+
59
+ return {
60
+ 'tag_name': self.tag_name,
61
+ 'xpath': self.xpath,
62
+ 'highlight_index': self.highlight_index,
63
+ 'entire_parent_branch_path': self.entire_parent_branch_path,
64
+ 'attributes': self.attributes,
65
+ 'shadow_root': self.shadow_root,
66
+ 'css_selector': self.css_selector,
67
+ 'page_coordinates': page_coordinates,
68
+ 'viewport_coordinates': viewport_coordinates,
69
+ 'viewport_info': viewport_info,
70
+ }
browser_use/dom/service.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import json
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from importlib import resources
6
+ from typing import TYPE_CHECKING, Optional
7
+
8
+ if TYPE_CHECKING:
9
+ from playwright.async_api import Page
10
+
11
+ from browser_use.dom.views import (
12
+ DOMBaseNode,
13
+ DOMElementNode,
14
+ DOMState,
15
+ DOMTextNode,
16
+ SelectorMap,
17
+ )
18
+ from browser_use.utils import time_execution_async
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class ViewportInfo:
25
+ width: int
26
+ height: int
27
+
28
+
29
+ class DomService:
30
+ def __init__(self, page: 'Page'):
31
+ self.page = page
32
+ self.xpath_cache = {}
33
+
34
+ self.js_code = resources.read_text('browser_use.dom', 'buildDomTree.js')
35
+
36
+ # region - Clickable elements
37
+ @time_execution_async('--get_clickable_elements')
38
+ async def get_clickable_elements(
39
+ self,
40
+ highlight_elements: bool = True,
41
+ focus_element: int = -1,
42
+ viewport_expansion: int = 0,
43
+ ) -> DOMState:
44
+ element_tree, selector_map = await self._build_dom_tree(highlight_elements, focus_element, viewport_expansion)
45
+ return DOMState(element_tree=element_tree, selector_map=selector_map)
46
+
47
+ @time_execution_async('--build_dom_tree')
48
+ async def _build_dom_tree(
49
+ self,
50
+ highlight_elements: bool,
51
+ focus_element: int,
52
+ viewport_expansion: int,
53
+ ) -> tuple[DOMElementNode, SelectorMap]:
54
+ if await self.page.evaluate('1+1') != 2:
55
+ raise ValueError('The page cannot evaluate javascript code properly')
56
+
57
+ # NOTE: We execute JS code in the browser to extract important DOM information.
58
+ # The returned hash map contains information about the DOM tree and the
59
+ # relationship between the DOM elements.
60
+ debug_mode = logger.getEffectiveLevel() == logging.DEBUG
61
+ args = {
62
+ 'doHighlightElements': highlight_elements,
63
+ 'focusHighlightIndex': focus_element,
64
+ 'viewportExpansion': viewport_expansion,
65
+ 'debugMode': debug_mode,
66
+ }
67
+
68
+ try:
69
+ eval_page = await self.page.evaluate(self.js_code, args)
70
+ except Exception as e:
71
+ logger.error('Error evaluating JavaScript: %s', e)
72
+ raise
73
+
74
+ # Only log performance metrics in debug mode
75
+ if debug_mode and 'perfMetrics' in eval_page:
76
+ logger.debug('DOM Tree Building Performance Metrics:\n%s', json.dumps(eval_page['perfMetrics'], indent=2))
77
+
78
+ return await self._construct_dom_tree(eval_page)
79
+
80
+ @time_execution_async('--construct_dom_tree')
81
+ async def _construct_dom_tree(
82
+ self,
83
+ eval_page: dict,
84
+ ) -> tuple[DOMElementNode, SelectorMap]:
85
+ js_node_map = eval_page['map']
86
+ js_root_id = eval_page['rootId']
87
+
88
+ selector_map = {}
89
+ node_map = {}
90
+
91
+ for id, node_data in js_node_map.items():
92
+ node, children_ids = self._parse_node(node_data)
93
+ if node is None:
94
+ continue
95
+
96
+ node_map[id] = node
97
+
98
+ if isinstance(node, DOMElementNode) and node.highlight_index is not None:
99
+ selector_map[node.highlight_index] = node
100
+
101
+ # NOTE: We know that we are building the tree bottom up
102
+ # and all children are already processed.
103
+ if isinstance(node, DOMElementNode):
104
+ for child_id in children_ids:
105
+ if child_id not in node_map:
106
+ continue
107
+
108
+ child_node = node_map[child_id]
109
+
110
+ child_node.parent = node
111
+ node.children.append(child_node)
112
+
113
+ html_to_dict = node_map[str(js_root_id)]
114
+
115
+ del node_map
116
+ del js_node_map
117
+ del js_root_id
118
+
119
+ gc.collect()
120
+
121
+ if html_to_dict is None or not isinstance(html_to_dict, DOMElementNode):
122
+ raise ValueError('Failed to parse HTML to dictionary')
123
+
124
+ return html_to_dict, selector_map
125
+
126
+ def _parse_node(
127
+ self,
128
+ node_data: dict,
129
+ ) -> tuple[Optional[DOMBaseNode], list[int]]:
130
+ if not node_data:
131
+ return None, []
132
+
133
+ # Process text nodes immediately
134
+ if node_data.get('type') == 'TEXT_NODE':
135
+ text_node = DOMTextNode(
136
+ text=node_data['text'],
137
+ is_visible=node_data['isVisible'],
138
+ parent=None,
139
+ )
140
+ return text_node, []
141
+
142
+ # Process coordinates if they exist for element nodes
143
+
144
+ viewport_info = None
145
+
146
+ if 'viewport' in node_data:
147
+ viewport_info = ViewportInfo(
148
+ width=node_data['viewport']['width'],
149
+ height=node_data['viewport']['height'],
150
+ )
151
+
152
+ element_node = DOMElementNode(
153
+ tag_name=node_data['tagName'],
154
+ xpath=node_data['xpath'],
155
+ attributes=node_data.get('attributes', {}),
156
+ children=[],
157
+ is_visible=node_data.get('isVisible', False),
158
+ is_interactive=node_data.get('isInteractive', False),
159
+ is_top_element=node_data.get('isTopElement', False),
160
+ is_in_viewport=node_data.get('isInViewport', False),
161
+ highlight_index=node_data.get('highlightIndex'),
162
+ shadow_root=node_data.get('shadowRoot', False),
163
+ parent=None,
164
+ viewport_info=viewport_info,
165
+ )
166
+
167
+ children_ids = node_data.get('children', [])
168
+
169
+ return element_node, children_ids
browser_use/dom/tests/debug_page_structure.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import os
3
+ import sys
4
+
5
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
6
+
7
+ from browser_use.browser.browser import Browser, BrowserConfig
8
+ from browser_use.browser.context import BrowserContext
9
+
10
+
11
+ async def analyze_page_structure(url: str):
12
+ """Analyze and print the structure of a webpage with enhanced debugging"""
13
+ browser = Browser(
14
+ config=BrowserConfig(
15
+ headless=False, # Set to True if you don't need to see the browser
16
+ )
17
+ )
18
+
19
+ context = BrowserContext(browser=browser)
20
+
21
+ try:
22
+ async with context as ctx:
23
+ # Navigate to the URL
24
+ page = await ctx.get_current_page()
25
+ await page.goto(url)
26
+ await page.wait_for_load_state('networkidle')
27
+
28
+ # Get viewport dimensions
29
+ viewport_info = await page.evaluate("""() => {
30
+ return {
31
+ viewport: {
32
+ width: window.innerWidth,
33
+ height: window.innerHeight,
34
+ scrollX: window.scrollX,
35
+ scrollY: window.scrollY
36
+ }
37
+ }
38
+ }""")
39
+
40
+ print('\nViewport Information:')
41
+ print(f'Width: {viewport_info["viewport"]["width"]}')
42
+ print(f'Height: {viewport_info["viewport"]["height"]}')
43
+ print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}')
44
+ print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}')
45
+
46
+ # Enhanced debug information for cookie consent and fixed position elements
47
+ debug_info = await page.evaluate("""() => {
48
+ function getElementInfo(element) {
49
+ const rect = element.getBoundingClientRect();
50
+ const style = window.getComputedStyle(element);
51
+ return {
52
+ tag: element.tagName.toLowerCase(),
53
+ id: element.id,
54
+ className: element.className,
55
+ position: style.position,
56
+ rect: {
57
+ top: rect.top,
58
+ right: rect.right,
59
+ bottom: rect.bottom,
60
+ left: rect.left,
61
+ width: rect.width,
62
+ height: rect.height
63
+ },
64
+ isFixed: style.position === 'fixed',
65
+ isSticky: style.position === 'sticky',
66
+ zIndex: style.zIndex,
67
+ visibility: style.visibility,
68
+ display: style.display,
69
+ opacity: style.opacity
70
+ };
71
+ }
72
+
73
+ // Find cookie-related elements
74
+ const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]'));
75
+ const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => {
76
+ const style = window.getComputedStyle(el);
77
+ return style.position === 'fixed' || style.position === 'sticky';
78
+ });
79
+
80
+ return {
81
+ cookieElements: cookieElements.map(getElementInfo),
82
+ fixedElements: fixedElements.map(getElementInfo)
83
+ };
84
+ }""")
85
+
86
+ print('\nCookie-related Elements:')
87
+ for elem in debug_info['cookieElements']:
88
+ print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
89
+ print(f'Position: {elem["position"]}')
90
+ print(f'Rect: {elem["rect"]}')
91
+ print(f'Z-Index: {elem["zIndex"]}')
92
+ print(f'Visibility: {elem["visibility"]}')
93
+ print(f'Display: {elem["display"]}')
94
+ print(f'Opacity: {elem["opacity"]}')
95
+
96
+ print('\nFixed/Sticky Position Elements:')
97
+ for elem in debug_info['fixedElements']:
98
+ print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}')
99
+ print(f'Position: {elem["position"]}')
100
+ print(f'Rect: {elem["rect"]}')
101
+ print(f'Z-Index: {elem["zIndex"]}')
102
+
103
+ print(f'\nPage Structure for {url}:\n')
104
+ structure = await ctx.get_page_structure()
105
+ print(structure)
106
+
107
+ input('Press Enter to close the browser...')
108
+ finally:
109
+ await browser.close()
110
+
111
+
112
+ if __name__ == '__main__':
113
+ # You can modify this URL to analyze different pages
114
+
115
+ urls = [
116
+ 'https://www.mlb.com/yankees/stats/',
117
+ 'https://immobilienscout24.de',
118
+ 'https://www.zeiss.com/career/en/job-search.html?page=1',
119
+ 'https://www.zeiss.com/career/en/job-search.html?page=1',
120
+ 'https://reddit.com',
121
+ ]
122
+ for url in urls:
123
+ asyncio.run(analyze_page_structure(url))
browser_use/dom/tests/extraction_test.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import time
3
+
4
+ from browser_use.browser.browser import Browser, BrowserConfig
5
+ from browser_use.browser.context import BrowserContext, BrowserContextConfig
6
+ from browser_use.dom.service import DomService
7
+ from browser_use.utils import time_execution_sync
8
+
9
+
10
+ async def test_process_html_file():
11
+ config = BrowserContextConfig(
12
+ cookies_file='cookies3.json',
13
+ disable_security=True,
14
+ wait_for_network_idle_page_load_time=2,
15
+ )
16
+
17
+ browser = Browser(
18
+ config=BrowserConfig(
19
+ # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
20
+ )
21
+ )
22
+ context = BrowserContext(browser=browser, config=config) # noqa: F821
23
+
24
+ websites = [
25
+ 'https://kayak.com/flights',
26
+ 'https://immobilienscout24.de',
27
+ 'https://google.com',
28
+ 'https://amazon.com',
29
+ 'https://github.com',
30
+ ]
31
+
32
+ async with context as context:
33
+ page = await context.get_current_page()
34
+ dom_service = DomService(page)
35
+
36
+ for website in websites:
37
+ print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
38
+ await page.goto(website)
39
+ time.sleep(2) # Additional wait for dynamic content
40
+
41
+ async def test_viewport(expansion: int, description: str):
42
+ print(f'\n{description}:')
43
+ dom_state = await time_execution_sync(f'get_clickable_elements ({description})')(
44
+ dom_service.get_clickable_elements
45
+ )(highlight_elements=True, viewport_expansion=expansion)
46
+
47
+ elements = dom_state.element_tree
48
+ selector_map = dom_state.selector_map
49
+ element_count = len(selector_map.keys())
50
+ token_count = count_string_tokens(elements.clickable_elements_to_string(), model='gpt-4o')
51
+
52
+ print(f'Number of elements: {element_count}')
53
+ print(f'Token count: {token_count}')
54
+ return element_count, token_count
55
+
56
+ expansions = [0, 100, 200, 300, 400, 500, 600, 1000, -1, -200]
57
+ results = []
58
+
59
+ for i, expansion in enumerate(expansions):
60
+ description = (
61
+ f'{i + 1}. Expansion {expansion}px' if expansion >= 0 else f'{i + 1}. All elements ({expansion} expansion)'
62
+ )
63
+ count, tokens = await test_viewport(expansion, description)
64
+ results.append((count, tokens))
65
+ input('Press Enter to continue...')
66
+ await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
67
+
68
+ # Print comparison summary
69
+ print('\nComparison Summary:')
70
+ for i, (count, tokens) in enumerate(results):
71
+ expansion = expansions[i]
72
+ description = f'Expansion {expansion}px' if expansion >= 0 else 'All elements (-1)'
73
+ initial_count, initial_tokens = results[0]
74
+ print(f'{description}: {count} elements (+{count - initial_count}), {tokens} tokens')
75
+
76
+ input('\nPress Enter to continue to next website...')
77
+
78
+ # Clear highlights before next website
79
+ await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
80
+
81
+
82
+ async def test_focus_vs_all_elements():
83
+ config = BrowserContextConfig(
84
+ cookies_file='cookies3.json',
85
+ disable_security=True,
86
+ wait_for_network_idle_page_load_time=2,
87
+ )
88
+
89
+ browser = Browser(
90
+ config=BrowserConfig(
91
+ # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
92
+ )
93
+ )
94
+ context = BrowserContext(browser=browser, config=config) # noqa: F821
95
+
96
+ websites = [
97
+ 'https://immobilienscout24.de',
98
+ 'https://www.zeiss.com/career/en/job-search.html?page=1',
99
+ 'https://www.mlb.com/yankees/stats/',
100
+ 'https://www.amazon.com/s?k=laptop&s=review-rank&crid=1RZCEJ289EUSI&qid=1740202453&sprefix=laptop%2Caps%2C166&ref=sr_st_review-rank&ds=v1%3A4EnYKXVQA7DIE41qCvRZoNB4qN92Jlztd3BPsTFXmxU',
101
+ 'https://codepen.io/geheimschriftstift/pen/mPLvQz',
102
+ 'https://reddit.com',
103
+ 'https://www.google.com/search?q=google+hi&oq=google+hi&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRhA0gEIMjI2NmowajSoAgCwAgE&sourceid=chrome&ie=UTF-8',
104
+ 'https://kayak.com/flights',
105
+ 'https://google.com',
106
+ 'https://amazon.com',
107
+ 'https://github.com',
108
+ ]
109
+
110
+ async with context as context:
111
+ page = await context.get_current_page()
112
+ dom_service = DomService(page)
113
+
114
+ for website in websites:
115
+ # sleep 2
116
+ await page.goto(website)
117
+ time.sleep(2)
118
+
119
+ while True:
120
+ try:
121
+ print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}')
122
+ # time.sleep(2) # Additional wait for dynamic content
123
+
124
+ # First get all elements
125
+ print('\nGetting all elements:')
126
+ all_elements_state = await time_execution_sync('get_all_elements')(dom_service.get_clickable_elements)(
127
+ highlight_elements=True, viewport_expansion=100
128
+ )
129
+
130
+ selector_map = all_elements_state.selector_map
131
+ total_elements = len(selector_map.keys())
132
+ print(f'Total number of elements: {total_elements}')
133
+
134
+ answer = input('Press Enter to clear highlights and continue...')
135
+ if answer == 'q':
136
+ break
137
+
138
+ await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()')
139
+
140
+ except Exception as e:
141
+ print(f'Error: {e}')
142
+ pass
143
+
144
+
145
+ if __name__ == '__main__':
146
+ asyncio.run(test_focus_vs_all_elements())
147
+ asyncio.run(test_process_html_file())
browser_use/dom/tests/process_dom_test.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+
5
+ from browser_use.browser.browser import Browser, BrowserConfig
6
+
7
+
8
+ async def test_process_dom():
9
+ browser = Browser(config=BrowserConfig(headless=False))
10
+
11
+ async with await browser.new_context() as context:
12
+ page = await context.get_current_page()
13
+ await page.goto('https://kayak.com/flights')
14
+ # await page.goto('https://google.com/flights')
15
+ # await page.goto('https://immobilienscout24.de')
16
+ # await page.goto('https://seleniumbase.io/w3schools/iframes')
17
+
18
+ time.sleep(3)
19
+
20
+ with open('browser_use/dom/buildDomTree.js', 'r') as f:
21
+ js_code = f.read()
22
+
23
+ start = time.time()
24
+ dom_tree = await page.evaluate(js_code)
25
+ end = time.time()
26
+
27
+ # print(dom_tree)
28
+ print(f'Time: {end - start:.2f}s')
29
+
30
+ os.makedirs('./tmp', exist_ok=True)
31
+ with open('./tmp/dom.json', 'w') as f:
32
+ json.dump(dom_tree, f, indent=1)
33
+
34
+ # both of these work for immobilienscout24.de
35
+ # await page.click('.sc-dcJsrY.ezjNCe')
36
+ # await page.click(
37
+ # 'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)'
38
+ # )
39
+
40
+ input('Press Enter to continue...')
browser_use/dom/views.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from functools import cached_property
3
+ from typing import TYPE_CHECKING, Dict, List, Optional
4
+
5
+ from browser_use.dom.history_tree_processor.view import CoordinateSet, HashedDomElement, ViewportInfo
6
+ from browser_use.utils import time_execution_sync
7
+
8
+ # Avoid circular import issues
9
+ if TYPE_CHECKING:
10
+ from .views import DOMElementNode
11
+
12
+
13
+ @dataclass(frozen=False)
14
+ class DOMBaseNode:
15
+ is_visible: bool
16
+ # Use None as default and set parent later to avoid circular reference issues
17
+ parent: Optional['DOMElementNode']
18
+
19
+
20
+ @dataclass(frozen=False)
21
+ class DOMTextNode(DOMBaseNode):
22
+ text: str
23
+ type: str = 'TEXT_NODE'
24
+
25
+ def has_parent_with_highlight_index(self) -> bool:
26
+ current = self.parent
27
+ while current is not None:
28
+ # stop if the element has a highlight index (will be handled separately)
29
+ if current.highlight_index is not None:
30
+ return True
31
+
32
+ current = current.parent
33
+ return False
34
+
35
+ def is_parent_in_viewport(self) -> bool:
36
+ if self.parent is None:
37
+ return False
38
+ return self.parent.is_in_viewport
39
+
40
+ def is_parent_top_element(self) -> bool:
41
+ if self.parent is None:
42
+ return False
43
+ return self.parent.is_top_element
44
+
45
+
46
+ @dataclass(frozen=False)
47
+ class DOMElementNode(DOMBaseNode):
48
+ """
49
+ xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe).
50
+ To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`)
51
+ """
52
+
53
+ tag_name: str
54
+ xpath: str
55
+ attributes: Dict[str, str]
56
+ children: List[DOMBaseNode]
57
+ is_interactive: bool = False
58
+ is_top_element: bool = False
59
+ is_in_viewport: bool = False
60
+ shadow_root: bool = False
61
+ highlight_index: Optional[int] = None
62
+ viewport_coordinates: Optional[CoordinateSet] = None
63
+ page_coordinates: Optional[CoordinateSet] = None
64
+ viewport_info: Optional[ViewportInfo] = None
65
+
66
+ def __repr__(self) -> str:
67
+ tag_str = f'<{self.tag_name}'
68
+
69
+ # Add attributes
70
+ for key, value in self.attributes.items():
71
+ tag_str += f' {key}="{value}"'
72
+ tag_str += '>'
73
+
74
+ # Add extra info
75
+ extras = []
76
+ if self.is_interactive:
77
+ extras.append('interactive')
78
+ if self.is_top_element:
79
+ extras.append('top')
80
+ if self.shadow_root:
81
+ extras.append('shadow-root')
82
+ if self.highlight_index is not None:
83
+ extras.append(f'highlight:{self.highlight_index}')
84
+ if self.is_in_viewport:
85
+ extras.append('in-viewport')
86
+
87
+ if extras:
88
+ tag_str += f' [{", ".join(extras)}]'
89
+
90
+ return tag_str
91
+
92
+ @cached_property
93
+ def hash(self) -> HashedDomElement:
94
+ from browser_use.dom.history_tree_processor.service import (
95
+ HistoryTreeProcessor,
96
+ )
97
+
98
+ return HistoryTreeProcessor._hash_dom_element(self)
99
+
100
+ def get_all_text_till_next_clickable_element(self, max_depth: int = -1) -> str:
101
+ text_parts = []
102
+
103
+ def collect_text(node: DOMBaseNode, current_depth: int) -> None:
104
+ if max_depth != -1 and current_depth > max_depth:
105
+ return
106
+
107
+ # Skip this branch if we hit a highlighted element (except for the current node)
108
+ if isinstance(node, DOMElementNode) and node != self and node.highlight_index is not None:
109
+ return
110
+
111
+ if isinstance(node, DOMTextNode):
112
+ text_parts.append(node.text)
113
+ elif isinstance(node, DOMElementNode):
114
+ for child in node.children:
115
+ collect_text(child, current_depth + 1)
116
+
117
+ collect_text(self, 0)
118
+ return '\n'.join(text_parts).strip()
119
+
120
+ @time_execution_sync('--clickable_elements_to_string')
121
+ def clickable_elements_to_string(self, include_attributes: list[str] | None = None) -> str:
122
+ """Convert the processed DOM content to HTML."""
123
+ formatted_text = []
124
+
125
+ def process_node(node: DOMBaseNode, depth: int) -> None:
126
+ if isinstance(node, DOMElementNode):
127
+ # Add element with highlight_index
128
+ if node.highlight_index is not None:
129
+ attributes_str = ''
130
+ text = node.get_all_text_till_next_clickable_element()
131
+ if include_attributes:
132
+ attributes = list(
133
+ set(
134
+ [
135
+ str(value)
136
+ for key, value in node.attributes.items()
137
+ if key in include_attributes and value != node.tag_name
138
+ ]
139
+ )
140
+ )
141
+ if text in attributes:
142
+ attributes.remove(text)
143
+ attributes_str = ';'.join(attributes)
144
+ line = f'[{node.highlight_index}]<{node.tag_name} '
145
+ if attributes_str:
146
+ line += f'{attributes_str}'
147
+ if text:
148
+ if attributes_str:
149
+ line += f'>{text}'
150
+ else:
151
+ line += f'{text}'
152
+ line += '/>'
153
+ formatted_text.append(line)
154
+
155
+ # Process children regardless
156
+ for child in node.children:
157
+ process_node(child, depth + 1)
158
+
159
+ elif isinstance(node, DOMTextNode):
160
+ # Add text only if it doesn't have a highlighted parent
161
+ if not node.has_parent_with_highlight_index() and node.is_visible: # and node.is_parent_top_element()
162
+ formatted_text.append(f'{node.text}')
163
+
164
+ process_node(self, 0)
165
+ return '\n'.join(formatted_text)
166
+
167
+ def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']:
168
+ # Check if current element is a file input
169
+ if self.tag_name == 'input' and self.attributes.get('type') == 'file':
170
+ return self
171
+
172
+ # Check children
173
+ for child in self.children:
174
+ if isinstance(child, DOMElementNode):
175
+ result = child.get_file_upload_element(check_siblings=False)
176
+ if result:
177
+ return result
178
+
179
+ # Check siblings only for the initial call
180
+ if check_siblings and self.parent:
181
+ for sibling in self.parent.children:
182
+ if sibling is not self and isinstance(sibling, DOMElementNode):
183
+ result = sibling.get_file_upload_element(check_siblings=False)
184
+ if result:
185
+ return result
186
+
187
+ return None
188
+
189
+
190
+ SelectorMap = dict[int, DOMElementNode]
191
+
192
+
193
+ @dataclass
194
+ class DOMState:
195
+ element_tree: DOMElementNode
196
+ selector_map: SelectorMap
browser_use/logging_config.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+
10
+ def addLoggingLevel(levelName, levelNum, methodName=None):
11
+ """
12
+ Comprehensively adds a new logging level to the `logging` module and the
13
+ currently configured logging class.
14
+
15
+ `levelName` becomes an attribute of the `logging` module with the value
16
+ `levelNum`. `methodName` becomes a convenience method for both `logging`
17
+ itself and the class returned by `logging.getLoggerClass()` (usually just
18
+ `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is
19
+ used.
20
+
21
+ To avoid accidental clobberings of existing attributes, this method will
22
+ raise an `AttributeError` if the level name is already an attribute of the
23
+ `logging` module or if the method name is already present
24
+
25
+ Example
26
+ -------
27
+ >>> addLoggingLevel('TRACE', logging.DEBUG - 5)
28
+ >>> logging.getLogger(__name__).setLevel('TRACE')
29
+ >>> logging.getLogger(__name__).trace('that worked')
30
+ >>> logging.trace('so did this')
31
+ >>> logging.TRACE
32
+ 5
33
+
34
+ """
35
+ if not methodName:
36
+ methodName = levelName.lower()
37
+
38
+ if hasattr(logging, levelName):
39
+ raise AttributeError('{} already defined in logging module'.format(levelName))
40
+ if hasattr(logging, methodName):
41
+ raise AttributeError('{} already defined in logging module'.format(methodName))
42
+ if hasattr(logging.getLoggerClass(), methodName):
43
+ raise AttributeError('{} already defined in logger class'.format(methodName))
44
+
45
+ # This method was inspired by the answers to Stack Overflow post
46
+ # http://stackoverflow.com/q/2183233/2988730, especially
47
+ # http://stackoverflow.com/a/13638084/2988730
48
+ def logForLevel(self, message, *args, **kwargs):
49
+ if self.isEnabledFor(levelNum):
50
+ self._log(levelNum, message, args, **kwargs)
51
+
52
+ def logToRoot(message, *args, **kwargs):
53
+ logging.log(levelNum, message, *args, **kwargs)
54
+
55
+ logging.addLevelName(levelNum, levelName)
56
+ setattr(logging, levelName, levelNum)
57
+ setattr(logging.getLoggerClass(), methodName, logForLevel)
58
+ setattr(logging, methodName, logToRoot)
59
+
60
+
61
+ def setup_logging():
62
+ # Try to add RESULT level, but ignore if it already exists
63
+ try:
64
+ addLoggingLevel('RESULT', 35) # This allows ERROR, FATAL and CRITICAL
65
+ except AttributeError:
66
+ pass # Level already exists, which is fine
67
+
68
+ log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower()
69
+
70
+ # Check if handlers are already set up
71
+ if logging.getLogger().hasHandlers():
72
+ return
73
+
74
+ # Clear existing handlers
75
+ root = logging.getLogger()
76
+ root.handlers = []
77
+
78
+ class BrowserUseFormatter(logging.Formatter):
79
+ def format(self, record):
80
+ if type(record.name) == str and record.name.startswith('browser_use.'):
81
+ record.name = record.name.split('.')[-2]
82
+ return super().format(record)
83
+
84
+ # Setup single handler for all loggers
85
+ console = logging.StreamHandler(sys.stdout)
86
+
87
+ # adittional setLevel here to filter logs
88
+ if log_type == 'result':
89
+ console.setLevel('RESULT')
90
+ console.setFormatter(BrowserUseFormatter('%(message)s'))
91
+ else:
92
+ console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s'))
93
+
94
+ # Configure root logger only
95
+ root.addHandler(console)
96
+
97
+ # switch cases for log_type
98
+ if log_type == 'result':
99
+ root.setLevel('RESULT') # string usage to avoid syntax error
100
+ elif log_type == 'debug':
101
+ root.setLevel(logging.DEBUG)
102
+ else:
103
+ root.setLevel(logging.INFO)
104
+
105
+ # Configure browser_use logger
106
+ browser_use_logger = logging.getLogger('browser_use')
107
+ browser_use_logger.propagate = False # Don't propagate to root logger
108
+ browser_use_logger.addHandler(console)
109
+ browser_use_logger.setLevel(root.level) # Set same level as root logger
110
+
111
+ logger = logging.getLogger('browser_use')
112
+ logger.info('BrowserUse logging setup complete with level %s', log_type)
113
+ # Silence third-party loggers
114
+ for logger in [
115
+ 'WDM',
116
+ 'httpx',
117
+ 'selenium',
118
+ 'playwright',
119
+ 'urllib3',
120
+ 'asyncio',
121
+ 'langchain',
122
+ 'openai',
123
+ 'httpcore',
124
+ 'charset_normalizer',
125
+ 'anthropic._base_client',
126
+ 'PIL.PngImagePlugin',
127
+ 'trafilatura.htmlprocessing',
128
+ 'trafilatura',
129
+ ]:
130
+ third_party = logging.getLogger(logger)
131
+ third_party.setLevel(logging.ERROR)
132
+ third_party.propagate = False
browser_use/telemetry/service.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import uuid
4
+ from pathlib import Path
5
+
6
+ from dotenv import load_dotenv
7
+ from posthog import Posthog
8
+
9
+ from browser_use.telemetry.views import BaseTelemetryEvent
10
+ from browser_use.utils import singleton
11
+
12
+ load_dotenv()
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ POSTHOG_EVENT_SETTINGS = {
19
+ 'process_person_profile': True,
20
+ }
21
+
22
+
23
+ @singleton
24
+ class ProductTelemetry:
25
+ """
26
+ Service for capturing anonymized telemetry data.
27
+
28
+ If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled.
29
+ """
30
+
31
+ USER_ID_PATH = str(Path.home() / '.cache' / 'browser_use' / 'telemetry_user_id')
32
+ PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh'
33
+ HOST = 'https://eu.i.posthog.com'
34
+ UNKNOWN_USER_ID = 'UNKNOWN'
35
+
36
+ _curr_user_id = None
37
+
38
+ def __init__(self) -> None:
39
+ telemetry_disabled = os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false'
40
+ self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug'
41
+
42
+ if telemetry_disabled:
43
+ self._posthog_client = None
44
+ else:
45
+ logging.info(
46
+ 'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.'
47
+ )
48
+ self._posthog_client = Posthog(
49
+ project_api_key=self.PROJECT_API_KEY,
50
+ host=self.HOST,
51
+ disable_geoip=False,
52
+ )
53
+
54
+ # Silence posthog's logging
55
+ if not self.debug_logging:
56
+ posthog_logger = logging.getLogger('posthog')
57
+ posthog_logger.disabled = True
58
+
59
+ if self._posthog_client is None:
60
+ logger.debug('Telemetry disabled')
61
+
62
+ def capture(self, event: BaseTelemetryEvent) -> None:
63
+ if self._posthog_client is None:
64
+ return
65
+
66
+ if self.debug_logging:
67
+ logger.debug(f'Telemetry event: {event.name} {event.properties}')
68
+ self._direct_capture(event)
69
+
70
+ def _direct_capture(self, event: BaseTelemetryEvent) -> None:
71
+ """
72
+ Should not be thread blocking because posthog magically handles it
73
+ """
74
+ if self._posthog_client is None:
75
+ return
76
+
77
+ try:
78
+ self._posthog_client.capture(
79
+ self.user_id,
80
+ event.name,
81
+ {**event.properties, **POSTHOG_EVENT_SETTINGS},
82
+ )
83
+ except Exception as e:
84
+ logger.error(f'Failed to send telemetry event {event.name}: {e}')
85
+
86
+ @property
87
+ def user_id(self) -> str:
88
+ if self._curr_user_id:
89
+ return self._curr_user_id
90
+
91
+ # File access may fail due to permissions or other reasons. We don't want to
92
+ # crash so we catch all exceptions.
93
+ try:
94
+ if not os.path.exists(self.USER_ID_PATH):
95
+ os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True)
96
+ with open(self.USER_ID_PATH, 'w') as f:
97
+ new_user_id = str(uuid.uuid4())
98
+ f.write(new_user_id)
99
+ self._curr_user_id = new_user_id
100
+ else:
101
+ with open(self.USER_ID_PATH, 'r') as f:
102
+ self._curr_user_id = f.read()
103
+ except Exception:
104
+ self._curr_user_id = 'UNKNOWN_USER_ID'
105
+ return self._curr_user_id
browser_use/telemetry/views.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import asdict, dataclass
3
+ from typing import Any, Dict, Sequence
4
+
5
+
6
+ @dataclass
7
+ class BaseTelemetryEvent(ABC):
8
+ @property
9
+ @abstractmethod
10
+ def name(self) -> str:
11
+ pass
12
+
13
+ @property
14
+ def properties(self) -> Dict[str, Any]:
15
+ return {k: v for k, v in asdict(self).items() if k != 'name'}
16
+
17
+
18
+ @dataclass
19
+ class RegisteredFunction:
20
+ name: str
21
+ params: dict[str, Any]
22
+
23
+
24
+ @dataclass
25
+ class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent):
26
+ registered_functions: list[RegisteredFunction]
27
+ name: str = 'controller_registered_functions'
28
+
29
+
30
+ @dataclass
31
+ class AgentStepTelemetryEvent(BaseTelemetryEvent):
32
+ agent_id: str
33
+ step: int
34
+ step_error: list[str]
35
+ consecutive_failures: int
36
+ actions: list[dict]
37
+ name: str = 'agent_step'
38
+
39
+
40
+ @dataclass
41
+ class AgentRunTelemetryEvent(BaseTelemetryEvent):
42
+ agent_id: str
43
+ use_vision: bool
44
+ task: str
45
+ model_name: str
46
+ chat_model_library: str
47
+ version: str
48
+ source: str
49
+ name: str = 'agent_run'
50
+
51
+
52
+ @dataclass
53
+ class AgentEndTelemetryEvent(BaseTelemetryEvent):
54
+ agent_id: str
55
+ steps: int
56
+ max_steps_reached: bool
57
+ is_done: bool
58
+ success: bool | None
59
+ total_input_tokens: int
60
+ total_duration_seconds: float
61
+
62
+ errors: Sequence[str | None]
63
+ name: str = 'agent_end'
browser_use/utils.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import time
3
+ from functools import wraps
4
+ from typing import Any, Callable, Coroutine, ParamSpec, TypeVar
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+
9
+ # Define generic type variables for return type and parameters
10
+ R = TypeVar('R')
11
+ P = ParamSpec('P')
12
+
13
+
14
+ def time_execution_sync(additional_text: str = '') -> Callable[[Callable[P, R]], Callable[P, R]]:
15
+ def decorator(func: Callable[P, R]) -> Callable[P, R]:
16
+ @wraps(func)
17
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
18
+ start_time = time.time()
19
+ result = func(*args, **kwargs)
20
+ execution_time = time.time() - start_time
21
+ logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
22
+ return result
23
+
24
+ return wrapper
25
+
26
+ return decorator
27
+
28
+
29
+ def time_execution_async(
30
+ additional_text: str = '',
31
+ ) -> Callable[[Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]]:
32
+ def decorator(func: Callable[P, Coroutine[Any, Any, R]]) -> Callable[P, Coroutine[Any, Any, R]]:
33
+ @wraps(func)
34
+ async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
35
+ start_time = time.time()
36
+ result = await func(*args, **kwargs)
37
+ execution_time = time.time() - start_time
38
+ logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds')
39
+ return result
40
+
41
+ return wrapper
42
+
43
+ return decorator
44
+
45
+
46
+ def singleton(cls):
47
+ instance = [None]
48
+
49
+ def wrapper(*args, **kwargs):
50
+ if instance[0] is None:
51
+ instance[0] = cls(*args, **kwargs)
52
+ return instance[0]
53
+
54
+ return wrapper
codebeaver.yml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ environment:
2
+ - OPENAI_API_KEY=empty
3
+ - AZURE_OPENAI_API_KEY=empty
4
+ from: pytest
conftest.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ from browser_use.logging_config import setup_logging
5
+
6
+ # Get the absolute path to the project root
7
+ project_root = os.path.dirname(os.path.abspath(__file__))
8
+ sys.path.insert(0, project_root)
9
+
10
+ setup_logging()
docs/README.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Docs
2
+
3
+ The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com).
4
+
5
+ ### Development
6
+
7
+ Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command
8
+
9
+ ```
10
+ npm i -g mintlify
11
+ ```
12
+
13
+ Run the following command at the root of your documentation (where mint.json is)
14
+
15
+ ```
16
+ mintlify dev
17
+ ```