diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..85438cdadb0226b91e66d9f12cb3122b1656bf05 --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +OPENAI_API_KEY= +ANTHROPIC_API_KEY= + +# Set to false to disable anonymized telemetry +ANONYMIZED_TELEMETRY=true + +# LogLevel: Set to debug to enable verbose logging, set to result to get results only. Available: result | debug | info +BROWSER_USE_LOGGING_LEVEL=info diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..da99c1975fbd3321669c447d6823a4d5a1ae8554 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,4 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +static/*.gif filter=lfs diff=lfs merge=lfs -text +# static/*.mp4 filter=lfs diff=lfs merge=lfs -text +docs/images/checks-passed.png filter=lfs diff=lfs merge=lfs -text +docs/images/laminar.png filter=lfs diff=lfs merge=lfs -text diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000000000000000000000000000000000000..08a567b04989b9988f40d382d09dcd586dde1770 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,84 @@ +name: πŸ› Bug Report +description: Report a bug in browser-use +labels: ["bug", "triage"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! Please fill out the form below to help us reproduce and fix the issue. + + - type: textarea + id: description + attributes: + label: Bug Description + description: A clear and concise description of what the bug is. + placeholder: When I try to... the library... + validations: + required: true + + - type: textarea + id: reproduction + attributes: + label: Reproduction Steps + description: Steps to reproduce the behavior + placeholder: | + 1. Install browser-use... + 2. Run the following task... + 3. See error... + validations: + required: true + + - type: textarea + id: code + attributes: + label: Code Sample + description: Include a minimal code sample that reproduces the issue + render: python + validations: + required: true + + - type: input + id: version + attributes: + label: Version + description: What version of browser-use are you using? (Run `uv pip show browser-use` to find out) + placeholder: "e.g., pip 0.1.26, or git main branch" + validations: + required: true + + - type: dropdown + id: model + attributes: + label: LLM Model + description: Which LLM model(s) are you using? + multiple: true + options: + - GPT-4o + - GPT-4 + - Claude 3.5 Sonnet + - Claude 3.5 Opus + - Claude 3.5 Haiku + - Gemini 1.5 Pro + - Gemini 1.5 Ultra + - Fireworks Mixtral + - DeepSeek Coder + - Local Model (Specify model in description) + - Other (specify in description) + validations: + required: true + + - type: input + id: os + attributes: + label: Operating System + description: What operating system are you using? + placeholder: "e.g., macOS 13.1, Windows 11, Ubuntu 22.04" + validations: + required: true + + - type: textarea + id: logs + attributes: + label: Relevant Log Output + description: Please copy and paste any relevant log output. This will be automatically formatted into code. + render: shell \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..a8607c3ab0b7014e22eb1b2b82d403056e454584 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,11 @@ +blank_issues_enabled: false # Set to true if you want to allow blank issues +contact_links: + - name: πŸ€” Quickstart Guide + url: https://docs.browser-use.com/quickstart + about: Most common issues can be resolved by following our quickstart guide + - name: πŸ€” Questions and Help + url: https://link.browser-use.com/discord + about: Please ask questions in our Discord community + - name: πŸ“– Documentation + url: https://docs.browser-use.com + about: Check our documentation for answers first \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/docs_issue.yml b/.github/ISSUE_TEMPLATE/docs_issue.yml new file mode 100644 index 0000000000000000000000000000000000000000..b0504a44979a6272827fa9d32f95f5c2342d6c98 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/docs_issue.yml @@ -0,0 +1,55 @@ +name: πŸ“š Documentation Issue +description: Report an issue in the browser-use documentation +labels: ["documentation"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to improve our documentation! Please fill out the form below to help us understand the issue. + + - type: dropdown + id: type + attributes: + label: Type of Documentation Issue + description: What type of documentation issue is this? + options: + - Missing documentation + - Incorrect documentation + - Unclear documentation + - Broken link + - Other (specify in description) + validations: + required: true + + - type: input + id: page + attributes: + label: Documentation Page + description: Which page or section of the documentation is this about? + placeholder: "e.g., https://docs.browser-use.com/getting-started or Installation Guide" + validations: + required: true + + - type: textarea + id: description + attributes: + label: Issue Description + description: Describe what's wrong or missing in the documentation + placeholder: The documentation should... + validations: + required: true + + - type: textarea + id: suggestion + attributes: + label: Suggested Changes + description: If you have specific suggestions for how to improve the documentation, please share them + placeholder: | + The documentation could be improved by... + + Example: + ```python + # Your suggested code example or text here + ``` + validations: + required: true \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000000000000000000000000000000000000..4b5d90f93342c1dba3f24e571a886c7eff16b4f0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,43 @@ +name: πŸ’‘ Feature Request +description: Suggest a new feature for browser-use +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to suggest a new feature! Please fill out the form below to help us understand your suggestion. + + - type: textarea + id: problem + attributes: + label: Problem Description + description: Is your feature request related to a problem? Please describe. + placeholder: I'm always frustrated when... + validations: + required: true + + - type: textarea + id: solution + attributes: + label: Proposed Solution + description: Describe the solution you'd like to see + placeholder: It would be great if... + validations: + required: true + + - type: textarea + id: alternatives + attributes: + label: Alternative Solutions + description: Describe any alternative solutions or features you've considered + placeholder: I've also thought about... + + - type: textarea + id: context + attributes: + label: Additional Context + description: Add any other context or examples about the feature request here + placeholder: | + - Example use cases + - Screenshots or mockups + - Related issues or discussions \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000000000000000000000000000000000000..8ee4acc7e98ab9174d40ee9661d45ab3ce136fd6 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,38 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build hatch + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0affd458965be1c34d27f1958db821b03d1b0a72 --- /dev/null +++ b/.gitignore @@ -0,0 +1,190 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +test_env/ + + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +temp +tmp + + +.DS_Store + +private_example.py +private_example + +browser_cookies.json +cookies.json +AgentHistory.json +cv_04_24.pdf +AgentHistoryList.json +*.gif +gcp-login.json +.vscode +.ruff_cache +.idea +*.txt +*.pdf +*.csv +*.json +*.jsonl + +uv.lock \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..34d34cda0f22789b7543dfec0b3d2321a7258ea6 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.3.0 + hooks: + - id: ruff + args: [ + --line-length=130, + --select=E,F,I, + --fix, + ] + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000000000000000000000000000000000..2c0733315e415bfb5e5b353f9996ecd964d395b2 --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.11 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..1ea3836ce58a4cd32c90c0b4f4e736d840d23780 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Gregor Zunic + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index a3c5a1c84b0539855f3343d3e00484aa09cd6a0c..3a6f60fe1b96bcaf6b12561d14335fcbbc052eb1 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,193 @@ ---- -title: Use -emoji: 🌍 -colorFrom: indigo -colorTo: indigo -sdk: static -pinned: false ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference + + + + Shows a black Browser Use Logo in light color mode and a white one in dark color mode. + + +

Enable AI to control your browser πŸ€–

+ +[![GitHub stars](https://img.shields.io/github/stars/gregpr07/browser-use?style=social)](https://github.com/gregpr07/browser-use/stargazers) +[![Discord](https://img.shields.io/discord/1303749220842340412?color=7289DA&label=Discord&logo=discord&logoColor=white)](https://link.browser-use.com/discord) +[![Cloud](https://img.shields.io/badge/Cloud-☁️-blue)](https://cloud.browser-use.com) +[![Documentation](https://img.shields.io/badge/Documentation-πŸ“•-blue)](https://docs.browser-use.com) +[![Twitter Follow](https://img.shields.io/twitter/follow/Gregor?style=social)](https://x.com/gregpr07) +[![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/mamagnus00) +[![Weave Badge](https://img.shields.io/endpoint?url=https%3A%2F%2Fapp.workweave.ai%2Fapi%2Frepository%2Fbadge%2Forg_T5Pvn3UBswTHIsN1dWS3voPg%2F881458615&labelColor=#EC6341)](https://app.workweave.ai/reports/repository/org_T5Pvn3UBswTHIsN1dWS3voPg/881458615) + +🌐 Browser-use is the easiest way to connect your AI agents with the browser. + +πŸ’‘ See what others are building and share your projects in our [Discord](https://link.browser-use.com/discord)! Want Swag? Check out our [Merch store](https://browsermerch.com). + +🌀️ Skip the setup - try our hosted version for instant browser automation! [Try the cloud ☁︎](https://cloud.browser-use.com). + +# Quick start + +With pip (Python>=3.11): + +```bash +pip install browser-use +``` + +install playwright: + +```bash +playwright install +``` + +Spin up your agent: + +```python +from langchain_openai import ChatOpenAI +from browser_use import Agent +import asyncio +from dotenv import load_dotenv +load_dotenv() + +async def main(): + agent = Agent( + task="Compare the price of gpt-4o and DeepSeek-V3", + llm=ChatOpenAI(model="gpt-4o"), + ) + await agent.run() + +asyncio.run(main()) +``` + +Add your API keys for the provider you want to use to your `.env` file. + +```bash +OPENAI_API_KEY= +``` + +For other settings, models, and more, check out the [documentation πŸ“•](https://docs.browser-use.com). + +### Test with UI + +You can test [browser-use with a UI repository](https://github.com/browser-use/web-ui) + +Or simply run the gradio example: + +``` +uv pip install gradio +``` + +```bash +python examples/ui/gradio_demo.py +``` + +# Demos + +

+ +[Task](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/shopping.py): Add grocery items to cart, and checkout. + +[![AI Did My Groceries](https://github.com/user-attachments/assets/d9359085-bde6-41d4-aa4e-6520d0221872)](https://www.youtube.com/watch?v=L2Ya9PYNns8) + +

+ +Prompt: Add my latest LinkedIn follower to my leads in Salesforce. + +![LinkedIn to Salesforce](https://github.com/user-attachments/assets/1440affc-a552-442e-b702-d0d3b277b0ae) + +

+ +[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/use-cases/find_and_apply_to_jobs.py): Read my CV & find ML jobs, save them to a file, and then start applying for them in new tabs, if you need help, ask me.' + +https://github.com/user-attachments/assets/171fb4d6-0355-46f2-863e-edb04a828d04 + +

+ +[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/browser/real_browser.py): Write a letter in Google Docs to my Papa, thanking him for everything, and save the document as a PDF. + +![Letter to Papa](https://github.com/user-attachments/assets/242ade3e-15bc-41c2-988f-cbc5415a66aa) + +

+ +[Prompt](https://github.com/browser-use/browser-use/blob/main/examples/custom-functions/save_to_file_hugging_face.py): Look up models with a license of cc-by-sa-4.0 and sort by most likes on Hugging face, save top 5 to file. + +https://github.com/user-attachments/assets/de73ee39-432c-4b97-b4e8-939fd7f323b3 + +

+ +## More examples + +For more examples see the [examples](examples) folder or join the [Discord](https://link.browser-use.com/discord) and show off your project. + +# Vision + +Tell your computer what to do, and it gets it done. + +## Roadmap + +### Agent + +- [ ] Improve agent memory (summarize, compress, RAG, etc.) +- [ ] Enhance planning capabilities (load website specific context) +- [ ] Reduce token consumption (system prompt, DOM state) + +### DOM Extraction + +- [ ] Improve extraction for datepickers, dropdowns, special elements +- [ ] Improve state representation for UI elements + +### Rerunning tasks + +- [ ] LLM as fallback +- [ ] Make it easy to define workfows templates where LLM fills in the details +- [ ] Return playwright script from the agent + +### Datasets + +- [ ] Create datasets for complex tasks +- [ ] Benchmark various models against each other +- [ ] Fine-tuning models for specific tasks + +### User Experience + +- [ ] Human-in-the-loop execution +- [ ] Improve the generated GIF quality +- [ ] Create various demos for tutorial execution, job application, QA testing, social media, etc. + +## Contributing + +We love contributions! Feel free to open issues for bugs or feature requests. To contribute to the docs, check out the `/docs` folder. + +## Local Setup + +To learn more about the library, check out the [local setup πŸ“•](https://docs.browser-use.com/development/local-setup). + +## Cooperations + +We are forming a commission to define best practices for UI/UX design for browser agents. +Together, we're exploring how software redesign improves the performance of AI agents and gives these companies a competitive advantage by designing their existing software to be at the forefront of the agent age. + +Email [Toby](mailto:tbiddle@loop11.com?subject=I%20want%20to%20join%20the%20UI/UX%20commission%20for%20AI%20agents&body=Hi%20Toby%2C%0A%0AI%20found%20you%20in%20the%20browser-use%20GitHub%20README.%0A%0A) to apply for a seat on the committee. + +## Swag + +Want to show off your Browser-use swag? Check out our [Merch store](https://browsermerch.com). Good contributors will receive swag for free πŸ‘€. + +## Citation + +If you use Browser Use in your research or project, please cite: + +```bibtex +@software{browser_use2024, + author = {MΓΌller, Magnus and Ε½unič, Gregor}, + title = {Browser Use: Enable AI to control your browser}, + year = {2024}, + publisher = {GitHub}, + url = {https://github.com/browser-use/browser-use} +} +``` + +
+ +[![Twitter Follow](https://img.shields.io/twitter/follow/Gregor?style=social)](https://x.com/gregpr07) +[![Twitter Follow](https://img.shields.io/twitter/follow/Magnus?style=social)](https://x.com/mamagnus00) + +
+ +
+Made with ❀️ in Zurich and San Francisco +
diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..e0969a3a05f2fbbb22ad7ff009d25f8da9fb6589 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,20 @@ +## Reporting Security Issues + +If you believe you have found a security vulnerability in browser-use, please report it through coordinated disclosure. + +**Please do not report security vulnerabilities through the repository issues, discussions, or pull requests.** + +Instead, please open a new [Github security advisory](https://github.com/browser-use/browser-use/security/advisories/new). + +Please include as much of the information listed below as you can to help me better understand and resolve the issue: + +* The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting) +* Full paths of source file(s) related to the manifestation of the issue +* The location of the affected source code (tag/branch/commit or direct URL) +* Any special configuration required to reproduce the issue +* Step-by-step instructions to reproduce the issue +* Proof-of-concept or exploit code (if possible) +* Impact of the issue, including how an attacker might exploit the issue + +This information will help me triage your report more quickly. + diff --git a/browser_use/README.md b/browser_use/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ed850d74033b54ae377e8021f3849a1cc273beb4 --- /dev/null +++ b/browser_use/README.md @@ -0,0 +1,51 @@ +# Codebase Structure + +> The code structure inspired by https://github.com/Netflix/dispatch. + +Very good structure on how to make a scalable codebase is also in [this repo](https://github.com/zhanymkanov/fastapi-best-practices). + +Just a brief document about how we should structure our backend codebase. + +## Code Structure + +```markdown +src/ +// +models.py +services.py +prompts.py +views.py +utils.py +routers.py + + /_/ +``` + +### Service.py + +Always a single file, except if it becomes too long - more than ~500 lines, split it into \_subservices + +### Views.py + +Always split the views into two parts + +```python +# All +... + +# Requests +... + +# Responses +... +``` + +If too long β†’ split into multiple files + +### Prompts.py + +Single file; if too long β†’ split into multiple files (one prompt per file or so) + +### Routers.py + +Never split into more than one file diff --git a/browser_use/__init__.py b/browser_use/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cb9b1a4e30f886fda7d2aff7dade1bd7d543ecb3 --- /dev/null +++ b/browser_use/__init__.py @@ -0,0 +1,27 @@ +from browser_use.logging_config import setup_logging + +setup_logging() + +from browser_use.agent.prompts import SystemPrompt as SystemPrompt +from browser_use.agent.service import Agent as Agent +from browser_use.agent.views import ActionModel as ActionModel +from browser_use.agent.views import ActionResult as ActionResult +from browser_use.agent.views import AgentHistoryList as AgentHistoryList +from browser_use.browser.browser import Browser as Browser +from browser_use.browser.browser import BrowserConfig as BrowserConfig +from browser_use.browser.context import BrowserContextConfig +from browser_use.controller.service import Controller as Controller +from browser_use.dom.service import DomService as DomService + +__all__ = [ + 'Agent', + 'Browser', + 'BrowserConfig', + 'Controller', + 'DomService', + 'SystemPrompt', + 'ActionResult', + 'ActionModel', + 'AgentHistoryList', + 'BrowserContextConfig', +] diff --git a/browser_use/agent/gif.py b/browser_use/agent/gif.py new file mode 100644 index 0000000000000000000000000000000000000000..1cb7cbc9ce15444c2b6b69c0313580f77700177b --- /dev/null +++ b/browser_use/agent/gif.py @@ -0,0 +1,325 @@ +from __future__ import annotations + +import base64 +import io +import logging +import os +import platform +from typing import TYPE_CHECKING, Optional + +from browser_use.agent.views import ( + AgentHistoryList, +) + +if TYPE_CHECKING: + from PIL import Image, ImageFont + +logger = logging.getLogger(__name__) + + +def create_history_gif( + task: str, + history: AgentHistoryList, + # + output_path: str = 'agent_history.gif', + duration: int = 3000, + show_goals: bool = True, + show_task: bool = True, + show_logo: bool = False, + font_size: int = 40, + title_font_size: int = 56, + goal_font_size: int = 44, + margin: int = 40, + line_spacing: float = 1.5, +) -> None: + """Create a GIF from the agent's history with overlaid task and goal text.""" + if not history.history: + logger.warning('No history to create GIF from') + return + + from PIL import Image, ImageFont + + images = [] + + # if history is empty or first screenshot is None, we can't create a gif + if not history.history or not history.history[0].state.screenshot: + logger.warning('No history or first screenshot to create GIF from') + return + + # Try to load nicer fonts + try: + # Try different font options in order of preference + font_options = ['Helvetica', 'Arial', 'DejaVuSans', 'Verdana'] + font_loaded = False + + for font_name in font_options: + try: + if platform.system() == 'Windows': + # Need to specify the abs font path on Windows + font_name = os.path.join(os.getenv('WIN_FONT_DIR', 'C:\\Windows\\Fonts'), font_name + '.ttf') + regular_font = ImageFont.truetype(font_name, font_size) + title_font = ImageFont.truetype(font_name, title_font_size) + goal_font = ImageFont.truetype(font_name, goal_font_size) + font_loaded = True + break + except OSError: + continue + + if not font_loaded: + raise OSError('No preferred fonts found') + + except OSError: + regular_font = ImageFont.load_default() + title_font = ImageFont.load_default() + + goal_font = regular_font + + # Load logo if requested + logo = None + if show_logo: + try: + logo = Image.open('./static/browser-use.png') + # Resize logo to be small (e.g., 40px height) + logo_height = 150 + aspect_ratio = logo.width / logo.height + logo_width = int(logo_height * aspect_ratio) + logo = logo.resize((logo_width, logo_height), Image.Resampling.LANCZOS) + except Exception as e: + logger.warning(f'Could not load logo: {e}') + + # Create task frame if requested + if show_task and task: + task_frame = _create_task_frame( + task, + history.history[0].state.screenshot, + title_font, # type: ignore + regular_font, # type: ignore + logo, + line_spacing, + ) + images.append(task_frame) + + # Process each history item + for i, item in enumerate(history.history, 1): + if not item.state.screenshot: + continue + + # Convert base64 screenshot to PIL Image + img_data = base64.b64decode(item.state.screenshot) + image = Image.open(io.BytesIO(img_data)) + + if show_goals and item.model_output: + image = _add_overlay_to_image( + image=image, + step_number=i, + goal_text=item.model_output.current_state.next_goal, + regular_font=regular_font, # type: ignore + title_font=title_font, # type: ignore + margin=margin, + logo=logo, + ) + + images.append(image) + + if images: + # Save the GIF + images[0].save( + output_path, + save_all=True, + append_images=images[1:], + duration=duration, + loop=0, + optimize=False, + ) + logger.info(f'Created GIF at {output_path}') + else: + logger.warning('No images found in history to create GIF') + + +def _create_task_frame( + task: str, + first_screenshot: str, + title_font: 'ImageFont.FreeTypeFont', + regular_font: 'ImageFont.FreeTypeFont', + logo: Optional[Image.Image] = None, + line_spacing: float = 1.5, +) -> 'Image.Image': + """Create initial frame showing the task.""" + from PIL import Image, ImageDraw, ImageFont + + img_data = base64.b64decode(first_screenshot) + template = Image.open(io.BytesIO(img_data)) + image = Image.new('RGB', template.size, (0, 0, 0)) + draw = ImageDraw.Draw(image) + + # Calculate vertical center of image + center_y = image.height // 2 + + # Draw task text with increased font size + margin = 140 # Increased margin + max_width = image.width - (2 * margin) + larger_font = ImageFont.truetype(regular_font.path, regular_font.size + 16) # Increase font size more + wrapped_text = _wrap_text(task, larger_font, max_width) + + # Calculate line height with spacing + line_height = larger_font.size * line_spacing + + # Split text into lines and draw with custom spacing + lines = wrapped_text.split('\n') + total_height = line_height * len(lines) + + # Start position for first line + text_y = center_y - (total_height / 2) + 50 # Shifted down slightly + + for line in lines: + # Get line width for centering + line_bbox = draw.textbbox((0, 0), line, font=larger_font) + text_x = (image.width - (line_bbox[2] - line_bbox[0])) // 2 + + draw.text( + (text_x, text_y), + line, + font=larger_font, + fill=(255, 255, 255), + ) + text_y += line_height + + # Add logo if provided (top right corner) + if logo: + logo_margin = 20 + logo_x = image.width - logo.width - logo_margin + image.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None) + + return image + + +def _add_overlay_to_image( + image: 'Image.Image', + step_number: int, + goal_text: str, + regular_font: 'ImageFont.FreeTypeFont', + title_font: 'ImageFont.FreeTypeFont', + margin: int, + logo: Optional['Image.Image'] = None, + display_step: bool = True, + text_color: tuple[int, int, int, int] = (255, 255, 255, 255), + text_box_color: tuple[int, int, int, int] = (0, 0, 0, 255), +) -> 'Image.Image': + """Add step number and goal overlay to an image.""" + from PIL import Image, ImageDraw + + image = image.convert('RGBA') + txt_layer = Image.new('RGBA', image.size, (0, 0, 0, 0)) + draw = ImageDraw.Draw(txt_layer) + if display_step: + # Add step number (bottom left) + step_text = str(step_number) + step_bbox = draw.textbbox((0, 0), step_text, font=title_font) + step_width = step_bbox[2] - step_bbox[0] + step_height = step_bbox[3] - step_bbox[1] + + # Position step number in bottom left + x_step = margin + 10 # Slight additional offset from edge + y_step = image.height - margin - step_height - 10 # Slight offset from bottom + + # Draw rounded rectangle background for step number + padding = 20 # Increased padding + step_bg_bbox = ( + x_step - padding, + y_step - padding, + x_step + step_width + padding, + y_step + step_height + padding, + ) + draw.rounded_rectangle( + step_bg_bbox, + radius=15, # Add rounded corners + fill=text_box_color, + ) + + # Draw step number + draw.text( + (x_step, y_step), + step_text, + font=title_font, + fill=text_color, + ) + + # Draw goal text (centered, bottom) + max_width = image.width - (4 * margin) + wrapped_goal = _wrap_text(goal_text, title_font, max_width) + goal_bbox = draw.multiline_textbbox((0, 0), wrapped_goal, font=title_font) + goal_width = goal_bbox[2] - goal_bbox[0] + goal_height = goal_bbox[3] - goal_bbox[1] + + # Center goal text horizontally, place above step number + x_goal = (image.width - goal_width) // 2 + y_goal = y_step - goal_height - padding * 4 # More space between step and goal + + # Draw rounded rectangle background for goal + padding_goal = 25 # Increased padding for goal + goal_bg_bbox = ( + x_goal - padding_goal, # Remove extra space for logo + y_goal - padding_goal, + x_goal + goal_width + padding_goal, + y_goal + goal_height + padding_goal, + ) + draw.rounded_rectangle( + goal_bg_bbox, + radius=15, # Add rounded corners + fill=text_box_color, + ) + + # Draw goal text + draw.multiline_text( + (x_goal, y_goal), + wrapped_goal, + font=title_font, + fill=text_color, + align='center', + ) + + # Add logo if provided (top right corner) + if logo: + logo_layer = Image.new('RGBA', image.size, (0, 0, 0, 0)) + logo_margin = 20 + logo_x = image.width - logo.width - logo_margin + logo_layer.paste(logo, (logo_x, logo_margin), logo if logo.mode == 'RGBA' else None) + txt_layer = Image.alpha_composite(logo_layer, txt_layer) + + # Composite and convert + result = Image.alpha_composite(image, txt_layer) + return result.convert('RGB') + + +def _wrap_text(text: str, font: 'ImageFont.FreeTypeFont', max_width: int) -> str: + """ + Wrap text to fit within a given width. + + Args: + text: Text to wrap + font: Font to use for text + max_width: Maximum width in pixels + + Returns: + Wrapped text with newlines + """ + words = text.split() + lines = [] + current_line = [] + + for word in words: + current_line.append(word) + line = ' '.join(current_line) + bbox = font.getbbox(line) + if bbox[2] > max_width: + if len(current_line) == 1: + lines.append(current_line.pop()) + else: + current_line.pop() + lines.append(' '.join(current_line)) + current_line = [word] + + if current_line: + lines.append(' '.join(current_line)) + + return '\n'.join(lines) diff --git a/browser_use/agent/message_manager/service.py b/browser_use/agent/message_manager/service.py new file mode 100644 index 0000000000000000000000000000000000000000..73b3cf78708a4c3d2a28a8c58ecaca4ecfd176e7 --- /dev/null +++ b/browser_use/agent/message_manager/service.py @@ -0,0 +1,306 @@ +from __future__ import annotations + +import logging +from typing import Dict, List, Optional + +from langchain_core.messages import ( + AIMessage, + BaseMessage, + HumanMessage, + SystemMessage, + ToolMessage, +) +from pydantic import BaseModel + +from browser_use.agent.message_manager.views import MessageMetadata +from browser_use.agent.prompts import AgentMessagePrompt +from browser_use.agent.views import ActionResult, AgentOutput, AgentStepInfo, MessageManagerState +from browser_use.browser.views import BrowserState +from browser_use.utils import time_execution_sync + +logger = logging.getLogger(__name__) + + +class MessageManagerSettings(BaseModel): + max_input_tokens: int = 128000 + estimated_characters_per_token: int = 3 + image_tokens: int = 800 + include_attributes: list[str] = [] + message_context: Optional[str] = None + sensitive_data: Optional[Dict[str, str]] = None + available_file_paths: Optional[List[str]] = None + + +class MessageManager: + def __init__( + self, + task: str, + system_message: SystemMessage, + settings: MessageManagerSettings = MessageManagerSettings(), + state: MessageManagerState = MessageManagerState(), + ): + self.task = task + self.settings = settings + self.state = state + self.system_prompt = system_message + + # Only initialize messages if state is empty + if len(self.state.history.messages) == 0: + self._init_messages() + + def _init_messages(self) -> None: + """Initialize the message history with system message, context, task, and other initial messages""" + self._add_message_with_tokens(self.system_prompt) + + if self.settings.message_context: + context_message = HumanMessage(content='Context for the task' + self.settings.message_context) + self._add_message_with_tokens(context_message) + + task_message = HumanMessage( + content=f'Your ultimate task is: """{self.task}""". If you achieved your ultimate task, stop everything and use the done action in the next step to complete the task. If not, continue as usual.' + ) + self._add_message_with_tokens(task_message) + + if self.settings.sensitive_data: + info = f'Here are placeholders for sensitve data: {list(self.settings.sensitive_data.keys())}' + info += 'To use them, write the placeholder name' + info_message = HumanMessage(content=info) + self._add_message_with_tokens(info_message) + + placeholder_message = HumanMessage(content='Example output:') + self._add_message_with_tokens(placeholder_message) + + tool_calls = [ + { + 'name': 'AgentOutput', + 'args': { + 'current_state': { + 'evaluation_previous_goal': 'Success - I opend the first page', + 'memory': 'Starting with the new task. I have completed 1/10 steps', + 'next_goal': 'Click on company a', + }, + 'action': [{'click_element': {'index': 0}}], + }, + 'id': str(self.state.tool_id), + 'type': 'tool_call', + } + ] + + example_tool_call = AIMessage( + content='', + tool_calls=tool_calls, + ) + self._add_message_with_tokens(example_tool_call) + self.add_tool_message(content='Browser started') + + placeholder_message = HumanMessage(content='[Your task history memory starts here]') + self._add_message_with_tokens(placeholder_message) + + if self.settings.available_file_paths: + filepaths_msg = HumanMessage(content=f'Here are file paths you can use: {self.settings.available_file_paths}') + self._add_message_with_tokens(filepaths_msg) + + def add_new_task(self, new_task: str) -> None: + content = f'Your new ultimate task is: """{new_task}""". Take the previous context into account and finish your new ultimate task. ' + msg = HumanMessage(content=content) + self._add_message_with_tokens(msg) + self.task = new_task + + @time_execution_sync('--add_state_message') + def add_state_message( + self, + state: BrowserState, + result: Optional[List[ActionResult]] = None, + step_info: Optional[AgentStepInfo] = None, + use_vision=True, + ) -> None: + """Add browser state as human message""" + + # if keep in memory, add to directly to history and add state without result + if result: + for r in result: + if r.include_in_memory: + if r.extracted_content: + msg = HumanMessage(content='Action result: ' + str(r.extracted_content)) + self._add_message_with_tokens(msg) + if r.error: + # if endswith \n, remove it + if r.error.endswith('\n'): + r.error = r.error[:-1] + # get only last line of error + last_line = r.error.split('\n')[-1] + msg = HumanMessage(content='Action error: ' + last_line) + self._add_message_with_tokens(msg) + result = None # if result in history, we dont want to add it again + + # otherwise add state message and result to next message (which will not stay in memory) + state_message = AgentMessagePrompt( + state, + result, + include_attributes=self.settings.include_attributes, + step_info=step_info, + ).get_user_message(use_vision) + self._add_message_with_tokens(state_message) + + def add_model_output(self, model_output: AgentOutput) -> None: + """Add model output as AI message""" + tool_calls = [ + { + 'name': 'AgentOutput', + 'args': model_output.model_dump(mode='json', exclude_unset=True), + 'id': str(self.state.tool_id), + 'type': 'tool_call', + } + ] + + msg = AIMessage( + content='', + tool_calls=tool_calls, + ) + + self._add_message_with_tokens(msg) + # empty tool response + self.add_tool_message(content='') + + def add_plan(self, plan: Optional[str], position: int | None = None) -> None: + if plan: + msg = AIMessage(content=plan) + self._add_message_with_tokens(msg, position) + + @time_execution_sync('--get_messages') + def get_messages(self) -> List[BaseMessage]: + """Get current message list, potentially trimmed to max tokens""" + + msg = [m.message for m in self.state.history.messages] + # debug which messages are in history with token count # log + total_input_tokens = 0 + logger.debug(f'Messages in history: {len(self.state.history.messages)}:') + for m in self.state.history.messages: + total_input_tokens += m.metadata.tokens + logger.debug(f'{m.message.__class__.__name__} - Token count: {m.metadata.tokens}') + logger.debug(f'Total input tokens: {total_input_tokens}') + + return msg + + def _add_message_with_tokens(self, message: BaseMessage, position: int | None = None) -> None: + """Add message with token count metadata + position: None for last, -1 for second last, etc. + """ + + # filter out sensitive data from the message + if self.settings.sensitive_data: + message = self._filter_sensitive_data(message) + + token_count = self._count_tokens(message) + metadata = MessageMetadata(tokens=token_count) + self.state.history.add_message(message, metadata, position) + + @time_execution_sync('--filter_sensitive_data') + def _filter_sensitive_data(self, message: BaseMessage) -> BaseMessage: + """Filter out sensitive data from the message""" + + def replace_sensitive(value: str) -> str: + if not self.settings.sensitive_data: + return value + for key, val in self.settings.sensitive_data.items(): + if not val: + continue + value = value.replace(val, f'{key}') + return value + + if isinstance(message.content, str): + message.content = replace_sensitive(message.content) + elif isinstance(message.content, list): + for i, item in enumerate(message.content): + if isinstance(item, dict) and 'text' in item: + item['text'] = replace_sensitive(item['text']) + message.content[i] = item + return message + + def _count_tokens(self, message: BaseMessage) -> int: + """Count tokens in a message using the model's tokenizer""" + tokens = 0 + if isinstance(message.content, list): + for item in message.content: + if 'image_url' in item: + tokens += self.settings.image_tokens + elif isinstance(item, dict) and 'text' in item: + tokens += self._count_text_tokens(item['text']) + else: + msg = message.content + if hasattr(message, 'tool_calls'): + msg += str(message.tool_calls) # type: ignore + tokens += self._count_text_tokens(msg) + return tokens + + def _count_text_tokens(self, text: str) -> int: + """Count tokens in a text string""" + tokens = len(text) // self.settings.estimated_characters_per_token # Rough estimate if no tokenizer available + return tokens + + def cut_messages(self): + """Get current message list, potentially trimmed to max tokens""" + diff = self.state.history.current_tokens - self.settings.max_input_tokens + if diff <= 0: + return None + + msg = self.state.history.messages[-1] + + # if list with image remove image + if isinstance(msg.message.content, list): + text = '' + for item in msg.message.content: + if 'image_url' in item: + msg.message.content.remove(item) + diff -= self.settings.image_tokens + msg.metadata.tokens -= self.settings.image_tokens + self.state.history.current_tokens -= self.settings.image_tokens + logger.debug( + f'Removed image with {self.settings.image_tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens}' + ) + elif 'text' in item and isinstance(item, dict): + text += item['text'] + msg.message.content = text + self.state.history.messages[-1] = msg + + if diff <= 0: + return None + + # if still over, remove text from state message proportionally to the number of tokens needed with buffer + # Calculate the proportion of content to remove + proportion_to_remove = diff / msg.metadata.tokens + if proportion_to_remove > 0.99: + raise ValueError( + f'Max token limit reached - history is too long - reduce the system prompt or task. ' + f'proportion_to_remove: {proportion_to_remove}' + ) + logger.debug( + f'Removing {proportion_to_remove * 100:.2f}% of the last message {proportion_to_remove * msg.metadata.tokens:.2f} / {msg.metadata.tokens:.2f} tokens)' + ) + + content = msg.message.content + characters_to_remove = int(len(content) * proportion_to_remove) + content = content[:-characters_to_remove] + + # remove tokens and old long message + self.state.history.remove_last_state_message() + + # new message with updated content + msg = HumanMessage(content=content) + self._add_message_with_tokens(msg) + + last_msg = self.state.history.messages[-1] + + logger.debug( + f'Added message with {last_msg.metadata.tokens} tokens - total tokens now: {self.state.history.current_tokens}/{self.settings.max_input_tokens} - total messages: {len(self.state.history.messages)}' + ) + + def _remove_last_state_message(self) -> None: + """Remove last state message from history""" + self.state.history.remove_last_state_message() + + def add_tool_message(self, content: str) -> None: + """Add tool message to history""" + msg = ToolMessage(content=content, tool_call_id=str(self.state.tool_id)) + self.state.tool_id += 1 + self._add_message_with_tokens(msg) diff --git a/browser_use/agent/message_manager/tests.py b/browser_use/agent/message_manager/tests.py new file mode 100644 index 0000000000000000000000000000000000000000..94c1beb59e2c38d69a2b7a30043d616ea0a0c29e --- /dev/null +++ b/browser_use/agent/message_manager/tests.py @@ -0,0 +1,237 @@ +import pytest +from langchain_anthropic import ChatAnthropic +from langchain_core.messages import AIMessage, HumanMessage, SystemMessage +from langchain_openai import AzureChatOpenAI, ChatOpenAI + +from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings +from browser_use.agent.views import ActionResult +from browser_use.browser.views import BrowserState, TabInfo +from browser_use.dom.views import DOMElementNode, DOMTextNode + + +@pytest.fixture( + params=[ + ChatOpenAI(model='gpt-4o-mini'), + AzureChatOpenAI(model='gpt-4o', api_version='2024-02-15-preview'), + ChatAnthropic(model_name='claude-3-5-sonnet-20240620', timeout=100, temperature=0.0, stop=None), + ], + ids=['gpt-4o-mini', 'gpt-4o', 'claude-3-5-sonnet'], +) +def message_manager(request: pytest.FixtureRequest): + task = 'Test task' + action_descriptions = 'Test actions' + return MessageManager( + task=task, + system_message=SystemMessage(content=action_descriptions), + settings=MessageManagerSettings( + max_input_tokens=1000, + estimated_characters_per_token=3, + image_tokens=800, + ), + ) + + +def test_initial_messages(message_manager: MessageManager): + """Test that message manager initializes with system and task messages""" + messages = message_manager.get_messages() + assert len(messages) == 2 + assert isinstance(messages[0], SystemMessage) + assert isinstance(messages[1], HumanMessage) + assert 'Test task' in messages[1].content + + +def test_add_state_message(message_manager: MessageManager): + """Test adding browser state message""" + state = BrowserState( + url='https://test.com', + title='Test Page', + element_tree=DOMElementNode( + tag_name='div', + attributes={}, + children=[], + is_visible=True, + parent=None, + xpath='//div', + ), + selector_map={}, + tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')], + ) + message_manager.add_state_message(state) + + messages = message_manager.get_messages() + assert len(messages) == 3 + assert isinstance(messages[2], HumanMessage) + assert 'https://test.com' in messages[2].content + + +def test_add_state_with_memory_result(message_manager: MessageManager): + """Test adding state with result that should be included in memory""" + state = BrowserState( + url='https://test.com', + title='Test Page', + element_tree=DOMElementNode( + tag_name='div', + attributes={}, + children=[], + is_visible=True, + parent=None, + xpath='//div', + ), + selector_map={}, + tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')], + ) + result = ActionResult(extracted_content='Important content', include_in_memory=True) + + message_manager.add_state_message(state, [result]) + messages = message_manager.get_messages() + + # Should have system, task, extracted content, and state messages + assert len(messages) == 4 + assert 'Important content' in messages[2].content + assert isinstance(messages[2], HumanMessage) + assert isinstance(messages[3], HumanMessage) + assert 'Important content' not in messages[3].content + + +def test_add_state_with_non_memory_result(message_manager: MessageManager): + """Test adding state with result that should not be included in memory""" + state = BrowserState( + url='https://test.com', + title='Test Page', + element_tree=DOMElementNode( + tag_name='div', + attributes={}, + children=[], + is_visible=True, + parent=None, + xpath='//div', + ), + selector_map={}, + tabs=[TabInfo(page_id=1, url='https://test.com', title='Test Page')], + ) + result = ActionResult(extracted_content='Temporary content', include_in_memory=False) + + message_manager.add_state_message(state, [result]) + messages = message_manager.get_messages() + + # Should have system, task, and combined state+result message + assert len(messages) == 3 + assert 'Temporary content' in messages[2].content + assert isinstance(messages[2], HumanMessage) + + +@pytest.mark.skip('not sure how to fix this') +@pytest.mark.parametrize('max_tokens', [100000, 10000, 5000]) +def test_token_overflow_handling_with_real_flow(message_manager: MessageManager, max_tokens): + """Test handling of token overflow in a realistic message flow""" + # Set more realistic token limit + message_manager.settings.max_input_tokens = max_tokens + + # Create a long sequence of interactions + for i in range(200): # Simulate 40 steps of interaction + # Create state with varying content length + state = BrowserState( + url=f'https://test{i}.com', + title=f'Test Page {i}', + element_tree=DOMElementNode( + tag_name='div', + attributes={}, + children=[ + DOMTextNode( + text=f'Content {j} ' * (10 + i), # Increasing content length + is_visible=True, + parent=None, + ) + for j in range(5) # Multiple DOM items + ], + is_visible=True, + parent=None, + xpath='//div', + ), + selector_map={j: f'//div[{j}]' for j in range(5)}, + tabs=[TabInfo(page_id=1, url=f'https://test{i}.com', title=f'Test Page {i}')], + ) + + # Alternate between different types of results + result = None + if i % 2 == 0: # Every other iteration + result = ActionResult( + extracted_content=f'Important content from step {i}' * 5, + include_in_memory=i % 4 == 0, # Include in memory every 4th message + ) + + # Add state message + if result: + message_manager.add_state_message(state, [result]) + else: + message_manager.add_state_message(state) + + try: + messages = message_manager.get_messages() + except ValueError as e: + if 'Max token limit reached - history is too long' in str(e): + return # If error occurs, end the test + else: + raise e + + assert message_manager.state.history.current_tokens <= message_manager.settings.max_input_tokens + 100 + + last_msg = messages[-1] + assert isinstance(last_msg, HumanMessage) + + if i % 4 == 0: + assert isinstance(message_manager.state.history.messages[-2].message, HumanMessage) + if i % 2 == 0 and not i % 4 == 0: + if isinstance(last_msg.content, list): + assert 'Current url: https://test' in last_msg.content[0]['text'] + else: + assert 'Current url: https://test' in last_msg.content + + # Add model output every time + from browser_use.agent.views import AgentBrain, AgentOutput + from browser_use.controller.registry.views import ActionModel + + output = AgentOutput( + current_state=AgentBrain( + evaluation_previous_goal=f'Success in step {i}', + memory=f'Memory from step {i}', + next_goal=f'Goal for step {i + 1}', + ), + action=[ActionModel()], + ) + message_manager._remove_last_state_message() + message_manager.add_model_output(output) + + # Get messages and verify after each addition + messages = [m.message for m in message_manager.state.history.messages] + + # Verify token limit is respected + + # Verify essential messages are preserved + assert isinstance(messages[0], SystemMessage) # System prompt always first + assert isinstance(messages[1], HumanMessage) # Task always second + assert 'Test task' in messages[1].content + + # Verify structure of latest messages + assert isinstance(messages[-1], AIMessage) # Last message should be model output + assert f'step {i}' in messages[-1].content # Should contain current step info + + # Log token usage for debugging + token_usage = message_manager.state.history.current_tokens + token_limit = message_manager.settings.max_input_tokens + # print(f'Step {i}: Using {token_usage}/{token_limit} tokens') + + # go through all messages and verify that the token count and total tokens is correct + total_tokens = 0 + real_tokens = [] + stored_tokens = [] + for msg in message_manager.state.history.messages: + total_tokens += msg.metadata.tokens + stored_tokens.append(msg.metadata.tokens) + real_tokens.append(message_manager._count_tokens(msg.message)) + assert total_tokens == sum(real_tokens) + assert stored_tokens == real_tokens + assert message_manager.state.history.current_tokens == total_tokens + + +# pytest -s browser_use/agent/message_manager/tests.py diff --git a/browser_use/agent/message_manager/utils.py b/browser_use/agent/message_manager/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ce9490124c8b2415e858bbb7126860ccdc4d2b26 --- /dev/null +++ b/browser_use/agent/message_manager/utils.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import json +import logging +import os +from typing import Any, Optional, Type + +from langchain_core.messages import ( + AIMessage, + BaseMessage, + HumanMessage, + SystemMessage, + ToolMessage, +) + +logger = logging.getLogger(__name__) + + +def extract_json_from_model_output(content: str) -> dict: + """Extract JSON from model output, handling both plain JSON and code-block-wrapped JSON.""" + try: + # If content is wrapped in code blocks, extract just the JSON part + if '```' in content: + # Find the JSON content between code blocks + content = content.split('```')[1] + # Remove language identifier if present (e.g., 'json\n') + if '\n' in content: + content = content.split('\n', 1)[1] + # Parse the cleaned content + return json.loads(content) + except json.JSONDecodeError as e: + logger.warning(f'Failed to parse model output: {content} {str(e)}') + raise ValueError('Could not parse response.') + + +def convert_input_messages(input_messages: list[BaseMessage], model_name: Optional[str]) -> list[BaseMessage]: + """Convert input messages to a format that is compatible with the planner model""" + if model_name is None: + return input_messages + if model_name == 'deepseek-reasoner' or 'deepseek-r1' in model_name: + converted_input_messages = _convert_messages_for_non_function_calling_models(input_messages) + merged_input_messages = _merge_successive_messages(converted_input_messages, HumanMessage) + merged_input_messages = _merge_successive_messages(merged_input_messages, AIMessage) + return merged_input_messages + return input_messages + + +def _convert_messages_for_non_function_calling_models(input_messages: list[BaseMessage]) -> list[BaseMessage]: + """Convert messages for non-function-calling models""" + output_messages = [] + for message in input_messages: + if isinstance(message, HumanMessage): + output_messages.append(message) + elif isinstance(message, SystemMessage): + output_messages.append(message) + elif isinstance(message, ToolMessage): + output_messages.append(HumanMessage(content=message.content)) + elif isinstance(message, AIMessage): + # check if tool_calls is a valid JSON object + if message.tool_calls: + tool_calls = json.dumps(message.tool_calls) + output_messages.append(AIMessage(content=tool_calls)) + else: + output_messages.append(message) + else: + raise ValueError(f'Unknown message type: {type(message)}') + return output_messages + + +def _merge_successive_messages(messages: list[BaseMessage], class_to_merge: Type[BaseMessage]) -> list[BaseMessage]: + """Some models like deepseek-reasoner dont allow multiple human messages in a row. This function merges them into one.""" + merged_messages = [] + streak = 0 + for message in messages: + if isinstance(message, class_to_merge): + streak += 1 + if streak > 1: + if isinstance(message.content, list): + merged_messages[-1].content += message.content[0]['text'] # type:ignore + else: + merged_messages[-1].content += message.content + else: + merged_messages.append(message) + else: + merged_messages.append(message) + streak = 0 + return merged_messages + + +def save_conversation(input_messages: list[BaseMessage], response: Any, target: str, encoding: Optional[str] = None) -> None: + """Save conversation history to file.""" + + # create folders if not exists + os.makedirs(os.path.dirname(target), exist_ok=True) + + with open( + target, + 'w', + encoding=encoding, + ) as f: + _write_messages_to_file(f, input_messages) + _write_response_to_file(f, response) + + +def _write_messages_to_file(f: Any, messages: list[BaseMessage]) -> None: + """Write messages to conversation file""" + for message in messages: + f.write(f' {message.__class__.__name__} \n') + + if isinstance(message.content, list): + for item in message.content: + if isinstance(item, dict) and item.get('type') == 'text': + f.write(item['text'].strip() + '\n') + elif isinstance(message.content, str): + try: + content = json.loads(message.content) + f.write(json.dumps(content, indent=2) + '\n') + except json.JSONDecodeError: + f.write(message.content.strip() + '\n') + + f.write('\n') + + +def _write_response_to_file(f: Any, response: Any) -> None: + """Write model response to conversation file""" + f.write(' RESPONSE\n') + f.write(json.dumps(json.loads(response.model_dump_json(exclude_unset=True)), indent=2)) diff --git a/browser_use/agent/message_manager/views.py b/browser_use/agent/message_manager/views.py new file mode 100644 index 0000000000000000000000000000000000000000..ad8c9c67056c0842bf20d7afa49f7b1ff2665e93 --- /dev/null +++ b/browser_use/agent/message_manager/views.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from langchain_core.load import dumpd, load +from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage +from pydantic import BaseModel, ConfigDict, Field, model_serializer, model_validator + +if TYPE_CHECKING: + from browser_use.agent.views import AgentOutput + + +class MessageMetadata(BaseModel): + """Metadata for a message""" + + tokens: int = 0 + + +class ManagedMessage(BaseModel): + """A message with its metadata""" + + message: BaseMessage + metadata: MessageMetadata = Field(default_factory=MessageMetadata) + + model_config = ConfigDict(arbitrary_types_allowed=True) + + # https://github.com/pydantic/pydantic/discussions/7558 + @model_serializer(mode='wrap') + def to_json(self, original_dump): + """ + Returns the JSON representation of the model. + + It uses langchain's `dumps` function to serialize the `message` + property before encoding the overall dict with json.dumps. + """ + data = original_dump(self) + + # NOTE: We override the message field to use langchain JSON serialization. + data['message'] = dumpd(self.message) + + return data + + @model_validator(mode='before') + @classmethod + def validate( + cls, + value: Any, + *, + strict: bool | None = None, + from_attributes: bool | None = None, + context: Any | None = None, + ) -> Any: + """ + Custom validator that uses langchain's `loads` function + to parse the message if it is provided as a JSON string. + """ + if isinstance(value, dict) and 'message' in value: + # NOTE: We use langchain's load to convert the JSON string back into a BaseMessage object. + value['message'] = load(value['message']) + return value + + +class MessageHistory(BaseModel): + """History of messages with metadata""" + + messages: list[ManagedMessage] = Field(default_factory=list) + current_tokens: int = 0 + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def add_message(self, message: BaseMessage, metadata: MessageMetadata, position: int | None = None) -> None: + """Add message with metadata to history""" + if position is None: + self.messages.append(ManagedMessage(message=message, metadata=metadata)) + else: + self.messages.insert(position, ManagedMessage(message=message, metadata=metadata)) + self.current_tokens += metadata.tokens + + def add_model_output(self, output: 'AgentOutput') -> None: + """Add model output as AI message""" + tool_calls = [ + { + 'name': 'AgentOutput', + 'args': output.model_dump(mode='json', exclude_unset=True), + 'id': '1', + 'type': 'tool_call', + } + ] + + msg = AIMessage( + content='', + tool_calls=tool_calls, + ) + self.add_message(msg, MessageMetadata(tokens=100)) # Estimate tokens for tool calls + + # Empty tool response + tool_message = ToolMessage(content='', tool_call_id='1') + self.add_message(tool_message, MessageMetadata(tokens=10)) # Estimate tokens for empty response + + def get_messages(self) -> list[BaseMessage]: + """Get all messages""" + return [m.message for m in self.messages] + + def get_total_tokens(self) -> int: + """Get total tokens in history""" + return self.current_tokens + + def remove_oldest_message(self) -> None: + """Remove oldest non-system message""" + for i, msg in enumerate(self.messages): + if not isinstance(msg.message, SystemMessage): + self.current_tokens -= msg.metadata.tokens + self.messages.pop(i) + break + + def remove_last_state_message(self) -> None: + """Remove last state message from history""" + if len(self.messages) > 2 and isinstance(self.messages[-1].message, HumanMessage): + self.current_tokens -= self.messages[-1].metadata.tokens + self.messages.pop() + + +class MessageManagerState(BaseModel): + """Holds the state for MessageManager""" + + history: MessageHistory = Field(default_factory=MessageHistory) + tool_id: int = 1 + + model_config = ConfigDict(arbitrary_types_allowed=True) diff --git a/browser_use/agent/prompts.py b/browser_use/agent/prompts.py new file mode 100644 index 0000000000000000000000000000000000000000..b78cfe5bc652851087abc49c385f73f8cdbcf9f6 --- /dev/null +++ b/browser_use/agent/prompts.py @@ -0,0 +1,165 @@ +import datetime +import importlib.resources +from datetime import datetime +from typing import TYPE_CHECKING, List, Optional + +from langchain_core.messages import HumanMessage, SystemMessage + +if TYPE_CHECKING: + from browser_use.agent.views import ActionResult, AgentStepInfo + from browser_use.browser.views import BrowserState + + +class SystemPrompt: + def __init__( + self, + action_description: str, + max_actions_per_step: int = 10, + override_system_message: Optional[str] = None, + extend_system_message: Optional[str] = None, + ): + self.default_action_description = action_description + self.max_actions_per_step = max_actions_per_step + prompt = '' + if override_system_message: + prompt = override_system_message + else: + self._load_prompt_template() + prompt = self.prompt_template.format(max_actions=self.max_actions_per_step) + + if extend_system_message: + prompt += f'\n{extend_system_message}' + + self.system_message = SystemMessage(content=prompt) + + def _load_prompt_template(self) -> None: + """Load the prompt template from the markdown file.""" + try: + # This works both in development and when installed as a package + with importlib.resources.files('browser_use.agent').joinpath('system_prompt.md').open('r') as f: + self.prompt_template = f.read() + except Exception as e: + raise RuntimeError(f'Failed to load system prompt template: {e}') + + def get_system_message(self) -> SystemMessage: + """ + Get the system prompt for the agent. + + Returns: + SystemMessage: Formatted system prompt + """ + return self.system_message + + +# Functions: +# {self.default_action_description} + +# Example: +# {self.example_response()} +# Your AVAILABLE ACTIONS: +# {self.default_action_description} + + +class AgentMessagePrompt: + def __init__( + self, + state: 'BrowserState', + result: Optional[List['ActionResult']] = None, + include_attributes: list[str] = [], + step_info: Optional['AgentStepInfo'] = None, + ): + self.state = state + self.result = result + self.include_attributes = include_attributes + self.step_info = step_info + + def get_user_message(self, use_vision: bool = True) -> HumanMessage: + elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes) + + has_content_above = (self.state.pixels_above or 0) > 0 + has_content_below = (self.state.pixels_below or 0) > 0 + + if elements_text != '': + if has_content_above: + elements_text = ( + f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}' + ) + else: + elements_text = f'[Start of page]\n{elements_text}' + if has_content_below: + elements_text = ( + f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...' + ) + else: + elements_text = f'{elements_text}\n[End of page]' + else: + elements_text = 'empty page' + + if self.step_info: + step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}' + else: + step_info_description = '' + time_str = datetime.now().strftime('%Y-%m-%d %H:%M') + step_info_description += f'Current date and time: {time_str}' + + state_description = f""" +[Task history memory ends] +[Current state starts here] +The following is one-time information - if you need to remember it write it to memory: +Current url: {self.state.url} +Available tabs: +{self.state.tabs} +Interactive elements from top layer of the current page inside the viewport: +{elements_text} +{step_info_description} +""" + + if self.result: + for i, result in enumerate(self.result): + if result.extracted_content: + state_description += f'\nAction result {i + 1}/{len(self.result)}: {result.extracted_content}' + if result.error: + # only use last line of error + error = result.error.split('\n')[-1] + state_description += f'\nAction error {i + 1}/{len(self.result)}: ...{error}' + + if self.state.screenshot and use_vision == True: + # Format message for vision model + return HumanMessage( + content=[ + {'type': 'text', 'text': state_description}, + { + 'type': 'image_url', + 'image_url': {'url': f'data:image/png;base64,{self.state.screenshot}'}, # , 'detail': 'low' + }, + ] + ) + + return HumanMessage(content=state_description) + + +class PlannerPrompt(SystemPrompt): + def get_system_message(self) -> SystemMessage: + return SystemMessage( + content="""You are a planning agent that helps break down tasks into smaller steps and reason about the current state. +Your role is to: +1. Analyze the current state and history +2. Evaluate progress towards the ultimate goal +3. Identify potential challenges or roadblocks +4. Suggest the next high-level steps to take + +Inside your messages, there will be AI messages from different agents with different formats. + +Your output format should be always a JSON object with the following fields: +{ + "state_analysis": "Brief analysis of the current state and what has been done so far", + "progress_evaluation": "Evaluation of progress towards the ultimate goal (as percentage and description)", + "challenges": "List any potential challenges or roadblocks", + "next_steps": "List 2-3 concrete next steps to take", + "reasoning": "Explain your reasoning for the suggested next steps" +} + +Ignore the other AI messages output structures. + +Keep your responses concise and focused on actionable insights.""" + ) diff --git a/browser_use/agent/service.py b/browser_use/agent/service.py new file mode 100644 index 0000000000000000000000000000000000000000..50947781fb088babad2dcb594739ed398aad4de5 --- /dev/null +++ b/browser_use/agent/service.py @@ -0,0 +1,964 @@ +from __future__ import annotations + +import asyncio +import json +import logging +import re +import time +from pathlib import Path +from typing import Any, Awaitable, Callable, Dict, Generic, List, Optional, TypeVar + +from dotenv import load_dotenv +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import ( + BaseMessage, + HumanMessage, + SystemMessage, +) + +# from lmnr.sdk.decorators import observe +from pydantic import BaseModel, ValidationError + +from browser_use.agent.gif import create_history_gif +from browser_use.agent.message_manager.service import MessageManager, MessageManagerSettings +from browser_use.agent.message_manager.utils import convert_input_messages, extract_json_from_model_output, save_conversation +from browser_use.agent.prompts import AgentMessagePrompt, PlannerPrompt, SystemPrompt +from browser_use.agent.views import ( + ActionResult, + AgentError, + AgentHistory, + AgentHistoryList, + AgentOutput, + AgentSettings, + AgentState, + AgentStepInfo, + StepMetadata, + ToolCallingMethod, +) +from browser_use.browser.browser import Browser +from browser_use.browser.context import BrowserContext +from browser_use.browser.views import BrowserState, BrowserStateHistory +from browser_use.controller.registry.views import ActionModel +from browser_use.controller.service import Controller +from browser_use.dom.history_tree_processor.service import ( + DOMHistoryElement, + HistoryTreeProcessor, +) +from browser_use.telemetry.service import ProductTelemetry +from browser_use.telemetry.views import ( + AgentEndTelemetryEvent, + AgentRunTelemetryEvent, + AgentStepTelemetryEvent, +) +from browser_use.utils import time_execution_async, time_execution_sync + +load_dotenv() +logger = logging.getLogger(__name__) + + +def log_response(response: AgentOutput) -> None: + """Utility function to log the model's response.""" + + if 'Success' in response.current_state.evaluation_previous_goal: + emoji = 'πŸ‘' + elif 'Failed' in response.current_state.evaluation_previous_goal: + emoji = '⚠' + else: + emoji = '🀷' + + logger.info(f'{emoji} Eval: {response.current_state.evaluation_previous_goal}') + logger.info(f'🧠 Memory: {response.current_state.memory}') + logger.info(f'🎯 Next goal: {response.current_state.next_goal}') + for i, action in enumerate(response.action): + logger.info(f'πŸ› οΈ Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}') + + +Context = TypeVar('Context') + + +class Agent(Generic[Context]): + @time_execution_sync('--init (agent)') + def __init__( + self, + task: str, + llm: BaseChatModel, + # Optional parameters + browser: Browser | None = None, + browser_context: BrowserContext | None = None, + controller: Controller[Context] = Controller(), + # Initial agent run parameters + sensitive_data: Optional[Dict[str, str]] = None, + initial_actions: Optional[List[Dict[str, Dict[str, Any]]]] = None, + # Cloud Callbacks + register_new_step_callback: Callable[['BrowserState', 'AgentOutput', int], Awaitable[None]] | None = None, + register_done_callback: Callable[['AgentHistoryList'], Awaitable[None]] | None = None, + register_external_agent_status_raise_error_callback: Callable[[], Awaitable[bool]] | None = None, + # Agent settings + use_vision: bool = True, + use_vision_for_planner: bool = False, + save_conversation_path: Optional[str] = None, + save_conversation_path_encoding: Optional[str] = 'utf-8', + max_failures: int = 3, + retry_delay: int = 10, + override_system_message: Optional[str] = None, + extend_system_message: Optional[str] = None, + max_input_tokens: int = 128000, + validate_output: bool = False, + message_context: Optional[str] = None, + generate_gif: bool | str = False, + available_file_paths: Optional[list[str]] = None, + include_attributes: list[str] = [ + 'title', + 'type', + 'name', + 'role', + 'aria-label', + 'placeholder', + 'value', + 'alt', + 'aria-expanded', + 'data-date-format', + ], + max_actions_per_step: int = 10, + tool_calling_method: Optional[ToolCallingMethod] = 'auto', + page_extraction_llm: Optional[BaseChatModel] = None, + planner_llm: Optional[BaseChatModel] = None, + planner_interval: int = 1, # Run planner every N steps + # Inject state + injected_agent_state: Optional[AgentState] = None, + # + context: Context | None = None, + ): + if page_extraction_llm is None: + page_extraction_llm = llm + + # Core components + self.task = task + self.llm = llm + self.controller = controller + self.sensitive_data = sensitive_data + + self.settings = AgentSettings( + use_vision=use_vision, + use_vision_for_planner=use_vision_for_planner, + save_conversation_path=save_conversation_path, + save_conversation_path_encoding=save_conversation_path_encoding, + max_failures=max_failures, + retry_delay=retry_delay, + override_system_message=override_system_message, + extend_system_message=extend_system_message, + max_input_tokens=max_input_tokens, + validate_output=validate_output, + message_context=message_context, + generate_gif=generate_gif, + available_file_paths=available_file_paths, + include_attributes=include_attributes, + max_actions_per_step=max_actions_per_step, + tool_calling_method=tool_calling_method, + page_extraction_llm=page_extraction_llm, + planner_llm=planner_llm, + planner_interval=planner_interval, + ) + + # Initialize state + self.state = injected_agent_state or AgentState() + + # Action setup + self._setup_action_models() + self._set_browser_use_version_and_source() + self.initial_actions = self._convert_initial_actions(initial_actions) if initial_actions else None + + # Model setup + self._set_model_names() + + # for models without tool calling, add available actions to context + self.available_actions = self.controller.registry.get_prompt_description() + + self.tool_calling_method = self._set_tool_calling_method() + self.settings.message_context = self._set_message_context() + + # Initialize message manager with state + self._message_manager = MessageManager( + task=task, + system_message=SystemPrompt( + action_description=self.available_actions, + max_actions_per_step=self.settings.max_actions_per_step, + override_system_message=override_system_message, + extend_system_message=extend_system_message, + ).get_system_message(), + settings=MessageManagerSettings( + max_input_tokens=self.settings.max_input_tokens, + include_attributes=self.settings.include_attributes, + message_context=self.settings.message_context, + sensitive_data=sensitive_data, + available_file_paths=self.settings.available_file_paths, + ), + state=self.state.message_manager_state, + ) + + # Browser setup + self.injected_browser = browser is not None + self.injected_browser_context = browser_context is not None + self.browser = browser if browser is not None else (None if browser_context else Browser()) + if browser_context: + self.browser_context = browser_context + elif self.browser: + self.browser_context = BrowserContext(browser=self.browser, config=self.browser.config.new_context_config) + else: + self.browser = Browser() + self.browser_context = BrowserContext(browser=self.browser) + + # Callbacks + self.register_new_step_callback = register_new_step_callback + self.register_done_callback = register_done_callback + self.register_external_agent_status_raise_error_callback = register_external_agent_status_raise_error_callback + + # Context + self.context = context + + # Telemetry + self.telemetry = ProductTelemetry() + + if self.settings.save_conversation_path: + logger.info(f'Saving conversation to {self.settings.save_conversation_path}') + + def _set_message_context(self) -> str | None: + if self.tool_calling_method == 'raw': + if self.settings.message_context: + self.settings.message_context += f'\n\nAvailable actions: {self.available_actions}' + else: + self.settings.message_context = f'Available actions: {self.available_actions}' + return self.settings.message_context + + def _set_browser_use_version_and_source(self) -> None: + """Get the version and source of the browser-use package (git or pip in a nutshell)""" + try: + # First check for repository-specific files + repo_files = ['.git', 'README.md', 'docs', 'examples'] + package_root = Path(__file__).parent.parent.parent + + # If all of these files/dirs exist, it's likely from git + if all(Path(package_root / file).exists() for file in repo_files): + try: + import subprocess + + version = subprocess.check_output(['git', 'describe', '--tags']).decode('utf-8').strip() + except Exception: + version = 'unknown' + source = 'git' + else: + # If no repo files found, try getting version from pip + import pkg_resources + + version = pkg_resources.get_distribution('browser-use').version + source = 'pip' + except Exception: + version = 'unknown' + source = 'unknown' + + logger.debug(f'Version: {version}, Source: {source}') + self.version = version + self.source = source + + def _set_model_names(self) -> None: + self.chat_model_library = self.llm.__class__.__name__ + self.model_name = 'Unknown' + if hasattr(self.llm, 'model_name'): + model = self.llm.model_name # type: ignore + self.model_name = model if model is not None else 'Unknown' + elif hasattr(self.llm, 'model'): + model = self.llm.model # type: ignore + self.model_name = model if model is not None else 'Unknown' + + if self.settings.planner_llm: + if hasattr(self.settings.planner_llm, 'model_name'): + self.planner_model_name = self.settings.planner_llm.model_name # type: ignore + elif hasattr(self.settings.planner_llm, 'model'): + self.planner_model_name = self.settings.planner_llm.model # type: ignore + else: + self.planner_model_name = 'Unknown' + else: + self.planner_model_name = None + + def _setup_action_models(self) -> None: + """Setup dynamic action models from controller's registry""" + self.ActionModel = self.controller.registry.create_action_model() + # Create output model with the dynamic actions + self.AgentOutput = AgentOutput.type_with_custom_actions(self.ActionModel) + + # used to force the done action when max_steps is reached + self.DoneActionModel = self.controller.registry.create_action_model(include_actions=['done']) + self.DoneAgentOutput = AgentOutput.type_with_custom_actions(self.DoneActionModel) + + def _set_tool_calling_method(self) -> Optional[ToolCallingMethod]: + tool_calling_method = self.settings.tool_calling_method + if tool_calling_method == 'auto': + if 'deepseek-reasoner' in self.model_name or 'deepseek-r1' in self.model_name: + return 'raw' + elif self.chat_model_library == 'ChatGoogleGenerativeAI': + return None + elif self.chat_model_library == 'ChatOpenAI': + return 'function_calling' + elif self.chat_model_library == 'AzureChatOpenAI': + return 'function_calling' + else: + return None + else: + return tool_calling_method + + def add_new_task(self, new_task: str) -> None: + self._message_manager.add_new_task(new_task) + + async def _raise_if_stopped_or_paused(self) -> None: + """Utility function that raises an InterruptedError if the agent is stopped or paused.""" + + if self.register_external_agent_status_raise_error_callback: + if await self.register_external_agent_status_raise_error_callback(): + raise InterruptedError + + if self.state.stopped or self.state.paused: + logger.debug('Agent paused after getting state') + raise InterruptedError + + # @observe(name='agent.step', ignore_output=True, ignore_input=True) + @time_execution_async('--step (agent)') + async def step(self, step_info: Optional[AgentStepInfo] = None) -> None: + """Execute one step of the task""" + logger.info(f'πŸ“ Step {self.state.n_steps}') + state = None + model_output = None + result: list[ActionResult] = [] + step_start_time = time.time() + tokens = 0 + + try: + state = await self.browser_context.get_state() + + await self._raise_if_stopped_or_paused() + + self._message_manager.add_state_message(state, self.state.last_result, step_info, self.settings.use_vision) + + # Run planner at specified intervals if planner is configured + if self.settings.planner_llm and self.state.n_steps % self.settings.planner_interval == 0: + plan = await self._run_planner() + # add plan before last state message + self._message_manager.add_plan(plan, position=-1) + + if step_info and step_info.is_last_step(): + # Add last step warning if needed + msg = 'Now comes your last step. Use only the "done" action now. No other actions - so here your action sequence must have length 1.' + msg += '\nIf the task is not yet fully finished as requested by the user, set success in "done" to false! E.g. if not all steps are fully completed.' + msg += '\nIf the task is fully finished, set success in "done" to true.' + msg += '\nInclude everything you found out for the ultimate task in the done text.' + logger.info('Last step finishing up') + self._message_manager._add_message_with_tokens(HumanMessage(content=msg)) + self.AgentOutput = self.DoneAgentOutput + + input_messages = self._message_manager.get_messages() + tokens = self._message_manager.state.history.current_tokens + + try: + model_output = await self.get_next_action(input_messages) + + self.state.n_steps += 1 + + if self.register_new_step_callback: + await self.register_new_step_callback(state, model_output, self.state.n_steps) + + if self.settings.save_conversation_path: + target = self.settings.save_conversation_path + f'_{self.state.n_steps}.txt' + save_conversation(input_messages, model_output, target, self.settings.save_conversation_path_encoding) + + self._message_manager._remove_last_state_message() # we dont want the whole state in the chat history + + await self._raise_if_stopped_or_paused() + + self._message_manager.add_model_output(model_output) + except Exception as e: + # model call failed, remove last state message from history + self._message_manager._remove_last_state_message() + raise e + + result: list[ActionResult] = await self.multi_act(model_output.action) + + self.state.last_result = result + + if len(result) > 0 and result[-1].is_done: + logger.info(f'πŸ“„ Result: {result[-1].extracted_content}') + + self.state.consecutive_failures = 0 + + except InterruptedError: + logger.debug('Agent paused') + self.state.last_result = [ + ActionResult( + error='The agent was paused - now continuing actions might need to be repeated', include_in_memory=True + ) + ] + return + except Exception as e: + result = await self._handle_step_error(e) + self.state.last_result = result + + finally: + step_end_time = time.time() + actions = [a.model_dump(exclude_unset=True) for a in model_output.action] if model_output else [] + self.telemetry.capture( + AgentStepTelemetryEvent( + agent_id=self.state.agent_id, + step=self.state.n_steps, + actions=actions, + consecutive_failures=self.state.consecutive_failures, + step_error=[r.error for r in result if r.error] if result else ['No result'], + ) + ) + if not result: + return + + if state: + metadata = StepMetadata( + step_number=self.state.n_steps, + step_start_time=step_start_time, + step_end_time=step_end_time, + input_tokens=tokens, + ) + self._make_history_item(model_output, state, result, metadata) + + @time_execution_async('--handle_step_error (agent)') + async def _handle_step_error(self, error: Exception) -> list[ActionResult]: + """Handle all types of errors that can occur during a step""" + include_trace = logger.isEnabledFor(logging.DEBUG) + error_msg = AgentError.format_error(error, include_trace=include_trace) + prefix = f'❌ Result failed {self.state.consecutive_failures + 1}/{self.settings.max_failures} times:\n ' + + if isinstance(error, (ValidationError, ValueError)): + logger.error(f'{prefix}{error_msg}') + if 'Max token limit reached' in error_msg: + # cut tokens from history + self._message_manager.settings.max_input_tokens = self.settings.max_input_tokens - 500 + logger.info( + f'Cutting tokens from history - new max input tokens: {self._message_manager.settings.max_input_tokens}' + ) + self._message_manager.cut_messages() + elif 'Could not parse response' in error_msg: + # give model a hint how output should look like + error_msg += '\n\nReturn a valid JSON object with the required fields.' + + self.state.consecutive_failures += 1 + else: + from google.api_core.exceptions import ResourceExhausted + from openai import RateLimitError + + if isinstance(error, RateLimitError) or isinstance(error, ResourceExhausted): + logger.warning(f'{prefix}{error_msg}') + await asyncio.sleep(self.settings.retry_delay) + self.state.consecutive_failures += 1 + else: + logger.error(f'{prefix}{error_msg}') + self.state.consecutive_failures += 1 + + return [ActionResult(error=error_msg, include_in_memory=True)] + + def _make_history_item( + self, + model_output: AgentOutput | None, + state: BrowserState, + result: list[ActionResult], + metadata: Optional[StepMetadata] = None, + ) -> None: + """Create and store history item""" + + if model_output: + interacted_elements = AgentHistory.get_interacted_element(model_output, state.selector_map) + else: + interacted_elements = [None] + + state_history = BrowserStateHistory( + url=state.url, + title=state.title, + tabs=state.tabs, + interacted_element=interacted_elements, + screenshot=state.screenshot, + ) + + history_item = AgentHistory(model_output=model_output, result=result, state=state_history, metadata=metadata) + + self.state.history.history.append(history_item) + + THINK_TAGS = re.compile(r'.*?', re.DOTALL) + STRAY_CLOSE_TAG = re.compile(r'.*?', re.DOTALL) + + def _remove_think_tags(self, text: str) -> str: + # Step 1: Remove well-formed ... + text = re.sub(self.THINK_TAGS, '', text) + # Step 2: If there's an unmatched closing tag , + # remove everything up to and including that. + text = re.sub(self.STRAY_CLOSE_TAG, '', text) + return text.strip() + + def _convert_input_messages(self, input_messages: list[BaseMessage]) -> list[BaseMessage]: + """Convert input messages to the correct format""" + if self.model_name == 'deepseek-reasoner' or 'deepseek-r1' in self.model_name: + return convert_input_messages(input_messages, self.model_name) + else: + return input_messages + + @time_execution_async('--get_next_action (agent)') + async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput: + """Get next action from LLM based on current state""" + input_messages = self._convert_input_messages(input_messages) + + if self.tool_calling_method == 'raw': + output = self.llm.invoke(input_messages) + # TODO: currently invoke does not return reasoning_content, we should override invoke + output.content = self._remove_think_tags(str(output.content)) + try: + parsed_json = extract_json_from_model_output(output.content) + parsed = self.AgentOutput(**parsed_json) + except (ValueError, ValidationError) as e: + logger.warning(f'Failed to parse model output: {output} {str(e)}') + raise ValueError('Could not parse response.') + + elif self.tool_calling_method is None: + structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True) + response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore + parsed: AgentOutput | None = response['parsed'] + else: + structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True, method=self.tool_calling_method) + response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore + parsed: AgentOutput | None = response['parsed'] + + if parsed is None: + raise ValueError('Could not parse response.') + + # cut the number of actions to max_actions_per_step if needed + if len(parsed.action) > self.settings.max_actions_per_step: + parsed.action = parsed.action[: self.settings.max_actions_per_step] + + log_response(parsed) + + return parsed + + def _log_agent_run(self) -> None: + """Log the agent run""" + logger.info(f'πŸš€ Starting task: {self.task}') + + logger.debug(f'Version: {self.version}, Source: {self.source}') + self.telemetry.capture( + AgentRunTelemetryEvent( + agent_id=self.state.agent_id, + use_vision=self.settings.use_vision, + task=self.task, + model_name=self.model_name, + chat_model_library=self.chat_model_library, + version=self.version, + source=self.source, + ) + ) + + async def take_step(self) -> tuple[bool, bool]: + """Take a step + + Returns: + Tuple[bool, bool]: (is_done, is_valid) + """ + await self.step() + + if self.state.history.is_done(): + if self.settings.validate_output: + if not await self._validate_output(): + return True, False + + await self.log_completion() + if self.register_done_callback: + await self.register_done_callback(self.state.history) + + return True, True + + return False, False + + # @observe(name='agent.run', ignore_output=True) + @time_execution_async('--run (agent)') + async def run(self, max_steps: int = 100) -> AgentHistoryList: + """Execute the task with maximum number of steps""" + try: + self._log_agent_run() + + # Execute initial actions if provided + if self.initial_actions: + result = await self.multi_act(self.initial_actions, check_for_new_elements=False) + self.state.last_result = result + + for step in range(max_steps): + # Check if we should stop due to too many failures + if self.state.consecutive_failures >= self.settings.max_failures: + logger.error(f'❌ Stopping due to {self.settings.max_failures} consecutive failures') + break + + # Check control flags before each step + if self.state.stopped: + logger.info('Agent stopped') + break + + while self.state.paused: + await asyncio.sleep(0.2) # Small delay to prevent CPU spinning + if self.state.stopped: # Allow stopping while paused + break + + step_info = AgentStepInfo(step_number=step, max_steps=max_steps) + await self.step(step_info) + + if self.state.history.is_done(): + if self.settings.validate_output and step < max_steps - 1: + if not await self._validate_output(): + continue + + await self.log_completion() + break + else: + logger.info('❌ Failed to complete task in maximum steps') + + return self.state.history + finally: + self.telemetry.capture( + AgentEndTelemetryEvent( + agent_id=self.state.agent_id, + is_done=self.state.history.is_done(), + success=self.state.history.is_successful(), + steps=self.state.n_steps, + max_steps_reached=self.state.n_steps >= max_steps, + errors=self.state.history.errors(), + total_input_tokens=self.state.history.total_input_tokens(), + total_duration_seconds=self.state.history.total_duration_seconds(), + ) + ) + + if not self.injected_browser_context: + await self.browser_context.close() + + if not self.injected_browser and self.browser: + await self.browser.close() + + if self.settings.generate_gif: + output_path: str = 'agent_history.gif' + if isinstance(self.settings.generate_gif, str): + output_path = self.settings.generate_gif + + create_history_gif(task=self.task, history=self.state.history, output_path=output_path) + + # @observe(name='controller.multi_act') + @time_execution_async('--multi-act (agent)') + async def multi_act( + self, + actions: list[ActionModel], + check_for_new_elements: bool = True, + ) -> list[ActionResult]: + """Execute multiple actions""" + results = [] + + cached_selector_map = await self.browser_context.get_selector_map() + cached_path_hashes = set(e.hash.branch_path_hash for e in cached_selector_map.values()) + + await self.browser_context.remove_highlights() + + for i, action in enumerate(actions): + if action.get_index() is not None and i != 0: + new_state = await self.browser_context.get_state() + new_path_hashes = set(e.hash.branch_path_hash for e in new_state.selector_map.values()) + if check_for_new_elements and not new_path_hashes.issubset(cached_path_hashes): + # next action requires index but there are new elements on the page + msg = f'Something new appeared after action {i} / {len(actions)}' + logger.info(msg) + results.append(ActionResult(extracted_content=msg, include_in_memory=True)) + break + + await self._raise_if_stopped_or_paused() + + result = await self.controller.act( + action, + self.browser_context, + self.settings.page_extraction_llm, + self.sensitive_data, + self.settings.available_file_paths, + context=self.context, + ) + + results.append(result) + + logger.debug(f'Executed action {i + 1} / {len(actions)}') + if results[-1].is_done or results[-1].error or i == len(actions) - 1: + break + + await asyncio.sleep(self.browser_context.config.wait_between_actions) + # hash all elements. if it is a subset of cached_state its fine - else break (new elements on page) + + return results + + async def _validate_output(self) -> bool: + """Validate the output of the last action is what the user wanted""" + system_msg = ( + f'You are a validator of an agent who interacts with a browser. ' + f'Validate if the output of last action is what the user wanted and if the task is completed. ' + f'If the task is unclear defined, you can let it pass. But if something is missing or the image does not show what was requested dont let it pass. ' + f'Try to understand the page and help the model with suggestions like scroll, do x, ... to get the solution right. ' + f'Task to validate: {self.task}. Return a JSON object with 2 keys: is_valid and reason. ' + f'is_valid is a boolean that indicates if the output is correct. ' + f'reason is a string that explains why it is valid or not.' + f' example: {{"is_valid": false, "reason": "The user wanted to search for "cat photos", but the agent searched for "dog photos" instead."}}' + ) + + if self.browser_context.session: + state = await self.browser_context.get_state() + content = AgentMessagePrompt( + state=state, + result=self.state.last_result, + include_attributes=self.settings.include_attributes, + ) + msg = [SystemMessage(content=system_msg), content.get_user_message(self.settings.use_vision)] + else: + # if no browser session, we can't validate the output + return True + + class ValidationResult(BaseModel): + """ + Validation results. + """ + + is_valid: bool + reason: str + + validator = self.llm.with_structured_output(ValidationResult, include_raw=True) + response: dict[str, Any] = await validator.ainvoke(msg) # type: ignore + parsed: ValidationResult = response['parsed'] + is_valid = parsed.is_valid + if not is_valid: + logger.info(f'❌ Validator decision: {parsed.reason}') + msg = f'The output is not yet correct. {parsed.reason}.' + self.state.last_result = [ActionResult(extracted_content=msg, include_in_memory=True)] + else: + logger.info(f'βœ… Validator decision: {parsed.reason}') + return is_valid + + async def log_completion(self) -> None: + """Log the completion of the task""" + logger.info('βœ… Task completed') + if self.state.history.is_successful(): + logger.info('βœ… Successfully') + else: + logger.info('❌ Unfinished') + + if self.register_done_callback: + await self.register_done_callback(self.state.history) + + async def rerun_history( + self, + history: AgentHistoryList, + max_retries: int = 3, + skip_failures: bool = True, + delay_between_actions: float = 2.0, + ) -> list[ActionResult]: + """ + Rerun a saved history of actions with error handling and retry logic. + + Args: + history: The history to replay + max_retries: Maximum number of retries per action + skip_failures: Whether to skip failed actions or stop execution + delay_between_actions: Delay between actions in seconds + + Returns: + List of action results + """ + # Execute initial actions if provided + if self.initial_actions: + result = await self.multi_act(self.initial_actions) + self.state.last_result = result + + results = [] + + for i, history_item in enumerate(history.history): + goal = history_item.model_output.current_state.next_goal if history_item.model_output else '' + logger.info(f'Replaying step {i + 1}/{len(history.history)}: goal: {goal}') + + if ( + not history_item.model_output + or not history_item.model_output.action + or history_item.model_output.action == [None] + ): + logger.warning(f'Step {i + 1}: No action to replay, skipping') + results.append(ActionResult(error='No action to replay')) + continue + + retry_count = 0 + while retry_count < max_retries: + try: + result = await self._execute_history_step(history_item, delay_between_actions) + results.extend(result) + break + + except Exception as e: + retry_count += 1 + if retry_count == max_retries: + error_msg = f'Step {i + 1} failed after {max_retries} attempts: {str(e)}' + logger.error(error_msg) + if not skip_failures: + results.append(ActionResult(error=error_msg)) + raise RuntimeError(error_msg) + else: + logger.warning(f'Step {i + 1} failed (attempt {retry_count}/{max_retries}), retrying...') + await asyncio.sleep(delay_between_actions) + + return results + + async def _execute_history_step(self, history_item: AgentHistory, delay: float) -> list[ActionResult]: + """Execute a single step from history with element validation""" + state = await self.browser_context.get_state() + if not state or not history_item.model_output: + raise ValueError('Invalid state or model output') + updated_actions = [] + for i, action in enumerate(history_item.model_output.action): + updated_action = await self._update_action_indices( + history_item.state.interacted_element[i], + action, + state, + ) + updated_actions.append(updated_action) + + if updated_action is None: + raise ValueError(f'Could not find matching element {i} in current page') + + result = await self.multi_act(updated_actions) + + await asyncio.sleep(delay) + return result + + async def _update_action_indices( + self, + historical_element: Optional[DOMHistoryElement], + action: ActionModel, # Type this properly based on your action model + current_state: BrowserState, + ) -> Optional[ActionModel]: + """ + Update action indices based on current page state. + Returns updated action or None if element cannot be found. + """ + if not historical_element or not current_state.element_tree: + return action + + current_element = HistoryTreeProcessor.find_history_element_in_tree(historical_element, current_state.element_tree) + + if not current_element or current_element.highlight_index is None: + return None + + old_index = action.get_index() + if old_index != current_element.highlight_index: + action.set_index(current_element.highlight_index) + logger.info(f'Element moved in DOM, updated index from {old_index} to {current_element.highlight_index}') + + return action + + async def load_and_rerun(self, history_file: Optional[str | Path] = None, **kwargs) -> list[ActionResult]: + """ + Load history from file and rerun it. + + Args: + history_file: Path to the history file + **kwargs: Additional arguments passed to rerun_history + """ + if not history_file: + history_file = 'AgentHistory.json' + history = AgentHistoryList.load_from_file(history_file, self.AgentOutput) + return await self.rerun_history(history, **kwargs) + + def save_history(self, file_path: Optional[str | Path] = None) -> None: + """Save the history to a file""" + if not file_path: + file_path = 'AgentHistory.json' + self.state.history.save_to_file(file_path) + + def pause(self) -> None: + """Pause the agent before the next step""" + logger.info('πŸ”„ pausing Agent ') + self.state.paused = True + + def resume(self) -> None: + """Resume the agent""" + logger.info('▢️ Agent resuming') + self.state.paused = False + + def stop(self) -> None: + """Stop the agent""" + logger.info('⏹️ Agent stopping') + self.state.stopped = True + + def _convert_initial_actions(self, actions: List[Dict[str, Dict[str, Any]]]) -> List[ActionModel]: + """Convert dictionary-based actions to ActionModel instances""" + converted_actions = [] + action_model = self.ActionModel + for action_dict in actions: + # Each action_dict should have a single key-value pair + action_name = next(iter(action_dict)) + params = action_dict[action_name] + + # Get the parameter model for this action from registry + action_info = self.controller.registry.registry.actions[action_name] + param_model = action_info.param_model + + # Create validated parameters using the appropriate param model + validated_params = param_model(**params) + + # Create ActionModel instance with the validated parameters + action_model = self.ActionModel(**{action_name: validated_params}) + converted_actions.append(action_model) + + return converted_actions + + async def _run_planner(self) -> Optional[str]: + """Run the planner to analyze state and suggest next steps""" + # Skip planning if no planner_llm is set + if not self.settings.planner_llm: + return None + + # Create planner message history using full message history + planner_messages = [ + PlannerPrompt(self.controller.registry.get_prompt_description()).get_system_message(), + *self._message_manager.get_messages()[1:], # Use full message history except the first + ] + + if not self.settings.use_vision_for_planner and self.settings.use_vision: + last_state_message: HumanMessage = planner_messages[-1] + # remove image from last state message + new_msg = '' + if isinstance(last_state_message.content, list): + for msg in last_state_message.content: + if msg['type'] == 'text': # type: ignore + new_msg += msg['text'] # type: ignore + elif msg['type'] == 'image_url': # type: ignore + continue # type: ignore + else: + new_msg = last_state_message.content + + planner_messages[-1] = HumanMessage(content=new_msg) + + planner_messages = convert_input_messages(planner_messages, self.planner_model_name) + + # Get planner output + response = await self.settings.planner_llm.ainvoke(planner_messages) + plan = str(response.content) + # if deepseek-reasoner, remove think tags + if self.planner_model_name and ('deepseek-r1' in self.planner_model_name or 'deepseek-reasoner' in self.planner_model_name): + plan = self._remove_think_tags(plan) + try: + plan_json = json.loads(plan) + logger.info(f'Planning Analysis:\n{json.dumps(plan_json, indent=4)}') + except json.JSONDecodeError: + logger.info(f'Planning Analysis:\n{plan}') + except Exception as e: + logger.debug(f'Error parsing planning analysis: {e}') + logger.info(f'Plan: {plan}') + + return plan + + @property + def message_manager(self) -> MessageManager: + return self._message_manager diff --git a/browser_use/agent/system_prompt.md b/browser_use/agent/system_prompt.md new file mode 100644 index 0000000000000000000000000000000000000000..e70ae4952fca15c61c532960d3391c6dc423ff69 --- /dev/null +++ b/browser_use/agent/system_prompt.md @@ -0,0 +1,69 @@ +You are an AI agent designed to automate browser tasks. Your goal is to accomplish the ultimate task following the rules. + +# Input Format +Task +Previous steps +Current URL +Open Tabs +Interactive Elements +[index]text +- index: Numeric identifier for interaction +- type: HTML element type (button, input, etc.) +- text: Element description +Example: +[33] + +- Only elements with numeric indexes in [] are interactive +- elements without [] provide only context + +# Response Rules +1. RESPONSE FORMAT: You must ALWAYS respond with valid JSON in this exact format: +{{"current_state": {{"evaluation_previous_goal": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Mention if something unexpected happened. Shortly state why/why not", +"memory": "Description of what has been done and what you need to remember. Be very specific. Count here ALWAYS how many times you have done something and how many remain. E.g. 0 out of 10 websites analyzed. Continue with abc and xyz", +"next_goal": "What needs to be done with the next immediate action"}}, +"action":[{{"one_action_name": {{// action-specific parameter}}}}, // ... more actions in sequence]}} + +2. ACTIONS: You can specify multiple actions in the list to be executed in sequence. But always specify only one action name per item. Use maximum {{max_actions}} actions per sequence. +Common action sequences: +- Form filling: [{{"input_text": {{"index": 1, "text": "username"}}}}, {{"input_text": {{"index": 2, "text": "password"}}}}, {{"click_element": {{"index": 3}}}}] +- Navigation and extraction: [{{"go_to_url": {{"url": "https://example.com"}}}}, {{"extract_content": {{"goal": "extract the names"}}}}] +- Actions are executed in the given order +- If the page changes after an action, the sequence is interrupted and you get the new state. +- Only provide the action sequence until an action which changes the page state significantly. +- Try to be efficient, e.g. fill forms at once, or chain actions where nothing changes on the page +- only use multiple actions if it makes sense. + +3. ELEMENT INTERACTION: +- Only use indexes of the interactive elements +- Elements marked with "[]Non-interactive text" are non-interactive + +4. NAVIGATION & ERROR HANDLING: +- If no suitable elements exist, use other functions to complete the task +- If stuck, try alternative approaches - like going back to a previous page, new search, new tab etc. +- Handle popups/cookies by accepting or closing them +- Use scroll to find elements you are looking for +- If you want to research something, open a new tab instead of using the current tab +- If captcha pops up, try to solve it - else try a different approach +- If the page is not fully loaded, use wait action + +5. TASK COMPLETION: +- Use the done action as the last action as soon as the ultimate task is complete +- Dont use "done" before you are done with everything the user asked you, except you reach the last step of max_steps. +- If you reach your last step, use the done action even if the task is not fully finished. Provide all the information you have gathered so far. If the ultimate task is completly finished set success to true. If not everything the user asked for is completed set success in done to false! +- If you have to do something repeatedly for example the task says for "each", or "for all", or "x times", count always inside "memory" how many times you have done it and how many remain. Don't stop until you have completed like the task asked you. Only call done after the last step. +- Don't hallucinate actions +- Make sure you include everything you found out for the ultimate task in the done text parameter. Do not just say you are done, but include the requested information of the task. + +6. VISUAL CONTEXT: +- When an image is provided, use it to understand the page layout +- Bounding boxes with labels on their top right corner correspond to element indexes + +7. Form filling: +- If you fill an input field and your action sequence is interrupted, most often something changed e.g. suggestions popped up under the field. + +8. Long tasks: +- Keep track of the status and subresults in the memory. + +9. Extraction: +- If your task is to find information - call extract_content on the specific pages to get and store the information. +Your responses must be always JSON with the specified format. \ No newline at end of file diff --git a/browser_use/agent/tests.py b/browser_use/agent/tests.py new file mode 100644 index 0000000000000000000000000000000000000000..15c47357da7b549b8adb0f8e38f0e2224b994da3 --- /dev/null +++ b/browser_use/agent/tests.py @@ -0,0 +1,197 @@ +import pytest + +from browser_use.agent.views import ( + ActionResult, + AgentBrain, + AgentHistory, + AgentHistoryList, + AgentOutput, +) +from browser_use.browser.views import BrowserState, BrowserStateHistory, TabInfo +from browser_use.controller.registry.service import Registry +from browser_use.controller.views import ClickElementAction, DoneAction, ExtractPageContentAction +from browser_use.dom.views import DOMElementNode + + +@pytest.fixture +def sample_browser_state(): + return BrowserState( + url='https://example.com', + title='Example Page', + tabs=[TabInfo(url='https://example.com', title='Example Page', page_id=1)], + screenshot='screenshot1.png', + element_tree=DOMElementNode( + tag_name='root', + is_visible=True, + parent=None, + xpath='', + attributes={}, + children=[], + ), + selector_map={}, + ) + + +@pytest.fixture +def action_registry(): + registry = Registry() + + # Register the actions we need for testing + @registry.action(description='Click an element', param_model=ClickElementAction) + def click_element(params: ClickElementAction, browser=None): + pass + + @registry.action( + description='Extract page content', + param_model=ExtractPageContentAction, + ) + def extract_page_content(params: ExtractPageContentAction, browser=None): + pass + + @registry.action(description='Mark task as done', param_model=DoneAction) + def done(params: DoneAction): + pass + + # Create the dynamic ActionModel with all registered actions + return registry.create_action_model() + + +@pytest.fixture +def sample_history(action_registry): + # Create actions with nested params structure + click_action = action_registry(click_element={'index': 1}) + + extract_action = action_registry(extract_page_content={'value': 'text'}) + + done_action = action_registry(done={'text': 'Task completed'}) + + histories = [ + AgentHistory( + model_output=AgentOutput( + current_state=AgentBrain( + evaluation_previous_goal='None', + memory='Started task', + next_goal='Click button', + ), + action=[click_action], + ), + result=[ActionResult(is_done=False)], + state=BrowserStateHistory( + url='https://example.com', + title='Page 1', + tabs=[TabInfo(url='https://example.com', title='Page 1', page_id=1)], + screenshot='screenshot1.png', + interacted_element=[{'xpath': '//button[1]'}], + ), + ), + AgentHistory( + model_output=AgentOutput( + current_state=AgentBrain( + evaluation_previous_goal='Clicked button', + memory='Button clicked', + next_goal='Extract content', + ), + action=[extract_action], + ), + result=[ + ActionResult( + is_done=False, + extracted_content='Extracted text', + error='Failed to extract completely', + ) + ], + state=BrowserStateHistory( + url='https://example.com/page2', + title='Page 2', + tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)], + screenshot='screenshot2.png', + interacted_element=[{'xpath': '//div[1]'}], + ), + ), + AgentHistory( + model_output=AgentOutput( + current_state=AgentBrain( + evaluation_previous_goal='Extracted content', + memory='Content extracted', + next_goal='Finish task', + ), + action=[done_action], + ), + result=[ActionResult(is_done=True, extracted_content='Task completed', error=None)], + state=BrowserStateHistory( + url='https://example.com/page2', + title='Page 2', + tabs=[TabInfo(url='https://example.com/page2', title='Page 2', page_id=2)], + screenshot='screenshot3.png', + interacted_element=[{'xpath': '//div[1]'}], + ), + ), + ] + return AgentHistoryList(history=histories) + + +def test_last_model_output(sample_history: AgentHistoryList): + last_output = sample_history.last_action() + print(last_output) + assert last_output == {'done': {'text': 'Task completed'}} + + +def test_get_errors(sample_history: AgentHistoryList): + errors = sample_history.errors() + assert len(errors) == 1 + assert errors[0] == 'Failed to extract completely' + + +def test_final_result(sample_history: AgentHistoryList): + assert sample_history.final_result() == 'Task completed' + + +def test_is_done(sample_history: AgentHistoryList): + assert sample_history.is_done() == True + + +def test_urls(sample_history: AgentHistoryList): + urls = sample_history.urls() + assert 'https://example.com' in urls + assert 'https://example.com/page2' in urls + + +def test_all_screenshots(sample_history: AgentHistoryList): + screenshots = sample_history.screenshots() + assert len(screenshots) == 3 + assert screenshots == ['screenshot1.png', 'screenshot2.png', 'screenshot3.png'] + + +def test_all_model_outputs(sample_history: AgentHistoryList): + outputs = sample_history.model_actions() + print(f'DEBUG: {outputs[0]}') + assert len(outputs) == 3 + # get first key value pair + assert dict([next(iter(outputs[0].items()))]) == {'click_element': {'index': 1}} + assert dict([next(iter(outputs[1].items()))]) == {'extract_page_content': {'value': 'text'}} + assert dict([next(iter(outputs[2].items()))]) == {'done': {'text': 'Task completed'}} + + +def test_all_model_outputs_filtered(sample_history: AgentHistoryList): + filtered = sample_history.model_actions_filtered(include=['click_element']) + assert len(filtered) == 1 + assert filtered[0]['click_element']['index'] == 1 + + +def test_empty_history(): + empty_history = AgentHistoryList(history=[]) + assert empty_history.last_action() is None + assert empty_history.final_result() is None + assert empty_history.is_done() == False + assert len(empty_history.urls()) == 0 + + +# Add a test to verify action creation +def test_action_creation(action_registry): + click_action = action_registry(click_element={'index': 1}) + + assert click_action.model_dump(exclude_none=True) == {'click_element': {'index': 1}} + + +# run this with: +# pytest browser_use/agent/tests.py diff --git a/browser_use/agent/views.py b/browser_use/agent/views.py new file mode 100644 index 0000000000000000000000000000000000000000..6d8249727b90e10c71c2b2c433fa9a504fc1ae52 --- /dev/null +++ b/browser_use/agent/views.py @@ -0,0 +1,393 @@ +from __future__ import annotations + +import json +import traceback +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Type + +from langchain_core.language_models.chat_models import BaseChatModel +from openai import RateLimitError +from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model + +from browser_use.agent.message_manager.views import MessageManagerState +from browser_use.browser.views import BrowserStateHistory +from browser_use.controller.registry.views import ActionModel +from browser_use.dom.history_tree_processor.service import ( + DOMElementNode, + DOMHistoryElement, + HistoryTreeProcessor, +) +from browser_use.dom.views import SelectorMap + +ToolCallingMethod = Literal['function_calling', 'json_mode', 'raw', 'auto'] + + +class AgentSettings(BaseModel): + """Options for the agent""" + + use_vision: bool = True + use_vision_for_planner: bool = False + save_conversation_path: Optional[str] = None + save_conversation_path_encoding: Optional[str] = 'utf-8' + max_failures: int = 3 + retry_delay: int = 10 + max_input_tokens: int = 128000 + validate_output: bool = False + message_context: Optional[str] = None + generate_gif: bool | str = False + available_file_paths: Optional[list[str]] = None + override_system_message: Optional[str] = None + extend_system_message: Optional[str] = None + include_attributes: list[str] = [ + 'title', + 'type', + 'name', + 'role', + 'tabindex', + 'aria-label', + 'placeholder', + 'value', + 'alt', + 'aria-expanded', + ] + max_actions_per_step: int = 10 + + tool_calling_method: Optional[ToolCallingMethod] = 'auto' + page_extraction_llm: Optional[BaseChatModel] = None + planner_llm: Optional[BaseChatModel] = None + planner_interval: int = 1 # Run planner every N steps + + +class AgentState(BaseModel): + """Holds all state information for an Agent""" + + agent_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + n_steps: int = 1 + consecutive_failures: int = 0 + last_result: Optional[List['ActionResult']] = None + history: AgentHistoryList = Field(default_factory=lambda: AgentHistoryList(history=[])) + last_plan: Optional[str] = None + paused: bool = False + stopped: bool = False + + message_manager_state: MessageManagerState = Field(default_factory=MessageManagerState) + + # class Config: + # arbitrary_types_allowed = True + + +@dataclass +class AgentStepInfo: + step_number: int + max_steps: int + + def is_last_step(self) -> bool: + """Check if this is the last step""" + return self.step_number >= self.max_steps - 1 + + +class ActionResult(BaseModel): + """Result of executing an action""" + + is_done: Optional[bool] = False + success: Optional[bool] = None + extracted_content: Optional[str] = None + error: Optional[str] = None + include_in_memory: bool = False # whether to include in past messages as context or not + + +class StepMetadata(BaseModel): + """Metadata for a single step including timing and token information""" + + step_start_time: float + step_end_time: float + input_tokens: int # Approximate tokens from message manager for this step + step_number: int + + @property + def duration_seconds(self) -> float: + """Calculate step duration in seconds""" + return self.step_end_time - self.step_start_time + + +class AgentBrain(BaseModel): + """Current state of the agent""" + + evaluation_previous_goal: str + memory: str + next_goal: str + + +class AgentOutput(BaseModel): + """Output model for agent + + @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + current_state: AgentBrain + action: list[ActionModel] = Field( + ..., + description='List of actions to execute', + json_schema_extra={'min_items': 1}, # Ensure at least one action is provided + ) + + @staticmethod + def type_with_custom_actions(custom_actions: Type[ActionModel]) -> Type['AgentOutput']: + """Extend actions with custom actions""" + model_ = create_model( + 'AgentOutput', + __base__=AgentOutput, + action=( + list[custom_actions], + Field(..., description='List of actions to execute', json_schema_extra={'min_items': 1}), + ), + __module__=AgentOutput.__module__, + ) + model_.__doc__ = 'AgentOutput model with custom actions' + return model_ + + +class AgentHistory(BaseModel): + """History item for agent actions""" + + model_output: AgentOutput | None + result: list[ActionResult] + state: BrowserStateHistory + metadata: Optional[StepMetadata] = None + + model_config = ConfigDict(arbitrary_types_allowed=True, protected_namespaces=()) + + @staticmethod + def get_interacted_element(model_output: AgentOutput, selector_map: SelectorMap) -> list[DOMHistoryElement | None]: + elements = [] + for action in model_output.action: + index = action.get_index() + if index and index in selector_map: + el: DOMElementNode = selector_map[index] + elements.append(HistoryTreeProcessor.convert_dom_element_to_history_element(el)) + else: + elements.append(None) + return elements + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Custom serialization handling circular references""" + + # Handle action serialization + model_output_dump = None + if self.model_output: + action_dump = [action.model_dump(exclude_none=True) for action in self.model_output.action] + model_output_dump = { + 'current_state': self.model_output.current_state.model_dump(), + 'action': action_dump, # This preserves the actual action data + } + + return { + 'model_output': model_output_dump, + 'result': [r.model_dump(exclude_none=True) for r in self.result], + 'state': self.state.to_dict(), + 'metadata': self.metadata.model_dump() if self.metadata else None, + } + + +class AgentHistoryList(BaseModel): + """List of agent history items""" + + history: list[AgentHistory] + + def total_duration_seconds(self) -> float: + """Get total duration of all steps in seconds""" + total = 0.0 + for h in self.history: + if h.metadata: + total += h.metadata.duration_seconds + return total + + def total_input_tokens(self) -> int: + """ + Get total tokens used across all steps. + Note: These are from the approximate token counting of the message manager. + For accurate token counting, use tools like LangChain Smith or OpenAI's token counters. + """ + total = 0 + for h in self.history: + if h.metadata: + total += h.metadata.input_tokens + return total + + def input_token_usage(self) -> list[int]: + """Get token usage for each step""" + return [h.metadata.input_tokens for h in self.history if h.metadata] + + def __str__(self) -> str: + """Representation of the AgentHistoryList object""" + return f'AgentHistoryList(all_results={self.action_results()}, all_model_outputs={self.model_actions()})' + + def __repr__(self) -> str: + """Representation of the AgentHistoryList object""" + return self.__str__() + + def save_to_file(self, filepath: str | Path) -> None: + """Save history to JSON file with proper serialization""" + try: + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + data = self.model_dump() + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2) + except Exception as e: + raise e + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Custom serialization that properly uses AgentHistory's model_dump""" + return { + 'history': [h.model_dump(**kwargs) for h in self.history], + } + + @classmethod + def load_from_file(cls, filepath: str | Path, output_model: Type[AgentOutput]) -> 'AgentHistoryList': + """Load history from JSON file""" + with open(filepath, 'r', encoding='utf-8') as f: + data = json.load(f) + # loop through history and validate output_model actions to enrich with custom actions + for h in data['history']: + if h['model_output']: + if isinstance(h['model_output'], dict): + h['model_output'] = output_model.model_validate(h['model_output']) + else: + h['model_output'] = None + if 'interacted_element' not in h['state']: + h['state']['interacted_element'] = None + history = cls.model_validate(data) + return history + + def last_action(self) -> None | dict: + """Last action in history""" + if self.history and self.history[-1].model_output: + return self.history[-1].model_output.action[-1].model_dump(exclude_none=True) + return None + + def errors(self) -> list[str | None]: + """Get all errors from history, with None for steps without errors""" + errors = [] + for h in self.history: + step_errors = [r.error for r in h.result if r.error] + + # each step can have only one error + errors.append(step_errors[0] if step_errors else None) + return errors + + def final_result(self) -> None | str: + """Final result from history""" + if self.history and self.history[-1].result[-1].extracted_content: + return self.history[-1].result[-1].extracted_content + return None + + def is_done(self) -> bool: + """Check if the agent is done""" + if self.history and len(self.history[-1].result) > 0: + last_result = self.history[-1].result[-1] + return last_result.is_done is True + return False + + def is_successful(self) -> bool | None: + """Check if the agent completed successfully - the agent decides in the last step if it was successful or not. None if not done yet.""" + if self.history and len(self.history[-1].result) > 0: + last_result = self.history[-1].result[-1] + if last_result.is_done is True: + return last_result.success + return None + + def has_errors(self) -> bool: + """Check if the agent has any non-None errors""" + return any(error is not None for error in self.errors()) + + def urls(self) -> list[str | None]: + """Get all unique URLs from history""" + return [h.state.url if h.state.url is not None else None for h in self.history] + + def screenshots(self) -> list[str | None]: + """Get all screenshots from history""" + return [h.state.screenshot if h.state.screenshot is not None else None for h in self.history] + + def action_names(self) -> list[str]: + """Get all action names from history""" + action_names = [] + for action in self.model_actions(): + actions = list(action.keys()) + if actions: + action_names.append(actions[0]) + return action_names + + def model_thoughts(self) -> list[AgentBrain]: + """Get all thoughts from history""" + return [h.model_output.current_state for h in self.history if h.model_output] + + def model_outputs(self) -> list[AgentOutput]: + """Get all model outputs from history""" + return [h.model_output for h in self.history if h.model_output] + + # get all actions with params + def model_actions(self) -> list[dict]: + """Get all actions from history""" + outputs = [] + + for h in self.history: + if h.model_output: + for action, interacted_element in zip(h.model_output.action, h.state.interacted_element): + output = action.model_dump(exclude_none=True) + output['interacted_element'] = interacted_element + outputs.append(output) + return outputs + + def action_results(self) -> list[ActionResult]: + """Get all results from history""" + results = [] + for h in self.history: + results.extend([r for r in h.result if r]) + return results + + def extracted_content(self) -> list[str]: + """Get all extracted content from history""" + content = [] + for h in self.history: + content.extend([r.extracted_content for r in h.result if r.extracted_content]) + return content + + def model_actions_filtered(self, include: list[str] | None = None) -> list[dict]: + """Get all model actions from history as JSON""" + if include is None: + include = [] + outputs = self.model_actions() + result = [] + for o in outputs: + for i in include: + if i == list(o.keys())[0]: + result.append(o) + return result + + def number_of_steps(self) -> int: + """Get the number of steps in the history""" + return len(self.history) + + +class AgentError: + """Container for agent error handling""" + + VALIDATION_ERROR = 'Invalid model output format. Please follow the correct schema.' + RATE_LIMIT_ERROR = 'Rate limit reached. Waiting before retry.' + NO_VALID_ACTION = 'No valid action found' + + @staticmethod + def format_error(error: Exception, include_trace: bool = False) -> str: + """Format error message based on error type and optionally include trace""" + message = '' + if isinstance(error, ValidationError): + return f'{AgentError.VALIDATION_ERROR}\nDetails: {str(error)}' + if isinstance(error, RateLimitError): + return AgentError.RATE_LIMIT_ERROR + if include_trace: + return f'{str(error)}\nStacktrace:\n{traceback.format_exc()}' + return f'{str(error)}' diff --git a/browser_use/browser/browser.py b/browser_use/browser/browser.py new file mode 100644 index 0000000000000000000000000000000000000000..9278ac34c9dffb7937897dab3af1c6eefe416226 --- /dev/null +++ b/browser_use/browser/browser.py @@ -0,0 +1,253 @@ +""" +Playwright browser on steroids. +""" + +import asyncio +import gc +import logging +from dataclasses import dataclass, field + +from playwright._impl._api_structures import ProxySettings +from playwright.async_api import Browser as PlaywrightBrowser +from playwright.async_api import ( + Playwright, + async_playwright, +) + +from browser_use.browser.context import BrowserContext, BrowserContextConfig +from browser_use.utils import time_execution_async + +logger = logging.getLogger(__name__) + + +@dataclass +class BrowserConfig: + r""" + Configuration for the Browser. + + Default values: + headless: True + Whether to run browser in headless mode + + disable_security: True + Disable browser security features + + extra_chromium_args: [] + Extra arguments to pass to the browser + + wss_url: None + Connect to a browser instance via WebSocket + + cdp_url: None + Connect to a browser instance via CDP + + chrome_instance_path: None + Path to a Chrome instance to use to connect to your normal browser + e.g. '/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome' + """ + + headless: bool = False + disable_security: bool = True + extra_chromium_args: list[str] = field(default_factory=list) + chrome_instance_path: str | None = None + wss_url: str | None = None + cdp_url: str | None = None + + proxy: ProxySettings | None = field(default=None) + new_context_config: BrowserContextConfig = field(default_factory=BrowserContextConfig) + + _force_keep_browser_alive: bool = False + + +# @singleton: TODO - think about id singleton makes sense here +# @dev By default this is a singleton, but you can create multiple instances if you need to. +class Browser: + """ + Playwright browser on steroids. + + This is persistant browser factory that can spawn multiple browser contexts. + It is recommended to use only one instance of Browser per your application (RAM usage will grow otherwise). + """ + + def __init__( + self, + config: BrowserConfig = BrowserConfig(), + ): + logger.debug('Initializing new browser') + self.config = config + self.playwright: Playwright | None = None + self.playwright_browser: PlaywrightBrowser | None = None + + self.disable_security_args = [] + if self.config.disable_security: + self.disable_security_args = [ + '--disable-web-security', + '--disable-site-isolation-trials', + '--disable-features=IsolateOrigins,site-per-process', + ] + + async def new_context(self, config: BrowserContextConfig = BrowserContextConfig()) -> BrowserContext: + """Create a browser context""" + return BrowserContext(config=config, browser=self) + + async def get_playwright_browser(self) -> PlaywrightBrowser: + """Get a browser context""" + if self.playwright_browser is None: + return await self._init() + + return self.playwright_browser + + @time_execution_async('--init (browser)') + async def _init(self): + """Initialize the browser session""" + playwright = await async_playwright().start() + browser = await self._setup_browser(playwright) + + self.playwright = playwright + self.playwright_browser = browser + + return self.playwright_browser + + async def _setup_cdp(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + if not self.config.cdp_url: + raise ValueError('CDP URL is required') + logger.info(f'Connecting to remote browser via CDP {self.config.cdp_url}') + browser = await playwright.chromium.connect_over_cdp(self.config.cdp_url) + return browser + + async def _setup_wss(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + if not self.config.wss_url: + raise ValueError('WSS URL is required') + logger.info(f'Connecting to remote browser via WSS {self.config.wss_url}') + browser = await playwright.chromium.connect(self.config.wss_url) + return browser + + async def _setup_browser_with_instance(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + if not self.config.chrome_instance_path: + raise ValueError('Chrome instance path is required') + import subprocess + + import requests + + try: + # Check if browser is already running + response = requests.get('http://localhost:9222/json/version', timeout=2) + if response.status_code == 200: + logger.info('Reusing existing Chrome instance') + browser = await playwright.chromium.connect_over_cdp( + endpoint_url='http://localhost:9222', + timeout=20000, # 20 second timeout for connection + ) + return browser + except requests.ConnectionError: + logger.debug('No existing Chrome instance found, starting a new one') + + # Start a new Chrome instance + subprocess.Popen( + [ + self.config.chrome_instance_path, + '--remote-debugging-port=9222', + ] + + self.config.extra_chromium_args, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + # Attempt to connect again after starting a new instance + for _ in range(10): + try: + response = requests.get('http://localhost:9222/json/version', timeout=2) + if response.status_code == 200: + break + except requests.ConnectionError: + pass + await asyncio.sleep(1) + + # Attempt to connect again after starting a new instance + try: + browser = await playwright.chromium.connect_over_cdp( + endpoint_url='http://localhost:9222', + timeout=20000, # 20 second timeout for connection + ) + return browser + except Exception as e: + logger.error(f'Failed to start a new Chrome instance.: {str(e)}') + raise RuntimeError( + ' To start chrome in Debug mode, you need to close all existing Chrome instances and try again otherwise we can not connect to the instance.' + ) + + async def _setup_standard_browser(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + browser = await playwright.chromium.launch( + headless=self.config.headless, + args=[ + '--no-sandbox', + '--disable-blink-features=AutomationControlled', + '--disable-infobars', + '--disable-background-timer-throttling', + '--disable-popup-blocking', + '--disable-backgrounding-occluded-windows', + '--disable-renderer-backgrounding', + '--disable-window-activation', + '--disable-focus-on-load', + '--no-first-run', + '--no-default-browser-check', + '--no-startup-window', + '--window-position=0,0', + # '--window-size=1280,1000', + ] + + self.disable_security_args + + self.config.extra_chromium_args, + proxy=self.config.proxy, + ) + # convert to Browser + return browser + + async def _setup_browser(self, playwright: Playwright) -> PlaywrightBrowser: + """Sets up and returns a Playwright Browser instance with anti-detection measures.""" + try: + if self.config.cdp_url: + return await self._setup_cdp(playwright) + if self.config.wss_url: + return await self._setup_wss(playwright) + elif self.config.chrome_instance_path: + return await self._setup_browser_with_instance(playwright) + else: + return await self._setup_standard_browser(playwright) + except Exception as e: + logger.error(f'Failed to initialize Playwright browser: {str(e)}') + raise + + async def close(self): + """Close the browser instance""" + try: + if not self.config._force_keep_browser_alive: + if self.playwright_browser: + await self.playwright_browser.close() + del self.playwright_browser + if self.playwright: + await self.playwright.stop() + del self.playwright + + except Exception as e: + logger.debug(f'Failed to close browser properly: {e}') + finally: + self.playwright_browser = None + self.playwright = None + + gc.collect() + + def __del__(self): + """Async cleanup when object is destroyed""" + try: + if self.playwright_browser or self.playwright: + loop = asyncio.get_running_loop() + if loop.is_running(): + loop.create_task(self.close()) + else: + asyncio.run(self.close()) + except Exception as e: + logger.debug(f'Failed to cleanup browser in destructor: {e}') diff --git a/browser_use/browser/context.py b/browser_use/browser/context.py new file mode 100644 index 0000000000000000000000000000000000000000..d005be4ea6eb5a23f2434cc0655ba0650012ba2b --- /dev/null +++ b/browser_use/browser/context.py @@ -0,0 +1,1353 @@ +""" +Playwright browser on steroids. +""" + +import asyncio +import base64 +import gc +import json +import logging +import os +import re +import time +import uuid +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Optional, TypedDict + +from playwright._impl._errors import TimeoutError +from playwright.async_api import Browser as PlaywrightBrowser +from playwright.async_api import ( + BrowserContext as PlaywrightBrowserContext, +) +from playwright.async_api import ( + ElementHandle, + FrameLocator, + Page, +) + +from browser_use.browser.views import ( + BrowserError, + BrowserState, + TabInfo, + URLNotAllowedError, +) +from browser_use.dom.service import DomService +from browser_use.dom.views import DOMElementNode, SelectorMap +from browser_use.utils import time_execution_async, time_execution_sync + +if TYPE_CHECKING: + from browser_use.browser.browser import Browser + +logger = logging.getLogger(__name__) + + +class BrowserContextWindowSize(TypedDict): + width: int + height: int + + +@dataclass +class BrowserContextConfig: + """ + Configuration for the BrowserContext. + + Default values: + cookies_file: None + Path to cookies file for persistence + + disable_security: True + Disable browser security features + + minimum_wait_page_load_time: 0.5 + Minimum time to wait before getting page state for LLM input + + wait_for_network_idle_page_load_time: 1.0 + Time to wait for network requests to finish before getting page state. + Lower values may result in incomplete page loads. + + maximum_wait_page_load_time: 5.0 + Maximum time to wait for page load before proceeding anyway + + wait_between_actions: 1.0 + Time to wait between multiple per step actions + + browser_window_size: { + 'width': 1280, + 'height': 1100, + } + Default browser window size + + no_viewport: False + Disable viewport + + save_recording_path: None + Path to save video recordings + + save_downloads_path: None + Path to save downloads to + + trace_path: None + Path to save trace files. It will auto name the file with the TRACE_PATH/{context_id}.zip + + locale: None + Specify user locale, for example en-GB, de-DE, etc. Locale will affect navigator.language value, Accept-Language request header value as well as number and date formatting rules. If not provided, defaults to the system default locale. + + user_agent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' + custom user agent to use. + + highlight_elements: True + Highlight elements in the DOM on the screen + + viewport_expansion: 500 + Viewport expansion in pixels. This amount will increase the number of elements which are included in the state what the LLM will see. If set to -1, all elements will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included. + + allowed_domains: None + List of allowed domains that can be accessed. If None, all domains are allowed. + Example: ['example.com', 'api.example.com'] + + include_dynamic_attributes: bool = True + Include dynamic attributes in the CSS selector. If you want to reuse the css_selectors, it might be better to set this to False. + """ + + cookies_file: str | None = None + minimum_wait_page_load_time: float = 0.25 + wait_for_network_idle_page_load_time: float = 0.5 + maximum_wait_page_load_time: float = 5 + wait_between_actions: float = 0.5 + + disable_security: bool = True + + browser_window_size: BrowserContextWindowSize = field(default_factory=lambda: {'width': 1280, 'height': 1100}) + no_viewport: Optional[bool] = None + + save_recording_path: str | None = None + save_downloads_path: str | None = None + trace_path: str | None = None + locale: str | None = None + user_agent: str = ( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' + ) + + highlight_elements: bool = True + viewport_expansion: int = 500 + allowed_domains: list[str] | None = None + include_dynamic_attributes: bool = True + + _force_keep_context_alive: bool = False + + +@dataclass +class BrowserSession: + context: PlaywrightBrowserContext + cached_state: BrowserState | None + + +@dataclass +class BrowserContextState: + """ + State of the browser context + """ + + target_id: str | None = None # CDP target ID + + +class BrowserContext: + def __init__( + self, + browser: 'Browser', + config: BrowserContextConfig = BrowserContextConfig(), + state: Optional[BrowserContextState] = None, + ): + self.context_id = str(uuid.uuid4()) + logger.debug(f'Initializing new browser context with id: {self.context_id}') + + self.config = config + self.browser = browser + + self.state = state or BrowserContextState() + + # Initialize these as None - they'll be set up when needed + self.session: BrowserSession | None = None + + async def __aenter__(self): + """Async context manager entry""" + await self._initialize_session() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Async context manager exit""" + await self.close() + + @time_execution_async('--close') + async def close(self): + """Close the browser instance""" + logger.debug('Closing browser context') + + try: + if self.session is None: + return + + # Then remove CDP protocol listeners + if self._page_event_handler and self.session.context: + try: + # This actually sends a CDP command to unsubscribe + self.session.context.remove_listener('page', self._page_event_handler) + except Exception as e: + logger.debug(f'Failed to remove CDP listener: {e}') + self._page_event_handler = None + + await self.save_cookies() + + if self.config.trace_path: + try: + await self.session.context.tracing.stop(path=os.path.join(self.config.trace_path, f'{self.context_id}.zip')) + except Exception as e: + logger.debug(f'Failed to stop tracing: {e}') + + # This is crucial - it closes the CDP connection + if not self.config._force_keep_context_alive: + try: + await self.session.context.close() + except Exception as e: + logger.debug(f'Failed to close context: {e}') + + finally: + # Dereference everything + self.session = None + self._page_event_handler = None + + def __del__(self): + """Cleanup when object is destroyed""" + if not self.config._force_keep_context_alive and self.session is not None: + logger.debug('BrowserContext was not properly closed before destruction') + try: + # Use sync Playwright method for force cleanup + if hasattr(self.session.context, '_impl_obj'): + asyncio.run(self.session.context._impl_obj.close()) + + self.session = None + gc.collect() + except Exception as e: + logger.warning(f'Failed to force close browser context: {e}') + + @time_execution_async('--initialize_session') + async def _initialize_session(self): + """Initialize the browser session""" + logger.debug('Initializing browser context') + + playwright_browser = await self.browser.get_playwright_browser() + context = await self._create_context(playwright_browser) + self._page_event_handler = None + + # Get or create a page to use + pages = context.pages + + self.session = BrowserSession( + context=context, + cached_state=None, + ) + + active_page = None + if self.browser.config.cdp_url: + # If we have a saved target ID, try to find and activate it + if self.state.target_id: + targets = await self._get_cdp_targets() + for target in targets: + if target['targetId'] == self.state.target_id: + # Find matching page by URL + for page in pages: + if page.url == target['url']: + active_page = page + break + break + + # If no target ID or couldn't find it, use existing page or create new + if not active_page: + if pages: + active_page = pages[0] + logger.debug('Using existing page') + else: + active_page = await context.new_page() + logger.debug('Created new page') + + # Get target ID for the active page + if self.browser.config.cdp_url: + targets = await self._get_cdp_targets() + for target in targets: + if target['url'] == active_page.url: + self.state.target_id = target['targetId'] + break + + # Bring page to front + await active_page.bring_to_front() + await active_page.wait_for_load_state('load') + + return self.session + + def _add_new_page_listener(self, context: PlaywrightBrowserContext): + async def on_page(page: Page): + if self.browser.config.cdp_url: + await page.reload() # Reload the page to avoid timeout errors + await page.wait_for_load_state() + logger.debug(f'New page opened: {page.url}') + if self.session is not None: + self.state.target_id = None + + self._page_event_handler = on_page + context.on('page', on_page) + + async def get_session(self) -> BrowserSession: + """Lazy initialization of the browser and related components""" + if self.session is None: + return await self._initialize_session() + return self.session + + async def get_current_page(self) -> Page: + """Get the current page""" + session = await self.get_session() + return await self._get_current_page(session) + + async def _create_context(self, browser: PlaywrightBrowser): + """Creates a new browser context with anti-detection measures and loads cookies if available.""" + if self.browser.config.cdp_url and len(browser.contexts) > 0: + context = browser.contexts[0] + elif self.browser.config.chrome_instance_path and len(browser.contexts) > 0: + # Connect to existing Chrome instance instead of creating new one + context = browser.contexts[0] + else: + # Original code for creating new context + context = await browser.new_context( + viewport=self.config.browser_window_size, + no_viewport=False, + user_agent=self.config.user_agent, + java_script_enabled=True, + bypass_csp=self.config.disable_security, + ignore_https_errors=self.config.disable_security, + record_video_dir=self.config.save_recording_path, + record_video_size=self.config.browser_window_size, + locale=self.config.locale, + ) + + if self.config.trace_path: + await context.tracing.start(screenshots=True, snapshots=True, sources=True) + + # Load cookies if they exist + if self.config.cookies_file and os.path.exists(self.config.cookies_file): + with open(self.config.cookies_file, 'r') as f: + cookies = json.load(f) + logger.info(f'Loaded {len(cookies)} cookies from {self.config.cookies_file}') + await context.add_cookies(cookies) + + # Expose anti-detection scripts + await context.add_init_script( + """ + // Webdriver property + Object.defineProperty(navigator, 'webdriver', { + get: () => undefined + }); + + // Languages + Object.defineProperty(navigator, 'languages', { + get: () => ['en-US'] + }); + + // Plugins + Object.defineProperty(navigator, 'plugins', { + get: () => [1, 2, 3, 4, 5] + }); + + // Chrome runtime + window.chrome = { runtime: {} }; + + // Permissions + const originalQuery = window.navigator.permissions.query; + window.navigator.permissions.query = (parameters) => ( + parameters.name === 'notifications' ? + Promise.resolve({ state: Notification.permission }) : + originalQuery(parameters) + ); + (function () { + const originalAttachShadow = Element.prototype.attachShadow; + Element.prototype.attachShadow = function attachShadow(options) { + return originalAttachShadow.call(this, { ...options, mode: "open" }); + }; + })(); + """ + ) + + return context + + async def _wait_for_stable_network(self): + page = await self.get_current_page() + + pending_requests = set() + last_activity = asyncio.get_event_loop().time() + + # Define relevant resource types and content types + RELEVANT_RESOURCE_TYPES = { + 'document', + 'stylesheet', + 'image', + 'font', + 'script', + 'iframe', + } + + RELEVANT_CONTENT_TYPES = { + 'text/html', + 'text/css', + 'application/javascript', + 'image/', + 'font/', + 'application/json', + } + + # Additional patterns to filter out + IGNORED_URL_PATTERNS = { + # Analytics and tracking + 'analytics', + 'tracking', + 'telemetry', + 'beacon', + 'metrics', + # Ad-related + 'doubleclick', + 'adsystem', + 'adserver', + 'advertising', + # Social media widgets + 'facebook.com/plugins', + 'platform.twitter', + 'linkedin.com/embed', + # Live chat and support + 'livechat', + 'zendesk', + 'intercom', + 'crisp.chat', + 'hotjar', + # Push notifications + 'push-notifications', + 'onesignal', + 'pushwoosh', + # Background sync/heartbeat + 'heartbeat', + 'ping', + 'alive', + # WebRTC and streaming + 'webrtc', + 'rtmp://', + 'wss://', + # Common CDNs for dynamic content + 'cloudfront.net', + 'fastly.net', + } + + async def on_request(request): + # Filter by resource type + if request.resource_type not in RELEVANT_RESOURCE_TYPES: + return + + # Filter out streaming, websocket, and other real-time requests + if request.resource_type in { + 'websocket', + 'media', + 'eventsource', + 'manifest', + 'other', + }: + return + + # Filter out by URL patterns + url = request.url.lower() + if any(pattern in url for pattern in IGNORED_URL_PATTERNS): + return + + # Filter out data URLs and blob URLs + if url.startswith(('data:', 'blob:')): + return + + # Filter out requests with certain headers + headers = request.headers + if headers.get('purpose') == 'prefetch' or headers.get('sec-fetch-dest') in [ + 'video', + 'audio', + ]: + return + + nonlocal last_activity + pending_requests.add(request) + last_activity = asyncio.get_event_loop().time() + # logger.debug(f'Request started: {request.url} ({request.resource_type})') + + async def on_response(response): + request = response.request + if request not in pending_requests: + return + + # Filter by content type if available + content_type = response.headers.get('content-type', '').lower() + + # Skip if content type indicates streaming or real-time data + if any( + t in content_type + for t in [ + 'streaming', + 'video', + 'audio', + 'webm', + 'mp4', + 'event-stream', + 'websocket', + 'protobuf', + ] + ): + pending_requests.remove(request) + return + + # Only process relevant content types + if not any(ct in content_type for ct in RELEVANT_CONTENT_TYPES): + pending_requests.remove(request) + return + + # Skip if response is too large (likely not essential for page load) + content_length = response.headers.get('content-length') + if content_length and int(content_length) > 5 * 1024 * 1024: # 5MB + pending_requests.remove(request) + return + + nonlocal last_activity + pending_requests.remove(request) + last_activity = asyncio.get_event_loop().time() + # logger.debug(f'Request resolved: {request.url} ({content_type})') + + # Attach event listeners + page.on('request', on_request) + page.on('response', on_response) + + try: + # Wait for idle time + start_time = asyncio.get_event_loop().time() + while True: + await asyncio.sleep(0.1) + now = asyncio.get_event_loop().time() + if len(pending_requests) == 0 and (now - last_activity) >= self.config.wait_for_network_idle_page_load_time: + break + if now - start_time > self.config.maximum_wait_page_load_time: + logger.debug( + f'Network timeout after {self.config.maximum_wait_page_load_time}s with {len(pending_requests)} ' + f'pending requests: {[r.url for r in pending_requests]}' + ) + break + + finally: + # Clean up event listeners + page.remove_listener('request', on_request) + page.remove_listener('response', on_response) + + logger.debug(f'Network stabilized for {self.config.wait_for_network_idle_page_load_time} seconds') + + async def _wait_for_page_and_frames_load(self, timeout_overwrite: float | None = None): + """ + Ensures page is fully loaded before continuing. + Waits for either network to be idle or minimum WAIT_TIME, whichever is longer. + Also checks if the loaded URL is allowed. + """ + # Start timing + start_time = time.time() + + # Wait for page load + try: + await self._wait_for_stable_network() + + # Check if the loaded URL is allowed + page = await self.get_current_page() + await self._check_and_handle_navigation(page) + except URLNotAllowedError as e: + raise e + except Exception: + logger.warning('Page load failed, continuing...') + pass + + # Calculate remaining time to meet minimum WAIT_TIME + elapsed = time.time() - start_time + remaining = max((timeout_overwrite or self.config.minimum_wait_page_load_time) - elapsed, 0) + + logger.debug(f'--Page loaded in {elapsed:.2f} seconds, waiting for additional {remaining:.2f} seconds') + + # Sleep remaining time if needed + if remaining > 0: + await asyncio.sleep(remaining) + + def _is_url_allowed(self, url: str) -> bool: + """Check if a URL is allowed based on the whitelist configuration.""" + if not self.config.allowed_domains: + return True + + try: + from urllib.parse import urlparse + + parsed_url = urlparse(url) + domain = parsed_url.netloc.lower() + + # Remove port number if present + if ':' in domain: + domain = domain.split(':')[0] + + # Check if domain matches any allowed domain pattern + return any( + domain == allowed_domain.lower() or domain.endswith('.' + allowed_domain.lower()) + for allowed_domain in self.config.allowed_domains + ) + except Exception as e: + logger.error(f'Error checking URL allowlist: {str(e)}') + return False + + async def _check_and_handle_navigation(self, page: Page) -> None: + """Check if current page URL is allowed and handle if not.""" + if not self._is_url_allowed(page.url): + logger.warning(f'Navigation to non-allowed URL detected: {page.url}') + try: + await self.go_back() + except Exception as e: + logger.error(f'Failed to go back after detecting non-allowed URL: {str(e)}') + raise URLNotAllowedError(f'Navigation to non-allowed URL: {page.url}') + + async def navigate_to(self, url: str): + """Navigate to a URL""" + if not self._is_url_allowed(url): + raise BrowserError(f'Navigation to non-allowed URL: {url}') + + page = await self.get_current_page() + await page.goto(url) + await page.wait_for_load_state() + + async def refresh_page(self): + """Refresh the current page""" + page = await self.get_current_page() + await page.reload() + await page.wait_for_load_state() + + async def go_back(self): + """Navigate back in history""" + page = await self.get_current_page() + try: + # 10 ms timeout + await page.go_back(timeout=10, wait_until='domcontentloaded') + # await self._wait_for_page_and_frames_load(timeout_overwrite=1.0) + except Exception as e: + # Continue even if its not fully loaded, because we wait later for the page to load + logger.debug(f'During go_back: {e}') + + async def go_forward(self): + """Navigate forward in history""" + page = await self.get_current_page() + try: + await page.go_forward(timeout=10, wait_until='domcontentloaded') + except Exception as e: + # Continue even if its not fully loaded, because we wait later for the page to load + logger.debug(f'During go_forward: {e}') + + async def close_current_tab(self): + """Close the current tab""" + session = await self.get_session() + page = await self._get_current_page(session) + await page.close() + + # Switch to the first available tab if any exist + if session.context.pages: + await self.switch_to_tab(0) + + # otherwise the browser will be closed + + async def get_page_html(self) -> str: + """Get the current page HTML content""" + page = await self.get_current_page() + return await page.content() + + async def execute_javascript(self, script: str): + """Execute JavaScript code on the page""" + page = await self.get_current_page() + return await page.evaluate(script) + + async def get_page_structure(self) -> str: + """Get a debug view of the page structure including iframes""" + debug_script = """(() => { + function getPageStructure(element = document, depth = 0, maxDepth = 10) { + if (depth >= maxDepth) return ''; + + const indent = ' '.repeat(depth); + let structure = ''; + + // Skip certain elements that clutter the output + const skipTags = new Set(['script', 'style', 'link', 'meta', 'noscript']); + + // Add current element info if it's not the document + if (element !== document) { + const tagName = element.tagName.toLowerCase(); + + // Skip uninteresting elements + if (skipTags.has(tagName)) return ''; + + const id = element.id ? `#${element.id}` : ''; + const classes = element.className && typeof element.className === 'string' ? + `.${element.className.split(' ').filter(c => c).join('.')}` : ''; + + // Get additional useful attributes + const attrs = []; + if (element.getAttribute('role')) attrs.push(`role="${element.getAttribute('role')}"`); + if (element.getAttribute('aria-label')) attrs.push(`aria-label="${element.getAttribute('aria-label')}"`); + if (element.getAttribute('type')) attrs.push(`type="${element.getAttribute('type')}"`); + if (element.getAttribute('name')) attrs.push(`name="${element.getAttribute('name')}"`); + if (element.getAttribute('src')) { + const src = element.getAttribute('src'); + attrs.push(`src="${src.substring(0, 50)}${src.length > 50 ? '...' : ''}"`); + } + + // Add element info + structure += `${indent}${tagName}${id}${classes}${attrs.length ? ' [' + attrs.join(', ') + ']' : ''}\\n`; + + // Handle iframes specially + if (tagName === 'iframe') { + try { + const iframeDoc = element.contentDocument || element.contentWindow?.document; + if (iframeDoc) { + structure += `${indent} [IFRAME CONTENT]:\\n`; + structure += getPageStructure(iframeDoc, depth + 2, maxDepth); + } else { + structure += `${indent} [IFRAME: No access - likely cross-origin]\\n`; + } + } catch (e) { + structure += `${indent} [IFRAME: Access denied - ${e.message}]\\n`; + } + } + } + + // Get all child elements + const children = element.children || element.childNodes; + for (const child of children) { + if (child.nodeType === 1) { // Element nodes only + structure += getPageStructure(child, depth + 1, maxDepth); + } + } + + return structure; + } + + return getPageStructure(); + })()""" + + page = await self.get_current_page() + structure = await page.evaluate(debug_script) + return structure + + @time_execution_sync('--get_state') # This decorator might need to be updated to handle async + async def get_state(self) -> BrowserState: + """Get the current state of the browser""" + await self._wait_for_page_and_frames_load() + session = await self.get_session() + session.cached_state = await self._update_state() + + # Save cookies if a file is specified + if self.config.cookies_file: + asyncio.create_task(self.save_cookies()) + + return session.cached_state + + async def _update_state(self, focus_element: int = -1) -> BrowserState: + """Update and return state.""" + session = await self.get_session() + + # Check if current page is still valid, if not switch to another available page + try: + page = await self.get_current_page() + # Test if page is still accessible + await page.evaluate('1') + except Exception as e: + logger.debug(f'Current page is no longer accessible: {str(e)}') + # Get all available pages + pages = session.context.pages + if pages: + self.state.target_id = None + page = await self._get_current_page(session) + logger.debug(f'Switched to page: {await page.title()}') + else: + raise BrowserError('Browser closed: no valid pages available') + + try: + await self.remove_highlights() + dom_service = DomService(page) + content = await dom_service.get_clickable_elements( + focus_element=focus_element, + viewport_expansion=self.config.viewport_expansion, + highlight_elements=self.config.highlight_elements, + ) + + screenshot_b64 = await self.take_screenshot() + pixels_above, pixels_below = await self.get_scroll_info(page) + + self.current_state = BrowserState( + element_tree=content.element_tree, + selector_map=content.selector_map, + url=page.url, + title=await page.title(), + tabs=await self.get_tabs_info(), + screenshot=screenshot_b64, + pixels_above=pixels_above, + pixels_below=pixels_below, + ) + + return self.current_state + except Exception as e: + logger.error(f'Failed to update state: {str(e)}') + # Return last known good state if available + if hasattr(self, 'current_state'): + return self.current_state + raise + + # region - Browser Actions + @time_execution_async('--take_screenshot') + async def take_screenshot(self, full_page: bool = False) -> str: + """ + Returns a base64 encoded screenshot of the current page. + """ + page = await self.get_current_page() + + await page.bring_to_front() + await page.wait_for_load_state() + + screenshot = await page.screenshot( + full_page=full_page, + animations='disabled', + ) + + screenshot_b64 = base64.b64encode(screenshot).decode('utf-8') + + # await self.remove_highlights() + + return screenshot_b64 + + @time_execution_async('--remove_highlights') + async def remove_highlights(self): + """ + Removes all highlight overlays and labels created by the highlightElement function. + Handles cases where the page might be closed or inaccessible. + """ + try: + page = await self.get_current_page() + await page.evaluate( + """ + try { + // Remove the highlight container and all its contents + const container = document.getElementById('playwright-highlight-container'); + if (container) { + container.remove(); + } + + // Remove highlight attributes from elements + const highlightedElements = document.querySelectorAll('[browser-user-highlight-id^="playwright-highlight-"]'); + highlightedElements.forEach(el => { + el.removeAttribute('browser-user-highlight-id'); + }); + } catch (e) { + console.error('Failed to remove highlights:', e); + } + """ + ) + except Exception as e: + logger.debug(f'Failed to remove highlights (this is usually ok): {str(e)}') + # Don't raise the error since this is not critical functionality + pass + + # endregion + + # region - User Actions + + @classmethod + def _convert_simple_xpath_to_css_selector(cls, xpath: str) -> str: + """Converts simple XPath expressions to CSS selectors.""" + if not xpath: + return '' + + # Remove leading slash if present + xpath = xpath.lstrip('/') + + # Split into parts + parts = xpath.split('/') + css_parts = [] + + for part in parts: + if not part: + continue + + # Handle index notation [n] + if '[' in part: + base_part = part[: part.find('[')] + index_part = part[part.find('[') :] + + # Handle multiple indices + indices = [i.strip('[]') for i in index_part.split(']')[:-1]] + + for idx in indices: + try: + # Handle numeric indices + if idx.isdigit(): + index = int(idx) - 1 + base_part += f':nth-of-type({index + 1})' + # Handle last() function + elif idx == 'last()': + base_part += ':last-of-type' + # Handle position() functions + elif 'position()' in idx: + if '>1' in idx: + base_part += ':nth-of-type(n+2)' + except ValueError: + continue + + css_parts.append(base_part) + else: + css_parts.append(part) + + base_selector = ' > '.join(css_parts) + return base_selector + + @classmethod + @time_execution_sync('--enhanced_css_selector_for_element') + def _enhanced_css_selector_for_element(cls, element: DOMElementNode, include_dynamic_attributes: bool = True) -> str: + """ + Creates a CSS selector for a DOM element, handling various edge cases and special characters. + + Args: + element: The DOM element to create a selector for + + Returns: + A valid CSS selector string + """ + try: + # Get base selector from XPath + css_selector = cls._convert_simple_xpath_to_css_selector(element.xpath) + + # Handle class attributes + if 'class' in element.attributes and element.attributes['class'] and include_dynamic_attributes: + # Define a regex pattern for valid class names in CSS + valid_class_name_pattern = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_-]*$') + + # Iterate through the class attribute values + classes = element.attributes['class'].split() + for class_name in classes: + # Skip empty class names + if not class_name.strip(): + continue + + # Check if the class name is valid + if valid_class_name_pattern.match(class_name): + # Append the valid class name to the CSS selector + css_selector += f'.{class_name}' + else: + # Skip invalid class names + continue + + # Expanded set of safe attributes that are stable and useful for selection + SAFE_ATTRIBUTES = { + # Data attributes (if they're stable in your application) + 'id', + # Standard HTML attributes + 'name', + 'type', + 'placeholder', + # Accessibility attributes + 'aria-label', + 'aria-labelledby', + 'aria-describedby', + 'role', + # Common form attributes + 'for', + 'autocomplete', + 'required', + 'readonly', + # Media attributes + 'alt', + 'title', + 'src', + # Custom stable attributes (add any application-specific ones) + 'href', + 'target', + } + + if include_dynamic_attributes: + dynamic_attributes = { + 'data-id', + 'data-qa', + 'data-cy', + 'data-testid', + } + SAFE_ATTRIBUTES.update(dynamic_attributes) + + # Handle other attributes + for attribute, value in element.attributes.items(): + if attribute == 'class': + continue + + # Skip invalid attribute names + if not attribute.strip(): + continue + + if attribute not in SAFE_ATTRIBUTES: + continue + + # Escape special characters in attribute names + safe_attribute = attribute.replace(':', r'\:') + + # Handle different value cases + if value == '': + css_selector += f'[{safe_attribute}]' + elif any(char in value for char in '"\'<>`\n\r\t'): + # Use contains for values with special characters + # Regex-substitute *any* whitespace with a single space, then strip. + collapsed_value = re.sub(r'\s+', ' ', value).strip() + # Escape embedded double-quotes. + safe_value = collapsed_value.replace('"', '\\"') + css_selector += f'[{safe_attribute}*="{safe_value}"]' + else: + css_selector += f'[{safe_attribute}="{value}"]' + + return css_selector + + except Exception: + # Fallback to a more basic selector if something goes wrong + tag_name = element.tag_name or '*' + return f"{tag_name}[highlight_index='{element.highlight_index}']" + + @time_execution_async('--get_locate_element') + async def get_locate_element(self, element: DOMElementNode) -> Optional[ElementHandle]: + current_frame = await self.get_current_page() + + # Start with the target element and collect all parents + parents: list[DOMElementNode] = [] + current = element + while current.parent is not None: + parent = current.parent + parents.append(parent) + current = parent + + # Reverse the parents list to process from top to bottom + parents.reverse() + + # Process all iframe parents in sequence + iframes = [item for item in parents if item.tag_name == 'iframe'] + for parent in iframes: + css_selector = self._enhanced_css_selector_for_element( + parent, + include_dynamic_attributes=self.config.include_dynamic_attributes, + ) + current_frame = current_frame.frame_locator(css_selector) + + css_selector = self._enhanced_css_selector_for_element( + element, include_dynamic_attributes=self.config.include_dynamic_attributes + ) + + try: + if isinstance(current_frame, FrameLocator): + element_handle = await current_frame.locator(css_selector).element_handle() + return element_handle + else: + # Try to scroll into view if hidden + element_handle = await current_frame.query_selector(css_selector) + if element_handle: + await element_handle.scroll_into_view_if_needed() + return element_handle + return None + except Exception as e: + logger.error(f'Failed to locate element: {str(e)}') + return None + + @time_execution_async('--input_text_element_node') + async def _input_text_element_node(self, element_node: DOMElementNode, text: str): + """ + Input text into an element with proper error handling and state management. + Handles different types of input fields and ensures proper element state before input. + """ + try: + # Highlight before typing + # if element_node.highlight_index is not None: + # await self._update_state(focus_element=element_node.highlight_index) + + element_handle = await self.get_locate_element(element_node) + + if element_handle is None: + raise BrowserError(f'Element: {repr(element_node)} not found') + + # Ensure element is ready for input + try: + await element_handle.wait_for_element_state('stable', timeout=1000) + await element_handle.scroll_into_view_if_needed(timeout=1000) + except Exception: + pass + + # Get element properties to determine input method + tag_handle = await element_handle.get_property("tagName") + tag_name = (await tag_handle.json_value()).lower() + is_contenteditable = await element_handle.get_property('isContentEditable') + readonly_handle = await element_handle.get_property("readOnly") + disabled_handle = await element_handle.get_property("disabled") + + readonly = await readonly_handle.json_value() if readonly_handle else False + disabled = await disabled_handle.json_value() if disabled_handle else False + + if (await is_contenteditable.json_value() or tag_name == 'input') and not (readonly or disabled): + await element_handle.evaluate('el => el.textContent = ""') + await element_handle.type(text, delay=5) + else: + await element_handle.fill(text) + + except Exception as e: + logger.debug(f'Failed to input text into element: {repr(element_node)}. Error: {str(e)}') + raise BrowserError(f'Failed to input text into index {element_node.highlight_index}') + + @time_execution_async('--click_element_node') + async def _click_element_node(self, element_node: DOMElementNode) -> Optional[str]: + """ + Optimized method to click an element using xpath. + """ + page = await self.get_current_page() + + try: + # Highlight before clicking + # if element_node.highlight_index is not None: + # await self._update_state(focus_element=element_node.highlight_index) + + element_handle = await self.get_locate_element(element_node) + + if element_handle is None: + raise Exception(f'Element: {repr(element_node)} not found') + + async def perform_click(click_func): + """Performs the actual click, handling both download + and navigation scenarios.""" + if self.config.save_downloads_path: + try: + # Try short-timeout expect_download to detect a file download has been been triggered + async with page.expect_download(timeout=5000) as download_info: + await click_func() + download = await download_info.value + # Determine file path + suggested_filename = download.suggested_filename + unique_filename = await self._get_unique_filename(self.config.save_downloads_path, suggested_filename) + download_path = os.path.join(self.config.save_downloads_path, unique_filename) + await download.save_as(download_path) + logger.debug(f'Download triggered. Saved file to: {download_path}') + return download_path + except TimeoutError: + # If no download is triggered, treat as normal click + logger.debug('No download triggered within timeout. Checking navigation...') + await page.wait_for_load_state() + await self._check_and_handle_navigation(page) + else: + # Standard click logic if no download is expected + await click_func() + await page.wait_for_load_state() + await self._check_and_handle_navigation(page) + + try: + return await perform_click(lambda: element_handle.click(timeout=1500)) + except URLNotAllowedError as e: + raise e + except Exception: + try: + return await perform_click(lambda: page.evaluate('(el) => el.click()', element_handle)) + except URLNotAllowedError as e: + raise e + except Exception as e: + raise Exception(f'Failed to click element: {str(e)}') + + except URLNotAllowedError as e: + raise e + except Exception as e: + raise Exception(f'Failed to click element: {repr(element_node)}. Error: {str(e)}') + + @time_execution_async('--get_tabs_info') + async def get_tabs_info(self) -> list[TabInfo]: + """Get information about all tabs""" + session = await self.get_session() + + tabs_info = [] + for page_id, page in enumerate(session.context.pages): + tab_info = TabInfo(page_id=page_id, url=page.url, title=await page.title()) + tabs_info.append(tab_info) + + return tabs_info + + @time_execution_async('--switch_to_tab') + async def switch_to_tab(self, page_id: int) -> None: + """Switch to a specific tab by its page_id""" + session = await self.get_session() + pages = session.context.pages + + if page_id >= len(pages): + raise BrowserError(f'No tab found with page_id: {page_id}') + + page = pages[page_id] + + # Check if the tab's URL is allowed before switching + if not self._is_url_allowed(page.url): + raise BrowserError(f'Cannot switch to tab with non-allowed URL: {page.url}') + + # Update target ID if using CDP + if self.browser.config.cdp_url: + targets = await self._get_cdp_targets() + for target in targets: + if target['url'] == page.url: + self.state.target_id = target['targetId'] + break + + await page.bring_to_front() + await page.wait_for_load_state() + + @time_execution_async('--create_new_tab') + async def create_new_tab(self, url: str | None = None) -> None: + """Create a new tab and optionally navigate to a URL""" + if url and not self._is_url_allowed(url): + raise BrowserError(f'Cannot create new tab with non-allowed URL: {url}') + + session = await self.get_session() + new_page = await session.context.new_page() + await new_page.wait_for_load_state() + + if url: + await new_page.goto(url) + await self._wait_for_page_and_frames_load(timeout_overwrite=1) + + # Get target ID for new page if using CDP + if self.browser.config.cdp_url: + targets = await self._get_cdp_targets() + for target in targets: + if target['url'] == new_page.url: + self.state.target_id = target['targetId'] + break + + # endregion + + # region - Helper methods for easier access to the DOM + async def _get_current_page(self, session: BrowserSession) -> Page: + pages = session.context.pages + + # Try to find page by target ID if using CDP + if self.browser.config.cdp_url and self.state.target_id: + targets = await self._get_cdp_targets() + for target in targets: + if target['targetId'] == self.state.target_id: + for page in pages: + if page.url == target['url']: + return page + + # Fallback to last page + return pages[-1] if pages else await session.context.new_page() + + async def get_selector_map(self) -> SelectorMap: + session = await self.get_session() + if session.cached_state is None: + return {} + return session.cached_state.selector_map + + async def get_element_by_index(self, index: int) -> ElementHandle | None: + selector_map = await self.get_selector_map() + element_handle = await self.get_locate_element(selector_map[index]) + return element_handle + + async def get_dom_element_by_index(self, index: int) -> DOMElementNode: + selector_map = await self.get_selector_map() + return selector_map[index] + + async def save_cookies(self): + """Save current cookies to file""" + if self.session and self.session.context and self.config.cookies_file: + try: + cookies = await self.session.context.cookies() + logger.debug(f'Saving {len(cookies)} cookies to {self.config.cookies_file}') + + # Check if the path is a directory and create it if necessary + dirname = os.path.dirname(self.config.cookies_file) + if dirname: + os.makedirs(dirname, exist_ok=True) + + with open(self.config.cookies_file, 'w') as f: + json.dump(cookies, f) + except Exception as e: + logger.warning(f'Failed to save cookies: {str(e)}') + + async def is_file_uploader(self, element_node: DOMElementNode, max_depth: int = 3, current_depth: int = 0) -> bool: + """Check if element or its children are file uploaders""" + if current_depth > max_depth: + return False + + # Check current element + is_uploader = False + + if not isinstance(element_node, DOMElementNode): + return False + + # Check for file input attributes + if element_node.tag_name == 'input': + is_uploader = element_node.attributes.get('type') == 'file' or element_node.attributes.get('accept') is not None + + if is_uploader: + return True + + # Recursively check children + if element_node.children and current_depth < max_depth: + for child in element_node.children: + if isinstance(child, DOMElementNode): + if await self.is_file_uploader(child, max_depth, current_depth + 1): + return True + + return False + + async def get_scroll_info(self, page: Page) -> tuple[int, int]: + """Get scroll position information for the current page.""" + scroll_y = await page.evaluate('window.scrollY') + viewport_height = await page.evaluate('window.innerHeight') + total_height = await page.evaluate('document.documentElement.scrollHeight') + pixels_above = scroll_y + pixels_below = total_height - (scroll_y + viewport_height) + return pixels_above, pixels_below + + async def reset_context(self): + """Reset the browser session + Call this when you don't want to kill the context but just kill the state + """ + # close all tabs and clear cached state + session = await self.get_session() + + pages = session.context.pages + for page in pages: + await page.close() + + session.cached_state = None + self.state.target_id = None + + async def _get_unique_filename(self, directory, filename): + """Generate a unique filename by appending (1), (2), etc., if a file already exists.""" + base, ext = os.path.splitext(filename) + counter = 1 + new_filename = filename + while os.path.exists(os.path.join(directory, new_filename)): + new_filename = f'{base} ({counter}){ext}' + counter += 1 + return new_filename + + async def _get_cdp_targets(self) -> list[dict]: + """Get all CDP targets directly using CDP protocol""" + if not self.browser.config.cdp_url or not self.session: + return [] + + try: + pages = self.session.context.pages + if not pages: + return [] + + cdp_session = await pages[0].context.new_cdp_session(pages[0]) + result = await cdp_session.send('Target.getTargets') + await cdp_session.detach() + return result.get('targetInfos', []) + except Exception as e: + logger.debug(f'Failed to get CDP targets: {e}') + return [] diff --git a/browser_use/browser/tests/screenshot_test.py b/browser_use/browser/tests/screenshot_test.py new file mode 100644 index 0000000000000000000000000000000000000000..7255ccb615e9a89d2a2e87b4949ae9b581e5c57f --- /dev/null +++ b/browser_use/browser/tests/screenshot_test.py @@ -0,0 +1,37 @@ +import base64 + +import pytest + +from browser_use.browser.browser import Browser, BrowserConfig + + +@pytest.fixture +async def browser(): + browser_service = Browser(config=BrowserConfig(headless=True)) + yield browser_service + + await browser_service.close() + + +# @pytest.mark.skip(reason='takes too long') +def test_take_full_page_screenshot(browser): + # Go to a test page + browser.go_to_url('https://example.com') + + # Take full page screenshot + screenshot_b64 = browser.take_screenshot(full_page=True) + + # Verify screenshot is not empty and is valid base64 + assert screenshot_b64 is not None + assert isinstance(screenshot_b64, str) + assert len(screenshot_b64) > 0 + + # Test we can decode the base64 string + try: + base64.b64decode(screenshot_b64) + except Exception as e: + pytest.fail(f'Failed to decode base64 screenshot: {str(e)}') + + +if __name__ == '__main__': + test_take_full_page_screenshot(Browser(config=BrowserConfig(headless=False))) diff --git a/browser_use/browser/tests/test_clicks.py b/browser_use/browser/tests/test_clicks.py new file mode 100644 index 0000000000000000000000000000000000000000..98ca74354c70406a8d1d701f86c32787b725c930 --- /dev/null +++ b/browser_use/browser/tests/test_clicks.py @@ -0,0 +1,94 @@ +import asyncio +import json + +import pytest + +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.dom.views import DOMBaseNode, DOMElementNode, DOMTextNode +from browser_use.utils import time_execution_sync + + +class ElementTreeSerializer: + @staticmethod + def dom_element_node_to_json(element_tree: DOMElementNode) -> dict: + def node_to_dict(node: DOMBaseNode) -> dict: + if isinstance(node, DOMTextNode): + return {'type': 'text', 'text': node.text} + elif isinstance(node, DOMElementNode): + return { + 'type': 'element', + 'tag_name': node.tag_name, + 'attributes': node.attributes, + 'highlight_index': node.highlight_index, + 'children': [node_to_dict(child) for child in node.children], + } + return {} + + return node_to_dict(element_tree) + + +# run with: pytest browser_use/browser/tests/test_clicks.py +@pytest.mark.asyncio +async def test_highlight_elements(): + browser = Browser(config=BrowserConfig(headless=False, disable_security=True)) + + async with await browser.new_context() as context: + page = await context.get_current_page() + # await page.goto('https://immobilienscout24.de') + # await page.goto('https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/service-plans') + # await page.goto('https://google.com/search?q=elon+musk') + # await page.goto('https://kayak.com') + # await page.goto('https://www.w3schools.com/tags/tryit.asp?filename=tryhtml_iframe') + # await page.goto('https://dictionary.cambridge.org') + # await page.goto('https://github.com') + await page.goto('https://huggingface.co/') + + await asyncio.sleep(1) + + while True: + try: + # await asyncio.sleep(10) + state = await context.get_state() + + with open('./tmp/page.json', 'w') as f: + json.dump( + ElementTreeSerializer.dom_element_node_to_json(state.element_tree), + f, + indent=1, + ) + + # await time_execution_sync('highlight_selector_map_elements')( + # browser.highlight_selector_map_elements + # )(state.selector_map) + + # Find and print duplicate XPaths + xpath_counts = {} + if not state.selector_map: + continue + for selector in state.selector_map.values(): + xpath = selector.xpath + if xpath in xpath_counts: + xpath_counts[xpath] += 1 + else: + xpath_counts[xpath] = 1 + + print('\nDuplicate XPaths found:') + for xpath, count in xpath_counts.items(): + if count > 1: + print(f'XPath: {xpath}') + print(f'Count: {count}\n') + + print(list(state.selector_map.keys()), 'Selector map keys') + print(state.element_tree.clickable_elements_to_string()) + action = input('Select next action: ') + + await time_execution_sync('remove_highlight_elements')(context.remove_highlights)() + + node_element = state.selector_map[int(action)] + + # check if index of selector map are the same as index of items in dom_items + + await context._click_element_node(node_element) + + except Exception as e: + print(e) diff --git a/browser_use/browser/views.py b/browser_use/browser/views.py new file mode 100644 index 0000000000000000000000000000000000000000..3434d86e2691caba5715f4028bd3d849d07bf9e7 --- /dev/null +++ b/browser_use/browser/views.py @@ -0,0 +1,53 @@ +from dataclasses import dataclass, field +from typing import Any, Optional + +from pydantic import BaseModel + +from browser_use.dom.history_tree_processor.service import DOMHistoryElement +from browser_use.dom.views import DOMState + + +# Pydantic +class TabInfo(BaseModel): + """Represents information about a browser tab""" + + page_id: int + url: str + title: str + + +@dataclass +class BrowserState(DOMState): + url: str + title: str + tabs: list[TabInfo] + screenshot: Optional[str] = None + pixels_above: int = 0 + pixels_below: int = 0 + browser_errors: list[str] = field(default_factory=list) + + +@dataclass +class BrowserStateHistory: + url: str + title: str + tabs: list[TabInfo] + interacted_element: list[DOMHistoryElement | None] | list[None] + screenshot: Optional[str] = None + + def to_dict(self) -> dict[str, Any]: + data = {} + data['tabs'] = [tab.model_dump() for tab in self.tabs] + data['screenshot'] = self.screenshot + data['interacted_element'] = [el.to_dict() if el else None for el in self.interacted_element] + data['url'] = self.url + data['title'] = self.title + return data + + +class BrowserError(Exception): + """Base class for all browser errors""" + + +class URLNotAllowedError(BrowserError): + """Error raised when a URL is not allowed""" diff --git a/browser_use/controller/registry/service.py b/browser_use/controller/registry/service.py new file mode 100644 index 0000000000000000000000000000000000000000..be52a4b68074f5e8c6dffafa504b57483f1fd7ba --- /dev/null +++ b/browser_use/controller/registry/service.py @@ -0,0 +1,199 @@ +import asyncio +from inspect import iscoroutinefunction, signature +from typing import Any, Callable, Dict, Generic, Optional, Type, TypeVar + +from langchain_core.language_models.chat_models import BaseChatModel +from pydantic import BaseModel, Field, create_model + +from browser_use.browser.context import BrowserContext +from browser_use.controller.registry.views import ( + ActionModel, + ActionRegistry, + RegisteredAction, +) +from browser_use.telemetry.service import ProductTelemetry +from browser_use.telemetry.views import ( + ControllerRegisteredFunctionsTelemetryEvent, + RegisteredFunction, +) +from browser_use.utils import time_execution_async, time_execution_sync + +Context = TypeVar('Context') + + +class Registry(Generic[Context]): + """Service for registering and managing actions""" + + def __init__(self, exclude_actions: list[str] | None = None): + self.registry = ActionRegistry() + self.telemetry = ProductTelemetry() + self.exclude_actions = exclude_actions if exclude_actions is not None else [] + + @time_execution_sync('--create_param_model') + def _create_param_model(self, function: Callable) -> Type[BaseModel]: + """Creates a Pydantic model from function signature""" + sig = signature(function) + params = { + name: (param.annotation, ... if param.default == param.empty else param.default) + for name, param in sig.parameters.items() + if name != 'browser' and name != 'page_extraction_llm' and name != 'available_file_paths' + } + # TODO: make the types here work + return create_model( + f'{function.__name__}_parameters', + __base__=ActionModel, + **params, # type: ignore + ) + + def action( + self, + description: str, + param_model: Optional[Type[BaseModel]] = None, + ): + """Decorator for registering actions""" + + def decorator(func: Callable): + # Skip registration if action is in exclude_actions + if func.__name__ in self.exclude_actions: + return func + + # Create param model from function if not provided + actual_param_model = param_model or self._create_param_model(func) + + # Wrap sync functions to make them async + if not iscoroutinefunction(func): + + async def async_wrapper(*args, **kwargs): + return await asyncio.to_thread(func, *args, **kwargs) + + # Copy the signature and other metadata from the original function + async_wrapper.__signature__ = signature(func) + async_wrapper.__name__ = func.__name__ + async_wrapper.__annotations__ = func.__annotations__ + wrapped_func = async_wrapper + else: + wrapped_func = func + + action = RegisteredAction( + name=func.__name__, + description=description, + function=wrapped_func, + param_model=actual_param_model, + ) + self.registry.actions[func.__name__] = action + return func + + return decorator + + @time_execution_async('--execute_action') + async def execute_action( + self, + action_name: str, + params: dict, + browser: Optional[BrowserContext] = None, + page_extraction_llm: Optional[BaseChatModel] = None, + sensitive_data: Optional[Dict[str, str]] = None, + available_file_paths: Optional[list[str]] = None, + # + context: Context | None = None, + ) -> Any: + """Execute a registered action""" + if action_name not in self.registry.actions: + raise ValueError(f'Action {action_name} not found') + + action = self.registry.actions[action_name] + try: + # Create the validated Pydantic model + validated_params = action.param_model(**params) + + # Check if the first parameter is a Pydantic model + sig = signature(action.function) + parameters = list(sig.parameters.values()) + is_pydantic = parameters and issubclass(parameters[0].annotation, BaseModel) + parameter_names = [param.name for param in parameters] + + if sensitive_data: + validated_params = self._replace_sensitive_data(validated_params, sensitive_data) + + # Check if the action requires browser + if 'browser' in parameter_names and not browser: + raise ValueError(f'Action {action_name} requires browser but none provided.') + if 'page_extraction_llm' in parameter_names and not page_extraction_llm: + raise ValueError(f'Action {action_name} requires page_extraction_llm but none provided.') + if 'available_file_paths' in parameter_names and not available_file_paths: + raise ValueError(f'Action {action_name} requires available_file_paths but none provided.') + + if 'context' in parameter_names and not context: + raise ValueError(f'Action {action_name} requires context but none provided.') + + # Prepare arguments based on parameter type + extra_args = {} + if 'context' in parameter_names: + extra_args['context'] = context + if 'browser' in parameter_names: + extra_args['browser'] = browser + if 'page_extraction_llm' in parameter_names: + extra_args['page_extraction_llm'] = page_extraction_llm + if 'available_file_paths' in parameter_names: + extra_args['available_file_paths'] = available_file_paths + if action_name == 'input_text' and sensitive_data: + extra_args['has_sensitive_data'] = True + if is_pydantic: + return await action.function(validated_params, **extra_args) + return await action.function(**validated_params.model_dump(), **extra_args) + + except Exception as e: + raise RuntimeError(f'Error executing action {action_name}: {str(e)}') from e + + def _replace_sensitive_data(self, params: BaseModel, sensitive_data: Dict[str, str]) -> BaseModel: + """Replaces the sensitive data in the params""" + # if there are any str with placeholder in the params, replace them with the actual value from sensitive_data + + import re + + secret_pattern = re.compile(r'(.*?)') + + def replace_secrets(value): + if isinstance(value, str): + matches = secret_pattern.findall(value) + for placeholder in matches: + if placeholder in sensitive_data: + value = value.replace(f'{placeholder}', sensitive_data[placeholder]) + return value + elif isinstance(value, dict): + return {k: replace_secrets(v) for k, v in value.items()} + elif isinstance(value, list): + return [replace_secrets(v) for v in value] + return value + + for key, value in params.model_dump().items(): + params.__dict__[key] = replace_secrets(value) + return params + + @time_execution_sync('--create_action_model') + def create_action_model(self, include_actions: Optional[list[str]] = None) -> Type[ActionModel]: + """Creates a Pydantic model from registered actions""" + fields = { + name: ( + Optional[action.param_model], + Field(default=None, description=action.description), + ) + for name, action in self.registry.actions.items() + if include_actions is None or name in include_actions + } + + self.telemetry.capture( + ControllerRegisteredFunctionsTelemetryEvent( + registered_functions=[ + RegisteredFunction(name=name, params=action.param_model.model_json_schema()) + for name, action in self.registry.actions.items() + if include_actions is None or name in include_actions + ] + ) + ) + + return create_model('ActionModel', __base__=ActionModel, **fields) # type:ignore + + def get_prompt_description(self) -> str: + """Get a description of all actions for the prompt""" + return self.registry.get_prompt_description() diff --git a/browser_use/controller/registry/views.py b/browser_use/controller/registry/views.py new file mode 100644 index 0000000000000000000000000000000000000000..211c767a31609ee4afc46d50fa8353fc16f391de --- /dev/null +++ b/browser_use/controller/registry/views.py @@ -0,0 +1,70 @@ +from typing import Callable, Dict, Type + +from pydantic import BaseModel, ConfigDict + + +class RegisteredAction(BaseModel): + """Model for a registered action""" + + name: str + description: str + function: Callable + param_model: Type[BaseModel] + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def prompt_description(self) -> str: + """Get a description of the action for the prompt""" + skip_keys = ['title'] + s = f'{self.description}: \n' + s += '{' + str(self.name) + ': ' + s += str( + { + k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k not in skip_keys} + for k, v in self.param_model.schema()['properties'].items() + } + ) + s += '}' + return s + + +class ActionModel(BaseModel): + """Base model for dynamically created action models""" + + # this will have all the registered actions, e.g. + # click_element = param_model = ClickElementParams + # done = param_model = None + # + model_config = ConfigDict(arbitrary_types_allowed=True) + + def get_index(self) -> int | None: + """Get the index of the action""" + # {'clicked_element': {'index':5}} + params = self.model_dump(exclude_unset=True).values() + if not params: + return None + for param in params: + if param is not None and 'index' in param: + return param['index'] + return None + + def set_index(self, index: int): + """Overwrite the index of the action""" + # Get the action name and params + action_data = self.model_dump(exclude_unset=True) + action_name = next(iter(action_data.keys())) + action_params = getattr(self, action_name) + + # Update the index directly on the model + if hasattr(action_params, 'index'): + action_params.index = index + + +class ActionRegistry(BaseModel): + """Model representing the action registry""" + + actions: Dict[str, RegisteredAction] = {} + + def get_prompt_description(self) -> str: + """Get a description of all actions for the prompt""" + return '\n'.join([action.prompt_description() for action in self.actions.values()]) diff --git a/browser_use/controller/service.py b/browser_use/controller/service.py new file mode 100644 index 0000000000000000000000000000000000000000..cf468f28734c632a378df3f03d871953637b251c --- /dev/null +++ b/browser_use/controller/service.py @@ -0,0 +1,532 @@ +import asyncio +import json +import enum +import logging +from typing import Dict, Generic, Optional, Type, TypeVar + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.prompts import PromptTemplate + +# from lmnr.sdk.laminar import Laminar +from pydantic import BaseModel + +from browser_use.agent.views import ActionModel, ActionResult +from browser_use.browser.context import BrowserContext +from browser_use.controller.registry.service import Registry +from browser_use.controller.views import ( + ClickElementAction, + DoneAction, + GoToUrlAction, + InputTextAction, + NoParamsAction, + OpenTabAction, + ScrollAction, + SearchGoogleAction, + SendKeysAction, + SwitchTabAction, +) +from browser_use.utils import time_execution_sync + +logger = logging.getLogger(__name__) + + +Context = TypeVar('Context') + + +class Controller(Generic[Context]): + def __init__( + self, + exclude_actions: list[str] = [], + output_model: Optional[Type[BaseModel]] = None, + ): + self.registry = Registry[Context](exclude_actions) + + """Register all default browser actions""" + + if output_model is not None: + # Create a new model that extends the output model with success parameter + class ExtendedOutputModel(BaseModel): # type: ignore + success: bool = True + data: output_model + + @self.registry.action( + 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached', + param_model=ExtendedOutputModel, + ) + async def done(params: ExtendedOutputModel): + # Exclude success from the output JSON since it's an internal parameter + output_dict = params.data.model_dump() + + # Enums are not serializable, convert to string + for key, value in output_dict.items(): + if isinstance(value, enum.Enum): + output_dict[key] = value.value + + return ActionResult(is_done=True, success=params.success, extracted_content=json.dumps(output_dict)) + else: + + @self.registry.action( + 'Complete task - with return text and if the task is finished (success=True) or not yet completly finished (success=False), because last step is reached', + param_model=DoneAction, + ) + async def done(params: DoneAction): + return ActionResult(is_done=True, success=params.success, extracted_content=params.text) + + # Basic Navigation Actions + @self.registry.action( + 'Search the query in Google in the current tab, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items. ', + param_model=SearchGoogleAction, + ) + async def search_google(params: SearchGoogleAction, browser: BrowserContext): + page = await browser.get_current_page() + await page.goto(f'https://www.google.com/search?q={params.query}&udm=14') + await page.wait_for_load_state() + msg = f'πŸ” Searched for "{params.query}" in Google' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + @self.registry.action('Navigate to URL in the current tab', param_model=GoToUrlAction) + async def go_to_url(params: GoToUrlAction, browser: BrowserContext): + page = await browser.get_current_page() + await page.goto(params.url) + await page.wait_for_load_state() + msg = f'πŸ”— Navigated to {params.url}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + @self.registry.action('Go back', param_model=NoParamsAction) + async def go_back(_: NoParamsAction, browser: BrowserContext): + await browser.go_back() + msg = 'πŸ”™ Navigated back' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + # wait for x seconds + @self.registry.action('Wait for x seconds default 3') + async def wait(seconds: int = 3): + msg = f'πŸ•’ Waiting for {seconds} seconds' + logger.info(msg) + await asyncio.sleep(seconds) + return ActionResult(extracted_content=msg, include_in_memory=True) + + # Element Interaction Actions + @self.registry.action('Click element', param_model=ClickElementAction) + async def click_element(params: ClickElementAction, browser: BrowserContext): + session = await browser.get_session() + + if params.index not in await browser.get_selector_map(): + raise Exception(f'Element with index {params.index} does not exist - retry or use alternative actions') + + element_node = await browser.get_dom_element_by_index(params.index) + initial_pages = len(session.context.pages) + + # if element has file uploader then dont click + if await browser.is_file_uploader(element_node): + msg = f'Index {params.index} - has an element which opens file upload dialog. To upload files please use a specific function to upload files ' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + msg = None + + try: + download_path = await browser._click_element_node(element_node) + if download_path: + msg = f'πŸ’Ύ Downloaded file to {download_path}' + else: + msg = f'πŸ–±οΈ Clicked button with index {params.index}: {element_node.get_all_text_till_next_clickable_element(max_depth=2)}' + + logger.info(msg) + logger.debug(f'Element xpath: {element_node.xpath}') + if len(session.context.pages) > initial_pages: + new_tab_msg = 'New tab opened - switching to it' + msg += f' - {new_tab_msg}' + logger.info(new_tab_msg) + await browser.switch_to_tab(-1) + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + logger.warning(f'Element not clickable with index {params.index} - most likely the page changed') + return ActionResult(error=str(e)) + + @self.registry.action( + 'Input text into a input interactive element', + param_model=InputTextAction, + ) + async def input_text(params: InputTextAction, browser: BrowserContext, has_sensitive_data: bool = False): + if params.index not in await browser.get_selector_map(): + raise Exception(f'Element index {params.index} does not exist - retry or use alternative actions') + + element_node = await browser.get_dom_element_by_index(params.index) + await browser._input_text_element_node(element_node, params.text) + if not has_sensitive_data: + msg = f'⌨️ Input {params.text} into index {params.index}' + else: + msg = f'⌨️ Input sensitive data into index {params.index}' + logger.info(msg) + logger.debug(f'Element xpath: {element_node.xpath}') + return ActionResult(extracted_content=msg, include_in_memory=True) + + # Tab Management Actions + @self.registry.action('Switch tab', param_model=SwitchTabAction) + async def switch_tab(params: SwitchTabAction, browser: BrowserContext): + await browser.switch_to_tab(params.page_id) + # Wait for tab to be ready + page = await browser.get_current_page() + await page.wait_for_load_state() + msg = f'πŸ”„ Switched to tab {params.page_id}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + @self.registry.action('Open url in new tab', param_model=OpenTabAction) + async def open_tab(params: OpenTabAction, browser: BrowserContext): + await browser.create_new_tab(params.url) + msg = f'πŸ”— Opened new tab with {params.url}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + # Content Actions + @self.registry.action( + 'Extract page content to retrieve specific information from the page, e.g. all company names, a specifc description, all information about, links with companies in structured format or simply links', + ) + async def extract_content(goal: str, browser: BrowserContext, page_extraction_llm: BaseChatModel): + page = await browser.get_current_page() + import markdownify + + content = markdownify.markdownify(await page.content()) + + prompt = 'Your task is to extract the content of the page. You will be given a page and a goal and you should extract all relevant information around this goal from the page. If the goal is vague, summarize the page. Respond in json format. Extraction goal: {goal}, Page: {page}' + template = PromptTemplate(input_variables=['goal', 'page'], template=prompt) + try: + output = page_extraction_llm.invoke(template.format(goal=goal, page=content)) + msg = f'πŸ“„ Extracted from page\n: {output.content}\n' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + logger.debug(f'Error extracting content: {e}') + msg = f'πŸ“„ Extracted from page\n: {content}\n' + logger.info(msg) + return ActionResult(extracted_content=msg) + + @self.registry.action( + 'Scroll down the page by pixel amount - if no amount is specified, scroll down one page', + param_model=ScrollAction, + ) + async def scroll_down(params: ScrollAction, browser: BrowserContext): + page = await browser.get_current_page() + if params.amount is not None: + await page.evaluate(f'window.scrollBy(0, {params.amount});') + else: + await page.evaluate('window.scrollBy(0, window.innerHeight);') + + amount = f'{params.amount} pixels' if params.amount is not None else 'one page' + msg = f'πŸ” Scrolled down the page by {amount}' + logger.info(msg) + return ActionResult( + extracted_content=msg, + include_in_memory=True, + ) + + # scroll up + @self.registry.action( + 'Scroll up the page by pixel amount - if no amount is specified, scroll up one page', + param_model=ScrollAction, + ) + async def scroll_up(params: ScrollAction, browser: BrowserContext): + page = await browser.get_current_page() + if params.amount is not None: + await page.evaluate(f'window.scrollBy(0, -{params.amount});') + else: + await page.evaluate('window.scrollBy(0, -window.innerHeight);') + + amount = f'{params.amount} pixels' if params.amount is not None else 'one page' + msg = f'πŸ” Scrolled up the page by {amount}' + logger.info(msg) + return ActionResult( + extracted_content=msg, + include_in_memory=True, + ) + + # send keys + @self.registry.action( + 'Send strings of special keys like Escape,Backspace, Insert, PageDown, Delete, Enter, Shortcuts such as `Control+o`, `Control+Shift+T` are supported as well. This gets used in keyboard.press. ', + param_model=SendKeysAction, + ) + async def send_keys(params: SendKeysAction, browser: BrowserContext): + page = await browser.get_current_page() + + try: + await page.keyboard.press(params.keys) + except Exception as e: + if 'Unknown key' in str(e): + # loop over the keys and try to send each one + for key in params.keys: + try: + await page.keyboard.press(key) + except Exception as e: + logger.debug(f'Error sending key {key}: {str(e)}') + raise e + else: + raise e + msg = f'⌨️ Sent keys: {params.keys}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + @self.registry.action( + description='If you dont find something which you want to interact with, scroll to it', + ) + async def scroll_to_text(text: str, browser: BrowserContext): # type: ignore + page = await browser.get_current_page() + try: + # Try different locator strategies + locators = [ + page.get_by_text(text, exact=False), + page.locator(f'text={text}'), + page.locator(f"//*[contains(text(), '{text}')]"), + ] + + for locator in locators: + try: + # First check if element exists and is visible + if await locator.count() > 0 and await locator.first.is_visible(): + await locator.first.scroll_into_view_if_needed() + await asyncio.sleep(0.5) # Wait for scroll to complete + msg = f'πŸ” Scrolled to text: {text}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + except Exception as e: + logger.debug(f'Locator attempt failed: {str(e)}') + continue + + msg = f"Text '{text}' not found or not visible on page" + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + except Exception as e: + msg = f"Failed to scroll to text '{text}': {str(e)}" + logger.error(msg) + return ActionResult(error=msg, include_in_memory=True) + + @self.registry.action( + description='Get all options from a native dropdown', + ) + async def get_dropdown_options(index: int, browser: BrowserContext) -> ActionResult: + """Get all options from a native dropdown""" + page = await browser.get_current_page() + selector_map = await browser.get_selector_map() + dom_element = selector_map[index] + + try: + # Frame-aware approach since we know it works + all_options = [] + frame_index = 0 + + for frame in page.frames: + try: + options = await frame.evaluate( + """ + (xpath) => { + const select = document.evaluate(xpath, document, null, + XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; + if (!select) return null; + + return { + options: Array.from(select.options).map(opt => ({ + text: opt.text, //do not trim, because we are doing exact match in select_dropdown_option + value: opt.value, + index: opt.index + })), + id: select.id, + name: select.name + }; + } + """, + dom_element.xpath, + ) + + if options: + logger.debug(f'Found dropdown in frame {frame_index}') + logger.debug(f'Dropdown ID: {options["id"]}, Name: {options["name"]}') + + formatted_options = [] + for opt in options['options']: + # encoding ensures AI uses the exact string in select_dropdown_option + encoded_text = json.dumps(opt['text']) + formatted_options.append(f'{opt["index"]}: text={encoded_text}') + + all_options.extend(formatted_options) + + except Exception as frame_e: + logger.debug(f'Frame {frame_index} evaluation failed: {str(frame_e)}') + + frame_index += 1 + + if all_options: + msg = '\n'.join(all_options) + msg += '\nUse the exact text string in select_dropdown_option' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + else: + msg = 'No options found in any frame for dropdown' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + except Exception as e: + logger.error(f'Failed to get dropdown options: {str(e)}') + msg = f'Error getting options: {str(e)}' + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + @self.registry.action( + description='Select dropdown option for interactive element index by the text of the option you want to select', + ) + async def select_dropdown_option( + index: int, + text: str, + browser: BrowserContext, + ) -> ActionResult: + """Select dropdown option by the text of the option you want to select""" + page = await browser.get_current_page() + selector_map = await browser.get_selector_map() + dom_element = selector_map[index] + + # Validate that we're working with a select element + if dom_element.tag_name != 'select': + logger.error(f'Element is not a select! Tag: {dom_element.tag_name}, Attributes: {dom_element.attributes}') + msg = f'Cannot select option: Element with index {index} is a {dom_element.tag_name}, not a select' + return ActionResult(extracted_content=msg, include_in_memory=True) + + logger.debug(f"Attempting to select '{text}' using xpath: {dom_element.xpath}") + logger.debug(f'Element attributes: {dom_element.attributes}') + logger.debug(f'Element tag: {dom_element.tag_name}') + + xpath = '//' + dom_element.xpath + + try: + frame_index = 0 + for frame in page.frames: + try: + logger.debug(f'Trying frame {frame_index} URL: {frame.url}') + + # First verify we can find the dropdown in this frame + find_dropdown_js = """ + (xpath) => { + try { + const select = document.evaluate(xpath, document, null, + XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; + if (!select) return null; + if (select.tagName.toLowerCase() !== 'select') { + return { + error: `Found element but it's a ${select.tagName}, not a SELECT`, + found: false + }; + } + return { + id: select.id, + name: select.name, + found: true, + tagName: select.tagName, + optionCount: select.options.length, + currentValue: select.value, + availableOptions: Array.from(select.options).map(o => o.text.trim()) + }; + } catch (e) { + return {error: e.toString(), found: false}; + } + } + """ + + dropdown_info = await frame.evaluate(find_dropdown_js, dom_element.xpath) + + if dropdown_info: + if not dropdown_info.get('found'): + logger.error(f'Frame {frame_index} error: {dropdown_info.get("error")}') + continue + + logger.debug(f'Found dropdown in frame {frame_index}: {dropdown_info}') + + # "label" because we are selecting by text + # nth(0) to disable error thrown by strict mode + # timeout=1000 because we are already waiting for all network events, therefore ideally we don't need to wait a lot here (default 30s) + selected_option_values = ( + await frame.locator('//' + dom_element.xpath).nth(0).select_option(label=text, timeout=1000) + ) + + msg = f'selected option {text} with value {selected_option_values}' + logger.info(msg + f' in frame {frame_index}') + + return ActionResult(extracted_content=msg, include_in_memory=True) + + except Exception as frame_e: + logger.error(f'Frame {frame_index} attempt failed: {str(frame_e)}') + logger.error(f'Frame type: {type(frame)}') + logger.error(f'Frame URL: {frame.url}') + + frame_index += 1 + + msg = f"Could not select option '{text}' in any frame" + logger.info(msg) + return ActionResult(extracted_content=msg, include_in_memory=True) + + except Exception as e: + msg = f'Selection failed: {str(e)}' + logger.error(msg) + return ActionResult(error=msg, include_in_memory=True) + + # Register --------------------------------------------------------------- + + def action(self, description: str, **kwargs): + """Decorator for registering custom actions + + @param description: Describe the LLM what the function does (better description == better function calling) + """ + return self.registry.action(description, **kwargs) + + # Act -------------------------------------------------------------------- + + @time_execution_sync('--act') + async def act( + self, + action: ActionModel, + browser_context: BrowserContext, + # + page_extraction_llm: Optional[BaseChatModel] = None, + sensitive_data: Optional[Dict[str, str]] = None, + available_file_paths: Optional[list[str]] = None, + # + context: Context | None = None, + ) -> ActionResult: + """Execute an action""" + + try: + for action_name, params in action.model_dump(exclude_unset=True).items(): + if params is not None: + # with Laminar.start_as_current_span( + # name=action_name, + # input={ + # 'action': action_name, + # 'params': params, + # }, + # span_type='TOOL', + # ): + result = await self.registry.execute_action( + action_name, + params, + browser=browser_context, + page_extraction_llm=page_extraction_llm, + sensitive_data=sensitive_data, + available_file_paths=available_file_paths, + context=context, + ) + + # Laminar.set_span_output(result) + + if isinstance(result, str): + return ActionResult(extracted_content=result) + elif isinstance(result, ActionResult): + return result + elif result is None: + return ActionResult() + else: + raise ValueError(f'Invalid action result type: {type(result)} of {result}') + return ActionResult() + except Exception as e: + raise e diff --git a/browser_use/controller/views.py b/browser_use/controller/views.py new file mode 100644 index 0000000000000000000000000000000000000000..82995c9e311578ca1c9c133509b70c3eede2bb40 --- /dev/null +++ b/browser_use/controller/views.py @@ -0,0 +1,65 @@ +from typing import Optional + +from pydantic import BaseModel, model_validator + + +# Action Input Models +class SearchGoogleAction(BaseModel): + query: str + + +class GoToUrlAction(BaseModel): + url: str + + +class ClickElementAction(BaseModel): + index: int + xpath: Optional[str] = None + + +class InputTextAction(BaseModel): + index: int + text: str + xpath: Optional[str] = None + + +class DoneAction(BaseModel): + text: str + success: bool + + +class SwitchTabAction(BaseModel): + page_id: int + + +class OpenTabAction(BaseModel): + url: str + + +class ScrollAction(BaseModel): + amount: Optional[int] = None # The number of pixels to scroll. If None, scroll down/up one page + + +class SendKeysAction(BaseModel): + keys: str + + +class ExtractPageContentAction(BaseModel): + value: str + + +class NoParamsAction(BaseModel): + """ + Accepts absolutely anything in the incoming data + and discards it, so the final parsed model is empty. + """ + + @model_validator(mode='before') + def ignore_all_inputs(cls, values): + # No matter what the user sends, discard it and return empty. + return {} + + class Config: + # If you want to silently allow unknown fields at top-level, + # set extra = 'allow' as well: + extra = 'allow' diff --git a/browser_use/dom/__init__.py b/browser_use/dom/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/browser_use/dom/buildDomTree.js b/browser_use/dom/buildDomTree.js new file mode 100644 index 0000000000000000000000000000000000000000..539c762259b044776229a7fe9542059af84c0ab9 --- /dev/null +++ b/browser_use/dom/buildDomTree.js @@ -0,0 +1,1055 @@ +( + args = { + doHighlightElements: true, + focusHighlightIndex: -1, + viewportExpansion: 0, + debugMode: false, + } +) => { + const { doHighlightElements, focusHighlightIndex, viewportExpansion, debugMode } = args; + let highlightIndex = 0; // Reset highlight index + + // Add timing stack to handle recursion + const TIMING_STACK = { + nodeProcessing: [], + treeTraversal: [], + highlighting: [], + current: null + }; + + function pushTiming(type) { + TIMING_STACK[type] = TIMING_STACK[type] || []; + TIMING_STACK[type].push(performance.now()); + } + + function popTiming(type) { + const start = TIMING_STACK[type].pop(); + const duration = performance.now() - start; + return duration; + } + + // Only initialize performance tracking if in debug mode + const PERF_METRICS = debugMode ? { + buildDomTreeCalls: 0, + timings: { + buildDomTree: 0, + highlightElement: 0, + isInteractiveElement: 0, + isElementVisible: 0, + isTopElement: 0, + isInExpandedViewport: 0, + isTextNodeVisible: 0, + getEffectiveScroll: 0, + }, + cacheMetrics: { + boundingRectCacheHits: 0, + boundingRectCacheMisses: 0, + computedStyleCacheHits: 0, + computedStyleCacheMisses: 0, + getBoundingClientRectTime: 0, + getComputedStyleTime: 0, + boundingRectHitRate: 0, + computedStyleHitRate: 0, + overallHitRate: 0, + }, + nodeMetrics: { + totalNodes: 0, + processedNodes: 0, + skippedNodes: 0, + }, + buildDomTreeBreakdown: { + totalTime: 0, + totalSelfTime: 0, + buildDomTreeCalls: 0, + domOperations: { + getBoundingClientRect: 0, + getComputedStyle: 0, + }, + domOperationCounts: { + getBoundingClientRect: 0, + getComputedStyle: 0, + } + } + } : null; + + // Simple timing helper that only runs in debug mode + function measureTime(fn) { + if (!debugMode) return fn; + return function (...args) { + const start = performance.now(); + const result = fn.apply(this, args); + const duration = performance.now() - start; + return result; + }; + } + + // Helper to measure DOM operations + function measureDomOperation(operation, name) { + if (!debugMode) return operation(); + + const start = performance.now(); + const result = operation(); + const duration = performance.now() - start; + + if (PERF_METRICS && name in PERF_METRICS.buildDomTreeBreakdown.domOperations) { + PERF_METRICS.buildDomTreeBreakdown.domOperations[name] += duration; + PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[name]++; + } + + return result; + } + + // Add caching mechanisms at the top level + const DOM_CACHE = { + boundingRects: new WeakMap(), + computedStyles: new WeakMap(), + clearCache: () => { + DOM_CACHE.boundingRects = new WeakMap(); + DOM_CACHE.computedStyles = new WeakMap(); + } + }; + + // Cache helper functions + function getCachedBoundingRect(element) { + if (!element) return null; + + if (DOM_CACHE.boundingRects.has(element)) { + if (debugMode && PERF_METRICS) { + PERF_METRICS.cacheMetrics.boundingRectCacheHits++; + } + return DOM_CACHE.boundingRects.get(element); + } + + if (debugMode && PERF_METRICS) { + PERF_METRICS.cacheMetrics.boundingRectCacheMisses++; + } + + let rect; + if (debugMode) { + const start = performance.now(); + rect = element.getBoundingClientRect(); + const duration = performance.now() - start; + if (PERF_METRICS) { + PERF_METRICS.buildDomTreeBreakdown.domOperations.getBoundingClientRect += duration; + PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getBoundingClientRect++; + } + } else { + rect = element.getBoundingClientRect(); + } + + if (rect) { + DOM_CACHE.boundingRects.set(element, rect); + } + return rect; + } + + function getCachedComputedStyle(element) { + if (!element) return null; + + if (DOM_CACHE.computedStyles.has(element)) { + if (debugMode && PERF_METRICS) { + PERF_METRICS.cacheMetrics.computedStyleCacheHits++; + } + return DOM_CACHE.computedStyles.get(element); + } + + if (debugMode && PERF_METRICS) { + PERF_METRICS.cacheMetrics.computedStyleCacheMisses++; + } + + let style; + if (debugMode) { + const start = performance.now(); + style = window.getComputedStyle(element); + const duration = performance.now() - start; + if (PERF_METRICS) { + PERF_METRICS.buildDomTreeBreakdown.domOperations.getComputedStyle += duration; + PERF_METRICS.buildDomTreeBreakdown.domOperationCounts.getComputedStyle++; + } + } else { + style = window.getComputedStyle(element); + } + + if (style) { + DOM_CACHE.computedStyles.set(element, style); + } + return style; + } + + /** + * Hash map of DOM nodes indexed by their highlight index. + * + * @type {Object} + */ + const DOM_HASH_MAP = {}; + + const ID = { current: 0 }; + + const HIGHLIGHT_CONTAINER_ID = "playwright-highlight-container"; + + /** + * Highlights an element in the DOM and returns the index of the next element. + */ + function highlightElement(element, index, parentIframe = null) { + if (!element) return index; + + try { + // Create or get highlight container + let container = document.getElementById(HIGHLIGHT_CONTAINER_ID); + if (!container) { + container = document.createElement("div"); + container.id = HIGHLIGHT_CONTAINER_ID; + container.style.position = "fixed"; + container.style.pointerEvents = "none"; + container.style.top = "0"; + container.style.left = "0"; + container.style.width = "100%"; + container.style.height = "100%"; + container.style.zIndex = "2147483647"; + document.body.appendChild(container); + } + + // Get element position + const rect = measureDomOperation( + () => element.getBoundingClientRect(), + 'getBoundingClientRect' + ); + + if (!rect) return index; + + // Generate a color based on the index + const colors = [ + "#FF0000", + "#00FF00", + "#0000FF", + "#FFA500", + "#800080", + "#008080", + "#FF69B4", + "#4B0082", + "#FF4500", + "#2E8B57", + "#DC143C", + "#4682B4", + ]; + const colorIndex = index % colors.length; + const baseColor = colors[colorIndex]; + const backgroundColor = baseColor + "1A"; // 10% opacity version of the color + + // Create highlight overlay + const overlay = document.createElement("div"); + overlay.style.position = "fixed"; + overlay.style.border = `2px solid ${baseColor}`; + overlay.style.backgroundColor = backgroundColor; + overlay.style.pointerEvents = "none"; + overlay.style.boxSizing = "border-box"; + + // Get element position + let iframeOffset = { x: 0, y: 0 }; + + // If element is in an iframe, calculate iframe offset + if (parentIframe) { + const iframeRect = parentIframe.getBoundingClientRect(); + iframeOffset.x = iframeRect.left; + iframeOffset.y = iframeRect.top; + } + + // Calculate position + const top = rect.top + iframeOffset.y; + const left = rect.left + iframeOffset.x; + + overlay.style.top = `${top}px`; + overlay.style.left = `${left}px`; + overlay.style.width = `${rect.width}px`; + overlay.style.height = `${rect.height}px`; + + // Create and position label + const label = document.createElement("div"); + label.className = "playwright-highlight-label"; + label.style.position = "fixed"; + label.style.background = baseColor; + label.style.color = "white"; + label.style.padding = "1px 4px"; + label.style.borderRadius = "4px"; + label.style.fontSize = `${Math.min(12, Math.max(8, rect.height / 2))}px`; + label.textContent = index; + + const labelWidth = 20; + const labelHeight = 16; + + let labelTop = top + 2; + let labelLeft = left + rect.width - labelWidth - 2; + + if (rect.width < labelWidth + 4 || rect.height < labelHeight + 4) { + labelTop = top - labelHeight - 2; + labelLeft = left + rect.width - labelWidth; + } + + label.style.top = `${labelTop}px`; + label.style.left = `${labelLeft}px`; + + // Add to container + container.appendChild(overlay); + container.appendChild(label); + + // Update positions on scroll + const updatePositions = () => { + const newRect = element.getBoundingClientRect(); + let newIframeOffset = { x: 0, y: 0 }; + + if (parentIframe) { + const iframeRect = parentIframe.getBoundingClientRect(); + newIframeOffset.x = iframeRect.left; + newIframeOffset.y = iframeRect.top; + } + + const newTop = newRect.top + newIframeOffset.y; + const newLeft = newRect.left + newIframeOffset.x; + + overlay.style.top = `${newTop}px`; + overlay.style.left = `${newLeft}px`; + overlay.style.width = `${newRect.width}px`; + overlay.style.height = `${newRect.height}px`; + + let newLabelTop = newTop + 2; + let newLabelLeft = newLeft + newRect.width - labelWidth - 2; + + if (newRect.width < labelWidth + 4 || newRect.height < labelHeight + 4) { + newLabelTop = newTop - labelHeight - 2; + newLabelLeft = newLeft + newRect.width - labelWidth; + } + + label.style.top = `${newLabelTop}px`; + label.style.left = `${newLabelLeft}px`; + }; + + window.addEventListener('scroll', updatePositions); + window.addEventListener('resize', updatePositions); + + return index + 1; + } finally { + popTiming('highlighting'); + } + } + + /** + * Returns an XPath tree string for an element. + */ + function getXPathTree(element, stopAtBoundary = true) { + const segments = []; + let currentElement = element; + + while (currentElement && currentElement.nodeType === Node.ELEMENT_NODE) { + // Stop if we hit a shadow root or iframe + if ( + stopAtBoundary && + (currentElement.parentNode instanceof ShadowRoot || + currentElement.parentNode instanceof HTMLIFrameElement) + ) { + break; + } + + let index = 0; + let sibling = currentElement.previousSibling; + while (sibling) { + if ( + sibling.nodeType === Node.ELEMENT_NODE && + sibling.nodeName === currentElement.nodeName + ) { + index++; + } + sibling = sibling.previousSibling; + } + + const tagName = currentElement.nodeName.toLowerCase(); + const xpathIndex = index > 0 ? `[${index + 1}]` : ""; + segments.unshift(`${tagName}${xpathIndex}`); + + currentElement = currentElement.parentNode; + } + + return segments.join("/"); + } + + /** + * Checks if a text node is visible. + */ + function isTextNodeVisible(textNode) { + try { + const range = document.createRange(); + range.selectNodeContents(textNode); + const rect = range.getBoundingClientRect(); + + // Simple size check + if (rect.width === 0 || rect.height === 0) { + return false; + } + + // Simple viewport check without scroll calculations + const isInViewport = !( + rect.bottom < -viewportExpansion || + rect.top > window.innerHeight + viewportExpansion || + rect.right < -viewportExpansion || + rect.left > window.innerWidth + viewportExpansion + ); + + // Check parent visibility + const parentElement = textNode.parentElement; + if (!parentElement) return false; + + try { + return isInViewport && parentElement.checkVisibility({ + checkOpacity: true, + checkVisibilityCSS: true, + }); + } catch (e) { + // Fallback if checkVisibility is not supported + const style = window.getComputedStyle(parentElement); + return isInViewport && + style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + } + } catch (e) { + console.warn('Error checking text node visibility:', e); + return false; + } + } + + // Helper function to check if element is accepted + function isElementAccepted(element) { + if (!element || !element.tagName) return false; + + // Always accept body and common container elements + const alwaysAccept = new Set([ + "body", "div", "main", "article", "section", "nav", "header", "footer" + ]); + const tagName = element.tagName.toLowerCase(); + + if (alwaysAccept.has(tagName)) return true; + + const leafElementDenyList = new Set([ + "svg", + "script", + "style", + "link", + "meta", + "noscript", + "template", + ]); + + return !leafElementDenyList.has(tagName); + } + + /** + * Checks if an element is visible. + */ + function isElementVisible(element) { + const style = getCachedComputedStyle(element); + return ( + element.offsetWidth > 0 && + element.offsetHeight > 0 && + style.visibility !== "hidden" && + style.display !== "none" + ); + } + + /** + * Checks if an element is interactive. + */ + function isInteractiveElement(element) { + if (!element || element.nodeType !== Node.ELEMENT_NODE) { + return false; + } + + // Special handling for cookie banner elements + const isCookieBannerElement = + (typeof element.closest === 'function') && ( + element.closest('[id*="onetrust"]') || + element.closest('[class*="onetrust"]') || + element.closest('[data-nosnippet="true"]') || + element.closest('[aria-label*="cookie"]') + ); + + if (isCookieBannerElement) { + // Check if it's a button or interactive element within the banner + if ( + element.tagName.toLowerCase() === 'button' || + element.getAttribute('role') === 'button' || + element.onclick || + element.getAttribute('onclick') || + (element.classList && ( + element.classList.contains('ot-sdk-button') || + element.classList.contains('accept-button') || + element.classList.contains('reject-button') + )) || + element.getAttribute('aria-label')?.toLowerCase().includes('accept') || + element.getAttribute('aria-label')?.toLowerCase().includes('reject') + ) { + return true; + } + } + + // Base interactive elements and roles + const interactiveElements = new Set([ + "a", "button", "details", "embed", "input", "menu", "menuitem", + "object", "select", "textarea", "canvas", "summary", "dialog", + "banner" + ]); + + const interactiveRoles = new Set(['button-icon', 'dialog', 'button-text-icon-only', 'treeitem', 'alert', 'grid', 'progressbar', 'radio', 'checkbox', 'menuitem', 'option', 'switch', 'dropdown', 'scrollbar', 'combobox', 'a-button-text', 'button', 'region', 'textbox', 'tabpanel', 'tab', 'click', 'button-text', 'spinbutton', 'a-button-inner', 'link', 'menu', 'slider', 'listbox', 'a-dropdown-button', 'button-icon-only', 'searchbox', 'menuitemradio', 'tooltip', 'tree', 'menuitemcheckbox']); + + const tagName = element.tagName.toLowerCase(); + const role = element.getAttribute("role"); + const ariaRole = element.getAttribute("aria-role"); + const tabIndex = element.getAttribute("tabindex"); + + // Add check for specific class + const hasAddressInputClass = element.classList && ( + element.classList.contains("address-input__container__input") || + element.classList.contains("nav-btn") || + element.classList.contains("pull-left") + ); + + // Added enhancement to capture dropdown interactive elements + if (element.classList && ( + element.classList.contains('dropdown-toggle') || + element.getAttribute('data-toggle') === 'dropdown' || + element.getAttribute('aria-haspopup') === 'true' + )) { + return true; + } + + // Basic role/attribute checks + const hasInteractiveRole = + hasAddressInputClass || + interactiveElements.has(tagName) || + interactiveRoles.has(role) || + interactiveRoles.has(ariaRole) || + (tabIndex !== null && + tabIndex !== "-1" && + element.parentElement?.tagName.toLowerCase() !== "body") || + element.getAttribute("data-action") === "a-dropdown-select" || + element.getAttribute("data-action") === "a-dropdown-button"; + + if (hasInteractiveRole) return true; + + // Additional checks for cookie banners and consent UI + const isCookieBanner = + element.id?.toLowerCase().includes('cookie') || + element.id?.toLowerCase().includes('consent') || + element.id?.toLowerCase().includes('notice') || + (element.classList && ( + element.classList.contains('otCenterRounded') || + element.classList.contains('ot-sdk-container') + )) || + element.getAttribute('data-nosnippet') === 'true' || + element.getAttribute('aria-label')?.toLowerCase().includes('cookie') || + element.getAttribute('aria-label')?.toLowerCase().includes('consent') || + (element.tagName.toLowerCase() === 'div' && ( + element.id?.includes('onetrust') || + (element.classList && ( + element.classList.contains('onetrust') || + element.classList.contains('cookie') || + element.classList.contains('consent') + )) + )); + + if (isCookieBanner) return true; + + // Additional check for buttons in cookie banners + const isInCookieBanner = typeof element.closest === 'function' && element.closest( + '[id*="cookie"],[id*="consent"],[class*="cookie"],[class*="consent"],[id*="onetrust"]' + ); + + if (isInCookieBanner && ( + element.tagName.toLowerCase() === 'button' || + element.getAttribute('role') === 'button' || + (element.classList && element.classList.contains('button')) || + element.onclick || + element.getAttribute('onclick') + )) { + return true; + } + + // Get computed style + const style = window.getComputedStyle(element); + + // Check for event listeners + const hasClickHandler = + element.onclick !== null || + element.getAttribute("onclick") !== null || + element.hasAttribute("ng-click") || + element.hasAttribute("@click") || + element.hasAttribute("v-on:click"); + + // Helper function to safely get event listeners + function getEventListeners(el) { + try { + return window.getEventListeners?.(el) || {}; + } catch (e) { + const listeners = {}; + const eventTypes = [ + "click", + "mousedown", + "mouseup", + "touchstart", + "touchend", + "keydown", + "keyup", + "focus", + "blur", + ]; + + for (const type of eventTypes) { + const handler = el[`on${type}`]; + if (handler) { + listeners[type] = [{ listener: handler, useCapture: false }]; + } + } + return listeners; + } + } + + // Check for click-related events + const listeners = getEventListeners(element); + const hasClickListeners = + listeners && + (listeners.click?.length > 0 || + listeners.mousedown?.length > 0 || + listeners.mouseup?.length > 0 || + listeners.touchstart?.length > 0 || + listeners.touchend?.length > 0); + + // Check for ARIA properties + const hasAriaProps = + element.hasAttribute("aria-expanded") || + element.hasAttribute("aria-pressed") || + element.hasAttribute("aria-selected") || + element.hasAttribute("aria-checked"); + + const isContentEditable = element.getAttribute("contenteditable") === "true" || + element.isContentEditable || + element.id === "tinymce" || + element.classList.contains("mce-content-body") || + (element.tagName.toLowerCase() === "body" && element.getAttribute("data-id")?.startsWith("mce_")); + + // Check if element is draggable + const isDraggable = + element.draggable || element.getAttribute("draggable") === "true"; + + return ( + hasAriaProps || + hasClickHandler || + hasClickListeners || + isDraggable || + isContentEditable + ); + } + + /** + * Checks if an element is the topmost element at its position. + */ + function isTopElement(element) { + const rect = getCachedBoundingRect(element); + + // If element is not in viewport, consider it top + const isInViewport = ( + rect.left < window.innerWidth && + rect.right > 0 && + rect.top < window.innerHeight && + rect.bottom > 0 + ); + + if (!isInViewport) { + return true; + } + + // Find the correct document context and root element + let doc = element.ownerDocument; + + // If we're in an iframe, elements are considered top by default + if (doc !== window.document) { + return true; + } + + // For shadow DOM, we need to check within its own root context + const shadowRoot = element.getRootNode(); + if (shadowRoot instanceof ShadowRoot) { + const centerX = rect.left + rect.width / 2; + const centerY = rect.top + rect.height / 2; + + try { + const topEl = measureDomOperation( + () => shadowRoot.elementFromPoint(centerX, centerY), + 'elementFromPoint' + ); + if (!topEl) return false; + + let current = topEl; + while (current && current !== shadowRoot) { + if (current === element) return true; + current = current.parentElement; + } + return false; + } catch (e) { + return true; + } + } + + // For elements in viewport, check if they're topmost + const centerX = rect.left + rect.width / 2; + const centerY = rect.top + rect.height / 2; + + try { + const topEl = document.elementFromPoint(centerX, centerY); + if (!topEl) return false; + + let current = topEl; + while (current && current !== document.documentElement) { + if (current === element) return true; + current = current.parentElement; + } + return false; + } catch (e) { + return true; + } + } + + /** + * Checks if an element is within the expanded viewport. + */ + function isInExpandedViewport(element, viewportExpansion) { + if (viewportExpansion === -1) { + return true; + } + + const rect = getCachedBoundingRect(element); + + // Simple viewport check without scroll calculations + return !( + rect.bottom < -viewportExpansion || + rect.top > window.innerHeight + viewportExpansion || + rect.right < -viewportExpansion || + rect.left > window.innerWidth + viewportExpansion + ); + } + + // Add this new helper function + function getEffectiveScroll(element) { + let currentEl = element; + let scrollX = 0; + let scrollY = 0; + + return measureDomOperation(() => { + while (currentEl && currentEl !== document.documentElement) { + if (currentEl.scrollLeft || currentEl.scrollTop) { + scrollX += currentEl.scrollLeft; + scrollY += currentEl.scrollTop; + } + currentEl = currentEl.parentElement; + } + + scrollX += window.scrollX; + scrollY += window.scrollY; + + return { scrollX, scrollY }; + }, 'scrollOperations'); + } + + // Add these helper functions at the top level + function isInteractiveCandidate(element) { + if (!element || element.nodeType !== Node.ELEMENT_NODE) return false; + + const tagName = element.tagName.toLowerCase(); + + // Fast-path for common interactive elements + const interactiveElements = new Set([ + "a", "button", "input", "select", "textarea", "details", "summary" + ]); + + if (interactiveElements.has(tagName)) return true; + + // Quick attribute checks without getting full lists + const hasQuickInteractiveAttr = element.hasAttribute("onclick") || + element.hasAttribute("role") || + element.hasAttribute("tabindex") || + element.hasAttribute("aria-") || + element.hasAttribute("data-action"); + + return hasQuickInteractiveAttr; + } + + function quickVisibilityCheck(element) { + // Fast initial check before expensive getComputedStyle + return element.offsetWidth > 0 && + element.offsetHeight > 0 && + !element.hasAttribute("hidden") && + element.style.display !== "none" && + element.style.visibility !== "hidden"; + } + + /** + * Creates a node data object for a given node and its descendants. + */ + function buildDomTree(node, parentIframe = null) { + if (debugMode) PERF_METRICS.nodeMetrics.totalNodes++; + + if (!node || node.id === HIGHLIGHT_CONTAINER_ID) { + if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++; + return null; + } + + // Special handling for root node (body) + if (node === document.body) { + const nodeData = { + tagName: 'body', + attributes: {}, + xpath: '/body', + children: [], + }; + + // Process children of body + for (const child of node.childNodes) { + const domElement = buildDomTree(child, parentIframe); + if (domElement) nodeData.children.push(domElement); + } + + const id = `${ID.current++}`; + DOM_HASH_MAP[id] = nodeData; + if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++; + return id; + } + + // Early bailout for non-element nodes except text + if (node.nodeType !== Node.ELEMENT_NODE && node.nodeType !== Node.TEXT_NODE) { + if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++; + return null; + } + + // Process text nodes + if (node.nodeType === Node.TEXT_NODE) { + const textContent = node.textContent.trim(); + if (!textContent) { + if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++; + return null; + } + + // Only check visibility for text nodes that might be visible + const parentElement = node.parentElement; + if (!parentElement || parentElement.tagName.toLowerCase() === 'script') { + if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++; + return null; + } + + const id = `${ID.current++}`; + DOM_HASH_MAP[id] = { + type: "TEXT_NODE", + text: textContent, + isVisible: isTextNodeVisible(node), + }; + if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++; + return id; + } + + // Quick checks for element nodes + if (node.nodeType === Node.ELEMENT_NODE && !isElementAccepted(node)) { + if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++; + return null; + } + + // Early viewport check - only filter out elements clearly outside viewport + if (viewportExpansion !== -1) { + const rect = getCachedBoundingRect(node); + const style = getCachedComputedStyle(node); + + // Skip viewport check for fixed/sticky elements as they may appear anywhere + const isFixedOrSticky = style && (style.position === 'fixed' || style.position === 'sticky'); + + // Check if element has actual dimensions + const hasSize = node.offsetWidth > 0 || node.offsetHeight > 0; + + if (!rect || (!isFixedOrSticky && !hasSize && ( + rect.bottom < -viewportExpansion || + rect.top > window.innerHeight + viewportExpansion || + rect.right < -viewportExpansion || + rect.left > window.innerWidth + viewportExpansion + ))) { + if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++; + return null; + } + } + + // Process element node + const nodeData = { + tagName: node.tagName.toLowerCase(), + attributes: {}, + xpath: getXPathTree(node, true), + children: [], + }; + + // Get attributes for interactive elements or potential text containers + if (isInteractiveCandidate(node) || node.tagName.toLowerCase() === 'iframe' || node.tagName.toLowerCase() === 'body') { + const attributeNames = node.getAttributeNames?.() || []; + for (const name of attributeNames) { + nodeData.attributes[name] = node.getAttribute(name); + } + } + + // if (isInteractiveCandidate(node)) { + + // Check interactivity + if (node.nodeType === Node.ELEMENT_NODE) { + nodeData.isVisible = isElementVisible(node); + if (nodeData.isVisible) { + nodeData.isTopElement = isTopElement(node); + if (nodeData.isTopElement) { + nodeData.isInteractive = isInteractiveElement(node); + if (nodeData.isInteractive) { + nodeData.isInViewport = true; + nodeData.highlightIndex = highlightIndex++; + + if (doHighlightElements) { + if (focusHighlightIndex >= 0) { + if (focusHighlightIndex === nodeData.highlightIndex) { + highlightElement(node, nodeData.highlightIndex, parentIframe); + } + } else { + highlightElement(node, nodeData.highlightIndex, parentIframe); + } + } + } + } + } + } + + // Process children, with special handling for iframes and rich text editors + if (node.tagName) { + const tagName = node.tagName.toLowerCase(); + + // Handle iframes + if (tagName === "iframe") { + try { + const iframeDoc = node.contentDocument || node.contentWindow?.document; + if (iframeDoc) { + for (const child of iframeDoc.childNodes) { + const domElement = buildDomTree(child, node); + if (domElement) nodeData.children.push(domElement); + } + } + } catch (e) { + console.warn("Unable to access iframe:", e); + } + } + // Handle rich text editors and contenteditable elements + else if ( + node.isContentEditable || + node.getAttribute("contenteditable") === "true" || + node.id === "tinymce" || + node.classList.contains("mce-content-body") || + (tagName === "body" && node.getAttribute("data-id")?.startsWith("mce_")) + ) { + // Process all child nodes to capture formatted text + for (const child of node.childNodes) { + const domElement = buildDomTree(child, parentIframe); + if (domElement) nodeData.children.push(domElement); + } + } + // Handle shadow DOM + else if (node.shadowRoot) { + nodeData.shadowRoot = true; + for (const child of node.shadowRoot.childNodes) { + const domElement = buildDomTree(child, parentIframe); + if (domElement) nodeData.children.push(domElement); + } + } + // Handle regular elements + else { + for (const child of node.childNodes) { + const domElement = buildDomTree(child, parentIframe); + if (domElement) nodeData.children.push(domElement); + } + } + } + + // Skip empty anchor tags + if (nodeData.tagName === 'a' && nodeData.children.length === 0 && !nodeData.attributes.href) { + if (debugMode) PERF_METRICS.nodeMetrics.skippedNodes++; + return null; + } + + const id = `${ID.current++}`; + DOM_HASH_MAP[id] = nodeData; + if (debugMode) PERF_METRICS.nodeMetrics.processedNodes++; + return id; + } + + // After all functions are defined, wrap them with performance measurement + // Remove buildDomTree from here as we measure it separately + highlightElement = measureTime(highlightElement); + isInteractiveElement = measureTime(isInteractiveElement); + isElementVisible = measureTime(isElementVisible); + isTopElement = measureTime(isTopElement); + isInExpandedViewport = measureTime(isInExpandedViewport); + isTextNodeVisible = measureTime(isTextNodeVisible); + getEffectiveScroll = measureTime(getEffectiveScroll); + + const rootId = buildDomTree(document.body); + + // Clear the cache before starting + DOM_CACHE.clearCache(); + + // Only process metrics in debug mode + if (debugMode && PERF_METRICS) { + // Convert timings to seconds and add useful derived metrics + Object.keys(PERF_METRICS.timings).forEach(key => { + PERF_METRICS.timings[key] = PERF_METRICS.timings[key] / 1000; + }); + + Object.keys(PERF_METRICS.buildDomTreeBreakdown).forEach(key => { + if (typeof PERF_METRICS.buildDomTreeBreakdown[key] === 'number') { + PERF_METRICS.buildDomTreeBreakdown[key] = PERF_METRICS.buildDomTreeBreakdown[key] / 1000; + } + }); + + // Add some useful derived metrics + if (PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls > 0) { + PERF_METRICS.buildDomTreeBreakdown.averageTimePerNode = + PERF_METRICS.buildDomTreeBreakdown.totalTime / PERF_METRICS.buildDomTreeBreakdown.buildDomTreeCalls; + } + + PERF_METRICS.buildDomTreeBreakdown.timeInChildCalls = + PERF_METRICS.buildDomTreeBreakdown.totalTime - PERF_METRICS.buildDomTreeBreakdown.totalSelfTime; + + // Add average time per operation to the metrics + Object.keys(PERF_METRICS.buildDomTreeBreakdown.domOperations).forEach(op => { + const time = PERF_METRICS.buildDomTreeBreakdown.domOperations[op]; + const count = PERF_METRICS.buildDomTreeBreakdown.domOperationCounts[op]; + if (count > 0) { + PERF_METRICS.buildDomTreeBreakdown.domOperations[`${op}Average`] = time / count; + } + }); + + // Calculate cache hit rates + const boundingRectTotal = PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.boundingRectCacheMisses; + const computedStyleTotal = PERF_METRICS.cacheMetrics.computedStyleCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheMisses; + + if (boundingRectTotal > 0) { + PERF_METRICS.cacheMetrics.boundingRectHitRate = PERF_METRICS.cacheMetrics.boundingRectCacheHits / boundingRectTotal; + } + + if (computedStyleTotal > 0) { + PERF_METRICS.cacheMetrics.computedStyleHitRate = PERF_METRICS.cacheMetrics.computedStyleCacheHits / computedStyleTotal; + } + + if ((boundingRectTotal + computedStyleTotal) > 0) { + PERF_METRICS.cacheMetrics.overallHitRate = + (PERF_METRICS.cacheMetrics.boundingRectCacheHits + PERF_METRICS.cacheMetrics.computedStyleCacheHits) / + (boundingRectTotal + computedStyleTotal); + } + } + + return debugMode ? + { rootId, map: DOM_HASH_MAP, perfMetrics: PERF_METRICS } : + { rootId, map: DOM_HASH_MAP }; +}; diff --git a/browser_use/dom/history_tree_processor/service.py b/browser_use/dom/history_tree_processor/service.py new file mode 100644 index 0000000000000000000000000000000000000000..fee43125c2786d01114b94f5bf6643376de4c2be --- /dev/null +++ b/browser_use/dom/history_tree_processor/service.py @@ -0,0 +1,107 @@ +import hashlib +from typing import Optional + +from browser_use.dom.history_tree_processor.view import DOMHistoryElement, HashedDomElement +from browser_use.dom.views import DOMElementNode + + +class HistoryTreeProcessor: + """ " + Operations on the DOM elements + + @dev be careful - text nodes can change even if elements stay the same + """ + + @staticmethod + def convert_dom_element_to_history_element(dom_element: DOMElementNode) -> DOMHistoryElement: + from browser_use.browser.context import BrowserContext + + parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element) + css_selector = BrowserContext._enhanced_css_selector_for_element(dom_element) + return DOMHistoryElement( + dom_element.tag_name, + dom_element.xpath, + dom_element.highlight_index, + parent_branch_path, + dom_element.attributes, + dom_element.shadow_root, + css_selector=css_selector, + page_coordinates=dom_element.page_coordinates, + viewport_coordinates=dom_element.viewport_coordinates, + viewport_info=dom_element.viewport_info, + ) + + @staticmethod + def find_history_element_in_tree(dom_history_element: DOMHistoryElement, tree: DOMElementNode) -> Optional[DOMElementNode]: + hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element) + + def process_node(node: DOMElementNode): + if node.highlight_index is not None: + hashed_node = HistoryTreeProcessor._hash_dom_element(node) + if hashed_node == hashed_dom_history_element: + return node + for child in node.children: + if isinstance(child, DOMElementNode): + result = process_node(child) + if result is not None: + return result + return None + + return process_node(tree) + + @staticmethod + def compare_history_element_and_dom_element(dom_history_element: DOMHistoryElement, dom_element: DOMElementNode) -> bool: + hashed_dom_history_element = HistoryTreeProcessor._hash_dom_history_element(dom_history_element) + hashed_dom_element = HistoryTreeProcessor._hash_dom_element(dom_element) + + return hashed_dom_history_element == hashed_dom_element + + @staticmethod + def _hash_dom_history_element(dom_history_element: DOMHistoryElement) -> HashedDomElement: + branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(dom_history_element.entire_parent_branch_path) + attributes_hash = HistoryTreeProcessor._attributes_hash(dom_history_element.attributes) + xpath_hash = HistoryTreeProcessor._xpath_hash(dom_history_element.xpath) + + return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash) + + @staticmethod + def _hash_dom_element(dom_element: DOMElementNode) -> HashedDomElement: + parent_branch_path = HistoryTreeProcessor._get_parent_branch_path(dom_element) + branch_path_hash = HistoryTreeProcessor._parent_branch_path_hash(parent_branch_path) + attributes_hash = HistoryTreeProcessor._attributes_hash(dom_element.attributes) + xpath_hash = HistoryTreeProcessor._xpath_hash(dom_element.xpath) + # text_hash = DomTreeProcessor._text_hash(dom_element) + + return HashedDomElement(branch_path_hash, attributes_hash, xpath_hash) + + @staticmethod + def _get_parent_branch_path(dom_element: DOMElementNode) -> list[str]: + parents: list[DOMElementNode] = [] + current_element: DOMElementNode = dom_element + while current_element.parent is not None: + parents.append(current_element) + current_element = current_element.parent + + parents.reverse() + + return [parent.tag_name for parent in parents] + + @staticmethod + def _parent_branch_path_hash(parent_branch_path: list[str]) -> str: + parent_branch_path_string = '/'.join(parent_branch_path) + return hashlib.sha256(parent_branch_path_string.encode()).hexdigest() + + @staticmethod + def _attributes_hash(attributes: dict[str, str]) -> str: + attributes_string = ''.join(f'{key}={value}' for key, value in attributes.items()) + return hashlib.sha256(attributes_string.encode()).hexdigest() + + @staticmethod + def _xpath_hash(xpath: str) -> str: + return hashlib.sha256(xpath.encode()).hexdigest() + + @staticmethod + def _text_hash(dom_element: DOMElementNode) -> str: + """ """ + text_string = dom_element.get_all_text_till_next_clickable_element() + return hashlib.sha256(text_string.encode()).hexdigest() diff --git a/browser_use/dom/history_tree_processor/view.py b/browser_use/dom/history_tree_processor/view.py new file mode 100644 index 0000000000000000000000000000000000000000..e970ad5b53af7f340d93f2f21773a7651fd3a8d7 --- /dev/null +++ b/browser_use/dom/history_tree_processor/view.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +from pydantic import BaseModel + + +@dataclass +class HashedDomElement: + """ + Hash of the dom element to be used as a unique identifier + """ + + branch_path_hash: str + attributes_hash: str + xpath_hash: str + # text_hash: str + + +class Coordinates(BaseModel): + x: int + y: int + + +class CoordinateSet(BaseModel): + top_left: Coordinates + top_right: Coordinates + bottom_left: Coordinates + bottom_right: Coordinates + center: Coordinates + width: int + height: int + + +class ViewportInfo(BaseModel): + scroll_x: int + scroll_y: int + width: int + height: int + + +@dataclass +class DOMHistoryElement: + tag_name: str + xpath: str + highlight_index: Optional[int] + entire_parent_branch_path: list[str] + attributes: dict[str, str] + shadow_root: bool = False + css_selector: Optional[str] = None + page_coordinates: Optional[CoordinateSet] = None + viewport_coordinates: Optional[CoordinateSet] = None + viewport_info: Optional[ViewportInfo] = None + + def to_dict(self) -> dict: + page_coordinates = self.page_coordinates.model_dump() if self.page_coordinates else None + viewport_coordinates = self.viewport_coordinates.model_dump() if self.viewport_coordinates else None + viewport_info = self.viewport_info.model_dump() if self.viewport_info else None + + return { + 'tag_name': self.tag_name, + 'xpath': self.xpath, + 'highlight_index': self.highlight_index, + 'entire_parent_branch_path': self.entire_parent_branch_path, + 'attributes': self.attributes, + 'shadow_root': self.shadow_root, + 'css_selector': self.css_selector, + 'page_coordinates': page_coordinates, + 'viewport_coordinates': viewport_coordinates, + 'viewport_info': viewport_info, + } diff --git a/browser_use/dom/service.py b/browser_use/dom/service.py new file mode 100644 index 0000000000000000000000000000000000000000..d03fbecfbf5abdc5db805d67764afdcc3868de87 --- /dev/null +++ b/browser_use/dom/service.py @@ -0,0 +1,169 @@ +import gc +import json +import logging +from dataclasses import dataclass +from importlib import resources +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + from playwright.async_api import Page + +from browser_use.dom.views import ( + DOMBaseNode, + DOMElementNode, + DOMState, + DOMTextNode, + SelectorMap, +) +from browser_use.utils import time_execution_async + +logger = logging.getLogger(__name__) + + +@dataclass +class ViewportInfo: + width: int + height: int + + +class DomService: + def __init__(self, page: 'Page'): + self.page = page + self.xpath_cache = {} + + self.js_code = resources.read_text('browser_use.dom', 'buildDomTree.js') + + # region - Clickable elements + @time_execution_async('--get_clickable_elements') + async def get_clickable_elements( + self, + highlight_elements: bool = True, + focus_element: int = -1, + viewport_expansion: int = 0, + ) -> DOMState: + element_tree, selector_map = await self._build_dom_tree(highlight_elements, focus_element, viewport_expansion) + return DOMState(element_tree=element_tree, selector_map=selector_map) + + @time_execution_async('--build_dom_tree') + async def _build_dom_tree( + self, + highlight_elements: bool, + focus_element: int, + viewport_expansion: int, + ) -> tuple[DOMElementNode, SelectorMap]: + if await self.page.evaluate('1+1') != 2: + raise ValueError('The page cannot evaluate javascript code properly') + + # NOTE: We execute JS code in the browser to extract important DOM information. + # The returned hash map contains information about the DOM tree and the + # relationship between the DOM elements. + debug_mode = logger.getEffectiveLevel() == logging.DEBUG + args = { + 'doHighlightElements': highlight_elements, + 'focusHighlightIndex': focus_element, + 'viewportExpansion': viewport_expansion, + 'debugMode': debug_mode, + } + + try: + eval_page = await self.page.evaluate(self.js_code, args) + except Exception as e: + logger.error('Error evaluating JavaScript: %s', e) + raise + + # Only log performance metrics in debug mode + if debug_mode and 'perfMetrics' in eval_page: + logger.debug('DOM Tree Building Performance Metrics:\n%s', json.dumps(eval_page['perfMetrics'], indent=2)) + + return await self._construct_dom_tree(eval_page) + + @time_execution_async('--construct_dom_tree') + async def _construct_dom_tree( + self, + eval_page: dict, + ) -> tuple[DOMElementNode, SelectorMap]: + js_node_map = eval_page['map'] + js_root_id = eval_page['rootId'] + + selector_map = {} + node_map = {} + + for id, node_data in js_node_map.items(): + node, children_ids = self._parse_node(node_data) + if node is None: + continue + + node_map[id] = node + + if isinstance(node, DOMElementNode) and node.highlight_index is not None: + selector_map[node.highlight_index] = node + + # NOTE: We know that we are building the tree bottom up + # and all children are already processed. + if isinstance(node, DOMElementNode): + for child_id in children_ids: + if child_id not in node_map: + continue + + child_node = node_map[child_id] + + child_node.parent = node + node.children.append(child_node) + + html_to_dict = node_map[str(js_root_id)] + + del node_map + del js_node_map + del js_root_id + + gc.collect() + + if html_to_dict is None or not isinstance(html_to_dict, DOMElementNode): + raise ValueError('Failed to parse HTML to dictionary') + + return html_to_dict, selector_map + + def _parse_node( + self, + node_data: dict, + ) -> tuple[Optional[DOMBaseNode], list[int]]: + if not node_data: + return None, [] + + # Process text nodes immediately + if node_data.get('type') == 'TEXT_NODE': + text_node = DOMTextNode( + text=node_data['text'], + is_visible=node_data['isVisible'], + parent=None, + ) + return text_node, [] + + # Process coordinates if they exist for element nodes + + viewport_info = None + + if 'viewport' in node_data: + viewport_info = ViewportInfo( + width=node_data['viewport']['width'], + height=node_data['viewport']['height'], + ) + + element_node = DOMElementNode( + tag_name=node_data['tagName'], + xpath=node_data['xpath'], + attributes=node_data.get('attributes', {}), + children=[], + is_visible=node_data.get('isVisible', False), + is_interactive=node_data.get('isInteractive', False), + is_top_element=node_data.get('isTopElement', False), + is_in_viewport=node_data.get('isInViewport', False), + highlight_index=node_data.get('highlightIndex'), + shadow_root=node_data.get('shadowRoot', False), + parent=None, + viewport_info=viewport_info, + ) + + children_ids = node_data.get('children', []) + + return element_node, children_ids diff --git a/browser_use/dom/tests/debug_page_structure.py b/browser_use/dom/tests/debug_page_structure.py new file mode 100644 index 0000000000000000000000000000000000000000..49e99bbc2599a6280bab6dd156a4639938efc8ff --- /dev/null +++ b/browser_use/dom/tests/debug_page_structure.py @@ -0,0 +1,123 @@ +import asyncio +import os +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext + + +async def analyze_page_structure(url: str): + """Analyze and print the structure of a webpage with enhanced debugging""" + browser = Browser( + config=BrowserConfig( + headless=False, # Set to True if you don't need to see the browser + ) + ) + + context = BrowserContext(browser=browser) + + try: + async with context as ctx: + # Navigate to the URL + page = await ctx.get_current_page() + await page.goto(url) + await page.wait_for_load_state('networkidle') + + # Get viewport dimensions + viewport_info = await page.evaluate("""() => { + return { + viewport: { + width: window.innerWidth, + height: window.innerHeight, + scrollX: window.scrollX, + scrollY: window.scrollY + } + } + }""") + + print('\nViewport Information:') + print(f'Width: {viewport_info["viewport"]["width"]}') + print(f'Height: {viewport_info["viewport"]["height"]}') + print(f'ScrollX: {viewport_info["viewport"]["scrollX"]}') + print(f'ScrollY: {viewport_info["viewport"]["scrollY"]}') + + # Enhanced debug information for cookie consent and fixed position elements + debug_info = await page.evaluate("""() => { + function getElementInfo(element) { + const rect = element.getBoundingClientRect(); + const style = window.getComputedStyle(element); + return { + tag: element.tagName.toLowerCase(), + id: element.id, + className: element.className, + position: style.position, + rect: { + top: rect.top, + right: rect.right, + bottom: rect.bottom, + left: rect.left, + width: rect.width, + height: rect.height + }, + isFixed: style.position === 'fixed', + isSticky: style.position === 'sticky', + zIndex: style.zIndex, + visibility: style.visibility, + display: style.display, + opacity: style.opacity + }; + } + + // Find cookie-related elements + const cookieElements = Array.from(document.querySelectorAll('[id*="cookie"], [id*="consent"], [class*="cookie"], [class*="consent"]')); + const fixedElements = Array.from(document.querySelectorAll('*')).filter(el => { + const style = window.getComputedStyle(el); + return style.position === 'fixed' || style.position === 'sticky'; + }); + + return { + cookieElements: cookieElements.map(getElementInfo), + fixedElements: fixedElements.map(getElementInfo) + }; + }""") + + print('\nCookie-related Elements:') + for elem in debug_info['cookieElements']: + print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}') + print(f'Position: {elem["position"]}') + print(f'Rect: {elem["rect"]}') + print(f'Z-Index: {elem["zIndex"]}') + print(f'Visibility: {elem["visibility"]}') + print(f'Display: {elem["display"]}') + print(f'Opacity: {elem["opacity"]}') + + print('\nFixed/Sticky Position Elements:') + for elem in debug_info['fixedElements']: + print(f'\nElement: {elem["tag"]}#{elem["id"]} .{elem["className"]}') + print(f'Position: {elem["position"]}') + print(f'Rect: {elem["rect"]}') + print(f'Z-Index: {elem["zIndex"]}') + + print(f'\nPage Structure for {url}:\n') + structure = await ctx.get_page_structure() + print(structure) + + input('Press Enter to close the browser...') + finally: + await browser.close() + + +if __name__ == '__main__': + # You can modify this URL to analyze different pages + + urls = [ + 'https://www.mlb.com/yankees/stats/', + 'https://immobilienscout24.de', + 'https://www.zeiss.com/career/en/job-search.html?page=1', + 'https://www.zeiss.com/career/en/job-search.html?page=1', + 'https://reddit.com', + ] + for url in urls: + asyncio.run(analyze_page_structure(url)) diff --git a/browser_use/dom/tests/extraction_test.py b/browser_use/dom/tests/extraction_test.py new file mode 100644 index 0000000000000000000000000000000000000000..0a4dce638c820a213cbcb43ddd7ab42ef533afec --- /dev/null +++ b/browser_use/dom/tests/extraction_test.py @@ -0,0 +1,147 @@ +import asyncio +import time + +from browser_use.browser.browser import Browser, BrowserConfig +from browser_use.browser.context import BrowserContext, BrowserContextConfig +from browser_use.dom.service import DomService +from browser_use.utils import time_execution_sync + + +async def test_process_html_file(): + config = BrowserContextConfig( + cookies_file='cookies3.json', + disable_security=True, + wait_for_network_idle_page_load_time=2, + ) + + browser = Browser( + config=BrowserConfig( + # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + ) + ) + context = BrowserContext(browser=browser, config=config) # noqa: F821 + + websites = [ + 'https://kayak.com/flights', + 'https://immobilienscout24.de', + 'https://google.com', + 'https://amazon.com', + 'https://github.com', + ] + + async with context as context: + page = await context.get_current_page() + dom_service = DomService(page) + + for website in websites: + print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}') + await page.goto(website) + time.sleep(2) # Additional wait for dynamic content + + async def test_viewport(expansion: int, description: str): + print(f'\n{description}:') + dom_state = await time_execution_sync(f'get_clickable_elements ({description})')( + dom_service.get_clickable_elements + )(highlight_elements=True, viewport_expansion=expansion) + + elements = dom_state.element_tree + selector_map = dom_state.selector_map + element_count = len(selector_map.keys()) + token_count = count_string_tokens(elements.clickable_elements_to_string(), model='gpt-4o') + + print(f'Number of elements: {element_count}') + print(f'Token count: {token_count}') + return element_count, token_count + + expansions = [0, 100, 200, 300, 400, 500, 600, 1000, -1, -200] + results = [] + + for i, expansion in enumerate(expansions): + description = ( + f'{i + 1}. Expansion {expansion}px' if expansion >= 0 else f'{i + 1}. All elements ({expansion} expansion)' + ) + count, tokens = await test_viewport(expansion, description) + results.append((count, tokens)) + input('Press Enter to continue...') + await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()') + + # Print comparison summary + print('\nComparison Summary:') + for i, (count, tokens) in enumerate(results): + expansion = expansions[i] + description = f'Expansion {expansion}px' if expansion >= 0 else 'All elements (-1)' + initial_count, initial_tokens = results[0] + print(f'{description}: {count} elements (+{count - initial_count}), {tokens} tokens') + + input('\nPress Enter to continue to next website...') + + # Clear highlights before next website + await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()') + + +async def test_focus_vs_all_elements(): + config = BrowserContextConfig( + cookies_file='cookies3.json', + disable_security=True, + wait_for_network_idle_page_load_time=2, + ) + + browser = Browser( + config=BrowserConfig( + # chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', + ) + ) + context = BrowserContext(browser=browser, config=config) # noqa: F821 + + websites = [ + 'https://immobilienscout24.de', + 'https://www.zeiss.com/career/en/job-search.html?page=1', + 'https://www.mlb.com/yankees/stats/', + 'https://www.amazon.com/s?k=laptop&s=review-rank&crid=1RZCEJ289EUSI&qid=1740202453&sprefix=laptop%2Caps%2C166&ref=sr_st_review-rank&ds=v1%3A4EnYKXVQA7DIE41qCvRZoNB4qN92Jlztd3BPsTFXmxU', + 'https://codepen.io/geheimschriftstift/pen/mPLvQz', + 'https://reddit.com', + 'https://www.google.com/search?q=google+hi&oq=google+hi&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIGCAEQRRhA0gEIMjI2NmowajSoAgCwAgE&sourceid=chrome&ie=UTF-8', + 'https://kayak.com/flights', + 'https://google.com', + 'https://amazon.com', + 'https://github.com', + ] + + async with context as context: + page = await context.get_current_page() + dom_service = DomService(page) + + for website in websites: + # sleep 2 + await page.goto(website) + time.sleep(2) + + while True: + try: + print(f'\n{"=" * 50}\nTesting {website}\n{"=" * 50}') + # time.sleep(2) # Additional wait for dynamic content + + # First get all elements + print('\nGetting all elements:') + all_elements_state = await time_execution_sync('get_all_elements')(dom_service.get_clickable_elements)( + highlight_elements=True, viewport_expansion=100 + ) + + selector_map = all_elements_state.selector_map + total_elements = len(selector_map.keys()) + print(f'Total number of elements: {total_elements}') + + answer = input('Press Enter to clear highlights and continue...') + if answer == 'q': + break + + await page.evaluate('document.getElementById("playwright-highlight-container")?.remove()') + + except Exception as e: + print(f'Error: {e}') + pass + + +if __name__ == '__main__': + asyncio.run(test_focus_vs_all_elements()) + asyncio.run(test_process_html_file()) diff --git a/browser_use/dom/tests/process_dom_test.py b/browser_use/dom/tests/process_dom_test.py new file mode 100644 index 0000000000000000000000000000000000000000..39bd2432885c1b961da9086de70dd658e5a1f840 --- /dev/null +++ b/browser_use/dom/tests/process_dom_test.py @@ -0,0 +1,40 @@ +import json +import os +import time + +from browser_use.browser.browser import Browser, BrowserConfig + + +async def test_process_dom(): + browser = Browser(config=BrowserConfig(headless=False)) + + async with await browser.new_context() as context: + page = await context.get_current_page() + await page.goto('https://kayak.com/flights') + # await page.goto('https://google.com/flights') + # await page.goto('https://immobilienscout24.de') + # await page.goto('https://seleniumbase.io/w3schools/iframes') + + time.sleep(3) + + with open('browser_use/dom/buildDomTree.js', 'r') as f: + js_code = f.read() + + start = time.time() + dom_tree = await page.evaluate(js_code) + end = time.time() + + # print(dom_tree) + print(f'Time: {end - start:.2f}s') + + os.makedirs('./tmp', exist_ok=True) + with open('./tmp/dom.json', 'w') as f: + json.dump(dom_tree, f, indent=1) + + # both of these work for immobilienscout24.de + # await page.click('.sc-dcJsrY.ezjNCe') + # await page.click( + # 'div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div:nth-of-type(2) > div > div > div > button:nth-of-type(2)' + # ) + + input('Press Enter to continue...') diff --git a/browser_use/dom/views.py b/browser_use/dom/views.py new file mode 100644 index 0000000000000000000000000000000000000000..f0acd9f86a7fe3f2e5b165ab4b904be27261c267 --- /dev/null +++ b/browser_use/dom/views.py @@ -0,0 +1,196 @@ +from dataclasses import dataclass +from functools import cached_property +from typing import TYPE_CHECKING, Dict, List, Optional + +from browser_use.dom.history_tree_processor.view import CoordinateSet, HashedDomElement, ViewportInfo +from browser_use.utils import time_execution_sync + +# Avoid circular import issues +if TYPE_CHECKING: + from .views import DOMElementNode + + +@dataclass(frozen=False) +class DOMBaseNode: + is_visible: bool + # Use None as default and set parent later to avoid circular reference issues + parent: Optional['DOMElementNode'] + + +@dataclass(frozen=False) +class DOMTextNode(DOMBaseNode): + text: str + type: str = 'TEXT_NODE' + + def has_parent_with_highlight_index(self) -> bool: + current = self.parent + while current is not None: + # stop if the element has a highlight index (will be handled separately) + if current.highlight_index is not None: + return True + + current = current.parent + return False + + def is_parent_in_viewport(self) -> bool: + if self.parent is None: + return False + return self.parent.is_in_viewport + + def is_parent_top_element(self) -> bool: + if self.parent is None: + return False + return self.parent.is_top_element + + +@dataclass(frozen=False) +class DOMElementNode(DOMBaseNode): + """ + xpath: the xpath of the element from the last root node (shadow root or iframe OR document if no shadow root or iframe). + To properly reference the element we need to recursively switch the root node until we find the element (work you way up the tree with `.parent`) + """ + + tag_name: str + xpath: str + attributes: Dict[str, str] + children: List[DOMBaseNode] + is_interactive: bool = False + is_top_element: bool = False + is_in_viewport: bool = False + shadow_root: bool = False + highlight_index: Optional[int] = None + viewport_coordinates: Optional[CoordinateSet] = None + page_coordinates: Optional[CoordinateSet] = None + viewport_info: Optional[ViewportInfo] = None + + def __repr__(self) -> str: + tag_str = f'<{self.tag_name}' + + # Add attributes + for key, value in self.attributes.items(): + tag_str += f' {key}="{value}"' + tag_str += '>' + + # Add extra info + extras = [] + if self.is_interactive: + extras.append('interactive') + if self.is_top_element: + extras.append('top') + if self.shadow_root: + extras.append('shadow-root') + if self.highlight_index is not None: + extras.append(f'highlight:{self.highlight_index}') + if self.is_in_viewport: + extras.append('in-viewport') + + if extras: + tag_str += f' [{", ".join(extras)}]' + + return tag_str + + @cached_property + def hash(self) -> HashedDomElement: + from browser_use.dom.history_tree_processor.service import ( + HistoryTreeProcessor, + ) + + return HistoryTreeProcessor._hash_dom_element(self) + + def get_all_text_till_next_clickable_element(self, max_depth: int = -1) -> str: + text_parts = [] + + def collect_text(node: DOMBaseNode, current_depth: int) -> None: + if max_depth != -1 and current_depth > max_depth: + return + + # Skip this branch if we hit a highlighted element (except for the current node) + if isinstance(node, DOMElementNode) and node != self and node.highlight_index is not None: + return + + if isinstance(node, DOMTextNode): + text_parts.append(node.text) + elif isinstance(node, DOMElementNode): + for child in node.children: + collect_text(child, current_depth + 1) + + collect_text(self, 0) + return '\n'.join(text_parts).strip() + + @time_execution_sync('--clickable_elements_to_string') + def clickable_elements_to_string(self, include_attributes: list[str] | None = None) -> str: + """Convert the processed DOM content to HTML.""" + formatted_text = [] + + def process_node(node: DOMBaseNode, depth: int) -> None: + if isinstance(node, DOMElementNode): + # Add element with highlight_index + if node.highlight_index is not None: + attributes_str = '' + text = node.get_all_text_till_next_clickable_element() + if include_attributes: + attributes = list( + set( + [ + str(value) + for key, value in node.attributes.items() + if key in include_attributes and value != node.tag_name + ] + ) + ) + if text in attributes: + attributes.remove(text) + attributes_str = ';'.join(attributes) + line = f'[{node.highlight_index}]<{node.tag_name} ' + if attributes_str: + line += f'{attributes_str}' + if text: + if attributes_str: + line += f'>{text}' + else: + line += f'{text}' + line += '/>' + formatted_text.append(line) + + # Process children regardless + for child in node.children: + process_node(child, depth + 1) + + elif isinstance(node, DOMTextNode): + # Add text only if it doesn't have a highlighted parent + if not node.has_parent_with_highlight_index() and node.is_visible: # and node.is_parent_top_element() + formatted_text.append(f'{node.text}') + + process_node(self, 0) + return '\n'.join(formatted_text) + + def get_file_upload_element(self, check_siblings: bool = True) -> Optional['DOMElementNode']: + # Check if current element is a file input + if self.tag_name == 'input' and self.attributes.get('type') == 'file': + return self + + # Check children + for child in self.children: + if isinstance(child, DOMElementNode): + result = child.get_file_upload_element(check_siblings=False) + if result: + return result + + # Check siblings only for the initial call + if check_siblings and self.parent: + for sibling in self.parent.children: + if sibling is not self and isinstance(sibling, DOMElementNode): + result = sibling.get_file_upload_element(check_siblings=False) + if result: + return result + + return None + + +SelectorMap = dict[int, DOMElementNode] + + +@dataclass +class DOMState: + element_tree: DOMElementNode + selector_map: SelectorMap diff --git a/browser_use/logging_config.py b/browser_use/logging_config.py new file mode 100644 index 0000000000000000000000000000000000000000..043252bd78d0ab711c206be30b96f1a9ff36537f --- /dev/null +++ b/browser_use/logging_config.py @@ -0,0 +1,132 @@ +import logging +import os +import sys + +from dotenv import load_dotenv + +load_dotenv() + + +def addLoggingLevel(levelName, levelNum, methodName=None): + """ + Comprehensively adds a new logging level to the `logging` module and the + currently configured logging class. + + `levelName` becomes an attribute of the `logging` module with the value + `levelNum`. `methodName` becomes a convenience method for both `logging` + itself and the class returned by `logging.getLoggerClass()` (usually just + `logging.Logger`). If `methodName` is not specified, `levelName.lower()` is + used. + + To avoid accidental clobberings of existing attributes, this method will + raise an `AttributeError` if the level name is already an attribute of the + `logging` module or if the method name is already present + + Example + ------- + >>> addLoggingLevel('TRACE', logging.DEBUG - 5) + >>> logging.getLogger(__name__).setLevel('TRACE') + >>> logging.getLogger(__name__).trace('that worked') + >>> logging.trace('so did this') + >>> logging.TRACE + 5 + + """ + if not methodName: + methodName = levelName.lower() + + if hasattr(logging, levelName): + raise AttributeError('{} already defined in logging module'.format(levelName)) + if hasattr(logging, methodName): + raise AttributeError('{} already defined in logging module'.format(methodName)) + if hasattr(logging.getLoggerClass(), methodName): + raise AttributeError('{} already defined in logger class'.format(methodName)) + + # This method was inspired by the answers to Stack Overflow post + # http://stackoverflow.com/q/2183233/2988730, especially + # http://stackoverflow.com/a/13638084/2988730 + def logForLevel(self, message, *args, **kwargs): + if self.isEnabledFor(levelNum): + self._log(levelNum, message, args, **kwargs) + + def logToRoot(message, *args, **kwargs): + logging.log(levelNum, message, *args, **kwargs) + + logging.addLevelName(levelNum, levelName) + setattr(logging, levelName, levelNum) + setattr(logging.getLoggerClass(), methodName, logForLevel) + setattr(logging, methodName, logToRoot) + + +def setup_logging(): + # Try to add RESULT level, but ignore if it already exists + try: + addLoggingLevel('RESULT', 35) # This allows ERROR, FATAL and CRITICAL + except AttributeError: + pass # Level already exists, which is fine + + log_type = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() + + # Check if handlers are already set up + if logging.getLogger().hasHandlers(): + return + + # Clear existing handlers + root = logging.getLogger() + root.handlers = [] + + class BrowserUseFormatter(logging.Formatter): + def format(self, record): + if type(record.name) == str and record.name.startswith('browser_use.'): + record.name = record.name.split('.')[-2] + return super().format(record) + + # Setup single handler for all loggers + console = logging.StreamHandler(sys.stdout) + + # adittional setLevel here to filter logs + if log_type == 'result': + console.setLevel('RESULT') + console.setFormatter(BrowserUseFormatter('%(message)s')) + else: + console.setFormatter(BrowserUseFormatter('%(levelname)-8s [%(name)s] %(message)s')) + + # Configure root logger only + root.addHandler(console) + + # switch cases for log_type + if log_type == 'result': + root.setLevel('RESULT') # string usage to avoid syntax error + elif log_type == 'debug': + root.setLevel(logging.DEBUG) + else: + root.setLevel(logging.INFO) + + # Configure browser_use logger + browser_use_logger = logging.getLogger('browser_use') + browser_use_logger.propagate = False # Don't propagate to root logger + browser_use_logger.addHandler(console) + browser_use_logger.setLevel(root.level) # Set same level as root logger + + logger = logging.getLogger('browser_use') + logger.info('BrowserUse logging setup complete with level %s', log_type) + # Silence third-party loggers + for logger in [ + 'WDM', + 'httpx', + 'selenium', + 'playwright', + 'urllib3', + 'asyncio', + 'langchain', + 'openai', + 'httpcore', + 'charset_normalizer', + 'anthropic._base_client', + 'PIL.PngImagePlugin', + 'trafilatura.htmlprocessing', + 'trafilatura', + ]: + third_party = logging.getLogger(logger) + third_party.setLevel(logging.ERROR) + third_party.propagate = False diff --git a/browser_use/telemetry/service.py b/browser_use/telemetry/service.py new file mode 100644 index 0000000000000000000000000000000000000000..6a2e82e45801516de284816f38de79bfd67a0f84 --- /dev/null +++ b/browser_use/telemetry/service.py @@ -0,0 +1,105 @@ +import logging +import os +import uuid +from pathlib import Path + +from dotenv import load_dotenv +from posthog import Posthog + +from browser_use.telemetry.views import BaseTelemetryEvent +from browser_use.utils import singleton + +load_dotenv() + + +logger = logging.getLogger(__name__) + + +POSTHOG_EVENT_SETTINGS = { + 'process_person_profile': True, +} + + +@singleton +class ProductTelemetry: + """ + Service for capturing anonymized telemetry data. + + If the environment variable `ANONYMIZED_TELEMETRY=False`, anonymized telemetry will be disabled. + """ + + USER_ID_PATH = str(Path.home() / '.cache' / 'browser_use' / 'telemetry_user_id') + PROJECT_API_KEY = 'phc_F8JMNjW1i2KbGUTaW1unnDdLSPCoyc52SGRU0JecaUh' + HOST = 'https://eu.i.posthog.com' + UNKNOWN_USER_ID = 'UNKNOWN' + + _curr_user_id = None + + def __init__(self) -> None: + telemetry_disabled = os.getenv('ANONYMIZED_TELEMETRY', 'true').lower() == 'false' + self.debug_logging = os.getenv('BROWSER_USE_LOGGING_LEVEL', 'info').lower() == 'debug' + + if telemetry_disabled: + self._posthog_client = None + else: + logging.info( + 'Anonymized telemetry enabled. See https://docs.browser-use.com/development/telemetry for more information.' + ) + self._posthog_client = Posthog( + project_api_key=self.PROJECT_API_KEY, + host=self.HOST, + disable_geoip=False, + ) + + # Silence posthog's logging + if not self.debug_logging: + posthog_logger = logging.getLogger('posthog') + posthog_logger.disabled = True + + if self._posthog_client is None: + logger.debug('Telemetry disabled') + + def capture(self, event: BaseTelemetryEvent) -> None: + if self._posthog_client is None: + return + + if self.debug_logging: + logger.debug(f'Telemetry event: {event.name} {event.properties}') + self._direct_capture(event) + + def _direct_capture(self, event: BaseTelemetryEvent) -> None: + """ + Should not be thread blocking because posthog magically handles it + """ + if self._posthog_client is None: + return + + try: + self._posthog_client.capture( + self.user_id, + event.name, + {**event.properties, **POSTHOG_EVENT_SETTINGS}, + ) + except Exception as e: + logger.error(f'Failed to send telemetry event {event.name}: {e}') + + @property + def user_id(self) -> str: + if self._curr_user_id: + return self._curr_user_id + + # File access may fail due to permissions or other reasons. We don't want to + # crash so we catch all exceptions. + try: + if not os.path.exists(self.USER_ID_PATH): + os.makedirs(os.path.dirname(self.USER_ID_PATH), exist_ok=True) + with open(self.USER_ID_PATH, 'w') as f: + new_user_id = str(uuid.uuid4()) + f.write(new_user_id) + self._curr_user_id = new_user_id + else: + with open(self.USER_ID_PATH, 'r') as f: + self._curr_user_id = f.read() + except Exception: + self._curr_user_id = 'UNKNOWN_USER_ID' + return self._curr_user_id diff --git a/browser_use/telemetry/views.py b/browser_use/telemetry/views.py new file mode 100644 index 0000000000000000000000000000000000000000..fdba27303109e6ec12f763b537b2bdee9dca9d95 --- /dev/null +++ b/browser_use/telemetry/views.py @@ -0,0 +1,63 @@ +from abc import ABC, abstractmethod +from dataclasses import asdict, dataclass +from typing import Any, Dict, Sequence + + +@dataclass +class BaseTelemetryEvent(ABC): + @property + @abstractmethod + def name(self) -> str: + pass + + @property + def properties(self) -> Dict[str, Any]: + return {k: v for k, v in asdict(self).items() if k != 'name'} + + +@dataclass +class RegisteredFunction: + name: str + params: dict[str, Any] + + +@dataclass +class ControllerRegisteredFunctionsTelemetryEvent(BaseTelemetryEvent): + registered_functions: list[RegisteredFunction] + name: str = 'controller_registered_functions' + + +@dataclass +class AgentStepTelemetryEvent(BaseTelemetryEvent): + agent_id: str + step: int + step_error: list[str] + consecutive_failures: int + actions: list[dict] + name: str = 'agent_step' + + +@dataclass +class AgentRunTelemetryEvent(BaseTelemetryEvent): + agent_id: str + use_vision: bool + task: str + model_name: str + chat_model_library: str + version: str + source: str + name: str = 'agent_run' + + +@dataclass +class AgentEndTelemetryEvent(BaseTelemetryEvent): + agent_id: str + steps: int + max_steps_reached: bool + is_done: bool + success: bool | None + total_input_tokens: int + total_duration_seconds: float + + errors: Sequence[str | None] + name: str = 'agent_end' diff --git a/browser_use/utils.py b/browser_use/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..860b35a320d83f924483dbba53ed80bfcea27ff2 --- /dev/null +++ b/browser_use/utils.py @@ -0,0 +1,54 @@ +import logging +import time +from functools import wraps +from typing import Any, Callable, Coroutine, ParamSpec, TypeVar + +logger = logging.getLogger(__name__) + + +# Define generic type variables for return type and parameters +R = TypeVar('R') +P = ParamSpec('P') + + +def time_execution_sync(additional_text: str = '') -> Callable[[Callable[P, R]], Callable[P, R]]: + def decorator(func: Callable[P, R]) -> Callable[P, R]: + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + start_time = time.time() + result = func(*args, **kwargs) + execution_time = time.time() - start_time + logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds') + return result + + return wrapper + + return decorator + + +def time_execution_async( + additional_text: str = '', +) -> Callable[[Callable[P, Coroutine[Any, Any, R]]], Callable[P, Coroutine[Any, Any, R]]]: + def decorator(func: Callable[P, Coroutine[Any, Any, R]]) -> Callable[P, Coroutine[Any, Any, R]]: + @wraps(func) + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + start_time = time.time() + result = await func(*args, **kwargs) + execution_time = time.time() - start_time + logger.debug(f'{additional_text} Execution time: {execution_time:.2f} seconds') + return result + + return wrapper + + return decorator + + +def singleton(cls): + instance = [None] + + def wrapper(*args, **kwargs): + if instance[0] is None: + instance[0] = cls(*args, **kwargs) + return instance[0] + + return wrapper diff --git a/codebeaver.yml b/codebeaver.yml new file mode 100644 index 0000000000000000000000000000000000000000..c6a0739f0ff1fa9362996665ef33c5ec2ed9edaa --- /dev/null +++ b/codebeaver.yml @@ -0,0 +1,4 @@ +environment: +- OPENAI_API_KEY=empty +- AZURE_OPENAI_API_KEY=empty +from: pytest diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..297403f266c035835c5c85c31dd66fabe3dd8b0b --- /dev/null +++ b/conftest.py @@ -0,0 +1,10 @@ +import os +import sys + +from browser_use.logging_config import setup_logging + +# Get the absolute path to the project root +project_root = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, project_root) + +setup_logging() diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..10f09abffd8d6d4cce4455f33152a4f616312dcb --- /dev/null +++ b/docs/README.md @@ -0,0 +1,17 @@ +# Docs + +The official documentation for Browser Use. The docs are published to [Browser Use Docs](https://docs.browser-use.com). + +### Development + +Install the [Mintlify CLI](https://www.npmjs.com/package/mintlify) to preview the documentation changes locally. To install, use the following command + +``` +npm i -g mintlify +``` + +Run the following command at the root of your documentation (where mint.json is) + +``` +mintlify dev +``` diff --git a/docs/cloud/implementation.mdx b/docs/cloud/implementation.mdx new file mode 100644 index 0000000000000000000000000000000000000000..8c894017709f692c9bf319e561318dfd5d0dfdf4 --- /dev/null +++ b/docs/cloud/implementation.mdx @@ -0,0 +1,107 @@ +--- +title: "Implementing the API" +description: "Learn how to implement the Browser Use API in Python" +icon: "code" +--- + +This guide shows how to implement common API patterns using Python. We'll create a complete example that creates and monitors a browser automation task. + +## Basic Implementation + +For all settings see [Run Task](cloud/api-v10/run-task). + +Here's a simple implementation using Python's `requests` library to stream the task steps: + +```python +import json +import time + +import requests + +API_KEY = 'your_api_key_here' +BASE_URL = 'https://api.browser-use.com/api/v1' +HEADERS = {'Authorization': f'Bearer {API_KEY}'} + + +def create_task(instructions: str): + """Create a new browser automation task""" + response = requests.post(f'{BASE_URL}/run-task', headers=HEADERS, json={'task': instructions}) + return response.json()['id'] + + +def get_task_status(task_id: str): + """Get current task status""" + response = requests.get(f'{BASE_URL}/task/{task_id}/status', headers=HEADERS) + return response.json() + + +def get_task_details(task_id: str): + """Get full task details including output""" + response = requests.get(f'{BASE_URL}/task/{task_id}', headers=HEADERS) + return response.json() + + +def wait_for_completion(task_id: str, poll_interval: int = 2): + """Poll task status until completion""" + count = 0 + unique_steps = [] + while True: + details = get_task_details(task_id) + new_steps = details['steps'] + # use only the new steps that are not in unique_steps. + if new_steps != unique_steps: + for step in new_steps: + if step not in unique_steps: + print(json.dumps(step, indent=4)) + unique_steps = new_steps + count += 1 + status = details['status'] + + if status in ['finished', 'failed', 'stopped']: + return details + time.sleep(poll_interval) + + +def main(): + task_id = create_task('Open https://www.google.com and search for openai') + print(f'Task created with ID: {task_id}') + task_details = wait_for_completion(task_id) + print(f"Final output: {task_details['output']}") + + +if __name__ == '__main__': + main() + +``` + +## Task Control Example + +Here's how to implement task control with pause/resume functionality: + +```python +def control_task(): + # Create a new task + task_id = create_task("Go to google.com and search for Browser Use") + + # Wait for 5 seconds + time.sleep(5) + + # Pause the task + requests.put(f"{BASE_URL}/pause-task?task_id={task_id}", headers=HEADERS) + print("Task paused! Check the live preview.") + + # Wait for user input + input("Press Enter to resume...") + + # Resume the task + requests.put(f"{BASE_URL}/resume-task?task_id={task_id}", headers=HEADERS) + + # Wait for completion + result = wait_for_completion(task_id) + print(f"Task completed with output: {result['output']}") +``` + + + Remember to handle your API key securely and implement proper error handling + in production code. + diff --git a/docs/cloud/quickstart.mdx b/docs/cloud/quickstart.mdx new file mode 100644 index 0000000000000000000000000000000000000000..68ef795f9f7e0795afb5bedfabba5d09820b533c --- /dev/null +++ b/docs/cloud/quickstart.mdx @@ -0,0 +1,103 @@ +--- +title: "Quickstart" +description: "Learn how to get started with the Browser Use Cloud API" +icon: "cloud" +--- + +The Browser Use Cloud API lets you create and manage browser automation agents programmatically. Each agent can execute tasks and provide real-time feedback through a live preview URL. + +## Prerequisites + + + You need an active subscription and an API key from + [cloud.browser-use.com/billing](https://cloud.browser-use.com/billing) + + +## Pricing + +The Browser Use Cloud API is priced at $0.05 per step that the agent executes. + + + Since Browser Use can execute multiple steps at the same time, + the price for filling out forms is much lower than other services. + + +## Creating Your First Agent + +Create a new browser automation task by providing instructions in natural language: + +```bash +curl -X POST https://api.browser-use.com/api/v1/run-task \ + -H "Authorization: Bearer your_api_key_here" \ + -H "Content-Type: application/json" \ + -d '{ + "task": "Go to google.com and search for Browser Use" + }' +``` + +The API returns a task ID that you can use to manage the task and check the live preview URL. + + + The task response includes a `live_url` that you can embed in an iframe to + watch and control the agent in real-time. + + +## Managing Tasks + +Control running tasks with these operations: + + + + Temporarily pause task execution with [`/api/v1/pause-task`](/cloud/api-v1/pause-task) and resume with + [`/api/v1/resume-task`](/cloud/api-v1/resume-task). Useful for manual inspection or intervention. + + + + Permanently stop a task using [`/api/v1/stop-task`](/cloud/api-v1/stop-task). The task cannot be + resumed after being stopped. + + + +For detailed API documentation, see the tabs on the left, which include the full coverage of the API. + +## Building your own client (OpenAPI) + + + We recommend this only if you don't need control and only need to run simple + tasks. + + +The best way to build your own client is to use our [OpenAPI specification](http://api.browser-use.com/openapi.json) to generate a type-safe client library. + +### Python + +Use [openapi-python-client](https://github.com/openapi-generators/openapi-python-client) to generate a modern Python client: + +```bash +# Install the generator +pipx install openapi-python-client --include-deps + +# Generate the client +openapi-python-client generate --url http://api.browser-use.com/openapi.json +``` + +This will create a Python package with full type hints, modern dataclasses, and async support. + +### TypeScript/JavaScript + +For TypeScript projects, use [openapi-typescript](https://www.npmjs.com/package/openapi-typescript) to generate type definitions: + +```bash +# Install the generator +npm install -D openapi-typescript + +# Generate the types +npx openapi-typescript http://api.browser-use.com/openapi.json -o browser-use-api.ts +``` + +This will create TypeScript definitions you can use with your preferred HTTP client. + + + Need help? Contact our support team at support@browser-use.com or join our + [Discord community](https://link.browser-use.com/discord) + diff --git a/docs/customize/agent-settings.mdx b/docs/customize/agent-settings.mdx new file mode 100644 index 0000000000000000000000000000000000000000..58371705b09fa69955709bf65528bbba3666ab75 --- /dev/null +++ b/docs/customize/agent-settings.mdx @@ -0,0 +1,215 @@ +--- +title: "Agent Settings" +description: "Learn how to configure the agent" +icon: "gear" +--- + +## Overview + +The `Agent` class is the core component of Browser Use that handles browser automation. Here are the main configuration options you can use when initializing an agent. + +## Basic Settings + +```python +from browser_use import Agent +from langchain_openai import ChatOpenAI + +agent = Agent( + task="Search for latest news about AI", + llm=ChatOpenAI(model="gpt-4o"), +) +``` + +### Required Parameters + +- `task`: The instruction for the agent to execute +- `llm`: A LangChain chat model instance. See LangChain Models for supported models. + +## Agent Behavior + +Control how the agent operates: + +```python +agent = Agent( + task="your task", + llm=llm, + controller=custom_controller, # For custom tool calling + use_vision=True, # Enable vision capabilities + save_conversation_path="logs/conversation" # Save chat logs +) +``` + +### Behavior Parameters + +- `controller`: Registry of functions the agent can call. Defaults to base Controller. See Custom Functions for details. +- `use_vision`: Enable/disable vision capabilities. Defaults to `True`. + - When enabled, the model processes visual information from web pages + - Disable to reduce costs or use models without vision support + - For GPT-4o, image processing costs approximately 800-1000 tokens (~$0.002 USD) per image (but this depends on the defined screen size) +- `save_conversation_path`: Path to save the complete conversation history. Useful for debugging. +- `system_prompt_class`: Custom system prompt class. See System Prompt for customization options. + + + Vision capabilities are recommended for better web interaction understanding, + but can be disabled to reduce costs or when using models without vision + support. + + +## (Reuse) Browser Configuration + +You can configure how the agent interacts with the browser. To see more `Browser` options refer to the Browser Settings documentation. + +### Reuse Existing Browser + +`browser`: A Browser Use Browser instance. When provided, the agent will reuse this browser instance and automatically create new contexts for each `run()`. + +```python +from browser_use import Agent, Browser +from browser_use.browser.context import BrowserContext + +# Reuse existing browser +browser = Browser() +agent = Agent( + task=task1, + llm=llm, + browser=browser # Browser instance will be reused +) + +await agent.run() + +# Manually close the browser +await browser.close() +``` + + + Remember: in this scenario the `Browser` will not be closed automatically. + + +### Reuse Existing Browser Context + +`browser_context`: A Playwright browser context. Useful for maintaining persistent sessions. See Persistent Browser for more details. + +```python +from browser_use import Agent, Browser +from playwright.async_api import BrowserContext + +# Use specific browser context (preferred method) +async with await browser.new_context() as context: + agent = Agent( + task=task2, + llm=llm, + browser_context=context # Use persistent context + ) + + # Run the agent + await agent.run() + + # Pass the context to the next agent + next_agent = Agent( + task=task2, + llm=llm, + browser_context=context + ) + + ... + +await browser.close() +``` + +For more information about how browser context works, refer to the [Playwright +documentation](https://playwright.dev/docs/api/class-browsercontext). + + + You can reuse the same context for multiple agents. If you do nothing, the + browser will be automatically created and closed on `run()` completion. + + +## Running the Agent + +The agent is executed using the async `run()` method: + +- `max_steps` (default: `100`) + Maximum number of steps the agent can take during execution. This prevents infinite loops and helps control execution time. + +## Agent History + +The method returns an `AgentHistoryList` object containing the complete execution history. This history is invaluable for debugging, analysis, and creating reproducible scripts. + +```python +# Example of accessing history +history = await agent.run() + +# Access (some) useful information +history.urls() # List of visited URLs +history.screenshots() # List of screenshot paths +history.action_names() # Names of executed actions +history.extracted_content() # Content extracted during execution +history.errors() # Any errors that occurred +history.model_actions() # All actions with their parameters +``` + +The `AgentHistoryList` provides many helper methods to analyze the execution: + +- `final_result()`: Get the final extracted content +- `is_done()`: Check if the agent completed successfully +- `has_errors()`: Check if any errors occurred +- `model_thoughts()`: Get the agent's reasoning process +- `action_results()`: Get results of all actions + + + For a complete list of helper methods and detailed history analysis + capabilities, refer to the [AgentHistoryList source + code](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/views.py#L111). + + +## Run initial actions without LLM +With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/initial_actions.py) you can run initial actions without the LLM. +Specify the action as a dictionary where the key is the action name and the value is the action parameters. You can find all our actions in the [Controller](https://github.com/browser-use/browser-use/blob/main/browser_use/controller/service.py) source code. +```python + +initial_actions = [ + {'open_tab': {'url': 'https://www.google.com'}}, + {'open_tab': {'url': 'https://en.wikipedia.org/wiki/Randomness'}}, + {'scroll_down': {'amount': 1000}}, +] +agent = Agent( + task='What theories are displayed on the page?', + initial_actions=initial_actions, + llm=llm, +) +``` + +## Run with planner model + +You can configure the agent to use a separate planner model for high-level task planning: + +```python +from langchain_openai import ChatOpenAI + +# Initialize models +llm = ChatOpenAI(model='gpt-4o') +planner_llm = ChatOpenAI(model='o3-mini') + +agent = Agent( + task="your task", + llm=llm, + planner_llm=planner_llm, # Separate model for planning + use_vision_for_planner=False, # Disable vision for planner + planner_interval=4 # Plan every 4 steps +) +``` + +### Planner Parameters + +- `planner_llm`: A LangChain chat model instance used for high-level task planning. Can be a smaller/cheaper model than the main LLM. +- `use_vision_for_planner`: Enable/disable vision capabilities for the planner model. Defaults to `True`. +- `planner_interval`: Number of steps between planning phases. Defaults to `1`. + +Using a separate planner model can help: +- Reduce costs by using a smaller model for high-level planning +- Improve task decomposition and strategic thinking +- Better handle complex, multi-step tasks + + + The planner model is optional. If not specified, the agent will not use the planner model. + diff --git a/docs/customize/browser-settings.mdx b/docs/customize/browser-settings.mdx new file mode 100644 index 0000000000000000000000000000000000000000..41995f9e1eb0c9b3de9abd8d99e65d7e18da53a6 --- /dev/null +++ b/docs/customize/browser-settings.mdx @@ -0,0 +1,179 @@ +--- +title: "Browser Settings" +description: "Configure browser behavior and context settings" +icon: "globe" +--- + +Browser Use allows you to customize the browser's behavior through two main configuration classes: `BrowserConfig` and `BrowserContextConfig`. These settings control everything from headless mode to proxy settings and page load behavior. + + + We are currently working on improving how browser contexts are managed. The + system will soon transition to a "1 agent, 1 browser, 1 context" model for + better stability and developer experience. + + +# Browser Configuration + +The `BrowserConfig` class controls the core browser behavior and connection settings. + +```python +from browser_use import BrowserConfig + +# Basic configuration +config = BrowserConfig( + headless=False, + disable_security=True +) + +browser = Browser(config=config) + +agent = Agent( + browser=browser, + # ... +) +``` + +## Core Settings + +- **headless** (default: `False`) + Runs the browser without a visible UI. Note that some websites may detect headless mode. + +- **disable_security** (default: `True`) + Disables browser security features. While this can fix certain functionality issues (like cross-site iFrames), it should be used cautiously, especially when visiting untrusted websites. + +### Additional Settings + +- **extra_chromium_args** (default: `[]`) + Additional arguments are passed to the browser at launch. See the [full list of available arguments](https://github.com/browser-use/browser-use/blob/main/browser_use/browser/browser.py#L180). + +- **proxy** (default: `None`) + Standard Playwright proxy settings for using external proxy services. + +- **new_context_config** (default: `BrowserContextConfig()`) + Default settings for new browser contexts. See Context Configuration below. + + + For web scraping tasks on sites that restrict automated access, we recommend + using external browser or proxy providers for better reliability. + + +## Alternative Initialization + +These settings allow you to connect to external browser providers or use a local Chrome instance. + +### External Browser Provider (wss) + +Connect to cloud-based browser services for enhanced reliability and proxy capabilities. + +```python +config = BrowserConfig( + wss_url="wss://your-browser-provider.com/ws" +) +``` + +- **wss_url** (default: `None`) + WebSocket URL for connecting to external browser providers (e.g., anchorbrowser.com, steel.dev, browserbase.com, browserless.io). + + + This overrides local browser settings and uses the provider's configuration. + Refer to their documentation for settings. + + +### External Browser Provider (cdp) + +Connect to cloud or local Chrome instances using Chrome DevTools Protocol (CDP) for use with tools like `headless-shell` or `browserless`. + +```python +config = BrowserConfig( + cdp_url="http://localhost:9222" +) +``` + +- **cdp_url** (default: `None`) + URL for connecting to a Chrome instance via CDP. Commonly used for debugging or connecting to locally running Chrome instances. + +### Local Chrome Instance (binary) + +Connect to your existing Chrome installation to access saved states and cookies. + +```python +config = BrowserConfig( + chrome_instance_path="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" +) +``` + +- **chrome_instance_path** (default: `None`) + Path to connect to an existing Chrome installation. Particularly useful for workflows requiring existing login states or browser preferences. + +This will overwrite other browser settings. + +# Context Configuration + +The `BrowserContextConfig` class controls settings for individual browser contexts. + +```python +from browser_use.browser.context import BrowserContextConfig + +config = BrowserContextConfig( + cookies_file="path/to/cookies.json", + wait_for_network_idle_page_load_time=3.0, + browser_window_size={'width': 1280, 'height': 1100}, + locale='en-US', + user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36', + highlight_elements=True, + viewport_expansion=500, + allowed_domains=['google.com', 'wikipedia.org'], +) + +browser = Browser() +context = BrowserContext(browser=browser, config=config) + + +async def run_search(): + agent = Agent( + browser_context=context, + task='Your task', + llm=llm) +``` + +## Configuration Options + +### Page Load Settings + +- **minimum_wait_page_load_time** (default: `0.5`) + Minimum time to wait before capturing page state for LLM input. + +- **wait_for_network_idle_page_load_time** (default: `1.0`) + Time to wait for network activity to cease. Increase to 3-5s for slower websites. This tracks essential content loading, not dynamic elements like videos. + +- **maximum_wait_page_load_time** (default: `5.0`) + Maximum time to wait for page load before proceeding. + +### Display Settings + +- **browser_window_size** (default: `{'width': 1280, 'height': 1100}`) + Browser window dimensions. The default size is optimized for general use cases and interaction with common UI elements like cookie banners. + +- **locale** (default: `None`) + Specify user locale, for example en-GB, de-DE, etc. Locale will affect the navigator. Language value, Accept-Language request header value as well as number and date formatting rules. If not provided, defaults to the system default locale. + +- **highlight_elements** (default: `True`) + Highlight interactive elements on the screen with colorful bounding boxes. + +- **viewport_expansion** (default: `500`) + Viewport expansion in pixels. With this you can controll how much of the page is included in the context of the LLM. If set to -1, all elements from the entire page will be included (this leads to high token usage). If set to 0, only the elements which are visible in the viewport will be included. + Default is 500 pixels, that means that we inlcude a little bit more than the visible viewport inside the context. + +### Restrict URLs + +- **allowed_domains** (default: `None`) + List of allowed domains that the agent can access. If None, all domains are allowed. + Example: ['google.com', 'wikipedia.org'] - Here the agent will only be able to access google and wikipedia. + +### Debug and Recording + +- **save_recording_path** (default: `None`) + Directory path for saving video recordings. + +- **trace_path** (default: `None`) + Directory path for saving trace files. Files are automatically named as `{trace_path}/{context_id}.zip`. diff --git a/docs/customize/custom-functions.mdx b/docs/customize/custom-functions.mdx new file mode 100644 index 0000000000000000000000000000000000000000..5e3ceb95c263d8714ad095bc278c7faae691e3a0 --- /dev/null +++ b/docs/customize/custom-functions.mdx @@ -0,0 +1,128 @@ +--- +title: "Custom Functions" +description: "Extend default agent and write custom function calls" +icon: "function" +--- + +## Basic Function Registration + +Functions can be either `sync` or `async`. Keep them focused and single-purpose. + +```python +from browser_use import Controller, ActionResult +# Initialize the controller +controller = Controller() + +@controller.action('Ask user for information') +def ask_human(question: str) -> str: + answer = input(f'\n{question}\nInput: ') + return ActionResult(extracted_content=answer) +``` + + + Basic `Controller` has all basic functionality you might need to interact with + the browser already implemented. + + +```python +# ... then pass controller to the agent +agent = Agent( + task=task, + llm=llm, + controller=controller +) +``` + + + Keep the function name and description short and concise. The Agent use the + function solely based on the name and description. The stringified output of + the action is passed to the Agent. + + +## Browser-Aware Functions + +For actions that need browser access, simply add the `browser` parameter inside the function parameters: + +```python +from browser_use import Browser, Controller, ActionResult + +controller = Controller() +@controller.action('Open website') +async def open_website(url: str, browser: Browser): + page = browser.get_current_page() + await page.goto(url) + return ActionResult(extracted_content='Website opened') +``` + +## Structured Parameters with Pydantic + +For complex actions, you can define parameter schemas using Pydantic models: + +```python +from pydantic import BaseModel +from typing import Optional +from browser_use import Controller, ActionResult, Browser + +controller = Controller() + +class JobDetails(BaseModel): + title: str + company: str + job_link: str + salary: Optional[str] = None + +@controller.action( + 'Save job details which you found on page', + param_model=JobDetails +) +async def save_job(params: JobDetails, browser: Browser): + print(f"Saving job: {params.title} at {params.company}") + + # Access browser if needed + page = browser.get_current_page() + await page.goto(params.job_link) +``` + +## Using Custom Actions with multiple agents + +You can use the same controller for multiple agents. + +```python +controller = Controller() + +# ... register actions to the controller + +agent = Agent( + task="Go to website X and find the latest news", + llm=llm, + controller=controller +) + +# Run the agent +await agent.run() + +agent2 = Agent( + task="Go to website Y and find the latest news", + llm=llm, + controller=controller +) + +await agent2.run() +``` + + + The controller is stateless and can be used to register multiple actions and + multiple agents. + + + + +## Exclude functions +If you want less actions to be used by the agent, you can exclude them from the controller. +```python +controller = Controller(exclude_actions=['open_tab', 'search_google']) +``` + + +For more examples like file upload or notifications, visit [examples/custom-functions](https://github.com/browser-use/browser-use/tree/main/examples/custom-functions). + diff --git a/docs/customize/output-format.mdx b/docs/customize/output-format.mdx new file mode 100644 index 0000000000000000000000000000000000000000..d893a7513b9ae0bf1f1957e84d169512df22b350 --- /dev/null +++ b/docs/customize/output-format.mdx @@ -0,0 +1,50 @@ +--- +title: "Output Format" +description: "The default is text. But you can define a structured output format to make post-processing easier." +icon: "code" +--- + +## Custom output format +With [this example](https://github.com/browser-use/browser-use/blob/main/examples/features/custom_output.py) you can define what output format the agent should return to you. + +```python +from pydantic import BaseModel +# Define the output format as a Pydantic model +class Post(BaseModel): + post_title: str + post_url: str + num_comments: int + hours_since_post: int + + +class Posts(BaseModel): + posts: List[Post] + + +controller = Controller(output_model=Posts) + + +async def main(): + task = 'Go to hackernews show hn and give me the first 5 posts' + model = ChatOpenAI(model='gpt-4o') + agent = Agent(task=task, llm=model, controller=controller) + + history = await agent.run() + + result = history.final_result() + if result: + parsed: Posts = Posts.model_validate_json(result) + + for post in parsed.posts: + print('\n--------------------------------') + print(f'Title: {post.post_title}') + print(f'URL: {post.post_url}') + print(f'Comments: {post.num_comments}') + print(f'Hours since post: {post.hours_since_post}') + else: + print('No result') + + +if __name__ == '__main__': + asyncio.run(main()) +``` \ No newline at end of file diff --git a/docs/customize/real-browser.mdx b/docs/customize/real-browser.mdx new file mode 100644 index 0000000000000000000000000000000000000000..aafb92f9129d80908c33161403eae822d9a88d0b --- /dev/null +++ b/docs/customize/real-browser.mdx @@ -0,0 +1,53 @@ +--- +title: "Connect to your Browser" +description: "With this you can connect to your real browser, where you are logged in with all your accounts." +icon: "computer" +--- + +## Overview + +You can connect the agent to your real Chrome browser instance, allowing it to access your existing browser profile with all your logged-in accounts and settings. This is particularly useful when you want the agent to interact with services where you're already authenticated. + + + First make sure to close all running Chrome instances. + + +## Basic Configuration + +To connect to your real Chrome browser, you'll need to specify the path to your Chrome executable when creating the Browser instance: + +```python +from browser_use import Agent, Browser, BrowserConfig +from langchain_openai import ChatOpenAI +import asyncio +# Configure the browser to connect to your Chrome instance +browser = Browser( + config=BrowserConfig( + # Specify the path to your Chrome executable + chrome_instance_path='/Applications/Google Chrome.app/Contents/MacOS/Google Chrome', # macOS path + # For Windows, typically: 'C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe' + # For Linux, typically: '/usr/bin/google-chrome' + ) +) + +# Create the agent with your configured browser +agent = Agent( + task="Your task here", + llm=ChatOpenAI(model='gpt-4o'), + browser=browser, +) + +async def main(): + await agent.run() + + input('Press Enter to close the browser...') + await browser.close() + +if __name__ == '__main__': + asyncio.run(main()) +``` + + + + When using your real browser, the agent will have access to all your logged-in sessions. Make sure to ALWAYS review the task you're giving to the agent and ensure it aligns with your security requirements! + diff --git a/docs/customize/sensitive-data.mdx b/docs/customize/sensitive-data.mdx new file mode 100644 index 0000000000000000000000000000000000000000..4130eff4d110498dcf733bdc651d20aac04f21e7 --- /dev/null +++ b/docs/customize/sensitive-data.mdx @@ -0,0 +1,50 @@ +--- +title: "Sensitive Data" +description: "Handle sensitive information securely by preventing the model from seeing actual passwords." +icon: "shield" +--- + +## Handling Sensitive Data + +When working with sensitive information like passwords, you can use the `sensitive_data` parameter to prevent the model from seeing the actual values while still allowing it to reference them in its actions. + +Here's an example of how to use sensitive data: + +```python +from dotenv import load_dotenv +from langchain_openai import ChatOpenAI +from browser_use import Agent + +load_dotenv() + +# Initialize the model +llm = ChatOpenAI( + model='gpt-4o', + temperature=0.0, +) + +# Define sensitive data +# The model will only see the keys (x_name, x_password) but never the actual values +sensitive_data = {'x_name': 'magnus', 'x_password': '12345678'} + +# Use the placeholder names in your task description +task = 'go to x.com and login with x_name and x_password then write a post about the meaning of life' + +# Pass the sensitive data to the agent +agent = Agent(task=task, llm=llm, sensitive_data=sensitive_data) + +async def main(): + await agent.run() + +if __name__ == '__main__': + asyncio.run(main()) +``` + +In this example: +1. The model only sees `x_name` and `x_password` as placeholders. +2. When the model wants to use your password it outputs x_password - and we replace it with the actual value. +3. When your password is visable on the current page, we replace it in the LLM input - so that the model never has it in its state. + +Warning: Vision models still see the image of the page - where the sensitive data might be visible. + +This approach ensures that sensitive information remains secure while still allowing the agent to perform tasks that require authentication. \ No newline at end of file diff --git a/docs/customize/supported-models.mdx b/docs/customize/supported-models.mdx new file mode 100644 index 0000000000000000000000000000000000000000..1798cfbfbc3dfd3828cf128c02d48c7cf98c448c --- /dev/null +++ b/docs/customize/supported-models.mdx @@ -0,0 +1,223 @@ +--- +title: "Supported Models" +description: "Guide to using different LangChain chat models with Browser Use" +icon: "robot" +--- + +## Overview + +Browser Use supports various LangChain chat models. Here's how to configure and use the most popular ones. The full list is available in the [LangChain documentation](https://python.langchain.com/docs/integrations/chat/). + +## Model Recommendations + +We have yet to test performance across all models. Currently, we achieve the best results using GPT-4o with an 89% accuracy on the [WebVoyager Dataset](https://browser-use.com/posts/sota-technical-report). DeepSeek-V3 is 30 times cheaper than GPT-4o. Gemini-2.0-exp is also gaining popularity in the community because it is currently free. +We also support local models, like Qwen 2.5, but be aware that small models often return the wrong output structure-which lead to parsing errors. We believe that local models will improve significantly this year. + + + + All models require their respective API keys. Make sure to set them in your + environment variables before running the agent. + + +## Supported Models + +All LangChain chat models, which support tool-calling are available. We will document the most popular ones here. + +### OpenAI + +OpenAI's GPT-4o models are recommended for best performance. + +```python +from langchain_openai import ChatOpenAI +from browser_use import Agent + +# Initialize the model +llm = ChatOpenAI( + model="gpt-4o", + temperature=0.0, +) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm +) +``` + +Required environment variables: + +```bash .env +OPENAI_API_KEY= +``` + +### Anthropic + + +```python +from langchain_anthropic import ChatAnthropic +from browser_use import Agent + +# Initialize the model +llm = ChatAnthropic( + model_name="claude-3-5-sonnet-20240620", + temperature=0.0, + timeout=100, # Increase for complex tasks +) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm +) +``` + +And add the variable: + +```bash .env +ANTHROPIC_API_KEY= +``` + +### Azure OpenAI + +```python +from langchain_openai import AzureChatOpenAI +from browser_use import Agent +from pydantic import SecretStr +import os + +# Initialize the model +llm = AzureChatOpenAI( + model="gpt-4o", + api_version='2024-10-21', + azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT', ''), + api_key=SecretStr(os.getenv('AZURE_OPENAI_KEY', '')), +) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm +) +``` + +Required environment variables: + +```bash .env +AZURE_OPENAI_ENDPOINT=https://your-endpoint.openai.azure.com/ +AZURE_OPENAI_KEY= +``` + + +### Gemini + +```python +from langchain_google_genai import ChatGoogleGenerativeAI +from browser_use import Agent +from pydantic import SecretStr +import os +from dotenv import load_dotenv +load_dotenv() + +api_key = os.getenv("GEMINI_API_KEY") + +# Initialize the model +llm = ChatGoogleGenerativeAI(model='gemini-2.0-flash-exp', api_key=SecretStr(os.getenv('GEMINI_API_KEY'))) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm +) +``` + +Required environment variables: + +```bash .env +GEMINI_API_KEY= +``` + + +### DeepSeek-V3 +The community likes DeepSeek-V3 for its low price, no rate limits, open-source nature, and good performance. +The example is available [here](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek.py). + +```python +from langchain_openai import ChatOpenAI +from browser_use import Agent +from pydantic import SecretStr + + +# Initialize the model +llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-chat', api_key=SecretStr(api_key)) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm, + use_vision=False +) +``` + +Required environment variables: + +```bash .env +DEEPSEEK_API_KEY= +``` + +### DeepSeek-R1 +We support DeepSeek-R1. Its not fully tested yet, more and more functionality will be added, like e.g. the output of it'sreasoning content. +The example is available [here](https://github.com/browser-use/browser-use/blob/main/examples/models/deepseek-r1.py). +It does not support vision. The model is open-source so you could also use it with Ollama, but we have not tested it. +```python +from langchain_openai import ChatOpenAI +from browser_use import Agent +from pydantic import SecretStr + + +# Initialize the model +llm=ChatOpenAI(base_url='https://api.deepseek.com/v1', model='deepseek-reasoner', api_key=SecretStr(api_key)) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm, + use_vision=False +) +``` + +Required environment variables: + +```bash .env +DEEPSEEK_API_KEY= +``` + +### Ollama +Many users asked for local models. Here they are. + +1. Download Ollama from [here](https://ollama.ai/download) +2. Run `ollama pull model_name`. Pick a model which supports tool-calling from [here](https://ollama.com/search?c=tools) +3. Run `ollama start` + +```python +from langchain_ollama import ChatOllama +from browser_use import Agent +from pydantic import SecretStr + + +# Initialize the model +llm=ChatOllama(model="qwen2.5", num_ctx=32000) + +# Create agent with the model +agent = Agent( + task="Your task here", + llm=llm +) +``` + +Required environment variables: None! + +## Coming soon +(We are working on it) +- Groq +- Github +- Fine-tuned models diff --git a/docs/customize/system-prompt.mdx b/docs/customize/system-prompt.mdx new file mode 100644 index 0000000000000000000000000000000000000000..ef7b5343a021eb928aa32fc53a614cb1c4167a87 --- /dev/null +++ b/docs/customize/system-prompt.mdx @@ -0,0 +1,66 @@ +--- +title: "System Prompt" +description: "Customize the system prompt to control agent behavior and capabilities" +icon: "message" +--- + +## Overview + +You can customize the system prompt in two ways: + +1. Extend the default system prompt with additional instructions +2. Override the default system prompt entirely + + + Custom system prompts allow you to modify the agent's behavior at a + fundamental level. Use this feature carefully as it can significantly impact + the agent's performance and reliability. + + +### Extend System Prompt (recommended) + +To add additional instructions to the default system prompt: + +```python +from browser_use import Agent +from langchain_openai import ChatOpenAI + +# Add your custom instructions +extend_system_message = """ +REMEMBER the most important RULE: +ALWAYS open first a new tab and go first to url wikipedia.com no matter the task!!! +""" + +# Create agent with extended system prompt +agent = Agent( + task="Your task here", + llm=ChatOpenAI(model='gpt-4'), + extend_system_message=extend_system_message +) +``` + +### Override System Prompt + + + Not recommended! If you must override the [default system + prompt](https://github.com/browser-use/browser-use/blob/main/browser_use/agent/system_prompt.md), + make sure to test the agent yourself. + + +Anyway, to override the default system prompt: + +```python +# Define your complete custom prompt +override_system_message = """ +You are an AI agent that helps users with web browsing tasks. + +[Your complete custom instructions here...] +""" + +# Create agent with custom system prompt +agent = Agent( + task="Your task here", + llm=ChatOpenAI(model='gpt-4'), + override_system_message=override_system_message +) +``` diff --git a/docs/development.mdx b/docs/development.mdx new file mode 100644 index 0000000000000000000000000000000000000000..c2f2ccdae147db3df4b46d491b3fc6d4fcb2d2f5 --- /dev/null +++ b/docs/development.mdx @@ -0,0 +1,106 @@ +--- +title: 'Development' +description: 'Preview changes locally to update your docs' +--- + + + **Prerequisite**: Please install Node.js (version 19 or higher) before proceeding. + + +Follow these steps to install and run Mintlify on your operating system: + +**Step 1**: Install Mintlify: + + + + ```bash npm + npm i -g mintlify + ``` + +```bash yarn +yarn global add mintlify +``` + + + +**Step 2**: Navigate to the docs directory (where the `mint.json` file is located) and execute the following command: + +```bash +mintlify dev +``` + +A local preview of your documentation will be available at `http://localhost:3000`. + +### Custom Ports + +By default, Mintlify uses port 3000. You can customize the port Mintlify runs on by using the `--port` flag. To run Mintlify on port 3333, for instance, use this command: + +```bash +mintlify dev --port 3333 +``` + +If you attempt to run Mintlify on a port that's already in use, it will use the next available port: + +```md +Port 3000 is already in use. Trying 3001 instead. +``` + +## Mintlify Versions + +Please note that each CLI release is associated with a specific version of Mintlify. If your local website doesn't align with the production version, please update the CLI: + + + +```bash npm +npm i -g mintlify@latest +``` + +```bash yarn +yarn global upgrade mintlify +``` + + + +## Validating Links + +The CLI can assist with validating reference links made in your documentation. To identify any broken links, use the following command: + +```bash +mintlify broken-links +``` + +## Deployment + + + Unlimited editors available under the [Pro + Plan](https://mintlify.com/pricing) and above. + + +If the deployment is successful, you should see the following: + + + + + +## Code Formatting + +We suggest using extensions on your IDE to recognize and format MDX. If you're a VSCode user, consider the [MDX VSCode extension](https://marketplace.visualstudio.com/items?itemName=unifiedjs.vscode-mdx) for syntax highlighting, and [Prettier](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode) for code formatting. + +## Troubleshooting + + + + + This may be due to an outdated version of node. Try the following: + 1. Remove the currently-installed version of mintlify: `npm remove -g mintlify` + 2. Upgrade to Node v19 or higher. + 3. Reinstall mintlify: `npm install -g mintlify` + + + + + Solution: Go to the root of your device and delete the \~/.mintlify folder. Afterwards, run `mintlify dev` again. + + + +Curious about what changed in the CLI version? [Check out the CLI changelog.](https://www.npmjs.com/package/mintlify?activeTab=versions) diff --git a/docs/development/contribution-guide.mdx b/docs/development/contribution-guide.mdx new file mode 100644 index 0000000000000000000000000000000000000000..37b551953577b1168c41086f3a381f4214fec9c8 --- /dev/null +++ b/docs/development/contribution-guide.mdx @@ -0,0 +1,7 @@ +--- +title: "Contribution Guide" +description: "Learn how to contribute to Browser Use" +icon: "code-pull-request" +--- + +Working on it! diff --git a/docs/development/local-setup.mdx b/docs/development/local-setup.mdx new file mode 100644 index 0000000000000000000000000000000000000000..da2bc3f7118fd1a7f3b468a7d63c65fa4d8d96af --- /dev/null +++ b/docs/development/local-setup.mdx @@ -0,0 +1,82 @@ +--- +title: "Local Setup" +description: "Set up Browser Use development environment locally" +icon: "laptop-code" +--- + +## Prerequisites + +Browser Use requires Python 3.11 or higher. We recommend using [uv](https://docs.astral.sh/uv/) for Python environment management. + +## Clone the Repository + +First, clone the Browser Use repository: + +```bash +git clone https://github.com/browser-use/browser-use +cd browser-use +``` + +## Environment Setup + +1. Create a virtual environment: + +```bash +uv venv --python 3.11 +``` + +2. Install dependencies: + +```bash +# Install the package in editable mode with all development dependencies +uv pip install -e ".[dev]" +``` + + + The `-e` flag installs the package in "editable" mode, which means your local code changes + will be reflected immediately without requiring reinstallation. The `[dev]` part installs + additional dependencies needed for development. + + +## Configuration + +Set up your environment variables: + +```bash +# Copy the example environment file +cp .env.example .env +``` + +Or manually create a `.env` file with your API keys: + +```bash .env +OPENAI_API_KEY= +ANTHROPIC_API_KEY= +``` + + + You can use any LLM model supported by LangChain. See [LangChain + Models](/customize/supported-models) for available options and their specific + API key requirements. + + +## Development + +After setup, you can: + +- Run tests with `pytest` +- Build the package with `hatch build` +- Try the examples in the `examples/` directory + +## Getting Help + +If you run into any issues: + +1. Check our [GitHub Issues](https://github.com/browser-use/browser-use/issues) +2. Join our [Discord community](https://link.browser-use.com/discord) for support + + + We welcome contributions! See our [Contribution + Guide](/development/contribution-guide) for guidelines on how to help improve + Browser Use. + diff --git a/docs/development/observability.mdx b/docs/development/observability.mdx new file mode 100644 index 0000000000000000000000000000000000000000..955e0388eb2669a85fc5f0526ba96cedca55d82d --- /dev/null +++ b/docs/development/observability.mdx @@ -0,0 +1,66 @@ +--- +title: "Observability" +description: "Trace Browser Use's agent execution steps and browser sessions" +icon: "eye" +--- + +## Overview + +Browser Use has a native integration with [Laminar](https://lmnr.ai) - open-source platform for tracing, evals and labeling of AI agents. +Read more about Laminar in the [Laminar docs](https://docs.lmnr.ai). + + + Laminar excels at tracing browser agents by providing unified visibility into both browser session recordings and agent execution steps. + + +## Setup + +To setup Laminar, you need to install the `lmnr` package and set the `LMNR_PROJECT_API_KEY` environment variable. + +To get your project API key, you can either: +- Register on [Laminar Cloud](https://lmnr.ai) and get the key from your project settings +- Or spin up a local Laminar instance and get the key from the settings page + +```bash +pip install 'lmnr[all]' +export LMNR_PROJECT_API_KEY= +``` + +## Usage + +Then, you simply initialize the Laminar at the top of your project and both Browser Use and session recordings will be automatically traced. + +```python {5-8} +from langchain_openai import ChatOpenAI +from browser_use import Agent +import asyncio + +from lmnr import Laminar +# this line auto-instruments Browser Use and any browser you use (local or remote) +Laminar.initialize(project_api_key="...") # you can also pass project api key here + +async def main(): + agent = Agent( + task="open google, search Laminar AI", + llm=ChatOpenAI(model="gpt-4o-mini"), + ) + result = await agent.run() + print(result) + +asyncio.run(main()) +``` + +## Viewing Traces + +You can view traces in the Laminar UI by going to the traces tab in your project. +When you select a trace, you can see both the browser session recording and the agent execution steps. + +Timeline of the browser session is synced with the agent execution steps, timeline highlights indicate the agent's current step synced with the browser session. +In the trace view, you can also see the agent's current step, the tool it's using, and the tool's input and output. Tools are highlighted in the timeline with a yellow color. + +Laminar + + +## Laminar + +To learn more about tracing and evaluating your browser agents, check out the [Laminar docs](https://docs.lmnr.ai). \ No newline at end of file diff --git a/docs/development/roadmap.mdx b/docs/development/roadmap.mdx new file mode 100644 index 0000000000000000000000000000000000000000..34f05f5a407acbb052acf92966e1fc586acfb634 --- /dev/null +++ b/docs/development/roadmap.mdx @@ -0,0 +1,7 @@ +--- +title: "Roadmap" +description: "Future plans and upcoming features for Browser Use" +icon: "road" +--- + +Big things coming soon! diff --git a/docs/development/telemetry.mdx b/docs/development/telemetry.mdx new file mode 100644 index 0000000000000000000000000000000000000000..fe4f7cb54c5c3095ebeccc77b6161a78bf3d96f4 --- /dev/null +++ b/docs/development/telemetry.mdx @@ -0,0 +1,39 @@ +--- +title: "Telemetry" +description: "Understanding Browser Use's telemetry and privacy settings" +icon: "chart-mixed" +--- + +## Overview + +Browser Use collects anonymous usage data to help us understand how the library is being used and to improve the user experience. It also helps us fix bugs faster and prioritize feature development. + +## Data Collection + +We use [PostHog](https://posthog.com) for telemetry collection. The data is completely anonymized and contains no personally identifiable information. + + + We never collect personal information, credentials, or specific content from + your browser automation tasks. + + +## Opting Out + +You can disable telemetry by setting an environment variable: + +```bash .env +ANONYMIZED_TELEMETRY=false +``` + +Or in your Python code: + +```python +import os +os.environ["ANONYMIZED_TELEMETRY"] = "false" +``` + + + Even when enabled, telemetry has zero impact on the library's performance or + functionality. Code is available in [Telemetry + Service](https://github.com/browser-use/browser-use/tree/main/browser_use/telemetry). + diff --git a/docs/favicon.svg b/docs/favicon.svg new file mode 100644 index 0000000000000000000000000000000000000000..59f98742e385191e7338bfb7a5bbd70a24784865 --- /dev/null +++ b/docs/favicon.svg @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/docs/images/browser-use.png b/docs/images/browser-use.png new file mode 100644 index 0000000000000000000000000000000000000000..54685c4f4b3c721f74bbb09cfc54c6645216ae70 Binary files /dev/null and b/docs/images/browser-use.png differ diff --git a/docs/images/checks-passed.png b/docs/images/checks-passed.png new file mode 100644 index 0000000000000000000000000000000000000000..97603943defbd694eacebf9559b61ddc13020f60 --- /dev/null +++ b/docs/images/checks-passed.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93add382731d1e6d443b128bbe1ac747b62d0efa1b8372ee3fcd37a59d86da30 +size 160724 diff --git a/docs/images/laminar.png b/docs/images/laminar.png new file mode 100644 index 0000000000000000000000000000000000000000..8ec7136ea6e0d344b4b9054220c48216011ed813 --- /dev/null +++ b/docs/images/laminar.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3852ced4a5dee49a567fac0afa8e8ee7a843d5b0f0729cb77d0578cde76d0956 +size 979079 diff --git a/docs/introduction.mdx b/docs/introduction.mdx new file mode 100644 index 0000000000000000000000000000000000000000..72d7b463d637a121dd07b1cd7a03c8900b77173c --- /dev/null +++ b/docs/introduction.mdx @@ -0,0 +1,101 @@ +--- +title: "Introduction" +description: "Welcome to Browser Use - We enable AI to control your browser" +icon: "book-open" +--- + +Browser Use + +## Overview + +Browser Use is the easiest way to connect your AI agents with the browser. It makes websites accessible for AI agents by providing a powerful, yet simple interface for browser automation. + + + If you have used Browser Use for your project, feel free to show it off in our + [Discord community](https://link.browser-use.com/discord)! + + +## Getting Started + + + + Get up and running with Browser Use in minutes + + + Configure different LLMs for your agents + + + Learn how to configure and customize your agents + + + Extend functionality with custom actions + + + +## Fancy Demos + +### Writing in Google Docs + +Task: Write a letter in Google Docs to my Papa, thanking him for everything, and save the document as a PDF. + + + + + +### Job Applications + +Task: Read my CV & find ML jobs, save them to a file, and then start applying for them in new tabs. + + +