name: test permissions: actions: read contents: write pull-requests: write # Allow writing comments on PRs issues: write # Allow writing comments on issues statuses: write # Allow writing statuses on PRs discussions: write # Cancel in-progress runs when a new commit is pushed to the same branch/PR concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true on: push: branches: - main - stable - 'releases/**' tags: - '*' pull_request: workflow_dispatch: jobs: setup-chromium: runs-on: ubuntu-latest timeout-minutes: 5 steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 - name: Get week number for cache key id: week run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - name: Cache chromium binaries id: cache-chromium uses: actions/cache@v4 with: path: | ~/.cache/ms-playwright key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} restore-keys: | ${{ runner.os }}-${{ runner.arch }}-chromium- - name: Install Chromium if not cached if: steps.cache-chromium.outputs.cache-hit != 'true' run: uvx playwright install chromium --with-deps --no-shell find_tests: runs-on: ubuntu-latest timeout-minutes: 5 # Prevent hanging outputs: TEST_FILENAMES: ${{ steps.lsgrep.outputs.TEST_FILENAMES }} # ["test_browser", "test_tools", "test_browser_session", "test_tab_management", ...] steps: - uses: actions/checkout@v4 with: # Force fresh checkout to avoid any caching issues fetch-depth: 1 - id: lsgrep run: | echo "🔍 Discovering test files at $(date)" echo "Git commit: $(git rev-parse HEAD)" echo "Git branch: $(git branch --show-current)" echo "" TEST_FILENAMES="$(find tests/ci -name 'test_*.py' -type f | sed 's|^tests/ci/||' | sed 's|\.py$||' | jq -R -s -c 'split("\n")[:-1]')" echo "TEST_FILENAMES=${TEST_FILENAMES}" >> "$GITHUB_OUTPUT" echo "📋 Test matrix: $TEST_FILENAMES" # https://code.dblock.org/2021/09/03/generating-task-matrix-by-looping-over-repo-files-with-github-actions.html - name: Check that at least one test file is found run: | if [ -z "${{ steps.lsgrep.outputs.TEST_FILENAMES }}" ]; then echo "Failed to find any test_*.py files in tests/ci/ folder!" > /dev/stderr exit 1 fi tests: needs: [setup-chromium, find_tests] runs-on: ubuntu-latest timeout-minutes: 4 # Reduced timeout - tests should complete quickly or retry env: IN_DOCKER: 'True' ANONYMIZED_TELEMETRY: 'false' BROWSER_USE_LOGGING_LEVEL: 'DEBUG' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} AZURE_OPENAI_KEY: ${{ secrets.AZURE_OPENAI_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} strategy: matrix: test_filename: ${{ fromJson(needs.find_tests.outputs.TEST_FILENAMES || '["FAILED_TO_DISCOVER_TESTS"]') }} # autodiscovers all the files in tests/ci/test_*.py # - test_browser # - test_tools # - test_browser_session # - test_tab_management # ... and more name: ${{ matrix.test_filename }} steps: - name: Check that the previous step managed to find some test files for us to run run: | if [[ "${{ matrix.test_filename }}" == "FAILED_TO_DISCOVER_TESTS" ]]; then echo "Failed get list of test files in tests/ci/test_*.py from find_tests job" > /dev/stderr exit 1 fi - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 with: enable-cache: true activate-environment: true - name: Cache uv packages and venv uses: actions/cache@v4 with: path: | ~/.cache/uv .venv key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} restore-keys: | ${{ runner.os }}-uv-venv- - run: uv sync --dev --all-extras - name: Get week number for cache key id: week run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - name: Cache chromium binaries id: cache-chromium uses: actions/cache@v4 with: path: | ~/.cache/ms-playwright key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} restore-keys: | ${{ runner.os }}-${{ runner.arch }}-chromium- - name: Install Chromium browser if not cached if: steps.cache-chromium.outputs.cache-hit != 'true' run: uvx playwright install chromium --with-deps --no-shell - name: Cache browser-use extensions uses: actions/cache@v4 with: path: | ~/.config/browseruse/extensions key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} restore-keys: | ${{ runner.os }}-browseruse-extensions- - name: Check if test file exists id: check-file run: | TEST_FILE="tests/ci/${{ matrix.test_filename }}.py" if [ -f "$TEST_FILE" ]; then echo "exists=true" >> $GITHUB_OUTPUT echo "✅ Test file found: $TEST_FILE" else echo "exists=false" >> $GITHUB_OUTPUT echo "❌ Test file not found: $TEST_FILE" echo "This file may have been renamed or removed. Current test files:" find tests/ci -name 'test_*.py' -type f | sed 's|tests/ci/||' | sed 's|\.py$||' | sort fi - name: Run test with retry if: steps.check-file.outputs.exists == 'true' uses: nick-fields/retry@v3 with: timeout_minutes: 4 max_attempts: 1 retry_on: error command: pytest "tests/ci/${{ matrix.test_filename }}.py" evaluate-tasks: needs: setup-chromium runs-on: ubuntu-latest timeout-minutes: 8 # Allow more time for agent eval env: IN_DOCKER: 'true' BROWSER_USE_CLOUD_SYNC: 'false' ANONYMIZED_TELEMETRY: 'false' BROWSER_USE_LOGGING_LEVEL: 'DEBUG' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 with: enable-cache: true activate-environment: true - name: Cache uv packages and venv uses: actions/cache@v4 with: path: | ~/.cache/uv .venv key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} restore-keys: | ${{ runner.os }}-uv-venv- - run: uv sync --dev --all-extras - name: Get week number for cache key id: week run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - name: Cache chromium binaries id: cache-chromium uses: actions/cache@v4 with: path: | ~/.cache/ms-playwright key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} restore-keys: | ${{ runner.os }}-${{ runner.arch }}-chromium- - name: Install Chromium browser if not cached if: steps.cache-chromium.outputs.cache-hit != 'true' run: uvx playwright install chromium --with-deps --no-shell - name: Cache browser-use extensions uses: actions/cache@v4 with: path: | ~/.config/browseruse/extensions key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} restore-keys: | ${{ runner.os }}-browseruse-extensions- - name: Run agent tasks evaluation and capture score id: eval uses: nick-fields/retry@v3 with: timeout_minutes: 4 max_attempts: 1 retry_on: error command: | python tests/ci/evaluate_tasks.py > result.txt cat result.txt echo "PASSED=$(grep '^PASSED=' result.txt | cut -d= -f2)" >> $GITHUB_ENV echo "TOTAL=$(grep '^TOTAL=' result.txt | cut -d= -f2)" >> $GITHUB_ENV echo "DETAILED_RESULTS=$(grep '^DETAILED_RESULTS=' result.txt | cut -d= -f2-)" >> $GITHUB_ENV - name: Print agent evaluation summary run: | echo "Agent tasks passed: $PASSED / $TOTAL" - name: Write agent evaluation summary to workflow overview run: | if [ "$PASSED" = "$TOTAL" ]; then COLOR="green" else COLOR="yellow" fi echo "