name: test permissions: actions: read contents: write pull-requests: write # Allow writing comments on PRs issues: write # Allow writing comments on issues statuses: write # Allow writing statuses on PRs discussions: write # Cancel in-progress runs when a new commit is pushed to the same branch/PR concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true on: push: branches: - main - stable - 'releases/**' tags: - '*' pull_request: workflow_dispatch: jobs: setup-chromium: runs-on: ubuntu-latest timeout-minutes: 5 steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 - name: Get week number for cache key id: week run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - name: Cache chromium binaries id: cache-chromium uses: actions/cache@v4 with: path: | ~/.cache/ms-playwright key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} restore-keys: | ${{ runner.os }}-${{ runner.arch }}-chromium- - name: Install Chromium if not cached if: steps.cache-chromium.outputs.cache-hit != 'true' run: uvx playwright install chromium --with-deps --no-shell find_tests: runs-on: ubuntu-latest timeout-minutes: 5 # Prevent hanging outputs: TEST_FILENAMES: ${{ steps.lsgrep.outputs.TEST_FILENAMES }} # ["test_browser", "test_tools", "test_browser_session", "test_tab_management", ...] steps: - uses: actions/checkout@v4 with: # Force fresh checkout to avoid any caching issues fetch-depth: 1 - id: lsgrep run: | echo "🔍 Discovering test files at $(date)" echo "Git commit: $(git rev-parse HEAD)" echo "Git branch: $(git branch --show-current)" echo "" TEST_FILENAMES="$(find tests/ci -name 'test_*.py' -type f | sed 's|^tests/ci/||' | sed 's|\.py$||' | jq -R -s -c 'split("\n")[:-1]')" echo "TEST_FILENAMES=${TEST_FILENAMES}" >> "$GITHUB_OUTPUT" echo "📋 Test matrix: $TEST_FILENAMES" # https://code.dblock.org/2021/09/03/generating-task-matrix-by-looping-over-repo-files-with-github-actions.html - name: Check that at least one test file is found run: | if [ -z "${{ steps.lsgrep.outputs.TEST_FILENAMES }}" ]; then echo "Failed to find any test_*.py files in tests/ci/ folder!" > /dev/stderr exit 1 fi tests: needs: [setup-chromium, find_tests] runs-on: ubuntu-latest timeout-minutes: 4 # Reduced timeout - tests should complete quickly or retry env: IN_DOCKER: 'True' ANONYMIZED_TELEMETRY: 'false' BROWSER_USE_LOGGING_LEVEL: 'DEBUG' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} AZURE_OPENAI_KEY: ${{ secrets.AZURE_OPENAI_KEY }} AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} strategy: matrix: test_filename: ${{ fromJson(needs.find_tests.outputs.TEST_FILENAMES || '["FAILED_TO_DISCOVER_TESTS"]') }} # autodiscovers all the files in tests/ci/test_*.py # - test_browser # - test_tools # - test_browser_session # - test_tab_management # ... and more name: ${{ matrix.test_filename }} steps: - name: Check that the previous step managed to find some test files for us to run run: | if [[ "${{ matrix.test_filename }}" == "FAILED_TO_DISCOVER_TESTS" ]]; then echo "Failed get list of test files in tests/ci/test_*.py from find_tests job" > /dev/stderr exit 1 fi - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 with: enable-cache: true activate-environment: true - name: Cache uv packages and venv uses: actions/cache@v4 with: path: | ~/.cache/uv .venv key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} restore-keys: | ${{ runner.os }}-uv-venv- - run: uv sync --dev --all-extras - name: Get week number for cache key id: week run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - name: Cache chromium binaries id: cache-chromium uses: actions/cache@v4 with: path: | ~/.cache/ms-playwright key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} restore-keys: | ${{ runner.os }}-${{ runner.arch }}-chromium- - name: Install Chromium browser if not cached if: steps.cache-chromium.outputs.cache-hit != 'true' run: uvx playwright install chromium --with-deps --no-shell - name: Cache browser-use extensions uses: actions/cache@v4 with: path: | ~/.config/browseruse/extensions key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} restore-keys: | ${{ runner.os }}-browseruse-extensions- - name: Check if test file exists id: check-file run: | TEST_FILE="tests/ci/${{ matrix.test_filename }}.py" if [ -f "$TEST_FILE" ]; then echo "exists=true" >> $GITHUB_OUTPUT echo "✅ Test file found: $TEST_FILE" else echo "exists=false" >> $GITHUB_OUTPUT echo "❌ Test file not found: $TEST_FILE" echo "This file may have been renamed or removed. Current test files:" find tests/ci -name 'test_*.py' -type f | sed 's|tests/ci/||' | sed 's|\.py$||' | sort fi - name: Run test with retry if: steps.check-file.outputs.exists == 'true' uses: nick-fields/retry@v3 with: timeout_minutes: 4 max_attempts: 1 retry_on: error command: pytest "tests/ci/${{ matrix.test_filename }}.py" evaluate-tasks: needs: setup-chromium runs-on: ubuntu-latest timeout-minutes: 8 # Allow more time for agent eval env: IN_DOCKER: 'true' BROWSER_USE_CLOUD_SYNC: 'false' ANONYMIZED_TELEMETRY: 'false' BROWSER_USE_LOGGING_LEVEL: 'DEBUG' OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v6 with: enable-cache: true activate-environment: true - name: Cache uv packages and venv uses: actions/cache@v4 with: path: | ~/.cache/uv .venv key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} restore-keys: | ${{ runner.os }}-uv-venv- - run: uv sync --dev --all-extras - name: Get week number for cache key id: week run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT - name: Cache chromium binaries id: cache-chromium uses: actions/cache@v4 with: path: | ~/.cache/ms-playwright key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} restore-keys: | ${{ runner.os }}-${{ runner.arch }}-chromium- - name: Install Chromium browser if not cached if: steps.cache-chromium.outputs.cache-hit != 'true' run: uvx playwright install chromium --with-deps --no-shell - name: Cache browser-use extensions uses: actions/cache@v4 with: path: | ~/.config/browseruse/extensions key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} restore-keys: | ${{ runner.os }}-browseruse-extensions- - name: Run agent tasks evaluation and capture score id: eval uses: nick-fields/retry@v3 with: timeout_minutes: 4 max_attempts: 1 retry_on: error command: | python tests/ci/evaluate_tasks.py > result.txt cat result.txt echo "PASSED=$(grep '^PASSED=' result.txt | cut -d= -f2)" >> $GITHUB_ENV echo "TOTAL=$(grep '^TOTAL=' result.txt | cut -d= -f2)" >> $GITHUB_ENV echo "DETAILED_RESULTS=$(grep '^DETAILED_RESULTS=' result.txt | cut -d= -f2-)" >> $GITHUB_ENV - name: Print agent evaluation summary run: | echo "Agent tasks passed: $PASSED / $TOTAL" - name: Write agent evaluation summary to workflow overview run: | if [ "$PASSED" = "$TOTAL" ]; then COLOR="green" else COLOR="yellow" fi echo "

Agent Tasks Score: $PASSED/$TOTAL

" >> $GITHUB_STEP_SUMMARY - name: Comment PR with agent evaluation results if: github.event_name == 'pull_request' uses: actions/github-script@v7 continue-on-error: true with: script: | const passed = parseInt(process.env.PASSED); const total = parseInt(process.env.TOTAL); const detailedResults = JSON.parse(process.env.DETAILED_RESULTS); const score = `${passed}/${total}`; const percentage = Math.round((passed / total) * 100); // Fail the workflow if 0% pass rate if (percentage === 0) { core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`); } // Create detailed table let tableRows = ''; detailedResults.forEach(result => { const emoji = result.success ? '✅' : '❌'; const status = result.success ? 'Pass' : 'Fail'; tableRows += `| ${result.task} | ${emoji} ${status} | ${result.reason} |\n`; }); const comment = `## Agent Task Evaluation Results: ${score} (${percentage}%)
View detailed results | Task | Result | Reason | |------|--------|--------| ${tableRows} Check the [evaluate-tasks job](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed task execution logs.
`; // Find existing comment to update or create new one const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, }); const botComment = comments.find(comment => comment.user.type === 'Bot' && comment.body.includes('Agent Task Evaluation Results') ); if (botComment) { // Update existing comment await github.rest.issues.updateComment({ owner: context.repo.owner, repo: context.repo.repo, comment_id: botComment.id, body: comment }); } else { // Create new comment await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: context.issue.number, body: comment }); }