Spaces:
Sleeping
Sleeping
| name: test | |
| permissions: | |
| actions: read | |
| contents: write | |
| pull-requests: write # Allow writing comments on PRs | |
| issues: write # Allow writing comments on issues | |
| statuses: write # Allow writing statuses on PRs | |
| discussions: write | |
| # Cancel in-progress runs when a new commit is pushed to the same branch/PR | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - stable | |
| - 'releases/**' | |
| tags: | |
| - '*' | |
| pull_request: | |
| workflow_dispatch: | |
| jobs: | |
| setup-chromium: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: astral-sh/setup-uv@v6 | |
| - name: Get week number for cache key | |
| id: week | |
| run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT | |
| - name: Cache chromium binaries | |
| id: cache-chromium | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/ms-playwright | |
| key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} | |
| restore-keys: | | |
| ${{ runner.os }}-${{ runner.arch }}-chromium- | |
| - name: Install Chromium if not cached | |
| if: steps.cache-chromium.outputs.cache-hit != 'true' | |
| run: uvx playwright install chromium --with-deps --no-shell | |
| find_tests: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 5 # Prevent hanging | |
| outputs: | |
| TEST_FILENAMES: ${{ steps.lsgrep.outputs.TEST_FILENAMES }} | |
| # ["test_browser", "test_tools", "test_browser_session", "test_tab_management", ...] | |
| steps: | |
| - uses: actions/checkout@v4 | |
| with: | |
| # Force fresh checkout to avoid any caching issues | |
| fetch-depth: 1 | |
| - id: lsgrep | |
| run: | | |
| echo "π Discovering test files at $(date)" | |
| echo "Git commit: $(git rev-parse HEAD)" | |
| echo "Git branch: $(git branch --show-current)" | |
| echo "" | |
| TEST_FILENAMES="$(find tests/ci -name 'test_*.py' -type f | sed 's|^tests/ci/||' | sed 's|\.py$||' | jq -R -s -c 'split("\n")[:-1]')" | |
| echo "TEST_FILENAMES=${TEST_FILENAMES}" >> "$GITHUB_OUTPUT" | |
| echo "π Test matrix: $TEST_FILENAMES" | |
| # https://code.dblock.org/2021/09/03/generating-task-matrix-by-looping-over-repo-files-with-github-actions.html | |
| - name: Check that at least one test file is found | |
| run: | | |
| if [ -z "${{ steps.lsgrep.outputs.TEST_FILENAMES }}" ]; then | |
| echo "Failed to find any test_*.py files in tests/ci/ folder!" > /dev/stderr | |
| exit 1 | |
| fi | |
| tests: | |
| needs: [setup-chromium, find_tests] | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 4 # Reduced timeout - tests should complete quickly or retry | |
| env: | |
| IN_DOCKER: 'True' | |
| ANONYMIZED_TELEMETRY: 'false' | |
| BROWSER_USE_LOGGING_LEVEL: 'DEBUG' | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} | |
| GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} | |
| AZURE_OPENAI_KEY: ${{ secrets.AZURE_OPENAI_KEY }} | |
| AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }} | |
| BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} | |
| OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} | |
| strategy: | |
| matrix: | |
| test_filename: ${{ fromJson(needs.find_tests.outputs.TEST_FILENAMES || '["FAILED_TO_DISCOVER_TESTS"]') }} | |
| # autodiscovers all the files in tests/ci/test_*.py | |
| # - test_browser | |
| # - test_tools | |
| # - test_browser_session | |
| # - test_tab_management | |
| # ... and more | |
| name: ${{ matrix.test_filename }} | |
| steps: | |
| - name: Check that the previous step managed to find some test files for us to run | |
| run: | | |
| if [[ "${{ matrix.test_filename }}" == "FAILED_TO_DISCOVER_TESTS" ]]; then | |
| echo "Failed get list of test files in tests/ci/test_*.py from find_tests job" > /dev/stderr | |
| exit 1 | |
| fi | |
| - uses: actions/checkout@v4 | |
| - uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| activate-environment: true | |
| - name: Cache uv packages and venv | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/uv | |
| .venv | |
| key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-uv-venv- | |
| - run: uv sync --dev --all-extras | |
| - name: Get week number for cache key | |
| id: week | |
| run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT | |
| - name: Cache chromium binaries | |
| id: cache-chromium | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/ms-playwright | |
| key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} | |
| restore-keys: | | |
| ${{ runner.os }}-${{ runner.arch }}-chromium- | |
| - name: Install Chromium browser if not cached | |
| if: steps.cache-chromium.outputs.cache-hit != 'true' | |
| run: uvx playwright install chromium --with-deps --no-shell | |
| - name: Cache browser-use extensions | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.config/browseruse/extensions | |
| key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} | |
| restore-keys: | | |
| ${{ runner.os }}-browseruse-extensions- | |
| - name: Check if test file exists | |
| id: check-file | |
| run: | | |
| TEST_FILE="tests/ci/${{ matrix.test_filename }}.py" | |
| if [ -f "$TEST_FILE" ]; then | |
| echo "exists=true" >> $GITHUB_OUTPUT | |
| echo "β Test file found: $TEST_FILE" | |
| else | |
| echo "exists=false" >> $GITHUB_OUTPUT | |
| echo "β Test file not found: $TEST_FILE" | |
| echo "This file may have been renamed or removed. Current test files:" | |
| find tests/ci -name 'test_*.py' -type f | sed 's|tests/ci/||' | sed 's|\.py$||' | sort | |
| fi | |
| - name: Run test with retry | |
| if: steps.check-file.outputs.exists == 'true' | |
| uses: nick-fields/retry@v3 | |
| with: | |
| timeout_minutes: 4 | |
| max_attempts: 1 | |
| retry_on: error | |
| command: pytest "tests/ci/${{ matrix.test_filename }}.py" | |
| evaluate-tasks: | |
| needs: setup-chromium | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 8 # Allow more time for agent eval | |
| env: | |
| IN_DOCKER: 'true' | |
| BROWSER_USE_CLOUD_SYNC: 'false' | |
| ANONYMIZED_TELEMETRY: 'false' | |
| BROWSER_USE_LOGGING_LEVEL: 'DEBUG' | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| PERPLEXITY_API_KEY: ${{ secrets.PERPLEXITY_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} | |
| GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} | |
| BROWSER_USE_API_KEY: ${{ secrets.BROWSER_USE_API_KEY }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: astral-sh/setup-uv@v6 | |
| with: | |
| enable-cache: true | |
| activate-environment: true | |
| - name: Cache uv packages and venv | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/uv | |
| .venv | |
| key: ${{ runner.os }}-uv-venv-${{ hashFiles('pyproject.toml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-uv-venv- | |
| - run: uv sync --dev --all-extras | |
| - name: Get week number for cache key | |
| id: week | |
| run: echo "number=$(date +%Y-W%U)" >> $GITHUB_OUTPUT | |
| - name: Cache chromium binaries | |
| id: cache-chromium | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.cache/ms-playwright | |
| key: ${{ runner.os }}-${{ runner.arch }}-chromium-${{ steps.week.outputs.number }} | |
| restore-keys: | | |
| ${{ runner.os }}-${{ runner.arch }}-chromium- | |
| - name: Install Chromium browser if not cached | |
| if: steps.cache-chromium.outputs.cache-hit != 'true' | |
| run: uvx playwright install chromium --with-deps --no-shell | |
| - name: Cache browser-use extensions | |
| uses: actions/cache@v4 | |
| with: | |
| path: | | |
| ~/.config/browseruse/extensions | |
| key: ${{ runner.os }}-browseruse-extensions-${{ hashFiles('browser_use/browser/profile.py') }} | |
| restore-keys: | | |
| ${{ runner.os }}-browseruse-extensions- | |
| - name: Run agent tasks evaluation and capture score | |
| id: eval | |
| uses: nick-fields/retry@v3 | |
| with: | |
| timeout_minutes: 4 | |
| max_attempts: 1 | |
| retry_on: error | |
| command: | | |
| python tests/ci/evaluate_tasks.py > result.txt | |
| cat result.txt | |
| echo "PASSED=$(grep '^PASSED=' result.txt | cut -d= -f2)" >> $GITHUB_ENV | |
| echo "TOTAL=$(grep '^TOTAL=' result.txt | cut -d= -f2)" >> $GITHUB_ENV | |
| echo "DETAILED_RESULTS=$(grep '^DETAILED_RESULTS=' result.txt | cut -d= -f2-)" >> $GITHUB_ENV | |
| - name: Print agent evaluation summary | |
| run: | | |
| echo "Agent tasks passed: $PASSED / $TOTAL" | |
| - name: Write agent evaluation summary to workflow overview | |
| run: | | |
| if [ "$PASSED" = "$TOTAL" ]; then | |
| COLOR="green" | |
| else | |
| COLOR="yellow" | |
| fi | |
| echo "<h2>Agent Tasks Score: <span style='color:$COLOR;'>$PASSED/$TOTAL</span></h2>" >> $GITHUB_STEP_SUMMARY | |
| - name: Comment PR with agent evaluation results | |
| if: github.event_name == 'pull_request' | |
| uses: actions/github-script@v7 | |
| continue-on-error: true | |
| with: | |
| script: | | |
| const passed = parseInt(process.env.PASSED); | |
| const total = parseInt(process.env.TOTAL); | |
| const detailedResults = JSON.parse(process.env.DETAILED_RESULTS); | |
| const score = `${passed}/${total}`; | |
| const percentage = Math.round((passed / total) * 100); | |
| // Fail the workflow if 0% pass rate | |
| if (percentage === 0) { | |
| core.setFailed(`Evaluation failed: 0% pass rate (${passed}/${total})`); | |
| } | |
| // Create detailed table | |
| let tableRows = ''; | |
| detailedResults.forEach(result => { | |
| const emoji = result.success ? 'β ' : 'β'; | |
| const status = result.success ? 'Pass' : 'Fail'; | |
| tableRows += `| ${result.task} | ${emoji} ${status} | ${result.reason} |\n`; | |
| }); | |
| const comment = `## Agent Task Evaluation Results: ${score} (${percentage}%) | |
| <details> | |
| <summary>View detailed results</summary> | |
| | Task | Result | Reason | | |
| |------|--------|--------| | |
| ${tableRows} | |
| Check the [evaluate-tasks job](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for detailed task execution logs. | |
| </details>`; | |
| // Find existing comment to update or create new one | |
| const { data: comments } = await github.rest.issues.listComments({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| }); | |
| const botComment = comments.find(comment => | |
| comment.user.type === 'Bot' && | |
| comment.body.includes('Agent Task Evaluation Results') | |
| ); | |
| if (botComment) { | |
| // Update existing comment | |
| await github.rest.issues.updateComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| comment_id: botComment.id, | |
| body: comment | |
| }); | |
| } else { | |
| // Create new comment | |
| await github.rest.issues.createComment({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| issue_number: context.issue.number, | |
| body: comment | |
| }); | |
| } | |