| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| name: model |
|
|
| on: |
| |
| |
| push: |
| branches: |
| - main |
| - v0.* |
| pull_request: |
| branches: |
| - main |
| - v0.* |
| paths: |
| - "verl/**/*.py" |
| |
| - ".github/workflows/model.yml" |
| - "tests/special_distributed/test_fsdp_ckpt.py" |
| - "tests/special_distributed/test_tensor_dict.py" |
| - "tests/models/**" |
| - "tests/special_distributed/run_all.sh" |
|
|
| |
| permissions: |
| contents: read |
|
|
| |
| concurrency: |
| group: ${{ github.workflow }}-${{ github.ref }} |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} |
|
|
| env: |
| IMAGE: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:vllm017.dev2" |
| DYNAMIC_RUNNER_ENDPOINT: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" |
|
|
| jobs: |
| setup: |
| if: github.repository_owner == 'verl-project' |
| runs-on: ubuntu-latest |
| outputs: |
| runner-label: ${{ steps.create-runner.outputs.runner-label }} |
| mlp-task-id: ${{ steps.create-runner.outputs.mlp-task-id }} |
| steps: |
| - uses: actions/checkout@v4 |
| - id: create-runner |
| uses: volcengine/vemlp-github-runner@v1 |
| with: |
| mode: "create" |
| faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" |
| mlp-image: "${{ env.IMAGE }}" |
|
|
| model_rmpad: |
| needs: setup |
| runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] |
| timeout-minutes: 20 |
| env: |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" |
| HF_ENDPOINT: "https://hf-mirror.com" |
| HF_HUB_ENABLE_HF_TRANSFER: "0" |
| steps: |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 |
| with: |
| fetch-depth: 0 |
| - name: Install the current repository and upgrade to latest transformers(4.54.0)/flash_attn, transformers 4.55.0 has strange behavior with model backward |
| run: | |
| pip3 install -r requirements-test.txt |
| pip3 install --no-deps -e . |
| pip3 install --upgrade "transformers<5.0.0" |
| - name: Running rmpad model tests on 8 L20 GPUs + flash_attn 2.5.8 |
| run: | |
| pytest -s tests/models/test_transformer.py |
| - name: Running rmpad model tests on 8 L20 GPUs + latest flash_attn |
| run: | |
| pytest -s tests/models/test_transformer.py |
| - name: Running FSDP rmpad model tests on 8 L20 GPUs + latest flash_attn |
| run: | |
| STRATEGY=fsdp torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py |
| - name: Running transformers ulysses tests on 8 L20 GPUs + latest transformers |
| run: | |
| torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py |
| - name: Running transformers ulysses tests on 8 L20 GPUs + transformers 4.54.1 |
| run: | |
| pip3 install transformers==4.54.1 |
| torchrun --nproc_per_node=8 -m pytest tests/models/test_transformers_ulysses.py |
| - name: Run distributed test |
| run: | |
| bash tests/special_distributed/run_all.sh |
| |
| |
| |
| model_rmpad_fsdp2_unstable: |
| needs: setup |
| runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] |
| timeout-minutes: 20 |
| env: |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" |
| HF_ENDPOINT: "https://hf-mirror.com" |
| HF_HUB_ENABLE_HF_TRANSFER: "0" |
| steps: |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 |
| with: |
| fetch-depth: 0 |
| - name: Install the current repository and upgrade to latest transformers/flash_attn |
| run: | |
| pip3 install -r requirements-test.txt |
| pip3 install --no-deps -e . |
| - name: Running FSDP2 rmpad model tests on 8 L20 GPUs + latest flash_attn |
| run: | |
| STRATEGY=fsdp2 torchrun --nproc_per_node=8 tests/special_distributed/test_fsdp_ckpt.py |
| |
| model_engine: |
| needs: setup |
| runs-on: ["${{ needs.setup.outputs.runner-label || 'L20x8' }}"] |
| timeout-minutes: 20 |
| env: |
| HTTP_PROXY: ${{ secrets.PROXY_HTTP }} |
| HTTPS_PROXY: ${{ secrets.PROXY_HTTPS }} |
| NO_PROXY: "localhost,127.0.0.1,hf-mirror.com" |
| HF_ENDPOINT: "https://hf-mirror.com" |
| HF_HUB_ENABLE_HF_TRANSFER: "0" |
| steps: |
| - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 |
| with: |
| fetch-depth: 0 |
| - name: Install the current repository |
| run: | |
| pip3 install -r requirements-test.txt |
| pip3 install --no-deps -e . |
| - name: Download model config files |
| run: | |
| hf download Qwen/Qwen2.5-0.5B-Instruct --local-dir $HOME/models/Qwen/Qwen2.5-0.5B-Instruct |
| |
| - name: Running mcore engine tests on 8 L20 GPUs |
| run: | |
| ray stop --force |
| pytest -s -x tests/models/test_engine.py |
| |
| cleanup: |
| runs-on: ubuntu-latest |
| needs: [setup, model_rmpad, model_rmpad_fsdp2_unstable, model_engine] |
| if: always() |
| steps: |
| - id: destroy-runner |
| uses: volcengine/vemlp-github-runner@v1 |
| with: |
| mode: "destroy" |
| faas-url: "${{ env.DYNAMIC_RUNNER_ENDPOINT }}" |
| mlp-task-id: "${{ needs.setup.outputs.mlp-task-id }}" |
|
|