| name: PR Test (AMD) |
| |
| |
| run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }} |
| |
| on: |
| push: |
| branches: [ main ] |
| paths: |
| - "python/**" |
| - "scripts/ci/**" |
| - "test/**" |
| - "sgl-kernel/**" |
| - ".github/workflows/pr-test-amd.yml" |
| - "docker/rocm.Dockerfile" |
| pull_request: |
| branches: [ main ] |
| paths: |
| - "python/**" |
| - "scripts/ci/**" |
| - "test/**" |
| - "sgl-kernel/**" |
| - ".github/workflows/pr-test-amd.yml" |
| - "docker/rocm.Dockerfile" |
| workflow_dispatch: |
| inputs: |
| target_stage_select: |
| description: "Select a stage to run from dropdown (leave empty for auto-detect)" |
| required: false |
| type: choice |
| default: '' |
| options: |
| - '' |
| - sgl-kernel-unit-test-amd |
| - sgl-kernel-unit-test-2-gpu-amd |
| - stage-a-test-1-amd |
| - jit-kernel-unit-test-amd |
| - stage-b-test-small-1-gpu-amd |
| - stage-b-test-small-1-gpu-amd-nondeterministic |
| - stage-b-test-small-1-gpu-amd-mi35x |
| - stage-b-test-large-1-gpu-amd |
| - stage-b-test-large-2-gpu-amd |
| - multimodal-gen-test-1-gpu-amd |
| - multimodal-gen-test-2-gpu-amd |
| - stage-c-test-large-8-gpu-amd |
| - stage-c-test-large-8-gpu-amd-mi35x |
| - stage-b-test-large-8-gpu-disaggregation-amd |
| target_stage: |
| description: "Or type comma-separated stage names (overrides dropdown if non-empty)" |
| required: false |
| type: string |
| default: "" |
| pr_head_sha: |
| description: "PR head SHA to checkout (for /rerun-stage on fork PRs)" |
| required: false |
| type: string |
| default: "" |
| aiter_ref: |
| description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' |
| required: false |
| type: string |
| default: '' |
| continue_on_error: |
| description: 'Continue on error (do not fail the workflow on test failures)' |
| required: false |
| type: boolean |
| default: false |
| workflow_call: |
| inputs: |
| ref: |
| description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' |
| required: false |
| type: string |
| default: '' |
| run_all_tests: |
| description: "Run all tests (for releasing or testing purpose)" |
| required: false |
| type: boolean |
| default: false |
| aiter_ref: |
| description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' |
| required: false |
| type: string |
| default: '' |
| continue_on_error: |
| description: 'Continue on error (do not fail the workflow on test failures)' |
| required: false |
| type: boolean |
| default: false |
|
|
| env: |
| AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} |
|
|
| concurrency: |
| |
| |
| |
| group: pr-test-amd-${{ inputs.run_all_tests && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }} |
| cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' }} |
|
|
| jobs: |
| call-gate: |
| uses: ./.github/workflows/pr-gate.yml |
| secrets: inherit |
| check-changes: |
| needs: [call-gate] |
| runs-on: ubuntu-latest |
| outputs: |
| main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} |
| sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} |
| jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} |
| multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Determine run mode |
| id: run-mode |
| run: | |
| # Run all tests for workflow_call (when ref input is provided) |
| # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref |
| if [[ "${{ inputs.run_all_tests }}" == "true" ]]; then |
| echo "run_all_tests=true" >> $GITHUB_OUTPUT |
| echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }})" |
| else |
| echo "run_all_tests=false" >> $GITHUB_OUTPUT |
| echo "Run mode: FILTERED (triggered by ${{ github.event_name }})" |
| fi |
| |
| - name: Detect file changes |
| id: filter |
| uses: dorny/paths-filter@v3 |
| if: steps.run-mode.outputs.run_all_tests != 'true' |
| with: |
| filters: | |
| main_package: |
| - "python/sglang/!(multimodal_gen)/**" |
| - "python/pyproject_rocm.toml" |
| - "python/pyproject_other.toml" |
| - "scripts/ci/amd/*" |
| - "scripts/ci/utils/*" |
| - "test/**" |
| - ".github/workflows/pr-test-amd.yml" |
| sgl_kernel: |
| - "sgl-kernel/**" |
| - ".github/workflows/pr-test-amd.yml" |
| jit_kernel: |
| - "python/sglang/jit_kernel/**" |
| - ".github/workflows/pr-test-amd.yml" |
| multimodal_gen: |
| - "python/sglang/multimodal_gen/**" |
| - "python/sglang/cli/**" |
| - "python/sglang/jit_kernel/diffusion/**" |
| - "python/pyproject_rocm.toml" |
| - "python/pyproject_other.toml" |
| |
| |
| sgl-kernel-unit-test-amd: |
| needs: [check-changes] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| needs.check-changes.outputs.sgl_kernel == 'true' |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-1gpu-sglang] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh |
| |
| - name: Run test |
| timeout-minutes: 14 |
| run: | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py |
| |
| sgl-kernel-unit-test-2-gpu-amd: |
| needs: [check-changes] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| needs.check-changes.outputs.sgl_kernel == 'true' |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-2gpu-sglang] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh |
| |
| - name: Run test |
| timeout-minutes: 20 |
| run: | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py |
| |
| |
|
|
| stage-a-test-1-amd: |
| needs: [check-changes] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-1gpu-sglang] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh |
| |
| - name: Run test |
| timeout-minutes: 10 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| jit-kernel-unit-test-amd: |
| needs: [check-changes] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| needs.check-changes.outputs.jit_kernel == 'true' |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-1gpu-sglang] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh |
| |
| - name: Run JIT kernel unit tests |
| timeout-minutes: 10 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout" python3 -m pytest -q python/sglang/jit_kernel/tests/test_store_cache.py |
| |
| stage-b-test-small-1-gpu-amd: |
| needs: [check-changes, stage-a-test-1-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-1gpu-sglang] |
| part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Run test |
| timeout-minutes: 30 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| stage-b-test-small-1-gpu-amd-nondeterministic: |
| needs: [check-changes, stage-a-test-1-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-1gpu-sglang] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Run test |
| timeout-minutes: 30 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| stage-b-test-small-1-gpu-amd-mi35x: |
| needs: [check-changes, stage-a-test-1-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-mi35x,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi35x-gpu-1] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Run test |
| timeout-minutes: 30 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| stage-b-test-large-1-gpu-amd: |
| needs: [check-changes, stage-a-test-1-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-1-gpu-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-1gpu-sglang] |
| part: [0, 1] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Run test |
| timeout-minutes: 30 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| stage-b-test-large-2-gpu-amd: |
| needs: [check-changes, stage-a-test-1-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-2-gpu-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-2gpu-sglang] |
| part: [0, 1] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Run test |
| timeout-minutes: 30 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| multimodal-gen-test-1-gpu-amd: |
| needs: [check-changes] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| needs.check-changes.outputs.multimodal_gen == 'true' |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| max-parallel: 1 |
| matrix: |
| runner: [linux-mi325-1gpu-sglang] |
| part: [0, 1] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Download artifacts |
| if: needs.check-changes.outputs.sgl_kernel == 'true' |
| uses: actions/download-artifact@v4 |
| with: |
| path: sgl-kernel/dist/ |
| merge-multiple: true |
| pattern: wheel-python3.10-cuda12.9 |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion |
| |
| - name: Setup kernel caches |
| run: | |
| # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) |
| # This directory persists across container restarts on the self-hosted runner |
| docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub |
| |
| |
| |
| echo "Clearing pre-built AITER kernels from Docker image..." |
| docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true |
| docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true |
| echo "AITER kernels cleared - will be rebuilt on first use" |
|
|
| |
| |
| if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then |
| docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache |
| echo "Created .persistent_cache marker - HF cache will persist" |
| else |
| echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" |
| fi |
|
|
| |
| miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") |
| echo "Found ${miopen_files} MIOpen cache files" |
|
|
| - name: Diagnose HF cache and system resources |
| run: | |
| echo "=== System Memory Status ===" |
| free -h |
| echo "" |
| echo "=== Disk Space ===" |
| df -h /home/runner/sgl-data 2>/dev/null || df -h |
| echo "" |
| echo "=== HF Cache Directory Structure ===" |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" |
| echo "" |
| echo "=== Checking for cached diffusion models (1-GPU tests) ===" |
| # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2 |
| for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do |
| cache_path="/sgl-data/hf-cache/hub/models--${model}" |
| if docker exec ci_sglang test -d "$cache_path"; then |
| size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) |
| echo "✓ CACHED: $model ($size)" |
| else |
| echo "✗ NOT CACHED: $model" |
| fi |
| done |
| echo "" |
| echo "=== GPU Memory Status ===" |
| docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" |
| |
| - name: Run diffusion server tests (1-GPU) |
| timeout-minutes: 70 |
| run: | |
| # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path) |
| # Tests: T2V, T2I, I2V, LoRA |
| # |
| # HF download env vars: |
| # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) |
| # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings |
| docker exec \ |
| -e SGLANG_E2E_TOLERANCE=0.3 \ |
| -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ |
| -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ |
| -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ |
| -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ |
| -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ |
| -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ |
| -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ |
| -e HF_HUB_ENABLE_HF_TRANSFER=1 \ |
| -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ |
| -w /sglang-checkout/python \ |
| ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ |
| --suite 1-gpu \ |
| --partition-id ${{ matrix.part }} \ |
| --total-partitions 2 \ |
| -k "not flux_2" |
| |
| |
| echo "=== Post-test System Memory Status ===" |
| free -h |
|
|
| multimodal-gen-test-2-gpu-amd: |
| needs: [check-changes] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| needs.check-changes.outputs.multimodal_gen == 'true' |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| max-parallel: 1 |
| matrix: |
| runner: [linux-mi325-2gpu-sglang] |
| part: [0, 1] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Download artifacts |
| if: needs.check-changes.outputs.sgl_kernel == 'true' |
| uses: actions/download-artifact@v4 |
| with: |
| path: sgl-kernel/dist/ |
| merge-multiple: true |
| pattern: wheel-python3.10-cuda12.9 |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion |
| |
| - name: Setup kernel caches |
| run: | |
| # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) |
| docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub |
| |
| |
| |
| echo "Clearing pre-built AITER kernels from Docker image..." |
| docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true |
| docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true |
| echo "AITER kernels cleared - will be rebuilt on first use" |
|
|
| |
| |
| if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then |
| docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache |
| echo "Created .persistent_cache marker - HF cache will persist" |
| else |
| echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" |
| fi |
|
|
| |
| miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") |
| echo "Found ${miopen_files} MIOpen cache files" |
|
|
| - name: Diagnose HF cache and system resources |
| run: | |
| echo "=== System Memory Status ===" |
| free -h |
| echo "" |
| echo "=== Disk Space ===" |
| df -h /home/runner/sgl-data 2>/dev/null || df -h |
| echo "" |
| echo "=== HF Cache Directory Structure ===" |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" |
| echo "" |
| echo "=== Checking for cached diffusion models (2-GPU tests) ===" |
| # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1 |
| for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do |
| cache_path="/sgl-data/hf-cache/hub/models--${model}" |
| if docker exec ci_sglang test -d "$cache_path"; then |
| size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) |
| echo "✓ CACHED: $model ($size)" |
| else |
| echo "✗ NOT CACHED: $model" |
| fi |
| done |
| echo "" |
| echo "=== GPU Memory Status ===" |
| docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" |
| |
| - name: Run diffusion server tests (2-GPU) |
| timeout-minutes: 80 |
| run: | |
| # AMD CI: All 2-GPU tests including LoRA |
| # Tests: T2V, T2I, I2V, LoRA |
| # |
| # HF download env vars: |
| # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) |
| # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings |
| docker exec \ |
| -e SGLANG_E2E_TOLERANCE=0.3 \ |
| -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ |
| -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ |
| -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ |
| -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ |
| -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ |
| -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ |
| -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ |
| -e HF_HUB_ENABLE_HF_TRANSFER=1 \ |
| -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ |
| -w /sglang-checkout/python \ |
| ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ |
| --suite 2-gpu \ |
| --partition-id ${{ matrix.part }} \ |
| --total-partitions 2 |
| |
| |
| echo "=== Post-test System Memory Status ===" |
| free -h |
|
|
|
|
| stage-c-test-large-8-gpu-amd: |
| needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| env: |
| RUNNER_LABELS: linux-mi325-8gpu-sglang |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi325-8gpu-sglang] |
| part: [0, 1, 2] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Test RCCL multi-GPU communication |
| timeout-minutes: 5 |
| run: | |
| echo "Testing RCCL multi-GPU communication with debug info..." |
| docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py" |
| |
| - name: Run test |
| timeout-minutes: 60 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| stage-c-test-large-8-gpu-amd-mi35x: |
| needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi35x-gpu-8] |
| part: [0] |
| runs-on: ${{matrix.runner}} |
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Start CI container |
| run: bash scripts/ci/amd/amd_ci_start_container.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Run test |
| timeout-minutes: 60 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| |
| stage-b-test-large-8-gpu-35x-disaggregation-amd: |
| needs: [check-changes, stage-a-test-1-amd] |
| if: | |
| always() && |
| ( |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-disaggregation-amd,')) || |
| ( |
| !(inputs.target_stage || inputs.target_stage_select) && |
| (!failure() && !cancelled()) && |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) |
| ) |
| ) |
| strategy: |
| fail-fast: false |
| matrix: |
| runner: [linux-mi35x-gpu-8.fabric] |
|
|
| runs-on: ${{matrix.runner}} |
|
|
| steps: |
| - name: Checkout code |
| uses: actions/checkout@v4 |
| with: |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} |
|
|
| - name: Ensure VRAM is clear |
| run: bash scripts/ensure_vram_clear.sh rocm |
|
|
| - name: Check Host RDMA Environment |
| id: rdma_detect |
| run: | |
| set +e |
| echo "=== Checking Host RDMA Environment ===" |
| |
| echo "" |
| echo "=== 1. Ionic driver library check ===" |
| ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path" |
|
|
| echo "" |
| echo "=== 2. Infiniband devices ===" |
| ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found" |
| ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found" |
|
|
| echo "" |
| echo "=== 3. ibv_devinfo ===" |
| which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available" |
|
|
| echo "" |
| echo "=== 4. Kernel modules ===" |
| lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded" |
|
|
| echo "" |
| echo "=== 5. Detect RDMA Devices for test environment ===" |
| if [ -d "/sys/class/infiniband" ]; then |
| RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -) |
| echo "Detected RDMA Devices: $RDMA_DEVS" |
| echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV |
| else |
| echo "No RDMA devices found in /sys/class/infiniband" |
| echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV |
| fi |
|
|
| echo "" |
| echo "=== Host RDMA Check Complete ===" |
|
|
| - name: Start Special Container |
| run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh |
| env: |
| GITHUB_WORKSPACE: ${{ github.workspace }} |
|
|
| - name: Install dependencies |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh |
|
|
| - name: Verify RDMA in Container |
| run: | |
| docker exec -u root ci_sglang bash -c ' |
| echo "=== Container RDMA Verification ===" |
| echo "Device nodes:" |
| ls -la /dev/infiniband/ |
| echo "" |
| echo "Provider libraries:" |
| ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers" |
| echo "" |
| echo "HCA devices:" |
| HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0") |
| ibv_devinfo -list |
| if [ "$HCA_COUNT" -gt 0 ]; then |
| echo "" |
| echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ===" |
| else |
| echo "" |
| echo "=== WARNING: No HCAs detected. RDMA tests may fail ===" |
| fi |
| ' |
| |
| - name: Run Aiter Op Test (RMSNorm) |
| timeout-minutes: 10 |
| run: | |
| echo "Running pre-check: test_rmsnorm2d.py" |
| docker exec \ |
| -e MAX_JOBS=192 \ |
| ci_sglang \ |
| python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py |
| |
| - name: Run test_disaggregation |
| timeout-minutes: 60 |
| run: | |
| bash scripts/ci/amd/amd_ci_exec.sh \ |
| -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ |
| -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} |
| |
| pr-test-amd-finish: |
| needs: |
| [ |
| call-gate, |
| check-changes, |
|
|
| sgl-kernel-unit-test-amd, |
| sgl-kernel-unit-test-2-gpu-amd, |
| multimodal-gen-test-1-gpu-amd, |
| multimodal-gen-test-2-gpu-amd, |
|
|
| stage-a-test-1-amd, |
| jit-kernel-unit-test-amd, |
| stage-b-test-small-1-gpu-amd, |
| stage-b-test-small-1-gpu-amd-nondeterministic, |
| stage-b-test-small-1-gpu-amd-mi35x, |
| stage-b-test-large-1-gpu-amd, |
| stage-b-test-large-2-gpu-amd, |
| stage-b-test-large-8-gpu-35x-disaggregation-amd, |
| stage-c-test-large-8-gpu-amd, |
| stage-c-test-large-8-gpu-amd-mi35x, |
| ] |
| if: always() |
| runs-on: ubuntu-latest |
| steps: |
| - name: Check all dependent job statuses |
| run: | |
| # Convert the 'needs' context to a JSON string |
| json_needs='${{ toJson(needs) }}' |
| |
| |
| job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') |
|
|
| for job in $job_names; do |
| |
| result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') |
|
|
| |
| echo "$job: $result" |
|
|
| |
| if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then |
| echo "The above jobs failed." |
| exit 1 |
| fi |
| done |
|
|
| |
| echo "All jobs completed successfully" |
| exit 0 |
|
|