name: PR Test (AMD) # Dynamic run-name for /rerun-stage commands to enable URL lookup # Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }} on: push: branches: [ main ] paths: - "python/**" - "scripts/ci/**" - "test/**" - "sgl-kernel/**" - ".github/workflows/pr-test-amd.yml" - "docker/rocm.Dockerfile" pull_request: branches: [ main ] paths: - "python/**" - "scripts/ci/**" - "test/**" - "sgl-kernel/**" - ".github/workflows/pr-test-amd.yml" - "docker/rocm.Dockerfile" workflow_dispatch: inputs: target_stage_select: description: "Select a stage to run from dropdown (leave empty for auto-detect)" required: false type: choice default: '' options: - '' - sgl-kernel-unit-test-amd - sgl-kernel-unit-test-2-gpu-amd - stage-a-test-1-amd - jit-kernel-unit-test-amd - stage-b-test-small-1-gpu-amd - stage-b-test-small-1-gpu-amd-nondeterministic - stage-b-test-small-1-gpu-amd-mi35x - stage-b-test-large-1-gpu-amd - stage-b-test-large-2-gpu-amd - multimodal-gen-test-1-gpu-amd - multimodal-gen-test-2-gpu-amd - stage-c-test-large-8-gpu-amd - stage-c-test-large-8-gpu-amd-mi35x - stage-b-test-large-8-gpu-disaggregation-amd target_stage: description: "Or type comma-separated stage names (overrides dropdown if non-empty)" required: false type: string default: "" pr_head_sha: description: "PR head SHA to checkout (for /rerun-stage on fork PRs)" required: false type: string default: "" aiter_ref: description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' required: false type: string default: '' continue_on_error: description: 'Continue on error (do not fail the workflow on test failures)' required: false type: boolean default: false workflow_call: inputs: ref: description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' required: false type: string default: '' run_all_tests: description: "Run all tests (for releasing or testing purpose)" required: false type: boolean default: false aiter_ref: description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' required: false type: string default: '' continue_on_error: description: 'Continue on error (do not fail the workflow on test failures)' required: false type: boolean default: false env: AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} concurrency: # When called via workflow_call with run_all_tests=true, use a unique group per run to # avoid collisions with direct push/PR triggers. We use run_all_tests (not github.event_name) # to detect this, because github.event_name inherits from the caller in workflow_call. group: pr-test-amd-${{ inputs.run_all_tests && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }} cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' }} jobs: call-gate: uses: ./.github/workflows/pr-gate.yml secrets: inherit check-changes: needs: [call-gate] runs-on: ubuntu-latest outputs: main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Determine run mode id: run-mode run: | # Run all tests for workflow_call (when ref input is provided) # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref if [[ "${{ inputs.run_all_tests }}" == "true" ]]; then echo "run_all_tests=true" >> $GITHUB_OUTPUT echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }})" else echo "run_all_tests=false" >> $GITHUB_OUTPUT echo "Run mode: FILTERED (triggered by ${{ github.event_name }})" fi - name: Detect file changes id: filter uses: dorny/paths-filter@v3 if: steps.run-mode.outputs.run_all_tests != 'true' with: filters: | main_package: - "python/sglang/!(multimodal_gen)/**" - "python/pyproject_rocm.toml" - "python/pyproject_other.toml" - "scripts/ci/amd/*" - "scripts/ci/utils/*" - "test/**" - ".github/workflows/pr-test-amd.yml" sgl_kernel: - "sgl-kernel/**" - ".github/workflows/pr-test-amd.yml" jit_kernel: - "python/sglang/jit_kernel/**" - ".github/workflows/pr-test-amd.yml" multimodal_gen: - "python/sglang/multimodal_gen/**" - "python/sglang/cli/**" - "python/sglang/jit_kernel/diffusion/**" - "python/pyproject_rocm.toml" - "python/pyproject_other.toml" # =============================================== sgl-kernel ==================================================== sgl-kernel-unit-test-amd: needs: [check-changes] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && needs.check-changes.outputs.sgl_kernel == 'true' ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-1gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: | bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 14 run: | docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py sgl-kernel-unit-test-2-gpu-amd: needs: [check-changes] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && needs.check-changes.outputs.sgl_kernel == 'true' ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-2gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: | bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 20 run: | docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_deterministic_custom_allreduce.py docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_amd_nccl_allreduce_determinism.py # =============================================== primary ==================================================== stage-a-test-1-amd: needs: [check-changes] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-1gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: | bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 10 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }} jit-kernel-unit-test-amd: needs: [check-changes] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && needs.check-changes.outputs.jit_kernel == 'true' ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-1gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: | bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run JIT kernel unit tests timeout-minutes: 10 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout" python3 -m pytest -q python/sglang/jit_kernel/tests/test_store_cache.py stage-b-test-small-1-gpu-amd: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-1gpu-sglang] part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-nondeterministic: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-nondeterministic,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-1gpu-sglang] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-nondeterministic --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-small-1-gpu-amd-mi35x: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-small-1-gpu-amd-mi35x,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi35x-gpu-1] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-small-1-gpu-amd-mi35x ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-1-gpu-amd: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-1-gpu-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-1gpu-sglang] part: [0, 1] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-b-test-large-2-gpu-amd: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-2-gpu-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi325-2gpu-sglang] part: [0, 1] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} multimodal-gen-test-1-gpu-amd: needs: [check-changes] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && needs.check-changes.outputs.multimodal_gen == 'true' ) ) strategy: fail-fast: false max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT matrix: runner: [linux-mi325-1gpu-sglang] part: [0, 1] # 2 partitions: 11 tests ÷ 2 = ~5-6 tests each runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: | bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion - name: Setup kernel caches run: | # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) # This directory persists across container restarts on the self-hosted runner docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub # Clear pre-built AITER kernels from Docker image to avoid segfaults # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ echo "Clearing pre-built AITER kernels from Docker image..." docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true echo "AITER kernels cleared - will be rebuilt on first use" # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) # This tells the test cleanup code to NOT delete downloaded models if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache echo "Created .persistent_cache marker - HF cache will persist" else echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" fi # Check MIOpen cache (VAE convolution kernels) miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") echo "Found ${miopen_files} MIOpen cache files" - name: Diagnose HF cache and system resources run: | echo "=== System Memory Status ===" free -h echo "" echo "=== Disk Space ===" df -h /home/runner/sgl-data 2>/dev/null || df -h echo "" echo "=== HF Cache Directory Structure ===" docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" echo "" echo "=== Checking for cached diffusion models (1-GPU tests) ===" # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2 for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do cache_path="/sgl-data/hf-cache/hub/models--${model}" if docker exec ci_sglang test -d "$cache_path"; then size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) echo "✓ CACHED: $model ($size)" else echo "✗ NOT CACHED: $model" fi done echo "" echo "=== GPU Memory Status ===" docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" - name: Run diffusion server tests (1-GPU) timeout-minutes: 70 run: | # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path) # Tests: T2V, T2I, I2V, LoRA # # HF download env vars: # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings docker exec \ -e SGLANG_E2E_TOLERANCE=0.3 \ -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ -e HF_HUB_ENABLE_HF_TRANSFER=1 \ -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ -w /sglang-checkout/python \ ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ --suite 1-gpu \ --partition-id ${{ matrix.part }} \ --total-partitions 2 \ -k "not flux_2" # Post-test diagnostics echo "=== Post-test System Memory Status ===" free -h multimodal-gen-test-2-gpu-amd: needs: [check-changes] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && needs.check-changes.outputs.multimodal_gen == 'true' ) ) strategy: fail-fast: false max-parallel: 1 # Run one at a time to avoid eviction from resource exhaustion during AITER kernel JIT matrix: runner: [linux-mi325-2gpu-sglang] part: [0, 1] # 2 partitions: 9 tests ÷ 2 = ~4-5 tests each runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: | bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion - name: Setup kernel caches run: | # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub # Clear pre-built AITER kernels from Docker image to avoid segfaults # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ echo "Clearing pre-built AITER kernels from Docker image..." docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true echo "AITER kernels cleared - will be rebuilt on first use" # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) # This tells the test cleanup code to NOT delete downloaded models if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache echo "Created .persistent_cache marker - HF cache will persist" else echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" fi # Check MIOpen cache (VAE convolution kernels) miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") echo "Found ${miopen_files} MIOpen cache files" - name: Diagnose HF cache and system resources run: | echo "=== System Memory Status ===" free -h echo "" echo "=== Disk Space ===" df -h /home/runner/sgl-data 2>/dev/null || df -h echo "" echo "=== HF Cache Directory Structure ===" docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" echo "" echo "=== Checking for cached diffusion models (2-GPU tests) ===" # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1 for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do cache_path="/sgl-data/hf-cache/hub/models--${model}" if docker exec ci_sglang test -d "$cache_path"; then size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) echo "✓ CACHED: $model ($size)" else echo "✗ NOT CACHED: $model" fi done echo "" echo "=== GPU Memory Status ===" docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" - name: Run diffusion server tests (2-GPU) timeout-minutes: 80 run: | # AMD CI: All 2-GPU tests including LoRA # Tests: T2V, T2I, I2V, LoRA # # HF download env vars: # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings docker exec \ -e SGLANG_E2E_TOLERANCE=0.3 \ -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ -e HF_HUB_ENABLE_HF_TRANSFER=1 \ -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ -w /sglang-checkout/python \ ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ --suite 2-gpu \ --partition-id ${{ matrix.part }} \ --total-partitions 2 # Post-test diagnostics echo "=== Post-test System Memory Status ===" free -h stage-c-test-large-8-gpu-amd: needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) env: RUNNER_LABELS: linux-mi325-8gpu-sglang strategy: fail-fast: false matrix: runner: [linux-mi325-8gpu-sglang] part: [0, 1, 2] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Test RCCL multi-GPU communication timeout-minutes: 5 run: | echo "Testing RCCL multi-GPU communication with debug info..." docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py" - name: Run test timeout-minutes: 60 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} stage-c-test-large-8-gpu-amd-mi35x: needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd, stage-b-test-large-2-gpu-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi35x-gpu-8] part: [0] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Start CI container run: bash scripts/ci/amd/amd_ci_start_container.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Run test timeout-minutes: 60 run: | bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 --timeout-per-file 3600 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} # =============================================== Disaggregation ==================================================== stage-b-test-large-8-gpu-35x-disaggregation-amd: needs: [check-changes, stage-a-test-1-amd] if: | always() && ( (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-disaggregation-amd,')) || ( !(inputs.target_stage || inputs.target_stage_select) && (!failure() && !cancelled()) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) strategy: fail-fast: false matrix: runner: [linux-mi35x-gpu-8.fabric] runs-on: ${{matrix.runner}} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Ensure VRAM is clear run: bash scripts/ensure_vram_clear.sh rocm - name: Check Host RDMA Environment id: rdma_detect run: | set +e echo "=== Checking Host RDMA Environment ===" echo "" echo "=== 1. Ionic driver library check ===" ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path" echo "" echo "=== 2. Infiniband devices ===" ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found" ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found" echo "" echo "=== 3. ibv_devinfo ===" which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available" echo "" echo "=== 4. Kernel modules ===" lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded" echo "" echo "=== 5. Detect RDMA Devices for test environment ===" if [ -d "/sys/class/infiniband" ]; then RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -) echo "Detected RDMA Devices: $RDMA_DEVS" echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV else echo "No RDMA devices found in /sys/class/infiniband" echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV fi echo "" echo "=== Host RDMA Check Complete ===" - name: Start Special Container run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh env: GITHUB_WORKSPACE: ${{ github.workspace }} - name: Install dependencies run: bash scripts/ci/amd/amd_ci_install_dependency.sh - name: Verify RDMA in Container run: | docker exec -u root ci_sglang bash -c ' echo "=== Container RDMA Verification ===" echo "Device nodes:" ls -la /dev/infiniband/ echo "" echo "Provider libraries:" ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers" echo "" echo "HCA devices:" HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0") ibv_devinfo -list if [ "$HCA_COUNT" -gt 0 ]; then echo "" echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ===" else echo "" echo "=== WARNING: No HCAs detected. RDMA tests may fail ===" fi ' - name: Run Aiter Op Test (RMSNorm) timeout-minutes: 10 run: | echo "Running pre-check: test_rmsnorm2d.py" docker exec \ -e MAX_JOBS=192 \ ci_sglang \ python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py - name: Run test_disaggregation timeout-minutes: 60 run: | bash scripts/ci/amd/amd_ci_exec.sh \ -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-35x-disaggregation-amd --timeout-per-file 1800 ${{ inputs.continue_on_error && '--continue-on-error' || '' }} pr-test-amd-finish: needs: [ call-gate, check-changes, sgl-kernel-unit-test-amd, sgl-kernel-unit-test-2-gpu-amd, multimodal-gen-test-1-gpu-amd, multimodal-gen-test-2-gpu-amd, stage-a-test-1-amd, jit-kernel-unit-test-amd, stage-b-test-small-1-gpu-amd, stage-b-test-small-1-gpu-amd-nondeterministic, stage-b-test-small-1-gpu-amd-mi35x, stage-b-test-large-1-gpu-amd, stage-b-test-large-2-gpu-amd, stage-b-test-large-8-gpu-35x-disaggregation-amd, stage-c-test-large-8-gpu-amd, stage-c-test-large-8-gpu-amd-mi35x, ] if: always() runs-on: ubuntu-latest steps: - name: Check all dependent job statuses run: | # Convert the 'needs' context to a JSON string json_needs='${{ toJson(needs) }}' # Get a list of all job names from the JSON keys job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') for job in $job_names; do # For each job, extract its result result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') # Print the job name and its result echo "$job: $result" # Check for failure or cancellation and exit if found if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then echo "The above jobs failed." exit 1 fi done # If the loop completes, all jobs were successful echo "All jobs completed successfully" exit 0