name: PR Test # Dynamic run-name for /rerun-stage commands to enable URL lookup # Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs run-name: ${{ inputs.target_stage && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage)) || '' }} on: schedule: - cron: '0 */6 * * *' # Run every 6 hours pull_request: branches: [main] workflow_dispatch: inputs: version: description: "FlashInfer version" required: true type: choice default: "release" options: - "release" - "nightly" target_stage: description: "Specific stage to run (optional, for quick testing)" required: false type: string default: "" force_continue_on_error: description: "Force continue-on-error (test scheduled CI behavior)" required: false type: boolean default: false pr_head_sha: description: "PR head SHA to checkout (for /rerun-stage on fork PRs)" required: false type: string default: "" test_parallel_dispatch: description: "Test parallel dispatch behavior (simulates scheduled run)" required: false type: boolean default: false workflow_call: inputs: ref: description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' required: false type: string default: '' run_all_tests: description: "Run all tests (for releasing or testing purpose)" required: false type: boolean default: false concurrency: # Concurrency group structure: pr-test-{event}-{branch}-{pr_sha}-{stage} # - event_name prevents scheduled runs from colliding with fork PRs whose branch is named 'main' # (without it, both resolve the branch segment to 'main' and block each other) # - github.head_ref (pull_request) or github.ref_name (workflow_dispatch) normalizes to branch name # - pr_head_sha isolates /rerun-stage from main branch runs # - target_stage allows parallel stage dispatches to run independently group: pr-test-${{ github.event_name }}-${{ github.head_ref || github.ref_name || 'default' }}-${{ inputs.pr_head_sha || 'current' }}-${{ inputs.target_stage || inputs.ref || 'all' }} cancel-in-progress: ${{ github.event_name != 'workflow_call' }} env: SGLANG_IS_IN_CI: true SGLANG_CUDA_COREDUMP: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true permissions: actions: write contents: read pull-requests: read jobs: # =============================================== check changes ==================================================== check-changes: runs-on: ubuntu-latest outputs: # Use API-based detection for target_stage mode (filter-api), otherwise use dorny/paths-filter (filter) main_package: ${{ steps.filter-api.outputs.main_package || steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} # sgl_kernel is forced to false when target_stage is set, since sgl-kernel-build-wheels won't run # This prevents CUSTOM_BUILD_SGL_KERNEL=true when the wheel artifacts aren't available # Note: If PR has kernel changes AND target_stage is set, the validate-target-stage step will fail sgl_kernel: ${{ !inputs.target_stage && (steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel) }} # Raw sgl_kernel value before target_stage override (used for validation) sgl_kernel_raw: ${{ steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel }} jit_kernel: ${{ steps.filter-api.outputs.jit_kernel || steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} multimodal_gen: ${{ steps.filter-api.outputs.multimodal_gen || steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} max_parallel: ${{ steps.set-parallel.outputs.max_parallel }} b200_runner: ${{ steps.set-runner.outputs.b200_runner }} enable_retry: ${{ steps.set-retry.outputs.enable_retry }} continue_on_error: ${{ steps.set-continue-on-error.outputs.continue_on_error }} steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Determine run mode id: run-mode run: | # Run all tests for scheduled runs and workflow_call (when ref input is provided) # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref if [[ "${{ github.event_name }}" == "schedule" || "${{ inputs.run_all_tests }}" == "true" ]]; then echo "run_all_tests=true" >> $GITHUB_OUTPUT echo "Run mode: ALL TESTS (schedule=${{ github.event_name == 'schedule' }}, run_all_tests=${{ inputs.run_all_tests }})" else echo "run_all_tests=false" >> $GITHUB_OUTPUT echo "Run mode: FILTERED (triggered by ${{ github.event_name }})" fi - name: Detect file changes id: filter uses: dorny/paths-filter@v3 # Only use paths-filter for pull_request events (where it works correctly) # For workflow_dispatch with target_stage, we use GitHub API in the next step if: steps.run-mode.outputs.run_all_tests != 'true' && !inputs.target_stage with: filters: | main_package: - "python/sglang/!(multimodal_gen)/**" - "python/pyproject.toml" - "scripts/ci/cuda/*" - "scripts/ci/utils/*" - "test/**" - ".github/workflows/pr-test.yml" sgl_kernel: - "sgl-kernel/**" jit_kernel: - "python/sglang/jit_kernel/**" - "python/pyproject.toml" - ".github/workflows/pr-test.yml" multimodal_gen: - "python/sglang/multimodal_gen/**" - "python/sglang/jit_kernel/**" - "python/sglang/cli/**" - "python/pyproject.toml" - ".github/workflows/pr-test.yml" # For /rerun-stage (workflow_dispatch with target_stage), dorny/paths-filter doesn't work # correctly because it falls back to "last commit" detection which breaks for merge commits. # Instead, we use the GitHub API to compare the PR commit against main. - name: Detect file changes via API (for target_stage) id: filter-api if: inputs.target_stage && inputs.pr_head_sha env: GH_TOKEN: ${{ github.token }} run: | echo "Detecting file changes via GitHub API for target_stage mode..." echo "PR head SHA: ${{ inputs.pr_head_sha }}" # Get the list of changed files by comparing PR commit against main # This correctly handles merge commits by looking at the actual PR diff CHANGED_FILES=$(gh api "repos/${{ github.repository }}/compare/main...${{ inputs.pr_head_sha }}" \ --jq '[.files[].filename] | .[]' 2>/dev/null || echo "") if [ -z "$CHANGED_FILES" ]; then echo "Warning: Could not fetch changed files from API, assuming no changes" echo "sgl_kernel=false" >> $GITHUB_OUTPUT echo "main_package=false" >> $GITHUB_OUTPUT echo "jit_kernel=false" >> $GITHUB_OUTPUT echo "multimodal_gen=false" >> $GITHUB_OUTPUT exit 0 fi echo "Changed files:" echo "$CHANGED_FILES" | head -20 echo "..." # Check for sgl-kernel changes if echo "$CHANGED_FILES" | grep -q "^sgl-kernel/"; then echo "sgl_kernel=true" >> $GITHUB_OUTPUT echo "Detected sgl-kernel changes" else echo "sgl_kernel=false" >> $GITHUB_OUTPUT fi # Check for main_package changes (excluding multimodal_gen) # Note: Need to filter out multimodal_gen before checking, not pipe grep -q output MAIN_PKG_FILES=$(echo "$CHANGED_FILES" | grep -E "^(python/sglang/|python/pyproject\.toml|scripts/ci/cuda/|scripts/ci/utils/|test/|\.github/workflows/pr-test\.yml)" | grep -v "^python/sglang/multimodal_gen/" || true) if [ -n "$MAIN_PKG_FILES" ]; then echo "main_package=true" >> $GITHUB_OUTPUT echo "Detected main_package changes" else echo "main_package=false" >> $GITHUB_OUTPUT fi # Check for jit_kernel changes if echo "$CHANGED_FILES" | grep -qE "^(python/sglang/jit_kernel/|python/pyproject\.toml|\.github/workflows/pr-test\.yml)"; then echo "jit_kernel=true" >> $GITHUB_OUTPUT echo "Detected jit_kernel changes" else echo "jit_kernel=false" >> $GITHUB_OUTPUT fi # Check for multimodal_gen changes if echo "$CHANGED_FILES" | grep -qE "^(python/sglang/multimodal_gen/|python/sglang/cli/|python/pyproject\.toml|\.github/workflows/pr-test\.yml)"; then echo "multimodal_gen=true" >> $GITHUB_OUTPUT echo "Detected multimodal_gen changes" else echo "multimodal_gen=false" >> $GITHUB_OUTPUT fi - name: Set max-parallel based on run type id: set-parallel env: GH_TOKEN: ${{ github.token }} run: | # Scheduled runs and high-priority PRs get full parallelism if [[ "${{ github.event_name }}" == "schedule" ]]; then echo "max_parallel=14" >> $GITHUB_OUTPUT echo "Scheduled run detected, setting max_parallel to 14" elif [[ "${{ github.event_name }}" == "pull_request" && "${{ contains(github.event.pull_request.labels.*.name, 'high priority') }}" == "true" ]]; then echo "max_parallel=14" >> $GITHUB_OUTPUT echo "High priority PR detected, setting max_parallel to 14" elif [[ -n "${{ inputs.target_stage }}" ]]; then # /rerun-stage (workflow_dispatch): query PR labels via GitHub API # Try SHA lookup first (fork PRs), fallback to branch name (non-fork PRs) LABELS="" PR_HEAD_SHA="${{ inputs.pr_head_sha }}" if [[ -n "$PR_HEAD_SHA" ]]; then LABELS=$(gh api "repos/${{ github.repository }}/commits/${PR_HEAD_SHA}/pulls" \ --jq '.[0].labels[].name' 2>/dev/null || true) fi if [[ -z "$LABELS" ]]; then LABELS=$(gh pr list --head "${{ github.ref_name }}" --repo "${{ github.repository }}" \ --json labels --jq '.[0].labels[].name' 2>/dev/null || true) fi echo "PR labels: ${LABELS:-"(none)"}" if echo "$LABELS" | grep -Fxq "high priority"; then echo "max_parallel=14" >> $GITHUB_OUTPUT echo "High priority PR detected via API (/rerun-stage), setting max_parallel to 14" else echo "max_parallel=3" >> $GITHUB_OUTPUT echo "Using default max_parallel of 3 (/rerun-stage, no high priority label)" fi else echo "max_parallel=3" >> $GITHUB_OUTPUT echo "Using default max_parallel of 3" fi - name: Set B200 runner tag id: set-runner run: | # Use kernel-build runner only when sgl_kernel changes are detected AND we're not in target_stage mode # (target_stage skips wheel builds, so we can't use custom kernels) # Use API-based detection (filter-api) for target_stage mode, otherwise use dorny/paths-filter (filter) sgl_kernel="${{ steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }}" target_stage="${{ inputs.target_stage }}" if [[ "$sgl_kernel" == "true" && -z "$target_stage" ]]; then echo "b200_runner=4-gpu-b200-kernel" >> $GITHUB_OUTPUT else echo "b200_runner=4-gpu-b200" >> $GITHUB_OUTPUT fi - name: Enable retry for CI id: set-retry run: | echo "enable_retry=true" >> $GITHUB_OUTPUT echo "Retry logic enabled for CI" - name: Set continue-on-error for full test runs id: set-continue-on-error run: | if [[ "${{ steps.run-mode.outputs.run_all_tests }}" == "true" || "${{ inputs.force_continue_on_error }}" == "true" ]]; then echo "continue_on_error=true" >> $GITHUB_OUTPUT echo "Full test run or force flag detected, enabling continue-on-error to run all tests" else echo "continue_on_error=false" >> $GITHUB_OUTPUT echo "Filtered run, continue-on-error disabled" fi - name: Validate target_stage with kernel changes # Use API-based detection (filter-api) for target_stage mode, otherwise use dorny/paths-filter (filter) if: inputs.target_stage && (steps.filter-api.outputs.sgl_kernel == 'true' || steps.filter.outputs.sgl_kernel == 'true') run: | echo "::error::Cannot use /rerun-stage when PR has sgl-kernel changes." echo "::error::The sgl-kernel-build-wheels job is skipped in target_stage mode, but this PR modifies sgl-kernel/ files." echo "::error::Please use /tag-and-rerun-ci to run the full workflow including kernel builds." echo "" echo "ERROR: Cannot use /rerun-stage when PR has sgl-kernel changes." echo "" echo "This PR modifies files in sgl-kernel/, which requires building custom kernel wheels." echo "The /rerun-stage command skips the wheel build job, so the test would run against" echo "the wrong (PyPI) version of sgl-kernel instead of your changes." echo "" echo "To properly test your kernel changes, use one of these commands instead:" echo " /tag-and-rerun-ci - Re-run the full workflow including kernel builds" echo " /rerun-ci - Re-run the full workflow" echo "" exit 1 - name: Show filter results in summary (table) run: | { echo "## Change Detection" echo "" echo "| Component | Changed |" echo "|-------------------|---------|" echo "| main_package | ${{ steps.filter-api.outputs.main_package || steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} |" echo "| sgl_kernel (raw) | ${{ steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel }} |" echo "| sgl_kernel (used) | ${{ !inputs.target_stage && (steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel) }} |" echo "| jit_kernel | ${{ steps.filter-api.outputs.jit_kernel || steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} |" echo "| multimodal_gen | ${{ steps.filter-api.outputs.multimodal_gen || steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} |" echo "| target_stage | ${{ inputs.target_stage || '(none)' }} |" echo "| detection_method | ${{ inputs.target_stage && 'GitHub API' || 'dorny/paths-filter' }} |" echo "| max_parallel | ${{ steps.set-parallel.outputs.max_parallel }} |" echo "| b200_runner | ${{ steps.set-runner.outputs.b200_runner }} |" echo "| enable_retry | ${{ steps.set-retry.outputs.enable_retry }} |" echo "| continue_on_error | ${{ steps.set-continue-on-error.outputs.continue_on_error }} |" } >> $GITHUB_STEP_SUMMARY # =============================================== Wait Jobs for Sequential PR Execution ==================================================== # These jobs poll GitHub API to wait for previous stages to complete. # For PR runs: wait jobs run and enforce sequential execution via polling. # For scheduled runs: wait jobs are skipped, enabling parallel execution for easier retry. wait-for-stage-a: needs: [check-changes, call-gate] # Only run for PRs (not scheduled) and when not targeting a specific stage # Skip if call-gate failed (stage-a jobs will be skipped, nothing to wait for) # !cancelled() ensures this job respects workflow cancellation from concurrency group if: | always() && !cancelled() && github.event_name == 'pull_request' && !inputs.target_stage && inputs.test_parallel_dispatch != true && (needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') && (needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') runs-on: ubuntu-latest outputs: stage_a_result: ${{ steps.wait.outputs.result }} steps: - name: Wait for stage-a-test-1 to complete id: wait uses: actions/github-script@v7 with: script: | const maxWaitMinutes = 240; const pollIntervalSeconds = 120; // 2 minutes to reduce GH API calls const maxAttempts = (maxWaitMinutes * 60) / pollIntervalSeconds; for (let attempt = 0; attempt < maxAttempts; attempt++) { const jobs = await github.paginate(github.rest.actions.listJobsForWorkflowRun, { owner: context.repo.owner, repo: context.repo.repo, run_id: context.runId, per_page: 100, }); const stageAJob = jobs.find(job => job.name === 'stage-a-test-1'); if (stageAJob) { console.log(`stage-a-test-1 status: ${stageAJob.status}, conclusion: ${stageAJob.conclusion}`); if (stageAJob.status === 'completed') { if (stageAJob.conclusion === 'success' || stageAJob.conclusion === 'skipped') { core.setOutput('result', stageAJob.conclusion === 'success' ? 'success' : 'skipped'); return; } else { core.setOutput('result', 'failure'); core.setFailed(`stage-a-test-1 ${stageAJob.conclusion}`); return; } } } else { console.log('stage-a-test-1 job not found yet'); } console.log(`Waiting ${pollIntervalSeconds}s... (attempt ${attempt + 1}/${maxAttempts})`); await new Promise(resolve => setTimeout(resolve, pollIntervalSeconds * 1000)); } core.setFailed('Timeout waiting for stage-a-test-1'); core.setOutput('result', 'timeout'); wait-for-stage-b: needs: [check-changes, call-gate, wait-for-stage-a] # Only run for PRs (not scheduled) and when not targeting a specific stage # Skip if call-gate failed (stage-b jobs will be skipped, nothing to wait for) if: | always() && !cancelled() && github.event_name == 'pull_request' && !inputs.target_stage && inputs.test_parallel_dispatch != true && (needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') && (needs.wait-for-stage-a.result == 'success' || needs.wait-for-stage-a.result == 'skipped') && (needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') runs-on: ubuntu-latest outputs: stage_b_result: ${{ steps.wait.outputs.result }} steps: - name: Wait for stage-b jobs to complete id: wait uses: actions/github-script@v7 with: script: | const maxWaitMinutes = 480; const pollIntervalSeconds = 120; // 2 minutes to reduce GH API calls const maxAttempts = (maxWaitMinutes * 60) / pollIntervalSeconds; // Stage-b jobs to wait for const stageBJobs = [ { prefix: 'stage-b-test-small-1-gpu', expectedCount: 8 }, // partitions 0-7 { prefix: 'stage-b-test-large-1-gpu', expectedCount: 14 }, // partitions 0-13 { prefix: 'stage-b-test-large-2-gpu', expectedCount: 4 }, // partitions 0-3 { prefix: 'stage-b-test-4-gpu-b200', expectedCount: 1 }, ]; const totalExpectedJobs = stageBJobs.reduce((sum, j) => sum + j.expectedCount, 0); // 27 total // Helper to match job names exactly (prefix alone or prefix + " (N)" for matrix jobs) const matchesPrefix = (jobName, prefix) => { return jobName === prefix || jobName.startsWith(prefix + ' ('); }; for (let attempt = 0; attempt < maxAttempts; attempt++) { const jobs = await github.paginate(github.rest.actions.listJobsForWorkflowRun, { owner: context.repo.owner, repo: context.repo.repo, run_id: context.runId, per_page: 100, }); let allCompleted = true; let anyFailed = false; let failedJobs = []; let completedCount = 0; let totalCount = 0; for (const { prefix, expectedCount } of stageBJobs) { const matchingJobs = jobs.filter(job => matchesPrefix(job.name, prefix)); // Check existing jobs for failures first (fail fast) for (const job of matchingJobs) { totalCount++; console.log(`${job.name}: status=${job.status}, conclusion=${job.conclusion}`); if (job.status !== 'completed') { allCompleted = false; } else { completedCount++; if (job.conclusion !== 'success' && job.conclusion !== 'skipped') { anyFailed = true; failedJobs.push(job.name); } } } if (matchingJobs.length < expectedCount) { console.log(`${prefix}: found ${matchingJobs.length}/${expectedCount} jobs (waiting for more)`); allCompleted = false; } } console.log(`Progress: ${completedCount}/${totalCount} jobs completed (expected ${totalExpectedJobs})`); // Fail fast if any jobs failed (don't wait for all jobs to be created) if (anyFailed) { core.setOutput('result', 'failure'); core.setFailed(`Stage-b jobs failed: ${failedJobs.join(', ')}`); return; } if (allCompleted && totalCount >= totalExpectedJobs) { core.setOutput('result', 'success'); return; } console.log(`Waiting ${pollIntervalSeconds}s... (attempt ${attempt + 1}/${maxAttempts})`); await new Promise(resolve => setTimeout(resolve, pollIntervalSeconds * 1000)); } core.setFailed('Timeout waiting for stage-b jobs'); core.setOutput('result', 'timeout'); # =============================================== PR Gate ==================================================== call-gate: needs: check-changes # Skip for scheduled runs (they run all tests) and when target_stage is specified if: | github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && ( needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true' || needs.check-changes.outputs.jit_kernel == 'true' || needs.check-changes.outputs.multimodal_gen == 'true' ) uses: ./.github/workflows/pr-gate.yml secrets: inherit # =============================================== sgl-kernel ==================================================== sgl-kernel-build-wheels: needs: [check-changes, call-gate] # Skip for scheduled runs (they run stages independently) and when target_stage is set if: github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: x64-kernel-build-node timeout-minutes: 240 strategy: matrix: include: - python-version: "3.10" cuda-version: "12.9" # Add back when CUDA 13.0 is supported on CI # - python-version: "3.10" # cuda-version: "13.0" name: Build Wheel steps: - name: Cleanup run: | sudo rm -rf $GITHUB_WORKSPACE/* || true - uses: actions/checkout@v4 with: submodules: "recursive" ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }} run: | cd sgl-kernel ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" env: USE_CCACHE: 1 - name: Verify wheel artifacts run: | ls -alh sgl-kernel/dist ls -alh sgl-kernel/dist/*.whl - name: Upload artifacts uses: actions/upload-artifact@v4 with: name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }} path: sgl-kernel/dist/* if-no-files-found: error sgl-kernel-build-wheels-arm: needs: [check-changes, call-gate] # Skip for scheduled runs (they run stages independently) and when target_stage is set if: github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: arm-kernel-build-node timeout-minutes: 240 strategy: matrix: include: - python-version: "3.10" cuda-version: "12.9" name: Build Wheel Arm steps: - name: Cleanup run: | if [ -d "$GITHUB_WORKSPACE" ]; then sudo rm -rf "$GITHUB_WORKSPACE"/* || true else echo "$GITHUB_WORKSPACE does not exist, nothing to clean" fi - uses: actions/checkout@v4 with: submodules: "recursive" ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }} run: | cd sgl-kernel ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}" env: USE_CCACHE: 1 - name: Verify wheel artifacts run: | ls -alh sgl-kernel/dist ls -alh sgl-kernel/dist/*.whl - name: Upload artifacts uses: actions/upload-artifact@v4 with: name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64 path: sgl-kernel/dist/* if-no-files-found: error sgl-kernel-unit-test: needs: [check-changes, call-gate, sgl-kernel-build-wheels] # Skip for scheduled runs and when target_stage is set if: | github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: 1-gpu-runner timeout-minutes: 240 env: RUNNER_LABELS: 1-gpu-runner steps: - uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Cleanup run: | ls -alh sgl-kernel/dist || true rm -rf sgl-kernel/dist/* || true - name: Download artifacts uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion - name: Run test timeout-minutes: 30 run: | cd sgl-kernel pytest tests/ sgl-kernel-mla-test: needs: [check-changes, call-gate, sgl-kernel-build-wheels] # Skip for scheduled runs and when target_stage is set if: | github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: 1-gpu-runner timeout-minutes: 240 env: RUNNER_LABELS: 1-gpu-runner steps: - uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Cleanup run: | ls -alh sgl-kernel/dist || true rm -rf sgl-kernel/dist/* || true - name: Download artifacts uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | cd test/registered/mla python3 test_mla_deepseek_v3.py sgl-kernel-benchmark-test: needs: [check-changes, call-gate, sgl-kernel-build-wheels] # Skip for scheduled runs and when target_stage is set if: | github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: 1-gpu-runner timeout-minutes: 240 env: CI: true RUNNER_LABELS: 1-gpu-runner steps: - uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Cleanup run: | ls -alh sgl-kernel/dist || true rm -rf sgl-kernel/dist/* || true - name: Download artifacts uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - name: Run benchmark tests timeout-minutes: 45 run: | cd sgl-kernel/benchmark echo "Running sgl-kernel benchmark tests in CI mode..." echo "CI environment variable: $CI" echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS" for bench_file in bench_*.py; do echo "Testing $bench_file..." timeout 60 python3 "$bench_file" || echo "Warning: $bench_file timed out or failed, continuing..." echo "Completed $bench_file" echo "---" done echo "All benchmark tests completed!" sgl-kernel-b200-test: needs: [check-changes, sgl-kernel-build-wheels] # Skip for scheduled runs and when target_stage is set if: | github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true' runs-on: ${{ needs.check-changes.outputs.b200_runner }} timeout-minutes: 240 env: RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} steps: - uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Cleanup run: | ls -alh sgl-kernel/dist || true rm -rf sgl-kernel/dist/* || true - name: Download artifacts uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/cuda/ci_install_dependency.sh diffusion - name: Run sgl-kernel unit tests on B200 timeout-minutes: 30 run: | cd sgl-kernel pytest tests/ # Adding a single CUDA13 smoke test to verify that the kernel builds and runs # TODO: Add back this test when it can pass on CI # cuda13-kernel-smoke-test: # needs: [check-changes, sgl-kernel-build-wheels] # if: needs.check-changes.outputs.sgl_kernel == 'true' # runs-on: x64-cu13-kernel-tests # steps: # - uses: actions/checkout@v4 # - name: Cleanup # run: | # ls -alh sgl-kernel/dist || true # rm -rf sgl-kernel/dist/* || true # - name: Download CUDA 13.0 artifacts # uses: actions/download-artifact@v4 # with: # path: sgl-kernel/dist/ # merge-multiple: true # pattern: wheel-python3.10-cuda13.0 # - name: Install dependencies # run: | # CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh # - name: Run kernel unit tests # timeout-minutes: 30 # run: | # cd sgl-kernel # pytest tests/ # =============================================== jit-kernel ==================================================== jit-kernel-unit-test: needs: [check-changes, call-gate] # Skip for scheduled runs and when target_stage is set if: | github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.jit_kernel == 'true' runs-on: 1-gpu-runner timeout-minutes: 240 env: RUNNER_LABELS: 1-gpu-runner steps: - uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Install dependencies timeout-minutes: 20 run: | bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | cd python/sglang/jit_kernel pytest tests/ # =============================================== primary ==================================================== stage-a-test-1: needs: [check-changes, call-gate, sgl-kernel-build-wheels] if: | always() && ( (inputs.target_stage == 'stage-a-test-1') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 1-gpu-runner timeout-minutes: 240 env: RUNNER_LABELS: 1-gpu-runner steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 10 run: | cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-a-test-1 $CONTINUE_ON_ERROR_FLAG # temporarily put backend-independent cpu tests here python3 run_suite.py --hw cpu --suite default $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() stage-a-cpu-only: needs: [check-changes, call-gate] if: | always() && ( (inputs.target_stage == 'stage-a-cpu-only') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && (needs.check-changes.outputs.main_package == 'true') ) ) runs-on: ubuntu-latest timeout-minutes: 240 steps: - name: Free disk space run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc df -h - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.10' - name: Install dependencies timeout-minutes: 20 run: | pip install -e "python/[dev]" - name: Run test timeout-minutes: 10 run: | cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cpu --suite stage-a-cpu-only $CONTINUE_ON_ERROR_FLAG # Runs on 5090 (32GB, SM120) stage-b-test-small-1-gpu: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] if: | always() && ( (inputs.target_stage == 'stage-b-test-small-1-gpu') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 1-gpu-5090 timeout-minutes: 240 env: RUNNER_LABELS: 1-gpu-5090 IS_BLACKWELL: "1" strategy: fail-fast: false max-parallel: 8 matrix: partition: [0, 1, 2, 3, 4, 5, 6, 7] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | source /etc/profile.d/sglang-ci.sh CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh git clone https://github.com/merrymercy/human-eval.git cd human-eval pip install -e . --no-build-isolation - name: Run test timeout-minutes: 30 run: | source /etc/profile.d/sglang-ci.sh cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 8 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.partition }} # Runs on H100 (80GB, SM90) - tests that don't pass on 5090 (FA3, FP8, high VRAM, etc.) stage-b-test-large-1-gpu: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] if: | always() && ( (inputs.target_stage == 'stage-b-test-large-1-gpu') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 1-gpu-runner timeout-minutes: 240 env: RUNNER_LABELS: 1-gpu-runner strategy: fail-fast: false max-parallel: ${{ fromJson(needs.check-changes.outputs.max_parallel) }} matrix: partition: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 14 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.partition }} stage-b-test-large-2-gpu: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] if: | always() && ( (inputs.target_stage == 'stage-b-test-large-2-gpu') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 2-gpu-runner timeout-minutes: 240 env: RUNNER_LABELS: 2-gpu-runner strategy: fail-fast: false matrix: partition: [0, 1, 2, 3] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh git clone https://github.com/merrymercy/human-eval.git cd human-eval pip install -e . --no-build-isolation - name: Run test timeout-minutes: 30 run: | cd test/ CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.partition }} stage-b-test-4-gpu-b200: needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels] if: | always() && ( (inputs.target_stage == 'stage-b-test-4-gpu-b200') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: ${{ needs.check-changes.outputs.b200_runner }} timeout-minutes: 240 env: RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} strategy: fail-fast: false steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v6 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-b-test-4-gpu-b200 $CONTINUE_ON_ERROR_FLAG - name: Run FA4 jit_kernel tests (SM100+) timeout-minutes: 10 run: | IS_BLACKWELL=1 python3 -m pytest -q python/sglang/jit_kernel/tests/test_flash_attention_4.py - uses: ./.github/actions/upload-cuda-coredumps if: always() multimodal-gen-test-1-gpu: needs: [check-changes, call-gate, sgl-kernel-build-wheels] if: | always() && ( (inputs.target_stage == 'multimodal-gen-test-1-gpu') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && needs.check-changes.outputs.multimodal_gen == 'true' ) ) runs-on: 1-gpu-runner timeout-minutes: 240 strategy: fail-fast: false matrix: part: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion - name: Run diffusion server tests timeout-minutes: 240 env: RUNAI_STREAMER_MEMORY_LIMIT: 0 run: | cd python CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 sglang/multimodal_gen/test/run_suite.py \ --suite 1-gpu \ --partition-id ${{ matrix.part }} \ --total-partitions 2 \ $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.part }} multimodal-gen-test-2-gpu: needs: [check-changes, call-gate, sgl-kernel-build-wheels] if: | always() && ( (inputs.target_stage == 'multimodal-gen-test-2-gpu') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && needs.check-changes.outputs.multimodal_gen == 'true' ) ) runs-on: 2-gpu-runner timeout-minutes: 240 strategy: fail-fast: false matrix: part: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion - name: Run diffusion server tests timeout-minutes: 240 env: RUNAI_STREAMER_MEMORY_LIMIT: 0 run: | cd python CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 sglang/multimodal_gen/test/run_suite.py \ --suite 2-gpu \ --partition-id ${{ matrix.part }} \ --total-partitions 2 \ $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.part }} stage-c-test-4-gpu-h100: needs: [check-changes, call-gate, wait-for-stage-b] if: | always() && ( (inputs.target_stage == 'stage-c-test-4-gpu-h100') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 4-gpu-h100 timeout-minutes: 240 env: RUNNER_LABELS: 4-gpu-h100 strategy: fail-fast: false matrix: part: [0, 1, 2] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 20 run: | cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-h100 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.part }} stage-c-test-8-gpu-h200: needs: [check-changes, call-gate, wait-for-stage-b] if: | always() && ( (inputs.target_stage == 'stage-c-test-8-gpu-h200') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 8-gpu-h200 timeout-minutes: 240 env: RUNNER_LABELS: 8-gpu-h200 strategy: fail-fast: false matrix: part: [0, 1, 2, 3] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh - name: Warmup DeepGEMM JIT Compilation timeout-minutes: 25 run: | python3 scripts/ci/cuda/warmup_deep_gemm.py \ deepseek-ai/DeepSeek-V3-0324:8 \ deepseek-ai/DeepSeek-V3.2-Exp:8 - name: Warmup Server CUDA Graphs timeout-minutes: 25 run: | python3 scripts/ci/cuda/warmup_server.py \ deepseek-ai/DeepSeek-V3-0324:8 \ inclusionAI/Ring-2.5-1T:8 - name: Run test timeout-minutes: 30 run: | cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-c-test-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.part }} stage-c-test-8-gpu-h20: needs: [check-changes, call-gate, wait-for-stage-b] if: | always() && ( (inputs.target_stage == 'stage-c-test-8-gpu-h20') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 8-gpu-h20 timeout-minutes: 240 env: SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4" RUNNER_LABELS: 8-gpu-h20 strategy: fail-fast: false matrix: part: [0, 1] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh - name: Run test timeout-minutes: 20 run: | cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-c-test-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.part }} stage-c-test-deepep-4-gpu: needs: [check-changes, call-gate, wait-for-stage-b] if: | always() && ( (inputs.target_stage == 'stage-c-test-deepep-4-gpu') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 4-gpu-h100 timeout-minutes: 240 env: RUNNER_LABELS: 4-gpu-h100 steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh - name: Warmup DeepGEMM JIT Compilation timeout-minutes: 25 run: | python3 scripts/ci/cuda/warmup_deep_gemm.py \ lmsys/sglang-ci-dsv3-test:4 - name: Warmup Server CUDA Graphs timeout-minutes: 25 run: | python3 scripts/ci/cuda/warmup_server.py \ lmsys/sglang-ci-dsv3-test:4 - name: Run test timeout-minutes: 20 run: | cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-c-test-deepep-4-gpu $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() stage-c-test-deepep-8-gpu-h200: needs: [check-changes, call-gate, wait-for-stage-b] if: | always() && ( (inputs.target_stage == 'stage-c-test-deepep-8-gpu-h200') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: 8-gpu-h200 timeout-minutes: 240 env: RUNNER_LABELS: 8-gpu-h200 steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v4 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh - name: Warmup DeepGEMM JIT Compilation timeout-minutes: 25 run: | python3 scripts/ci/cuda/warmup_deep_gemm.py \ deepseek-ai/DeepSeek-V3-0324:8 \ deepseek-ai/DeepSeek-V3.2-Exp:8 - name: Warmup Server CUDA Graphs timeout-minutes: 25 run: | python3 scripts/ci/cuda/warmup_server.py \ deepseek-ai/DeepSeek-V3-0324:8 - name: Run test timeout-minutes: 45 run: | cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi python3 run_suite.py --hw cuda --suite stage-c-test-deepep-8-gpu-h200 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() stage-c-test-4-gpu-b200: needs: [check-changes, call-gate, wait-for-stage-b] if: | always() && ( (inputs.target_stage == 'stage-c-test-4-gpu-b200') || ( !inputs.target_stage && ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) ) ) runs-on: ${{ needs.check-changes.outputs.b200_runner }} timeout-minutes: 240 env: RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }} strategy: fail-fast: false matrix: part: [0, 1, 2] steps: - name: Checkout code uses: actions/checkout@v4 with: ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} - name: Download artifacts if: needs.check-changes.outputs.sgl_kernel == 'true' uses: actions/download-artifact@v6 with: path: sgl-kernel/dist/ merge-multiple: true pattern: wheel-python3.10-cuda12.9 - name: Install dependencies timeout-minutes: 20 run: | CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/cuda/ci_install_dependency.sh - name: Run test timeout-minutes: 30 run: | cd test CONTINUE_ON_ERROR_FLAG="" if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then CONTINUE_ON_ERROR_FLAG="--continue-on-error" fi IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-b200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG - uses: ./.github/actions/upload-cuda-coredumps if: always() with: artifact-suffix: ${{ matrix.part }} # NOTE: GB200 stage temporarily disabled — no company-owned GB200 runner available yet. # Re-enable when a 4-gpu-gb200 runner is provisioned. # stage-c-test-4-gpu-gb200: # needs: [check-changes, call-gate, wait-for-stage-b, sgl-kernel-build-wheels-arm] # if: | # always() && # ( # (inputs.target_stage == 'stage-c-test-4-gpu-gb200') || # ( # !inputs.target_stage && # ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) && # ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) # ) # ) # runs-on: 4-gpu-gb200 # timeout-minutes: 240 # env: # RUNNER_LABELS: 4-gpu-gb200 # strategy: # fail-fast: false # steps: # - name: Checkout code # uses: actions/checkout@v4 # with: # ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} # # - name: Download artifacts # if: needs.check-changes.outputs.sgl_kernel == 'true' # uses: actions/download-artifact@v4 # with: # path: sgl-kernel/dist/ # merge-multiple: true # pattern: wheel-python3.10-cuda12.9-aarch64 # # - name: Install dependencies # timeout-minutes: 20 # run: | # CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/cuda/ci_install_deepep.sh # # - name: Run test # timeout-minutes: 45 # run: | # cd test # CONTINUE_ON_ERROR_FLAG="" # if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then # CONTINUE_ON_ERROR_FLAG="--continue-on-error" # fi # python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-gb200 --timeout-per-file 3600 $CONTINUE_ON_ERROR_FLAG # # - uses: ./.github/actions/upload-cuda-coredumps # if: always() pr-test-finish: needs: [ call-gate, check-changes, sgl-kernel-build-wheels, sgl-kernel-unit-test, sgl-kernel-mla-test, sgl-kernel-benchmark-test, sgl-kernel-b200-test, wait-for-stage-a, wait-for-stage-b, jit-kernel-unit-test, multimodal-gen-test-1-gpu, multimodal-gen-test-2-gpu, stage-a-test-1, stage-a-cpu-only, stage-b-test-small-1-gpu, stage-b-test-large-1-gpu, stage-b-test-large-2-gpu, stage-b-test-4-gpu-b200, stage-c-test-4-gpu-h100, stage-c-test-8-gpu-h20, stage-c-test-8-gpu-h200, stage-c-test-deepep-4-gpu, stage-c-test-deepep-8-gpu-h200, stage-c-test-4-gpu-b200, # stage-c-test-4-gpu-gb200, # Temporarily disabled — no GB200 runner ] if: always() runs-on: ubuntu-latest steps: - name: Check all dependent job statuses run: | # Convert the 'needs' context to a JSON string json_needs='${{ toJson(needs) }}' # Get a list of all job names from the JSON keys job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') for job in $job_names; do # For each job, extract its result result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') # Print the job name and its result echo "$job: $result" # Check for failure or cancellation and exit if found if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then echo "The above jobs failed." exit 1 fi done # If the loop completes, all jobs were successful echo "All jobs completed successfully" exit 0