name: PR Test
# Dynamic run-name for /rerun-stage commands to enable URL lookup
# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs
run-name: ${{ inputs.target_stage && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage)) || '' }}

on:
  schedule:
    - cron: '0 */6 * * *'  # Run every 6 hours
  pull_request:
    branches: [main]
  workflow_dispatch:
    inputs:
      version:
        description: "FlashInfer version"
        required: true
        type: choice
        default: "release"
        options:
          - "release"
          - "nightly"
      target_stage:
        description: "Specific stage to run (optional, for quick testing)"
        required: false
        type: string
        default: ""
      force_continue_on_error:
        description: "Force continue-on-error (test scheduled CI behavior)"
        required: false
        type: boolean
        default: false
      pr_head_sha:
        description: "PR head SHA to checkout (for /rerun-stage on fork PRs)"
        required: false
        type: string
        default: ""
      test_parallel_dispatch:
        description: "Test parallel dispatch behavior (simulates scheduled run)"
        required: false
        type: boolean
        default: false
  workflow_call:
    inputs:
      ref:
        description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
        required: false
        type: string
        default: ''
      run_all_tests:
        description: "Run all tests (for releasing or testing purpose)"
        required: false
        type: boolean
        default: false

concurrency:
  # Concurrency group structure: pr-test-{event}-{branch}-{pr_sha}-{stage}
  # - event_name prevents scheduled runs from colliding with fork PRs whose branch is named 'main'
  #   (without it, both resolve the branch segment to 'main' and block each other)
  # - github.head_ref (pull_request) or github.ref_name (workflow_dispatch) normalizes to branch name
  # - pr_head_sha isolates /rerun-stage from main branch runs
  # - target_stage allows parallel stage dispatches to run independently
  group: pr-test-${{ github.event_name }}-${{ github.head_ref || github.ref_name || 'default' }}-${{ inputs.pr_head_sha || 'current' }}-${{ inputs.target_stage || inputs.ref || 'all' }}
  cancel-in-progress: ${{ github.event_name != 'workflow_call' }}

env:
  SGLANG_IS_IN_CI: true
  SGLANG_CUDA_COREDUMP: "1"
  SGLANG_JIT_DEEPGEMM_FAST_WARMUP: true

permissions:
  actions: write
  contents: read
  pull-requests: read

jobs:
  # =============================================== check changes ====================================================
  check-changes:
    runs-on: ubuntu-latest
    outputs:
      # Use API-based detection for target_stage mode (filter-api), otherwise use dorny/paths-filter (filter)
      main_package: ${{ steps.filter-api.outputs.main_package || steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }}
      # sgl_kernel is forced to false when target_stage is set, since sgl-kernel-build-wheels won't run
      # This prevents CUSTOM_BUILD_SGL_KERNEL=true when the wheel artifacts aren't available
      # Note: If PR has kernel changes AND target_stage is set, the validate-target-stage step will fail
      sgl_kernel: ${{ !inputs.target_stage && (steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel) }}
      # Raw sgl_kernel value before target_stage override (used for validation)
      sgl_kernel_raw: ${{ steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel }}
      jit_kernel: ${{ steps.filter-api.outputs.jit_kernel || steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }}
      multimodal_gen: ${{ steps.filter-api.outputs.multimodal_gen || steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }}
      max_parallel: ${{ steps.set-parallel.outputs.max_parallel }}
      b200_runner: ${{ steps.set-runner.outputs.b200_runner }}
      enable_retry: ${{ steps.set-retry.outputs.enable_retry }}
      continue_on_error: ${{ steps.set-continue-on-error.outputs.continue_on_error }}
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Determine run mode
        id: run-mode
        run: |
          # Run all tests for scheduled runs and workflow_call (when ref input is provided)
          # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref
          if [[ "${{ github.event_name }}" == "schedule" || "${{ inputs.run_all_tests }}" == "true" ]]; then
            echo "run_all_tests=true" >> $GITHUB_OUTPUT
            echo "Run mode: ALL TESTS (schedule=${{ github.event_name == 'schedule' }}, run_all_tests=${{ inputs.run_all_tests }})"
          else
            echo "run_all_tests=false" >> $GITHUB_OUTPUT
            echo "Run mode: FILTERED (triggered by ${{ github.event_name }})"
          fi

      - name: Detect file changes
        id: filter
        uses: dorny/paths-filter@v3
        # Only use paths-filter for pull_request events (where it works correctly)
        # For workflow_dispatch with target_stage, we use GitHub API in the next step
        if: steps.run-mode.outputs.run_all_tests != 'true' && !inputs.target_stage
        with:
          filters: |
            main_package:
              - "python/sglang/!(multimodal_gen)/**"
              - "python/pyproject.toml"
              - "scripts/ci/cuda/*"
              - "scripts/ci/utils/*"
              - "test/**"
              - ".github/workflows/pr-test.yml"
            sgl_kernel:
              - "sgl-kernel/**"
            jit_kernel:
              - "python/sglang/jit_kernel/**"
              - "python/pyproject.toml"
              - ".github/workflows/pr-test.yml"
            multimodal_gen:
              - "python/sglang/multimodal_gen/**"
              - "python/sglang/jit_kernel/**"
              - "python/sglang/cli/**"
              - "python/pyproject.toml"
              - ".github/workflows/pr-test.yml"

      # For /rerun-stage (workflow_dispatch with target_stage), dorny/paths-filter doesn't work
      # correctly because it falls back to "last commit" detection which breaks for merge commits.
      # Instead, we use the GitHub API to compare the PR commit against main.
      - name: Detect file changes via API (for target_stage)
        id: filter-api
        if: inputs.target_stage && inputs.pr_head_sha
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          echo "Detecting file changes via GitHub API for target_stage mode..."
          echo "PR head SHA: ${{ inputs.pr_head_sha }}"

          # Get the list of changed files by comparing PR commit against main
          # This correctly handles merge commits by looking at the actual PR diff
          CHANGED_FILES=$(gh api "repos/${{ github.repository }}/compare/main...${{ inputs.pr_head_sha }}" \
            --jq '[.files[].filename] | .[]' 2>/dev/null || echo "")

          if [ -z "$CHANGED_FILES" ]; then
            echo "Warning: Could not fetch changed files from API, assuming no changes"
            echo "sgl_kernel=false" >> $GITHUB_OUTPUT
            echo "main_package=false" >> $GITHUB_OUTPUT
            echo "jit_kernel=false" >> $GITHUB_OUTPUT
            echo "multimodal_gen=false" >> $GITHUB_OUTPUT
            exit 0
          fi

          echo "Changed files:"
          echo "$CHANGED_FILES" | head -20
          echo "..."

          # Check for sgl-kernel changes
          if echo "$CHANGED_FILES" | grep -q "^sgl-kernel/"; then
            echo "sgl_kernel=true" >> $GITHUB_OUTPUT
            echo "Detected sgl-kernel changes"
          else
            echo "sgl_kernel=false" >> $GITHUB_OUTPUT
          fi

          # Check for main_package changes (excluding multimodal_gen)
          # Note: Need to filter out multimodal_gen before checking, not pipe grep -q output
          MAIN_PKG_FILES=$(echo "$CHANGED_FILES" | grep -E "^(python/sglang/|python/pyproject\.toml|scripts/ci/cuda/|scripts/ci/utils/|test/|\.github/workflows/pr-test\.yml)" | grep -v "^python/sglang/multimodal_gen/" || true)
          if [ -n "$MAIN_PKG_FILES" ]; then
            echo "main_package=true" >> $GITHUB_OUTPUT
            echo "Detected main_package changes"
          else
            echo "main_package=false" >> $GITHUB_OUTPUT
          fi

          # Check for jit_kernel changes
          if echo "$CHANGED_FILES" | grep -qE "^(python/sglang/jit_kernel/|python/pyproject\.toml|\.github/workflows/pr-test\.yml)"; then
            echo "jit_kernel=true" >> $GITHUB_OUTPUT
            echo "Detected jit_kernel changes"
          else
            echo "jit_kernel=false" >> $GITHUB_OUTPUT
          fi

          # Check for multimodal_gen changes
          if echo "$CHANGED_FILES" | grep -qE "^(python/sglang/multimodal_gen/|python/sglang/cli/|python/pyproject\.toml|\.github/workflows/pr-test\.yml)"; then
            echo "multimodal_gen=true" >> $GITHUB_OUTPUT
            echo "Detected multimodal_gen changes"
          else
            echo "multimodal_gen=false" >> $GITHUB_OUTPUT
          fi

      - name: Set max-parallel based on run type
        id: set-parallel
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          # Scheduled runs and high-priority PRs get full parallelism
          if [[ "${{ github.event_name }}" == "schedule" ]]; then
            echo "max_parallel=14" >> $GITHUB_OUTPUT
            echo "Scheduled run detected, setting max_parallel to 14"
          elif [[ "${{ github.event_name }}" == "pull_request" && "${{ contains(github.event.pull_request.labels.*.name, 'high priority') }}" == "true" ]]; then
            echo "max_parallel=14" >> $GITHUB_OUTPUT
            echo "High priority PR detected, setting max_parallel to 14"
          elif [[ -n "${{ inputs.target_stage }}" ]]; then
            # /rerun-stage (workflow_dispatch): query PR labels via GitHub API
            # Try SHA lookup first (fork PRs), fallback to branch name (non-fork PRs)
            LABELS=""
            PR_HEAD_SHA="${{ inputs.pr_head_sha }}"
            if [[ -n "$PR_HEAD_SHA" ]]; then
              LABELS=$(gh api "repos/${{ github.repository }}/commits/${PR_HEAD_SHA}/pulls" \
                --jq '.[0].labels[].name' 2>/dev/null || true)
            fi
            if [[ -z "$LABELS" ]]; then
              LABELS=$(gh pr list --head "${{ github.ref_name }}" --repo "${{ github.repository }}" \
                --json labels --jq '.[0].labels[].name' 2>/dev/null || true)
            fi
            echo "PR labels: ${LABELS:-"(none)"}"
            if echo "$LABELS" | grep -Fxq "high priority"; then
              echo "max_parallel=14" >> $GITHUB_OUTPUT
              echo "High priority PR detected via API (/rerun-stage), setting max_parallel to 14"
            else
              echo "max_parallel=3" >> $GITHUB_OUTPUT
              echo "Using default max_parallel of 3 (/rerun-stage, no high priority label)"
            fi
          else
            echo "max_parallel=3" >> $GITHUB_OUTPUT
            echo "Using default max_parallel of 3"
          fi

      - name: Set B200 runner tag
        id: set-runner
        run: |
          # Use kernel-build runner only when sgl_kernel changes are detected AND we're not in target_stage mode
          # (target_stage skips wheel builds, so we can't use custom kernels)
          # Use API-based detection (filter-api) for target_stage mode, otherwise use dorny/paths-filter (filter)
          sgl_kernel="${{ steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }}"
          target_stage="${{ inputs.target_stage }}"
          if [[ "$sgl_kernel" == "true" && -z "$target_stage" ]]; then
            echo "b200_runner=4-gpu-b200-kernel" >> $GITHUB_OUTPUT
          else
            echo "b200_runner=4-gpu-b200" >> $GITHUB_OUTPUT
          fi

      - name: Enable retry for CI
        id: set-retry
        run: |
          echo "enable_retry=true" >> $GITHUB_OUTPUT
          echo "Retry logic enabled for CI"

      - name: Set continue-on-error for full test runs
        id: set-continue-on-error
        run: |
          if [[ "${{ steps.run-mode.outputs.run_all_tests }}" == "true" || "${{ inputs.force_continue_on_error }}" == "true" ]]; then
            echo "continue_on_error=true" >> $GITHUB_OUTPUT
            echo "Full test run or force flag detected, enabling continue-on-error to run all tests"
          else
            echo "continue_on_error=false" >> $GITHUB_OUTPUT
            echo "Filtered run, continue-on-error disabled"
          fi

      - name: Validate target_stage with kernel changes
        # Use API-based detection (filter-api) for target_stage mode, otherwise use dorny/paths-filter (filter)
        if: inputs.target_stage && (steps.filter-api.outputs.sgl_kernel == 'true' || steps.filter.outputs.sgl_kernel == 'true')
        run: |
          echo "::error::Cannot use /rerun-stage when PR has sgl-kernel changes."
          echo "::error::The sgl-kernel-build-wheels job is skipped in target_stage mode, but this PR modifies sgl-kernel/ files."
          echo "::error::Please use /tag-and-rerun-ci to run the full workflow including kernel builds."
          echo ""
          echo "ERROR: Cannot use /rerun-stage when PR has sgl-kernel changes."
          echo ""
          echo "This PR modifies files in sgl-kernel/, which requires building custom kernel wheels."
          echo "The /rerun-stage command skips the wheel build job, so the test would run against"
          echo "the wrong (PyPI) version of sgl-kernel instead of your changes."
          echo ""
          echo "To properly test your kernel changes, use one of these commands instead:"
          echo "  /tag-and-rerun-ci           - Re-run the full workflow including kernel builds"
          echo "  /rerun-ci                   - Re-run the full workflow"
          echo ""
          exit 1

      - name: Show filter results in summary (table)
        run: |
          {
            echo "## Change Detection"
            echo ""
            echo "| Component         | Changed |"
            echo "|-------------------|---------|"
            echo "| main_package      | ${{ steps.filter-api.outputs.main_package || steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} |"
            echo "| sgl_kernel (raw)  | ${{ steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel }} |"
            echo "| sgl_kernel (used) | ${{ !inputs.target_stage && (steps.filter-api.outputs.sgl_kernel || steps.filter.outputs.sgl_kernel) }} |"
            echo "| jit_kernel        | ${{ steps.filter-api.outputs.jit_kernel || steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} |"
            echo "| multimodal_gen    | ${{ steps.filter-api.outputs.multimodal_gen || steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} |"
            echo "| target_stage      | ${{ inputs.target_stage || '(none)' }} |"
            echo "| detection_method  | ${{ inputs.target_stage && 'GitHub API' || 'dorny/paths-filter' }} |"
            echo "| max_parallel      | ${{ steps.set-parallel.outputs.max_parallel }} |"
            echo "| b200_runner       | ${{ steps.set-runner.outputs.b200_runner }} |"
            echo "| enable_retry      | ${{ steps.set-retry.outputs.enable_retry }} |"
            echo "| continue_on_error | ${{ steps.set-continue-on-error.outputs.continue_on_error }} |"
          } >> $GITHUB_STEP_SUMMARY

  # =============================================== Wait Jobs for Sequential PR Execution ====================================================
  # These jobs poll GitHub API to wait for previous stages to complete.
  # For PR runs: wait jobs run and enforce sequential execution via polling.
  # For scheduled runs: wait jobs are skipped, enabling parallel execution for easier retry.

  wait-for-stage-a:
    needs: [check-changes, call-gate]
    # Only run for PRs (not scheduled) and when not targeting a specific stage
    # Skip if call-gate failed (stage-a jobs will be skipped, nothing to wait for)
    # !cancelled() ensures this job respects workflow cancellation from concurrency group
    if: |
      always() &&
      !cancelled() &&
      github.event_name == 'pull_request' &&
      !inputs.target_stage &&
      inputs.test_parallel_dispatch != true &&
      (needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') &&
      (needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped')
    runs-on: ubuntu-latest
    outputs:
      stage_a_result: ${{ steps.wait.outputs.result }}
    steps:
      - name: Wait for stage-a-test-1 to complete
        id: wait
        uses: actions/github-script@v7
        with:
          script: |
            const maxWaitMinutes = 240;
            const pollIntervalSeconds = 120;  // 2 minutes to reduce GH API calls
            const maxAttempts = (maxWaitMinutes * 60) / pollIntervalSeconds;

            for (let attempt = 0; attempt < maxAttempts; attempt++) {
              const jobs = await github.paginate(github.rest.actions.listJobsForWorkflowRun, {
                owner: context.repo.owner,
                repo: context.repo.repo,
                run_id: context.runId,
                per_page: 100,
              });

              const stageAJob = jobs.find(job => job.name === 'stage-a-test-1');

              if (stageAJob) {
                console.log(`stage-a-test-1 status: ${stageAJob.status}, conclusion: ${stageAJob.conclusion}`);

                if (stageAJob.status === 'completed') {
                  if (stageAJob.conclusion === 'success' || stageAJob.conclusion === 'skipped') {
                    core.setOutput('result', stageAJob.conclusion === 'success' ? 'success' : 'skipped');
                    return;
                  } else {
                    core.setOutput('result', 'failure');
                    core.setFailed(`stage-a-test-1 ${stageAJob.conclusion}`);
                    return;
                  }
                }
              } else {
                console.log('stage-a-test-1 job not found yet');
              }

              console.log(`Waiting ${pollIntervalSeconds}s... (attempt ${attempt + 1}/${maxAttempts})`);
              await new Promise(resolve => setTimeout(resolve, pollIntervalSeconds * 1000));
            }

            core.setFailed('Timeout waiting for stage-a-test-1');
            core.setOutput('result', 'timeout');

  wait-for-stage-b:
    needs: [check-changes, call-gate, wait-for-stage-a]
    # Only run for PRs (not scheduled) and when not targeting a specific stage
    # Skip if call-gate failed (stage-b jobs will be skipped, nothing to wait for)
    if: |
      always() &&
      !cancelled() &&
      github.event_name == 'pull_request' &&
      !inputs.target_stage &&
      inputs.test_parallel_dispatch != true &&
      (needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') &&
      (needs.wait-for-stage-a.result == 'success' || needs.wait-for-stage-a.result == 'skipped') &&
      (needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped')
    runs-on: ubuntu-latest
    outputs:
      stage_b_result: ${{ steps.wait.outputs.result }}
    steps:
      - name: Wait for stage-b jobs to complete
        id: wait
        uses: actions/github-script@v7
        with:
          script: |
            const maxWaitMinutes = 480;
            const pollIntervalSeconds = 120;  // 2 minutes to reduce GH API calls
            const maxAttempts = (maxWaitMinutes * 60) / pollIntervalSeconds;

            // Stage-b jobs to wait for
            const stageBJobs = [
              { prefix: 'stage-b-test-small-1-gpu', expectedCount: 8 },              // partitions 0-7
              { prefix: 'stage-b-test-large-1-gpu', expectedCount: 14 },             // partitions 0-13
              { prefix: 'stage-b-test-large-2-gpu', expectedCount: 4 },              // partitions 0-3
              { prefix: 'stage-b-test-4-gpu-b200', expectedCount: 1 },
            ];
            const totalExpectedJobs = stageBJobs.reduce((sum, j) => sum + j.expectedCount, 0);  // 27 total

            // Helper to match job names exactly (prefix alone or prefix + " (N)" for matrix jobs)
            const matchesPrefix = (jobName, prefix) => {
              return jobName === prefix || jobName.startsWith(prefix + ' (');
            };

            for (let attempt = 0; attempt < maxAttempts; attempt++) {
              const jobs = await github.paginate(github.rest.actions.listJobsForWorkflowRun, {
                owner: context.repo.owner,
                repo: context.repo.repo,
                run_id: context.runId,
                per_page: 100,
              });

              let allCompleted = true;
              let anyFailed = false;
              let failedJobs = [];
              let completedCount = 0;
              let totalCount = 0;

              for (const { prefix, expectedCount } of stageBJobs) {
                const matchingJobs = jobs.filter(job => matchesPrefix(job.name, prefix));

                // Check existing jobs for failures first (fail fast)
                for (const job of matchingJobs) {
                  totalCount++;
                  console.log(`${job.name}: status=${job.status}, conclusion=${job.conclusion}`);

                  if (job.status !== 'completed') {
                    allCompleted = false;
                  } else {
                    completedCount++;
                    if (job.conclusion !== 'success' && job.conclusion !== 'skipped') {
                      anyFailed = true;
                      failedJobs.push(job.name);
                    }
                  }
                }

                if (matchingJobs.length < expectedCount) {
                  console.log(`${prefix}: found ${matchingJobs.length}/${expectedCount} jobs (waiting for more)`);
                  allCompleted = false;
                }
              }

              console.log(`Progress: ${completedCount}/${totalCount} jobs completed (expected ${totalExpectedJobs})`);

              // Fail fast if any jobs failed (don't wait for all jobs to be created)
              if (anyFailed) {
                core.setOutput('result', 'failure');
                core.setFailed(`Stage-b jobs failed: ${failedJobs.join(', ')}`);
                return;
              }

              if (allCompleted && totalCount >= totalExpectedJobs) {
                core.setOutput('result', 'success');
                return;
              }

              console.log(`Waiting ${pollIntervalSeconds}s... (attempt ${attempt + 1}/${maxAttempts})`);
              await new Promise(resolve => setTimeout(resolve, pollIntervalSeconds * 1000));
            }

            core.setFailed('Timeout waiting for stage-b jobs');
            core.setOutput('result', 'timeout');

  # =============================================== PR Gate ====================================================
  call-gate:
    needs: check-changes
    # Skip for scheduled runs (they run all tests) and when target_stage is specified
    if: |
      github.event_name != 'schedule' &&
      inputs.test_parallel_dispatch != true &&
      !inputs.target_stage &&
      (
        needs.check-changes.outputs.main_package == 'true' ||
        needs.check-changes.outputs.sgl_kernel == 'true' ||
        needs.check-changes.outputs.jit_kernel == 'true' ||
        needs.check-changes.outputs.multimodal_gen == 'true'
      )
    uses: ./.github/workflows/pr-gate.yml
    secrets: inherit

  # =============================================== sgl-kernel ====================================================

  sgl-kernel-build-wheels:
    needs: [check-changes, call-gate]
    # Skip for scheduled runs (they run stages independently) and when target_stage is set
    if: github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true'
    runs-on: x64-kernel-build-node
    timeout-minutes: 240
    strategy:
      matrix:
        include:
          - python-version: "3.10"
            cuda-version: "12.9"
          # Add back when CUDA 13.0 is supported on CI
          # - python-version: "3.10"
          #   cuda-version: "13.0"
    name: Build Wheel
    steps:
      - name: Cleanup
        run: |
          sudo rm -rf $GITHUB_WORKSPACE/* || true

      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}

      - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
        run: |
          cd sgl-kernel
          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
        env:
          USE_CCACHE: 1

      - name: Verify wheel artifacts
        run: |
          ls -alh sgl-kernel/dist
          ls -alh sgl-kernel/dist/*.whl

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}
          path: sgl-kernel/dist/*
          if-no-files-found: error

  sgl-kernel-build-wheels-arm:
    needs: [check-changes, call-gate]
    # Skip for scheduled runs (they run stages independently) and when target_stage is set
    if: github.event_name != 'schedule' && inputs.test_parallel_dispatch != true && !inputs.target_stage && needs.check-changes.outputs.sgl_kernel == 'true'
    runs-on: arm-kernel-build-node
    timeout-minutes: 240
    strategy:
      matrix:
        include:
          - python-version: "3.10"
            cuda-version: "12.9"
    name: Build Wheel Arm
    steps:
      - name: Cleanup
        run: |
          if [ -d "$GITHUB_WORKSPACE" ]; then
            sudo rm -rf "$GITHUB_WORKSPACE"/* || true
          else
            echo "$GITHUB_WORKSPACE does not exist, nothing to clean"
          fi

      - uses: actions/checkout@v4
        with:
          submodules: "recursive"
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}

      - name: Build wheel for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
        run: |
          cd sgl-kernel
          ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
        env:
          USE_CCACHE: 1

      - name: Verify wheel artifacts
        run: |
          ls -alh sgl-kernel/dist
          ls -alh sgl-kernel/dist/*.whl

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
          name: wheel-python${{ matrix.python-version }}-cuda${{ matrix.cuda-version }}-aarch64
          path: sgl-kernel/dist/*
          if-no-files-found: error

  sgl-kernel-unit-test:
    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
    # Skip for scheduled runs and when target_stage is set
    if: |
      github.event_name != 'schedule' &&
      inputs.test_parallel_dispatch != true &&
      !inputs.target_stage &&
      needs.check-changes.outputs.sgl_kernel == 'true'
    runs-on: 1-gpu-runner
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 1-gpu-runner
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Cleanup
        run: |
          ls -alh sgl-kernel/dist || true
          rm -rf sgl-kernel/dist/* || true

      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion

      - name: Run test
        timeout-minutes: 30
        run: |
          cd sgl-kernel
          pytest tests/

  sgl-kernel-mla-test:
    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
    # Skip for scheduled runs and when target_stage is set
    if: |
      github.event_name != 'schedule' &&
      inputs.test_parallel_dispatch != true &&
      !inputs.target_stage &&
      needs.check-changes.outputs.sgl_kernel == 'true'
    runs-on: 1-gpu-runner
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 1-gpu-runner
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Cleanup
        run: |
          ls -alh sgl-kernel/dist || true
          rm -rf sgl-kernel/dist/* || true

      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run test
        timeout-minutes: 30
        run: |
          cd test/registered/mla
          python3 test_mla_deepseek_v3.py

  sgl-kernel-benchmark-test:
    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
    # Skip for scheduled runs and when target_stage is set
    if: |
      github.event_name != 'schedule' &&
      inputs.test_parallel_dispatch != true &&
      !inputs.target_stage &&
      needs.check-changes.outputs.sgl_kernel == 'true'
    runs-on: 1-gpu-runner
    timeout-minutes: 240
    env:
      CI: true
      RUNNER_LABELS: 1-gpu-runner
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Cleanup
        run: |
          ls -alh sgl-kernel/dist || true
          rm -rf sgl-kernel/dist/* || true

      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run benchmark tests
        timeout-minutes: 45
        run: |
          cd sgl-kernel/benchmark
          echo "Running sgl-kernel benchmark tests in CI mode..."

          echo "CI environment variable: $CI"
          echo "GITHUB_ACTIONS environment variable: $GITHUB_ACTIONS"

          for bench_file in bench_*.py; do
            echo "Testing $bench_file..."
            timeout 60 python3 "$bench_file" || echo "Warning: $bench_file timed out or failed, continuing..."
            echo "Completed $bench_file"
            echo "---"
          done

          echo "All benchmark tests completed!"

  sgl-kernel-b200-test:
    needs: [check-changes, sgl-kernel-build-wheels]
    # Skip for scheduled runs and when target_stage is set
    if: |
      github.event_name != 'schedule' &&
      inputs.test_parallel_dispatch != true &&
      !inputs.target_stage &&
      needs.check-changes.outputs.sgl_kernel == 'true'
    runs-on: ${{ needs.check-changes.outputs.b200_runner }}
    timeout-minutes: 240
    env:
      RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Cleanup
        run: |
          ls -alh sgl-kernel/dist || true
          rm -rf sgl-kernel/dist/* || true

      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/cuda/ci_install_dependency.sh diffusion

      - name: Run sgl-kernel unit tests on B200
        timeout-minutes: 30
        run: |
          cd sgl-kernel
          pytest tests/

  # Adding a single CUDA13 smoke test to verify that the kernel builds and runs
  # TODO: Add back this test when it can pass on CI
  # cuda13-kernel-smoke-test:
  #   needs: [check-changes, sgl-kernel-build-wheels]
  #   if: needs.check-changes.outputs.sgl_kernel == 'true'
  #   runs-on: x64-cu13-kernel-tests
  #   steps:
  #     - uses: actions/checkout@v4

  #     - name: Cleanup
  #       run: |
  #         ls -alh sgl-kernel/dist || true
  #         rm -rf sgl-kernel/dist/* || true

  #     - name: Download CUDA 13.0 artifacts
  #       uses: actions/download-artifact@v4
  #       with:
  #         path: sgl-kernel/dist/
  #         merge-multiple: true
  #         pattern: wheel-python3.10-cuda13.0

  #     - name: Install dependencies
  #       run: |
  #         CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh

  #     - name: Run kernel unit tests
  #       timeout-minutes: 30
  #       run: |
  #         cd sgl-kernel
  #         pytest tests/

  # =============================================== jit-kernel ====================================================

  jit-kernel-unit-test:
    needs: [check-changes, call-gate]
    # Skip for scheduled runs and when target_stage is set
    if: |
      github.event_name != 'schedule' &&
      inputs.test_parallel_dispatch != true &&
      !inputs.target_stage &&
      needs.check-changes.outputs.jit_kernel == 'true'
    runs-on: 1-gpu-runner
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 1-gpu-runner
    steps:
      - uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run test
        timeout-minutes: 30
        run: |
          cd python/sglang/jit_kernel
          pytest tests/

  # =============================================== primary ====================================================

  stage-a-test-1:
    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-a-test-1') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 1-gpu-runner
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 1-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run test
        timeout-minutes: 10
        run: |
          cd test/
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-a-test-1 $CONTINUE_ON_ERROR_FLAG
          # temporarily put backend-independent cpu tests here
          python3 run_suite.py --hw cpu --suite default $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()

  stage-a-cpu-only:
    needs: [check-changes, call-gate]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-a-cpu-only') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          (needs.check-changes.outputs.main_package == 'true')
        )
      )
    runs-on: ubuntu-latest
    timeout-minutes: 240
    steps:
      - name: Free disk space
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc
          df -h

      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.10'

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          pip install -e "python/[dev]"

      - name: Run test
        timeout-minutes: 10
        run: |
          cd test/
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cpu --suite stage-a-cpu-only $CONTINUE_ON_ERROR_FLAG

  # Runs on 5090 (32GB, SM120)
  stage-b-test-small-1-gpu:
    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-b-test-small-1-gpu') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 1-gpu-5090
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 1-gpu-5090
      IS_BLACKWELL: "1"
    strategy:
      fail-fast: false
      max-parallel: 8
      matrix:
        partition: [0, 1, 2, 3, 4, 5, 6, 7]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          source /etc/profile.d/sglang-ci.sh
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
          pip install -e . --no-build-isolation

      - name: Run test
        timeout-minutes: 30
        run: |
          source /etc/profile.d/sglang-ci.sh
          cd test/
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-b-test-small-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 8 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.partition }}

  # Runs on H100 (80GB, SM90) - tests that don't pass on 5090 (FA3, FP8, high VRAM, etc.)
  stage-b-test-large-1-gpu:
    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-b-test-large-1-gpu') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 1-gpu-runner
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 1-gpu-runner
    strategy:
      fail-fast: false
      max-parallel: ${{ fromJson(needs.check-changes.outputs.max_parallel) }}
      matrix:
        partition: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run test
        timeout-minutes: 30
        run: |
          cd test/
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-b-test-large-1-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 14 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.partition }}

  stage-b-test-large-2-gpu:
    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-b-test-large-2-gpu') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 2-gpu-runner
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 2-gpu-runner
    strategy:
      fail-fast: false
      matrix:
        partition: [0, 1, 2, 3]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh
          git clone https://github.com/merrymercy/human-eval.git
          cd human-eval
          pip install -e . --no-build-isolation

      - name: Run test
        timeout-minutes: 30
        run: |
          cd test/
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-b-test-large-2-gpu --auto-partition-id ${{ matrix.partition }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.partition }}

  stage-b-test-4-gpu-b200:
    needs: [check-changes, call-gate, wait-for-stage-a, sgl-kernel-build-wheels]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-b-test-4-gpu-b200') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: ${{ needs.check-changes.outputs.b200_runner }}
    timeout-minutes: 240
    env:
      RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
    strategy:
      fail-fast: false

    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v6
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run test
        timeout-minutes: 30
        run: |
          cd test
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-b-test-4-gpu-b200 $CONTINUE_ON_ERROR_FLAG

      - name: Run FA4 jit_kernel tests (SM100+)
        timeout-minutes: 10
        run: |
          IS_BLACKWELL=1 python3 -m pytest -q python/sglang/jit_kernel/tests/test_flash_attention_4.py

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()

  multimodal-gen-test-1-gpu:
    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
    if: |
      always() &&
      (
        (inputs.target_stage == 'multimodal-gen-test-1-gpu') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          needs.check-changes.outputs.multimodal_gen == 'true'
        )
      )
    runs-on: 1-gpu-runner
    timeout-minutes: 240
    strategy:
      fail-fast: false
      matrix:
        part: [0, 1]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion
      - name: Run diffusion server tests
        timeout-minutes: 240
        env:
          RUNAI_STREAMER_MEMORY_LIMIT: 0
        run: |
          cd python
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 sglang/multimodal_gen/test/run_suite.py \
            --suite 1-gpu \
            --partition-id ${{ matrix.part }} \
            --total-partitions 2 \
            $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.part }}

  multimodal-gen-test-2-gpu:
    needs: [check-changes, call-gate, sgl-kernel-build-wheels]
    if: |
      always() &&
      (
        (inputs.target_stage == 'multimodal-gen-test-2-gpu') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          needs.check-changes.outputs.multimodal_gen == 'true'
        )
      )
    runs-on: 2-gpu-runner
    timeout-minutes: 240
    strategy:
      fail-fast: false
      matrix:
        part: [0, 1]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh diffusion

      - name: Run diffusion server tests
        timeout-minutes: 240
        env:
          RUNAI_STREAMER_MEMORY_LIMIT: 0
        run: |
          cd python
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 sglang/multimodal_gen/test/run_suite.py \
            --suite 2-gpu \
            --partition-id ${{ matrix.part }} \
            --total-partitions 2 \
            $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.part }}

  stage-c-test-4-gpu-h100:
    needs: [check-changes, call-gate, wait-for-stage-b]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-c-test-4-gpu-h100') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 4-gpu-h100
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 4-gpu-h100
    strategy:
      fail-fast: false
      matrix:
        part: [0, 1, 2]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run test
        timeout-minutes: 20
        run: |
          cd test
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-h100 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.part }}

  stage-c-test-8-gpu-h200:
    needs: [check-changes, call-gate, wait-for-stage-b]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-c-test-8-gpu-h200') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 8-gpu-h200
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 8-gpu-h200
    strategy:
      fail-fast: false
      matrix:
        part: [0, 1, 2, 3]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Warmup DeepGEMM JIT Compilation
        timeout-minutes: 25
        run: |
          python3 scripts/ci/cuda/warmup_deep_gemm.py \
            deepseek-ai/DeepSeek-V3-0324:8 \
            deepseek-ai/DeepSeek-V3.2-Exp:8

      - name: Warmup Server CUDA Graphs
        timeout-minutes: 25
        run: |
          python3 scripts/ci/cuda/warmup_server.py \
            deepseek-ai/DeepSeek-V3-0324:8 \
            inclusionAI/Ring-2.5-1T:8

      - name: Run test
        timeout-minutes: 30
        run: |
          cd test
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-c-test-8-gpu-h200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.part }}

  stage-c-test-8-gpu-h20:
    needs: [check-changes, call-gate, wait-for-stage-b]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-c-test-8-gpu-h20') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 8-gpu-h20
    timeout-minutes: 240
    env:
      SGLANG_CI_RDMA_ALL_DEVICES: "mlx5_1,mlx5_2,mlx5_3,mlx5_4"
      RUNNER_LABELS: 8-gpu-h20
    strategy:
      fail-fast: false
      matrix:
        part: [0, 1]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh

      - name: Run test
        timeout-minutes: 20
        run: |
          cd test
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-c-test-8-gpu-h20 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.part }}

  stage-c-test-deepep-4-gpu:
    needs: [check-changes, call-gate, wait-for-stage-b]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-c-test-deepep-4-gpu') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 4-gpu-h100
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 4-gpu-h100
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh

      - name: Warmup DeepGEMM JIT Compilation
        timeout-minutes: 25
        run: |
          python3 scripts/ci/cuda/warmup_deep_gemm.py \
            lmsys/sglang-ci-dsv3-test:4

      - name: Warmup Server CUDA Graphs
        timeout-minutes: 25
        run: |
          python3 scripts/ci/cuda/warmup_server.py \
            lmsys/sglang-ci-dsv3-test:4

      - name: Run test
        timeout-minutes: 20
        run: |
          cd test
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-c-test-deepep-4-gpu $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()

  stage-c-test-deepep-8-gpu-h200:
    needs: [check-changes, call-gate, wait-for-stage-b]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-c-test-deepep-8-gpu-h200') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: 8-gpu-h200
    timeout-minutes: 240
    env:
      RUNNER_LABELS: 8-gpu-h200
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v4
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/cuda/ci_install_deepep.sh

      - name: Warmup DeepGEMM JIT Compilation
        timeout-minutes: 25
        run: |
          python3 scripts/ci/cuda/warmup_deep_gemm.py \
            deepseek-ai/DeepSeek-V3-0324:8 \
            deepseek-ai/DeepSeek-V3.2-Exp:8

      - name: Warmup Server CUDA Graphs
        timeout-minutes: 25
        run: |
          python3 scripts/ci/cuda/warmup_server.py \
            deepseek-ai/DeepSeek-V3-0324:8

      - name: Run test
        timeout-minutes: 45
        run: |
          cd test
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          python3 run_suite.py --hw cuda --suite stage-c-test-deepep-8-gpu-h200 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()

  stage-c-test-4-gpu-b200:
    needs: [check-changes, call-gate, wait-for-stage-b]
    if: |
      always() &&
      (
        (inputs.target_stage == 'stage-c-test-4-gpu-b200') ||
        (
          !inputs.target_stage &&
          ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
        )
      )
    runs-on: ${{ needs.check-changes.outputs.b200_runner }}
    timeout-minutes: 240
    env:
      RUNNER_LABELS: ${{ needs.check-changes.outputs.b200_runner }}
    strategy:
      fail-fast: false
      matrix:
        part: [0, 1, 2]

    steps:
      - name: Checkout code
        uses: actions/checkout@v4
        with:
          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

      - name: Download artifacts
        if: needs.check-changes.outputs.sgl_kernel == 'true'
        uses: actions/download-artifact@v6
        with:
          path: sgl-kernel/dist/
          merge-multiple: true
          pattern: wheel-python3.10-cuda12.9

      - name: Install dependencies
        timeout-minutes: 20
        run: |
          CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 bash scripts/ci/cuda/ci_install_dependency.sh

      - name: Run test
        timeout-minutes: 30
        run: |
          cd test
          CONTINUE_ON_ERROR_FLAG=""
          if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
            CONTINUE_ON_ERROR_FLAG="--continue-on-error"
          fi
          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-b200 --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1800 $CONTINUE_ON_ERROR_FLAG

      - uses: ./.github/actions/upload-cuda-coredumps
        if: always()
        with:
          artifact-suffix: ${{ matrix.part }}

  # NOTE: GB200 stage temporarily disabled — no company-owned GB200 runner available yet.
  # Re-enable when a 4-gpu-gb200 runner is provisioned.
  # stage-c-test-4-gpu-gb200:
  #   needs: [check-changes, call-gate, wait-for-stage-b, sgl-kernel-build-wheels-arm]
  #   if: |
  #     always() &&
  #     (
  #       (inputs.target_stage == 'stage-c-test-4-gpu-gb200') ||
  #       (
  #         !inputs.target_stage &&
  #         ((github.event_name == 'schedule' || inputs.test_parallel_dispatch == true) || (!failure() && !cancelled())) &&
  #         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
  #       )
  #     )
  #   runs-on: 4-gpu-gb200
  #   timeout-minutes: 240
  #   env:
  #     RUNNER_LABELS: 4-gpu-gb200
  #   strategy:
  #     fail-fast: false
  #   steps:
  #     - name: Checkout code
  #       uses: actions/checkout@v4
  #       with:
  #         ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
  #
  #     - name: Download artifacts
  #       if: needs.check-changes.outputs.sgl_kernel == 'true'
  #       uses: actions/download-artifact@v4
  #       with:
  #         path: sgl-kernel/dist/
  #         merge-multiple: true
  #         pattern: wheel-python3.10-cuda12.9-aarch64
  #
  #     - name: Install dependencies
  #       timeout-minutes: 20
  #       run: |
  #         CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} IS_BLACKWELL=1 GRACE_BLACKWELL=1 bash scripts/ci/cuda/ci_install_deepep.sh
  #
  #     - name: Run test
  #       timeout-minutes: 45
  #       run: |
  #         cd test
  #         CONTINUE_ON_ERROR_FLAG=""
  #         if [[ "${{ needs.check-changes.outputs.continue_on_error }}" == "true" ]]; then
  #           CONTINUE_ON_ERROR_FLAG="--continue-on-error"
  #         fi
  #         python3 run_suite.py --hw cuda --suite stage-c-test-4-gpu-gb200 --timeout-per-file 3600 $CONTINUE_ON_ERROR_FLAG
  #
  #     - uses: ./.github/actions/upload-cuda-coredumps
  #       if: always()

  pr-test-finish:
    needs:
      [
        call-gate,
        check-changes,

        sgl-kernel-build-wheels,
        sgl-kernel-unit-test,
        sgl-kernel-mla-test,
        sgl-kernel-benchmark-test,
        sgl-kernel-b200-test,

        wait-for-stage-a,
        wait-for-stage-b,

        jit-kernel-unit-test,

        multimodal-gen-test-1-gpu,
        multimodal-gen-test-2-gpu,

        stage-a-test-1,
        stage-a-cpu-only,
        stage-b-test-small-1-gpu,
        stage-b-test-large-1-gpu,
        stage-b-test-large-2-gpu,
        stage-b-test-4-gpu-b200,
        stage-c-test-4-gpu-h100,
        stage-c-test-8-gpu-h20,
        stage-c-test-8-gpu-h200,
        stage-c-test-deepep-4-gpu,
        stage-c-test-deepep-8-gpu-h200,
        stage-c-test-4-gpu-b200,
        # stage-c-test-4-gpu-gb200,  # Temporarily disabled — no GB200 runner
      ]
    if: always()
    runs-on: ubuntu-latest
    steps:
      - name: Check all dependent job statuses
        run: |
          # Convert the 'needs' context to a JSON string
          json_needs='${{ toJson(needs) }}'

          # Get a list of all job names from the JSON keys
          job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')

          for job in $job_names; do
            # For each job, extract its result
            result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')

            # Print the job name and its result
            echo "$job: $result"

            # Check for failure or cancellation and exit if found
            if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
              echo "The above jobs failed."
              exit 1
            fi
          done
          # If the loop completes, all jobs were successful
          echo "All jobs completed successfully"
          exit 0