File size: 12,212 Bytes
61ba51e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
#!/bin/bash
set -euo pipefail
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x"   # default
SKIP_TT_DEPS=""
SKIP_SGLANG_BUILD=""
SKIP_AITER_BUILD=""

while [[ $# -gt 0 ]]; do
  case $1 in
    --skip-aiter-build) SKIP_AITER_BUILD="1"; shift;;
    --skip-sglang-build) SKIP_SGLANG_BUILD="1"; shift;;
    --skip-test-time-deps) SKIP_TT_DEPS="1"; shift;;
    -h|--help)
      echo "Usage: $0 [OPTIONS] [OPTIONAL_DEPS]"
      echo "Options:"
      echo "  --skip-sglang-build         Don't build checkout sglang, use what was shipped with the image"
      echo "  --skip-aiter-build          Don't build aiter, use what was shipped with the image"
      echo "  --skip-test-time-deps       Don't build miscellaneous dependencies"
      exit 0
      ;;
    *) break ;;
  esac
done

OPTIONAL_DEPS="${1:-}"

# Build python extras
EXTRAS="dev_hip"
if [ -n "$OPTIONAL_DEPS" ]; then
    EXTRAS="dev_hip,${OPTIONAL_DEPS}"
fi
echo "Installing python extras: [${EXTRAS}]"

# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
  GPU_ARCH="${BASH_REMATCH[1]}"
  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi

# Install the required dependencies in CI.
# Fix permissions on pip cache, ignore errors from concurrent access or missing temp files
docker exec ci_sglang chown -R root:root /sgl-data/pip-cache 2>/dev/null || true
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade pip

# Helper function to install with retries and fallback PyPI mirror
install_with_retry() {
  local max_attempts=3
  local cmd="$@"

  for attempt in $(seq 1 $max_attempts); do
    echo "Attempt $attempt/$max_attempts: $cmd"
    if eval "$cmd"; then
      echo "Success!"
      return 0
    fi

    if [ $attempt -lt $max_attempts ]; then
      echo "Failed, retrying in 5 seconds..."
      sleep 5
      # Try with alternative PyPI index on retry
      if [[ "$cmd" =~ "pip install" ]] && [ $attempt -eq 2 ]; then
        cmd="$cmd --index-url https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com"
        echo "Using fallback PyPI mirror: $cmd"
      fi
    fi
  done

  echo "Failed after $max_attempts attempts"
  return 1
}

# Helper function to git clone with retries
git_clone_with_retry() {
  local repo_url="$1"
  local dest_dir="${2:-}"
  local branch_args="${3:-}"
  local max_attempts=3

  for attempt in $(seq 1 $max_attempts); do
    echo "Git clone attempt $attempt/$max_attempts: $repo_url"

    # prevent from partial clone
    if [ -n "$dest_dir" ] && [ -d "$dest_dir" ]; then
      rm -rf "$dest_dir"
    fi

    if git \
      -c http.lowSpeedLimit=1000 \
      -c http.lowSpeedTime=30 \
      clone --depth 1 ${branch_args:+$branch_args} "$repo_url" "$dest_dir"; then
      echo "Git clone succeeded."
      return 0
    fi

    if [ $attempt -lt $max_attempts ]; then
      echo "Git clone failed, retrying in 5 seconds..."
      sleep 5
    fi
  done

  echo "Git clone failed after $max_attempts attempts: $repo_url"
  return 1
}

# Install checkout sglang
if [ -n "$SKIP_SGLANG_BUILD" ]; then
  echo "Didn't build checkout SGLang"
else
  docker exec ci_sglang pip uninstall sgl-kernel -y || true
  docker exec ci_sglang pip uninstall sglang -y || true
  # Clear Python cache to ensure latest code is used
  docker exec ci_sglang find /opt/venv -name "*.pyc" -delete || true
  docker exec ci_sglang find /opt/venv -name "__pycache__" -type d -exec rm -rf {} + || true
  # Also clear cache in sglang-checkout
  docker exec ci_sglang find /sglang-checkout -name "*.pyc" -delete || true
  docker exec ci_sglang find /sglang-checkout -name "__pycache__" -type d -exec rm -rf {} + || true
  docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"

  docker exec ci_sglang bash -c 'rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml'
  install_with_retry docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[${EXTRAS}]"
fi

if [[ -n "${SKIP_TT_DEPS}" ]]; then
  echo "Didn't build lmms_eval, human-eval, and others"
else
  # For lmms_evals evaluating MMMU
  docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
  install_with_retry docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .

  git_clone_with_retry https://github.com/akao-amd/human-eval.git human-eval
  docker cp human-eval ci_sglang:/
  install_with_retry docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .

  docker exec -w / ci_sglang mkdir -p /dummy-grok
  # Create dummy grok config inline (bypasses Azure blob storage which may have auth issues)
  mkdir -p dummy-grok
  cat > dummy-grok/config.json << 'EOF'
  {
    "architectures": [
      "Grok1ModelForCausalLM"
    ],
    "embedding_multiplier_scale": 78.38367176906169,
    "output_multiplier_scale": 0.5773502691896257,
    "vocab_size": 131072,
    "hidden_size": 6144,
    "intermediate_size": 32768,
    "max_position_embeddings": 8192,
    "num_experts_per_tok": 2,
    "num_local_experts": 8,
    "num_attention_heads": 48,
    "num_hidden_layers": 64,
    "num_key_value_heads": 8,
    "head_dim": 128,
    "rms_norm_eps": 1e-05,
    "rope_theta": 10000.0,
    "model_type": "mixtral",
    "torch_dtype": "bfloat16"
  }
EOF
  # docker exec -w / ci_sglang mkdir -p /dummy-grok
  # mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
  # docker cp ./dummy-grok ci_sglang:/

  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet]
  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest

  # Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204)
  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache cache-dit || echo "cache-dit installation failed"

  # Install accelerate for distributed training and inference support
  docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache accelerate || echo "accelerate installation failed"
fi

if [[ -n "${SKIP_AITER_BUILD}" ]]; then
  exit 0
fi

# Detect AITER version
#############################################
# Detect correct AITER_COMMIT for this runner
# + Check mismatch
# + Rebuild AITER if needed
#############################################

echo "[CI-AITER-CHECK] === AITER VERSION CHECK START ==="

DOCKERFILE="docker/rocm.Dockerfile"

# GPU_ARCH
GPU_ARCH="${GPU_ARCH:-mi30x}"
echo "[CI-AITER-CHECK] Runner GPU_ARCH=${GPU_ARCH}"

#############################################
# 1. Extract AITER_COMMIT from correct Dockerfile block
#############################################
if [[ "${GPU_ARCH}" == "mi35x" ]]; then
    echo "[CI-AITER-CHECK] Using gfx950 block from Dockerfile..."
    REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_950 AS gfx950' docker/rocm.Dockerfile \
                        | grep 'AITER_COMMIT=' \
                        | head -n1 \
                        | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/')
else
    echo "[CI-AITER-CHECK] Using gfx942 block from Dockerfile..."
    REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_942 AS gfx942' docker/rocm.Dockerfile \
                        | grep 'AITER_COMMIT=' \
                        | head -n1 \
                        | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/')
fi


if [[ -z "${REPO_AITER_COMMIT}" ]]; then
    echo "[CI-AITER-CHECK] ERROR: Failed to extract AITER_COMMIT from Dockerfile."
    exit 1
fi

echo "[CI-AITER-CHECK] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}"

#############################################
# 2. Check container pre-installed AITER version
#############################################
IMAGE_AITER_VERSION=$(docker exec ci_sglang bash -c "pip show amd-aiter 2>/dev/null | grep '^Version:' | awk '{print \$2}'" || echo "none")
IMAGE_AITER_VERSION="v${IMAGE_AITER_VERSION}"
echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}"

#############################################
# 3. Decide rebuild
#############################################
NEED_REBUILD="false"

if [[ -n "${AITER_COMMIT_OVERRIDE:-}" ]]; then
    echo "[CI-AITER-CHECK] AITER_COMMIT_OVERRIDE=${AITER_COMMIT_OVERRIDE} → forcing rebuild"
    REPO_AITER_COMMIT="${AITER_COMMIT_OVERRIDE}"
    NEED_REBUILD="true"
elif [[ "${IMAGE_AITER_VERSION}" == "vnone" || "${IMAGE_AITER_VERSION}" == "v" ]]; then
    echo "[CI-AITER-CHECK] No AITER found in image → rebuild needed"
    NEED_REBUILD="true"
elif [[ "${IMAGE_AITER_VERSION}" == "${REPO_AITER_COMMIT}" ]]; then
    echo "[CI-AITER-CHECK] AITER version matches"
elif [[ "${IMAGE_AITER_VERSION}" =~ (dev|\+g[0-9a-f]+) ]]; then
    # Dev/patched version (contains 'dev' or git hash) → preserve it
    echo "[CI-AITER-CHECK] Dev/patched version detected: ${IMAGE_AITER_VERSION} → skipping rebuild"
else
    echo "[CI-AITER-CHECK] Version mismatch: image=${IMAGE_AITER_VERSION}, repo=${REPO_AITER_COMMIT}"
    NEED_REBUILD="true"
fi


#############################################
# 4. Rebuild AITER if needed
#############################################
if [[ "${NEED_REBUILD}" == "true" ]]; then
    echo "[CI-AITER-CHECK] === AITER REBUILD START ==="

    # uninstall existing aiter
    docker exec ci_sglang pip uninstall -y amd-aiter || true

    # delete old aiter directory
    docker exec ci_sglang rm -rf /sgl-workspace/aiter

    # clone a fresh copy to /sgl-workspace/aiter
    docker exec ci_sglang git clone https://github.com/ROCm/aiter.git /sgl-workspace/aiter

    # checkout correct version
    docker exec ci_sglang bash -c "
        cd /sgl-workspace/aiter && \
        git fetch --all && \
        git checkout ${REPO_AITER_COMMIT} && \
        git submodule update --init --recursive
    "

    if [[ "${GPU_ARCH}" == "mi35x" ]]; then
        GPU_ARCH_LIST="gfx950"
    else
        GPU_ARCH_LIST="gfx942"
    fi
    echo "[CI-AITER-CHECK] GPU_ARCH_LIST=${GPU_ARCH_LIST}"

    # Re-apply Dockerfile hotpatches for ROCm 7.2 (the fresh clone lost them, can be removed after triton fixed this problem)
    ROCM_VERSION=$(docker exec ci_sglang bash -c "cat /opt/rocm/.info/version 2>/dev/null || echo unknown")
    if [[ "${ROCM_VERSION}" == 7.2* ]]; then
        echo "[CI-AITER-CHECK] ROCm 7.2 detected (${ROCM_VERSION}), applying AITER hotpatches..."
        docker exec ci_sglang bash -c "
            cd /sgl-workspace/aiter && \
            TARGET_FILE='aiter/ops/triton/attention/pa_mqa_logits.py' && \
            if [ -f \"\${TARGET_FILE}\" ]; then \
                sed -i '459 s/if.*:/if False:/' \"\${TARGET_FILE}\" && \
                echo '[CI-AITER-CHECK] Hotpatch applied to pa_mqa_logits.py'; \
            else \
                echo '[CI-AITER-CHECK] pa_mqa_logits.py not found, skipping hotpatch'; \
            fi
        "
    else
        echo "[CI-AITER-CHECK] ROCm version=${ROCM_VERSION}, no hotpatch needed"
    fi

    # build AITER
    docker exec ci_sglang bash -c "
        cd /sgl-workspace/aiter && \
        GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop
    "

    echo "[CI-AITER-CHECK] === AITER REBUILD COMPLETE ==="
fi

echo "[CI-AITER-CHECK] === AITER VERSION CHECK END ==="


# # Clear pre-built AITER kernels from Docker image to avoid segfaults
# # The Docker image may contain pre-compiled kernels incompatible with the current environment
# echo "Clearing pre-built AITER kernels from Docker image..."
# docker exec ci_sglang find /sgl-workspace/aiter/aiter/jit -name "*.so" -delete 2>/dev/null || true
# docker exec ci_sglang ls -la /sgl-workspace/aiter/aiter/jit/ 2>/dev/null || echo "jit dir empty or not found"

# # Pre-build AITER kernels to avoid timeout during tests
# echo "Warming up AITER JIT kernels..."
# docker exec -e SGLANG_USE_AITER=1 ci_sglang python3 /sglang-checkout/scripts/ci/amd/amd_ci_warmup_aiter.py || echo "AITER warmup completed (some kernels may not be available)"