File size: 12,212 Bytes
61ba51e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 | #!/bin/bash
set -euo pipefail
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x" # default
SKIP_TT_DEPS=""
SKIP_SGLANG_BUILD=""
SKIP_AITER_BUILD=""
while [[ $# -gt 0 ]]; do
case $1 in
--skip-aiter-build) SKIP_AITER_BUILD="1"; shift;;
--skip-sglang-build) SKIP_SGLANG_BUILD="1"; shift;;
--skip-test-time-deps) SKIP_TT_DEPS="1"; shift;;
-h|--help)
echo "Usage: $0 [OPTIONS] [OPTIONAL_DEPS]"
echo "Options:"
echo " --skip-sglang-build Don't build checkout sglang, use what was shipped with the image"
echo " --skip-aiter-build Don't build aiter, use what was shipped with the image"
echo " --skip-test-time-deps Don't build miscellaneous dependencies"
exit 0
;;
*) break ;;
esac
done
OPTIONAL_DEPS="${1:-}"
# Build python extras
EXTRAS="dev_hip"
if [ -n "$OPTIONAL_DEPS" ]; then
EXTRAS="dev_hip,${OPTIONAL_DEPS}"
fi
echo "Installing python extras: [${EXTRAS}]"
# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
GPU_ARCH="${BASH_REMATCH[1]}"
echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi
# Install the required dependencies in CI.
# Fix permissions on pip cache, ignore errors from concurrent access or missing temp files
docker exec ci_sglang chown -R root:root /sgl-data/pip-cache 2>/dev/null || true
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade pip
# Helper function to install with retries and fallback PyPI mirror
install_with_retry() {
local max_attempts=3
local cmd="$@"
for attempt in $(seq 1 $max_attempts); do
echo "Attempt $attempt/$max_attempts: $cmd"
if eval "$cmd"; then
echo "Success!"
return 0
fi
if [ $attempt -lt $max_attempts ]; then
echo "Failed, retrying in 5 seconds..."
sleep 5
# Try with alternative PyPI index on retry
if [[ "$cmd" =~ "pip install" ]] && [ $attempt -eq 2 ]; then
cmd="$cmd --index-url https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com"
echo "Using fallback PyPI mirror: $cmd"
fi
fi
done
echo "Failed after $max_attempts attempts"
return 1
}
# Helper function to git clone with retries
git_clone_with_retry() {
local repo_url="$1"
local dest_dir="${2:-}"
local branch_args="${3:-}"
local max_attempts=3
for attempt in $(seq 1 $max_attempts); do
echo "Git clone attempt $attempt/$max_attempts: $repo_url"
# prevent from partial clone
if [ -n "$dest_dir" ] && [ -d "$dest_dir" ]; then
rm -rf "$dest_dir"
fi
if git \
-c http.lowSpeedLimit=1000 \
-c http.lowSpeedTime=30 \
clone --depth 1 ${branch_args:+$branch_args} "$repo_url" "$dest_dir"; then
echo "Git clone succeeded."
return 0
fi
if [ $attempt -lt $max_attempts ]; then
echo "Git clone failed, retrying in 5 seconds..."
sleep 5
fi
done
echo "Git clone failed after $max_attempts attempts: $repo_url"
return 1
}
# Install checkout sglang
if [ -n "$SKIP_SGLANG_BUILD" ]; then
echo "Didn't build checkout SGLang"
else
docker exec ci_sglang pip uninstall sgl-kernel -y || true
docker exec ci_sglang pip uninstall sglang -y || true
# Clear Python cache to ensure latest code is used
docker exec ci_sglang find /opt/venv -name "*.pyc" -delete || true
docker exec ci_sglang find /opt/venv -name "__pycache__" -type d -exec rm -rf {} + || true
# Also clear cache in sglang-checkout
docker exec ci_sglang find /sglang-checkout -name "*.pyc" -delete || true
docker exec ci_sglang find /sglang-checkout -name "__pycache__" -type d -exec rm -rf {} + || true
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
docker exec ci_sglang bash -c 'rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml'
install_with_retry docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[${EXTRAS}]"
fi
if [[ -n "${SKIP_TT_DEPS}" ]]; then
echo "Didn't build lmms_eval, human-eval, and others"
else
# For lmms_evals evaluating MMMU
docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
install_with_retry docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
git_clone_with_retry https://github.com/akao-amd/human-eval.git human-eval
docker cp human-eval ci_sglang:/
install_with_retry docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e .
docker exec -w / ci_sglang mkdir -p /dummy-grok
# Create dummy grok config inline (bypasses Azure blob storage which may have auth issues)
mkdir -p dummy-grok
cat > dummy-grok/config.json << 'EOF'
{
"architectures": [
"Grok1ModelForCausalLM"
],
"embedding_multiplier_scale": 78.38367176906169,
"output_multiplier_scale": 0.5773502691896257,
"vocab_size": 131072,
"hidden_size": 6144,
"intermediate_size": 32768,
"max_position_embeddings": 8192,
"num_experts_per_tok": 2,
"num_local_experts": 8,
"num_attention_heads": 48,
"num_hidden_layers": 64,
"num_key_value_heads": 8,
"head_dim": 128,
"rms_norm_eps": 1e-05,
"rope_theta": 10000.0,
"model_type": "mixtral",
"torch_dtype": "bfloat16"
}
EOF
# docker exec -w / ci_sglang mkdir -p /dummy-grok
# mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
# docker cp ./dummy-grok ci_sglang:/
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet]
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest
# Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204)
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache cache-dit || echo "cache-dit installation failed"
# Install accelerate for distributed training and inference support
docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache accelerate || echo "accelerate installation failed"
fi
if [[ -n "${SKIP_AITER_BUILD}" ]]; then
exit 0
fi
# Detect AITER version
#############################################
# Detect correct AITER_COMMIT for this runner
# + Check mismatch
# + Rebuild AITER if needed
#############################################
echo "[CI-AITER-CHECK] === AITER VERSION CHECK START ==="
DOCKERFILE="docker/rocm.Dockerfile"
# GPU_ARCH
GPU_ARCH="${GPU_ARCH:-mi30x}"
echo "[CI-AITER-CHECK] Runner GPU_ARCH=${GPU_ARCH}"
#############################################
# 1. Extract AITER_COMMIT from correct Dockerfile block
#############################################
if [[ "${GPU_ARCH}" == "mi35x" ]]; then
echo "[CI-AITER-CHECK] Using gfx950 block from Dockerfile..."
REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_950 AS gfx950' docker/rocm.Dockerfile \
| grep 'AITER_COMMIT=' \
| head -n1 \
| sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/')
else
echo "[CI-AITER-CHECK] Using gfx942 block from Dockerfile..."
REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_942 AS gfx942' docker/rocm.Dockerfile \
| grep 'AITER_COMMIT=' \
| head -n1 \
| sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/')
fi
if [[ -z "${REPO_AITER_COMMIT}" ]]; then
echo "[CI-AITER-CHECK] ERROR: Failed to extract AITER_COMMIT from Dockerfile."
exit 1
fi
echo "[CI-AITER-CHECK] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}"
#############################################
# 2. Check container pre-installed AITER version
#############################################
IMAGE_AITER_VERSION=$(docker exec ci_sglang bash -c "pip show amd-aiter 2>/dev/null | grep '^Version:' | awk '{print \$2}'" || echo "none")
IMAGE_AITER_VERSION="v${IMAGE_AITER_VERSION}"
echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}"
#############################################
# 3. Decide rebuild
#############################################
NEED_REBUILD="false"
if [[ -n "${AITER_COMMIT_OVERRIDE:-}" ]]; then
echo "[CI-AITER-CHECK] AITER_COMMIT_OVERRIDE=${AITER_COMMIT_OVERRIDE} → forcing rebuild"
REPO_AITER_COMMIT="${AITER_COMMIT_OVERRIDE}"
NEED_REBUILD="true"
elif [[ "${IMAGE_AITER_VERSION}" == "vnone" || "${IMAGE_AITER_VERSION}" == "v" ]]; then
echo "[CI-AITER-CHECK] No AITER found in image → rebuild needed"
NEED_REBUILD="true"
elif [[ "${IMAGE_AITER_VERSION}" == "${REPO_AITER_COMMIT}" ]]; then
echo "[CI-AITER-CHECK] AITER version matches"
elif [[ "${IMAGE_AITER_VERSION}" =~ (dev|\+g[0-9a-f]+) ]]; then
# Dev/patched version (contains 'dev' or git hash) → preserve it
echo "[CI-AITER-CHECK] Dev/patched version detected: ${IMAGE_AITER_VERSION} → skipping rebuild"
else
echo "[CI-AITER-CHECK] Version mismatch: image=${IMAGE_AITER_VERSION}, repo=${REPO_AITER_COMMIT}"
NEED_REBUILD="true"
fi
#############################################
# 4. Rebuild AITER if needed
#############################################
if [[ "${NEED_REBUILD}" == "true" ]]; then
echo "[CI-AITER-CHECK] === AITER REBUILD START ==="
# uninstall existing aiter
docker exec ci_sglang pip uninstall -y amd-aiter || true
# delete old aiter directory
docker exec ci_sglang rm -rf /sgl-workspace/aiter
# clone a fresh copy to /sgl-workspace/aiter
docker exec ci_sglang git clone https://github.com/ROCm/aiter.git /sgl-workspace/aiter
# checkout correct version
docker exec ci_sglang bash -c "
cd /sgl-workspace/aiter && \
git fetch --all && \
git checkout ${REPO_AITER_COMMIT} && \
git submodule update --init --recursive
"
if [[ "${GPU_ARCH}" == "mi35x" ]]; then
GPU_ARCH_LIST="gfx950"
else
GPU_ARCH_LIST="gfx942"
fi
echo "[CI-AITER-CHECK] GPU_ARCH_LIST=${GPU_ARCH_LIST}"
# Re-apply Dockerfile hotpatches for ROCm 7.2 (the fresh clone lost them, can be removed after triton fixed this problem)
ROCM_VERSION=$(docker exec ci_sglang bash -c "cat /opt/rocm/.info/version 2>/dev/null || echo unknown")
if [[ "${ROCM_VERSION}" == 7.2* ]]; then
echo "[CI-AITER-CHECK] ROCm 7.2 detected (${ROCM_VERSION}), applying AITER hotpatches..."
docker exec ci_sglang bash -c "
cd /sgl-workspace/aiter && \
TARGET_FILE='aiter/ops/triton/attention/pa_mqa_logits.py' && \
if [ -f \"\${TARGET_FILE}\" ]; then \
sed -i '459 s/if.*:/if False:/' \"\${TARGET_FILE}\" && \
echo '[CI-AITER-CHECK] Hotpatch applied to pa_mqa_logits.py'; \
else \
echo '[CI-AITER-CHECK] pa_mqa_logits.py not found, skipping hotpatch'; \
fi
"
else
echo "[CI-AITER-CHECK] ROCm version=${ROCM_VERSION}, no hotpatch needed"
fi
# build AITER
docker exec ci_sglang bash -c "
cd /sgl-workspace/aiter && \
GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop
"
echo "[CI-AITER-CHECK] === AITER REBUILD COMPLETE ==="
fi
echo "[CI-AITER-CHECK] === AITER VERSION CHECK END ==="
# # Clear pre-built AITER kernels from Docker image to avoid segfaults
# # The Docker image may contain pre-compiled kernels incompatible with the current environment
# echo "Clearing pre-built AITER kernels from Docker image..."
# docker exec ci_sglang find /sgl-workspace/aiter/aiter/jit -name "*.so" -delete 2>/dev/null || true
# docker exec ci_sglang ls -la /sgl-workspace/aiter/aiter/jit/ 2>/dev/null || echo "jit dir empty or not found"
# # Pre-build AITER kernels to avoid timeout during tests
# echo "Warming up AITER JIT kernels..."
# docker exec -e SGLANG_USE_AITER=1 ci_sglang python3 /sglang-checkout/scripts/ci/amd/amd_ci_warmup_aiter.py || echo "AITER warmup completed (some kernels may not be available)"
|