#!/bin/bash set -euo pipefail HOSTNAME_VALUE=$(hostname) GPU_ARCH="mi30x" # default SKIP_TT_DEPS="" SKIP_SGLANG_BUILD="" SKIP_AITER_BUILD="" while [[ $# -gt 0 ]]; do case $1 in --skip-aiter-build) SKIP_AITER_BUILD="1"; shift;; --skip-sglang-build) SKIP_SGLANG_BUILD="1"; shift;; --skip-test-time-deps) SKIP_TT_DEPS="1"; shift;; -h|--help) echo "Usage: $0 [OPTIONS] [OPTIONAL_DEPS]" echo "Options:" echo " --skip-sglang-build Don't build checkout sglang, use what was shipped with the image" echo " --skip-aiter-build Don't build aiter, use what was shipped with the image" echo " --skip-test-time-deps Don't build miscellaneous dependencies" exit 0 ;; *) break ;; esac done OPTIONAL_DEPS="${1:-}" # Build python extras EXTRAS="dev_hip" if [ -n "$OPTIONAL_DEPS" ]; then EXTRAS="dev_hip,${OPTIONAL_DEPS}" fi echo "Installing python extras: [${EXTRAS}]" # Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then GPU_ARCH="${BASH_REMATCH[1]}" echo "Detected GPU architecture from hostname: ${GPU_ARCH}" else echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}" fi # Install the required dependencies in CI. # Fix permissions on pip cache, ignore errors from concurrent access or missing temp files docker exec ci_sglang chown -R root:root /sgl-data/pip-cache 2>/dev/null || true docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade pip # Helper function to install with retries and fallback PyPI mirror install_with_retry() { local max_attempts=3 local cmd="$@" for attempt in $(seq 1 $max_attempts); do echo "Attempt $attempt/$max_attempts: $cmd" if eval "$cmd"; then echo "Success!" return 0 fi if [ $attempt -lt $max_attempts ]; then echo "Failed, retrying in 5 seconds..." sleep 5 # Try with alternative PyPI index on retry if [[ "$cmd" =~ "pip install" ]] && [ $attempt -eq 2 ]; then cmd="$cmd --index-url https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com" echo "Using fallback PyPI mirror: $cmd" fi fi done echo "Failed after $max_attempts attempts" return 1 } # Helper function to git clone with retries git_clone_with_retry() { local repo_url="$1" local dest_dir="${2:-}" local branch_args="${3:-}" local max_attempts=3 for attempt in $(seq 1 $max_attempts); do echo "Git clone attempt $attempt/$max_attempts: $repo_url" # prevent from partial clone if [ -n "$dest_dir" ] && [ -d "$dest_dir" ]; then rm -rf "$dest_dir" fi if git \ -c http.lowSpeedLimit=1000 \ -c http.lowSpeedTime=30 \ clone --depth 1 ${branch_args:+$branch_args} "$repo_url" "$dest_dir"; then echo "Git clone succeeded." return 0 fi if [ $attempt -lt $max_attempts ]; then echo "Git clone failed, retrying in 5 seconds..." sleep 5 fi done echo "Git clone failed after $max_attempts attempts: $repo_url" return 1 } # Install checkout sglang if [ -n "$SKIP_SGLANG_BUILD" ]; then echo "Didn't build checkout SGLang" else docker exec ci_sglang pip uninstall sgl-kernel -y || true docker exec ci_sglang pip uninstall sglang -y || true # Clear Python cache to ensure latest code is used docker exec ci_sglang find /opt/venv -name "*.pyc" -delete || true docker exec ci_sglang find /opt/venv -name "__pycache__" -type d -exec rm -rf {} + || true # Also clear cache in sglang-checkout docker exec ci_sglang find /sglang-checkout -name "*.pyc" -delete || true docker exec ci_sglang find /sglang-checkout -name "__pycache__" -type d -exec rm -rf {} + || true docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" docker exec ci_sglang bash -c 'rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml' install_with_retry docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[${EXTRAS}]" fi if [[ -n "${SKIP_TT_DEPS}" ]]; then echo "Didn't build lmms_eval, human-eval, and others" else # For lmms_evals evaluating MMMU docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git install_with_retry docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . git_clone_with_retry https://github.com/akao-amd/human-eval.git human-eval docker cp human-eval ci_sglang:/ install_with_retry docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . docker exec -w / ci_sglang mkdir -p /dummy-grok # Create dummy grok config inline (bypasses Azure blob storage which may have auth issues) mkdir -p dummy-grok cat > dummy-grok/config.json << 'EOF' { "architectures": [ "Grok1ModelForCausalLM" ], "embedding_multiplier_scale": 78.38367176906169, "output_multiplier_scale": 0.5773502691896257, "vocab_size": 131072, "hidden_size": 6144, "intermediate_size": 32768, "max_position_embeddings": 8192, "num_experts_per_tok": 2, "num_local_experts": 8, "num_attention_heads": 48, "num_hidden_layers": 64, "num_key_value_heads": 8, "head_dim": 128, "rms_norm_eps": 1e-05, "rope_theta": 10000.0, "model_type": "mixtral", "torch_dtype": "bfloat16" } EOF # docker exec -w / ci_sglang mkdir -p /dummy-grok # mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json # docker cp ./dummy-grok ci_sglang:/ docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet] docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest # Install cache-dit for qwen_image_t2i_cache_dit_enabled test (added in PR 16204) docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache cache-dit || echo "cache-dit installation failed" # Install accelerate for distributed training and inference support docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache accelerate || echo "accelerate installation failed" fi if [[ -n "${SKIP_AITER_BUILD}" ]]; then exit 0 fi # Detect AITER version ############################################# # Detect correct AITER_COMMIT for this runner # + Check mismatch # + Rebuild AITER if needed ############################################# echo "[CI-AITER-CHECK] === AITER VERSION CHECK START ===" DOCKERFILE="docker/rocm.Dockerfile" # GPU_ARCH GPU_ARCH="${GPU_ARCH:-mi30x}" echo "[CI-AITER-CHECK] Runner GPU_ARCH=${GPU_ARCH}" ############################################# # 1. Extract AITER_COMMIT from correct Dockerfile block ############################################# if [[ "${GPU_ARCH}" == "mi35x" ]]; then echo "[CI-AITER-CHECK] Using gfx950 block from Dockerfile..." REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_950 AS gfx950' docker/rocm.Dockerfile \ | grep 'AITER_COMMIT=' \ | head -n1 \ | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/') else echo "[CI-AITER-CHECK] Using gfx942 block from Dockerfile..." REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_942 AS gfx942' docker/rocm.Dockerfile \ | grep 'AITER_COMMIT=' \ | head -n1 \ | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/') fi if [[ -z "${REPO_AITER_COMMIT}" ]]; then echo "[CI-AITER-CHECK] ERROR: Failed to extract AITER_COMMIT from Dockerfile." exit 1 fi echo "[CI-AITER-CHECK] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}" ############################################# # 2. Check container pre-installed AITER version ############################################# IMAGE_AITER_VERSION=$(docker exec ci_sglang bash -c "pip show amd-aiter 2>/dev/null | grep '^Version:' | awk '{print \$2}'" || echo "none") IMAGE_AITER_VERSION="v${IMAGE_AITER_VERSION}" echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}" ############################################# # 3. Decide rebuild ############################################# NEED_REBUILD="false" if [[ -n "${AITER_COMMIT_OVERRIDE:-}" ]]; then echo "[CI-AITER-CHECK] AITER_COMMIT_OVERRIDE=${AITER_COMMIT_OVERRIDE} → forcing rebuild" REPO_AITER_COMMIT="${AITER_COMMIT_OVERRIDE}" NEED_REBUILD="true" elif [[ "${IMAGE_AITER_VERSION}" == "vnone" || "${IMAGE_AITER_VERSION}" == "v" ]]; then echo "[CI-AITER-CHECK] No AITER found in image → rebuild needed" NEED_REBUILD="true" elif [[ "${IMAGE_AITER_VERSION}" == "${REPO_AITER_COMMIT}" ]]; then echo "[CI-AITER-CHECK] AITER version matches" elif [[ "${IMAGE_AITER_VERSION}" =~ (dev|\+g[0-9a-f]+) ]]; then # Dev/patched version (contains 'dev' or git hash) → preserve it echo "[CI-AITER-CHECK] Dev/patched version detected: ${IMAGE_AITER_VERSION} → skipping rebuild" else echo "[CI-AITER-CHECK] Version mismatch: image=${IMAGE_AITER_VERSION}, repo=${REPO_AITER_COMMIT}" NEED_REBUILD="true" fi ############################################# # 4. Rebuild AITER if needed ############################################# if [[ "${NEED_REBUILD}" == "true" ]]; then echo "[CI-AITER-CHECK] === AITER REBUILD START ===" # uninstall existing aiter docker exec ci_sglang pip uninstall -y amd-aiter || true # delete old aiter directory docker exec ci_sglang rm -rf /sgl-workspace/aiter # clone a fresh copy to /sgl-workspace/aiter docker exec ci_sglang git clone https://github.com/ROCm/aiter.git /sgl-workspace/aiter # checkout correct version docker exec ci_sglang bash -c " cd /sgl-workspace/aiter && \ git fetch --all && \ git checkout ${REPO_AITER_COMMIT} && \ git submodule update --init --recursive " if [[ "${GPU_ARCH}" == "mi35x" ]]; then GPU_ARCH_LIST="gfx950" else GPU_ARCH_LIST="gfx942" fi echo "[CI-AITER-CHECK] GPU_ARCH_LIST=${GPU_ARCH_LIST}" # Re-apply Dockerfile hotpatches for ROCm 7.2 (the fresh clone lost them, can be removed after triton fixed this problem) ROCM_VERSION=$(docker exec ci_sglang bash -c "cat /opt/rocm/.info/version 2>/dev/null || echo unknown") if [[ "${ROCM_VERSION}" == 7.2* ]]; then echo "[CI-AITER-CHECK] ROCm 7.2 detected (${ROCM_VERSION}), applying AITER hotpatches..." docker exec ci_sglang bash -c " cd /sgl-workspace/aiter && \ TARGET_FILE='aiter/ops/triton/attention/pa_mqa_logits.py' && \ if [ -f \"\${TARGET_FILE}\" ]; then \ sed -i '459 s/if.*:/if False:/' \"\${TARGET_FILE}\" && \ echo '[CI-AITER-CHECK] Hotpatch applied to pa_mqa_logits.py'; \ else \ echo '[CI-AITER-CHECK] pa_mqa_logits.py not found, skipping hotpatch'; \ fi " else echo "[CI-AITER-CHECK] ROCm version=${ROCM_VERSION}, no hotpatch needed" fi # build AITER docker exec ci_sglang bash -c " cd /sgl-workspace/aiter && \ GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop " echo "[CI-AITER-CHECK] === AITER REBUILD COMPLETE ===" fi echo "[CI-AITER-CHECK] === AITER VERSION CHECK END ===" # # Clear pre-built AITER kernels from Docker image to avoid segfaults # # The Docker image may contain pre-compiled kernels incompatible with the current environment # echo "Clearing pre-built AITER kernels from Docker image..." # docker exec ci_sglang find /sgl-workspace/aiter/aiter/jit -name "*.so" -delete 2>/dev/null || true # docker exec ci_sglang ls -la /sgl-workspace/aiter/aiter/jit/ 2>/dev/null || echo "jit dir empty or not found" # # Pre-build AITER kernels to avoid timeout during tests # echo "Warming up AITER JIT kernels..." # docker exec -e SGLANG_USE_AITER=1 ci_sglang python3 /sglang-checkout/scripts/ci/amd/amd_ci_warmup_aiter.py || echo "AITER warmup completed (some kernels may not be available)"