| #!/bin/bash |
| set -euo pipefail |
| HOSTNAME_VALUE=$(hostname) |
| GPU_ARCH="mi30x" |
| SKIP_TT_DEPS="" |
| SKIP_SGLANG_BUILD="" |
| SKIP_AITER_BUILD="" |
|
|
| while [[ $# -gt 0 ]]; do |
| case $1 in |
| --skip-aiter-build) SKIP_AITER_BUILD="1"; shift;; |
| --skip-sglang-build) SKIP_SGLANG_BUILD="1"; shift;; |
| --skip-test-time-deps) SKIP_TT_DEPS="1"; shift;; |
| -h|--help) |
| echo "Usage: $0 [OPTIONS] [OPTIONAL_DEPS]" |
| echo "Options:" |
| echo " --skip-sglang-build Don't build checkout sglang, use what was shipped with the image" |
| echo " --skip-aiter-build Don't build aiter, use what was shipped with the image" |
| echo " --skip-test-time-deps Don't build miscellaneous dependencies" |
| exit 0 |
| ;; |
| *) break ;; |
| esac |
| done |
|
|
| OPTIONAL_DEPS="${1:-}" |
|
|
| |
| EXTRAS="dev_hip" |
| if [ -n "$OPTIONAL_DEPS" ]; then |
| EXTRAS="dev_hip,${OPTIONAL_DEPS}" |
| fi |
| echo "Installing python extras: [${EXTRAS}]" |
|
|
| |
| if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then |
| GPU_ARCH="${BASH_REMATCH[1]}" |
| echo "Detected GPU architecture from hostname: ${GPU_ARCH}" |
| else |
| echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}" |
| fi |
|
|
| |
| |
| docker exec ci_sglang chown -R root:root /sgl-data/pip-cache 2>/dev/null || true |
| docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache --upgrade pip |
|
|
| |
| install_with_retry() { |
| local max_attempts=3 |
| local cmd="$@" |
|
|
| for attempt in $(seq 1 $max_attempts); do |
| echo "Attempt $attempt/$max_attempts: $cmd" |
| if eval "$cmd"; then |
| echo "Success!" |
| return 0 |
| fi |
|
|
| if [ $attempt -lt $max_attempts ]; then |
| echo "Failed, retrying in 5 seconds..." |
| sleep 5 |
| |
| if [[ "$cmd" =~ "pip install" ]] && [ $attempt -eq 2 ]; then |
| cmd="$cmd --index-url https://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com" |
| echo "Using fallback PyPI mirror: $cmd" |
| fi |
| fi |
| done |
|
|
| echo "Failed after $max_attempts attempts" |
| return 1 |
| } |
|
|
| |
| git_clone_with_retry() { |
| local repo_url="$1" |
| local dest_dir="${2:-}" |
| local branch_args="${3:-}" |
| local max_attempts=3 |
|
|
| for attempt in $(seq 1 $max_attempts); do |
| echo "Git clone attempt $attempt/$max_attempts: $repo_url" |
|
|
| |
| if [ -n "$dest_dir" ] && [ -d "$dest_dir" ]; then |
| rm -rf "$dest_dir" |
| fi |
|
|
| if git \ |
| -c http.lowSpeedLimit=1000 \ |
| -c http.lowSpeedTime=30 \ |
| clone --depth 1 ${branch_args:+$branch_args} "$repo_url" "$dest_dir"; then |
| echo "Git clone succeeded." |
| return 0 |
| fi |
|
|
| if [ $attempt -lt $max_attempts ]; then |
| echo "Git clone failed, retrying in 5 seconds..." |
| sleep 5 |
| fi |
| done |
|
|
| echo "Git clone failed after $max_attempts attempts: $repo_url" |
| return 1 |
| } |
|
|
| |
| if [ -n "$SKIP_SGLANG_BUILD" ]; then |
| echo "Didn't build checkout SGLang" |
| else |
| docker exec ci_sglang pip uninstall sgl-kernel -y || true |
| docker exec ci_sglang pip uninstall sglang -y || true |
| |
| docker exec ci_sglang find /opt/venv -name "*.pyc" -delete || true |
| docker exec ci_sglang find /opt/venv -name "__pycache__" -type d -exec rm -rf {} + || true |
| |
| docker exec ci_sglang find /sglang-checkout -name "*.pyc" -delete || true |
| docker exec ci_sglang find /sglang-checkout -name "__pycache__" -type d -exec rm -rf {} + || true |
| docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install" |
|
|
| docker exec ci_sglang bash -c 'rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml' |
| install_with_retry docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e "python[${EXTRAS}]" |
| fi |
|
|
| if [[ -n "${SKIP_TT_DEPS}" ]]; then |
| echo "Didn't build lmms_eval, human-eval, and others" |
| else |
| |
| docker exec -w / ci_sglang git clone --branch v0.4.1 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git |
| install_with_retry docker exec -w /lmms-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . |
|
|
| git_clone_with_retry https://github.com/akao-amd/human-eval.git human-eval |
| docker cp human-eval ci_sglang:/ |
| install_with_retry docker exec -w /human-eval ci_sglang pip install --cache-dir=/sgl-data/pip-cache -e . |
|
|
| docker exec -w / ci_sglang mkdir -p /dummy-grok |
| |
| mkdir -p dummy-grok |
| cat > dummy-grok/config.json << 'EOF' |
| { |
| "architectures": [ |
| "Grok1ModelForCausalLM" |
| ], |
| "embedding_multiplier_scale": 78.38367176906169, |
| "output_multiplier_scale": 0.5773502691896257, |
| "vocab_size": 131072, |
| "hidden_size": 6144, |
| "intermediate_size": 32768, |
| "max_position_embeddings": 8192, |
| "num_experts_per_tok": 2, |
| "num_local_experts": 8, |
| "num_attention_heads": 48, |
| "num_hidden_layers": 64, |
| "num_key_value_heads": 8, |
| "head_dim": 128, |
| "rms_norm_eps": 1e-05, |
| "rope_theta": 10000.0, |
| "model_type": "mixtral", |
| "torch_dtype": "bfloat16" |
| } |
| EOF |
| |
| |
| |
|
|
| docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache huggingface_hub[hf_xet] |
| docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache pytest |
|
|
| |
| docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache cache-dit || echo "cache-dit installation failed" |
|
|
| |
| docker exec ci_sglang pip install --cache-dir=/sgl-data/pip-cache accelerate || echo "accelerate installation failed" |
| fi |
|
|
| if [[ -n "${SKIP_AITER_BUILD}" ]]; then |
| exit 0 |
| fi |
|
|
| |
| |
| |
| |
| |
| |
|
|
| echo "[CI-AITER-CHECK] === AITER VERSION CHECK START ===" |
|
|
| DOCKERFILE="docker/rocm.Dockerfile" |
|
|
| |
| GPU_ARCH="${GPU_ARCH:-mi30x}" |
| echo "[CI-AITER-CHECK] Runner GPU_ARCH=${GPU_ARCH}" |
|
|
| |
| |
| |
| if [[ "${GPU_ARCH}" == "mi35x" ]]; then |
| echo "[CI-AITER-CHECK] Using gfx950 block from Dockerfile..." |
| REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_950 AS gfx950' docker/rocm.Dockerfile \ |
| | grep 'AITER_COMMIT=' \ |
| | head -n1 \ |
| | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/') |
| else |
| echo "[CI-AITER-CHECK] Using gfx942 block from Dockerfile..." |
| REPO_AITER_COMMIT=$(grep -F -A20 'FROM $BASE_IMAGE_942 AS gfx942' docker/rocm.Dockerfile \ |
| | grep 'AITER_COMMIT=' \ |
| | head -n1 \ |
| | sed 's/.*AITER_COMMIT="\([^"]*\)".*/\1/') |
| fi |
|
|
|
|
| if [[ -z "${REPO_AITER_COMMIT}" ]]; then |
| echo "[CI-AITER-CHECK] ERROR: Failed to extract AITER_COMMIT from Dockerfile." |
| exit 1 |
| fi |
|
|
| echo "[CI-AITER-CHECK] Dockerfile expects AITER_COMMIT=${REPO_AITER_COMMIT}" |
|
|
| |
| |
| |
| IMAGE_AITER_VERSION=$(docker exec ci_sglang bash -c "pip show amd-aiter 2>/dev/null | grep '^Version:' | awk '{print \$2}'" || echo "none") |
| IMAGE_AITER_VERSION="v${IMAGE_AITER_VERSION}" |
| echo "[CI-AITER-CHECK] AITER version inside CI image: ${IMAGE_AITER_VERSION}" |
|
|
| |
| |
| |
| NEED_REBUILD="false" |
|
|
| if [[ -n "${AITER_COMMIT_OVERRIDE:-}" ]]; then |
| echo "[CI-AITER-CHECK] AITER_COMMIT_OVERRIDE=${AITER_COMMIT_OVERRIDE} → forcing rebuild" |
| REPO_AITER_COMMIT="${AITER_COMMIT_OVERRIDE}" |
| NEED_REBUILD="true" |
| elif [[ "${IMAGE_AITER_VERSION}" == "vnone" || "${IMAGE_AITER_VERSION}" == "v" ]]; then |
| echo "[CI-AITER-CHECK] No AITER found in image → rebuild needed" |
| NEED_REBUILD="true" |
| elif [[ "${IMAGE_AITER_VERSION}" == "${REPO_AITER_COMMIT}" ]]; then |
| echo "[CI-AITER-CHECK] AITER version matches" |
| elif [[ "${IMAGE_AITER_VERSION}" =~ (dev|\+g[0-9a-f]+) ]]; then |
| |
| echo "[CI-AITER-CHECK] Dev/patched version detected: ${IMAGE_AITER_VERSION} → skipping rebuild" |
| else |
| echo "[CI-AITER-CHECK] Version mismatch: image=${IMAGE_AITER_VERSION}, repo=${REPO_AITER_COMMIT}" |
| NEED_REBUILD="true" |
| fi |
|
|
|
|
| |
| |
| |
| if [[ "${NEED_REBUILD}" == "true" ]]; then |
| echo "[CI-AITER-CHECK] === AITER REBUILD START ===" |
|
|
| |
| docker exec ci_sglang pip uninstall -y amd-aiter || true |
|
|
| |
| docker exec ci_sglang rm -rf /sgl-workspace/aiter |
|
|
| |
| docker exec ci_sglang git clone https://github.com/ROCm/aiter.git /sgl-workspace/aiter |
|
|
| |
| docker exec ci_sglang bash -c " |
| cd /sgl-workspace/aiter && \ |
| git fetch --all && \ |
| git checkout ${REPO_AITER_COMMIT} && \ |
| git submodule update --init --recursive |
| " |
|
|
| if [[ "${GPU_ARCH}" == "mi35x" ]]; then |
| GPU_ARCH_LIST="gfx950" |
| else |
| GPU_ARCH_LIST="gfx942" |
| fi |
| echo "[CI-AITER-CHECK] GPU_ARCH_LIST=${GPU_ARCH_LIST}" |
|
|
| |
| ROCM_VERSION=$(docker exec ci_sglang bash -c "cat /opt/rocm/.info/version 2>/dev/null || echo unknown") |
| if [[ "${ROCM_VERSION}" == 7.2* ]]; then |
| echo "[CI-AITER-CHECK] ROCm 7.2 detected (${ROCM_VERSION}), applying AITER hotpatches..." |
| docker exec ci_sglang bash -c " |
| cd /sgl-workspace/aiter && \ |
| TARGET_FILE='aiter/ops/triton/attention/pa_mqa_logits.py' && \ |
| if [ -f \"\${TARGET_FILE}\" ]; then \ |
| sed -i '459 s/if.*:/if False:/' \"\${TARGET_FILE}\" && \ |
| echo '[CI-AITER-CHECK] Hotpatch applied to pa_mqa_logits.py'; \ |
| else \ |
| echo '[CI-AITER-CHECK] pa_mqa_logits.py not found, skipping hotpatch'; \ |
| fi |
| " |
| else |
| echo "[CI-AITER-CHECK] ROCm version=${ROCM_VERSION}, no hotpatch needed" |
| fi |
|
|
| |
| docker exec ci_sglang bash -c " |
| cd /sgl-workspace/aiter && \ |
| GPU_ARCHS=${GPU_ARCH_LIST} python3 setup.py develop |
| " |
|
|
| echo "[CI-AITER-CHECK] === AITER REBUILD COMPLETE ===" |
| fi |
|
|
| echo "[CI-AITER-CHECK] === AITER VERSION CHECK END ===" |
|
|
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|