Hanrui / sglang /scripts /ci /amd /amd_ci_start_container.sh
Lekr0's picture
Add files using upload-large-folder tool
61ba51e verified
#!/bin/bash
set -euo pipefail
# Get version from git tags
SGLANG_VERSION="v0.5.5" # Default version, will be overridden if git tags are found
# Fetch tags from origin to ensure we have the latest
if git fetch --tags origin; then
# Get the latest version tag sorted by version number (e.g., v0.5.7)
VERSION_FROM_TAG=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1)
if [ -n "$VERSION_FROM_TAG" ]; then
SGLANG_VERSION="$VERSION_FROM_TAG"
echo "Using SGLang version from git tags: $SGLANG_VERSION"
else
echo "Warning: No version tags found; using default $SGLANG_VERSION" >&2
fi
else
echo "Warning: Failed to fetch tags from origin; using default $SGLANG_VERSION" >&2
fi
# Default base tags (can be overridden by command line arguments)
ROCM_VERSION="rocm700"
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"
# Parse command line arguments
MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
CUSTOM_IMAGE=""
BUILD_FROM_DOCKERFILE=""
GPU_ARCH_BUILD=""
while [[ $# -gt 0 ]]; do
case $1 in
--mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
--mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
--custom-image) CUSTOM_IMAGE="$2"; shift 2;;
--build-from-dockerfile) BUILD_FROM_DOCKERFILE="1"; shift;;
--gpu-arch) GPU_ARCH_BUILD="$2"; shift 2;;
--rocm-version)
ROCM_VERSION="$2"
MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x"
MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"
echo "Using ROCm version override: ${ROCM_VERSION}"
shift 2;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --mi30x-base-tag TAG Override MI30x base image tag"
echo " --mi35x-base-tag TAG Override MI35x base image tag"
echo " --custom-image IMAGE Use a specific Docker image directly"
echo " --build-from-dockerfile Build image from docker/rocm.Dockerfile"
echo " --gpu-arch ARCH GPU architecture for Dockerfile build (e.g., gfx950-rocm720)"
echo " --rocm-version VERSION Override ROCm version for image lookup (e.g., rocm720)"
exit 0
;;
*) echo "Unknown option $1"; exit 1;;
esac
done
# Detect GPU architecture from the Kubernetes runner hostname
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x" # default
# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
GPU_ARCH="${BASH_REMATCH[1]}"
echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi
# Normalise / collapse architectures we don't yet build specifically for
case "${GPU_ARCH}" in
mi35x)
echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
;;
mi30x|mi300|mi325)
echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
GPU_ARCH="mi30x"
;;
*)
echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
GPU_ARCH="mi30x"
;;
esac
# Set up DEVICE_FLAG based on Kubernetes pod info
if [[ -f /etc/podinfo/gha-render-devices ]]; then
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
DEVICE_FLAG="--device /dev/dri"
fi
# Find the latest image
find_latest_image() {
local gpu_arch=$1
local base_tag days_back image_tag
case "${gpu_arch}" in
mi30x) base_tag="${MI30X_BASE_TAG}" ;;
mi35x) base_tag="${MI35X_BASE_TAG}" ;;
*) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
esac
# First, check local cache
for days_back in {0..6}; do
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
local local_image="rocm/sgl-dev:${image_tag}"
image_id=$(docker images -q "${local_image}")
if [[ -n "$image_id" ]]; then
echo "Found cached image locally: ${local_image}" >&2
echo "${local_image}"
return 0
fi
done
# If not found locally, fall back to pulling from public registry
for days_back in {0..6}; do
image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
echo "rocm/sgl-dev:${image_tag}"
return 0
fi
done
# If still not found, try finding any image matching ROCm+arch from remote registry
echo "Exact version not found. Searching remote registry for any ${ROCM_VERSION}-${gpu_arch} image…" >&2
for days_back in {0..6}; do
local target_date=$(date -d "${days_back} days ago" +%Y%m%d)
local remote_tags=$(curl -s "https://registry.hub.docker.com/v2/repositories/rocm/sgl-dev/tags?page_size=100&name=${ROCM_VERSION}-${gpu_arch}-${target_date}" 2>/dev/null | grep -o '"name":"[^"]*"' | cut -d'"' -f4 | head -n 1)
if [[ -n "$remote_tags" ]]; then
echo "Found available image: rocm/sgl-dev:${remote_tags}" >&2
echo "rocm/sgl-dev:${remote_tags}"
return 0
fi
done
echo "No recent images found. Searching any cached local images matching ROCm+arch…" >&2
local any_local
any_local=$(docker images --format '{{.Repository}}:{{.Tag}}' --filter "reference=rocm/sgl-dev:*${ROCM_VERSION}*${gpu_arch}*" | sort -r | head -n 1)
if [[ -n "$any_local" ]]; then
echo "Using cached fallback image: ${any_local}" >&2
echo "${any_local}"
return 0
fi
echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
echo "Using hard-coded fallback for ${ROCM_VERSION}…" >&2
case "${ROCM_VERSION}" in
rocm720)
if [[ "${gpu_arch}" == "mi35x" ]]; then
echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260211-preview"
else
echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi30x-20260211-preview"
fi
;;
rocm700)
if [[ "${gpu_arch}" == "mi35x" ]]; then
echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi35x-20260211"
else
echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi30x-20260211"
fi
;;
*)
echo "Error: no hard-coded fallback available for ${ROCM_VERSION}" >&2
return 1
;;
esac
}
# Determine which image to use
if [[ -n "${CUSTOM_IMAGE}" ]]; then
# Use explicitly provided custom image
IMAGE="${CUSTOM_IMAGE}"
echo "Using custom image: ${IMAGE}"
docker pull "${IMAGE}"
elif [[ -n "${BUILD_FROM_DOCKERFILE}" ]]; then
# Build image from Dockerfile
if [[ -z "${GPU_ARCH_BUILD}" ]]; then
echo "Error: --gpu-arch is required when using --build-from-dockerfile" >&2
exit 1
fi
DOCKERFILE_DIR="${GITHUB_WORKSPACE:-$PWD}/docker"
DOCKERFILE="${DOCKERFILE_DIR}/rocm.Dockerfile"
if [[ ! -f "${DOCKERFILE}" ]]; then
echo "Error: Dockerfile not found at ${DOCKERFILE}" >&2
exit 1
fi
IMAGE="sglang-ci:${GPU_ARCH_BUILD}-$(date +%Y%m%d)"
echo "Building Docker image from ${DOCKERFILE} with GPU_ARCH=${GPU_ARCH_BUILD}..."
# Pass full GPU_ARCH (e.g., gfx950-rocm720) - Dockerfile handles stripping suffix
docker build \
--build-arg GPU_ARCH="${GPU_ARCH_BUILD}" \
--build-arg SGL_BRANCH="main" \
-t "${IMAGE}" \
-f "${DOCKERFILE}" \
"${DOCKERFILE_DIR}"
echo "Successfully built image: ${IMAGE}"
else
# Find the latest pre-built image
IMAGE=$(find_latest_image "${GPU_ARCH}")
echo "Pulling Docker image: ${IMAGE}"
docker pull "${IMAGE}"
fi
CACHE_HOST=/home/runner/sgl-data
if [[ -d "$CACHE_HOST" ]]; then
CACHE_VOLUME="-v $CACHE_HOST:/sgl-data"
else
CACHE_VOLUME=""
fi
echo "Launching container: ci_sglang"
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
--ulimit nofile=65536:65536 \
-v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
$CACHE_VOLUME \
--group-add video \
--shm-size 32g \
--cap-add=SYS_PTRACE \
-e HF_TOKEN="${HF_TOKEN:-}" \
-e HF_HOME=/sgl-data/hf-cache \
-e HF_HUB_ETAG_TIMEOUT=300 \
-e HF_HUB_DOWNLOAD_TIMEOUT=300 \
-e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
-e MIOPEN_CUSTOM_CACHE_DIR=/sgl-data/miopen-cache \
-e PYTHONPATH="/opt/tilelang:${PYTHONPATH:-}" \
--security-opt seccomp=unconfined \
-w /sglang-checkout \
--name ci_sglang \
"${IMAGE}"