#!/bin/bash set -euo pipefail # Get version from git tags SGLANG_VERSION="v0.5.5" # Default version, will be overridden if git tags are found # Fetch tags from origin to ensure we have the latest if git fetch --tags origin; then # Get the latest version tag sorted by version number (e.g., v0.5.7) VERSION_FROM_TAG=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1) if [ -n "$VERSION_FROM_TAG" ]; then SGLANG_VERSION="$VERSION_FROM_TAG" echo "Using SGLang version from git tags: $SGLANG_VERSION" else echo "Warning: No version tags found; using default $SGLANG_VERSION" >&2 fi else echo "Warning: Failed to fetch tags from origin; using default $SGLANG_VERSION" >&2 fi # Default base tags (can be overridden by command line arguments) ROCM_VERSION="rocm700" DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x" DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x" # Parse command line arguments MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}" MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}" CUSTOM_IMAGE="" BUILD_FROM_DOCKERFILE="" GPU_ARCH_BUILD="" while [[ $# -gt 0 ]]; do case $1 in --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;; --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;; --custom-image) CUSTOM_IMAGE="$2"; shift 2;; --build-from-dockerfile) BUILD_FROM_DOCKERFILE="1"; shift;; --gpu-arch) GPU_ARCH_BUILD="$2"; shift 2;; --rocm-version) ROCM_VERSION="$2" MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x" MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x" echo "Using ROCm version override: ${ROCM_VERSION}" shift 2;; -h|--help) echo "Usage: $0 [OPTIONS]" echo "Options:" echo " --mi30x-base-tag TAG Override MI30x base image tag" echo " --mi35x-base-tag TAG Override MI35x base image tag" echo " --custom-image IMAGE Use a specific Docker image directly" echo " --build-from-dockerfile Build image from docker/rocm.Dockerfile" echo " --gpu-arch ARCH GPU architecture for Dockerfile build (e.g., gfx950-rocm720)" echo " --rocm-version VERSION Override ROCm version for image lookup (e.g., rocm720)" exit 0 ;; *) echo "Unknown option $1"; exit 1;; esac done # Detect GPU architecture from the Kubernetes runner hostname HOSTNAME_VALUE=$(hostname) GPU_ARCH="mi30x" # default # Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then GPU_ARCH="${BASH_REMATCH[1]}" echo "Detected GPU architecture from hostname: ${GPU_ARCH}" else echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}" fi # Normalise / collapse architectures we don't yet build specifically for case "${GPU_ARCH}" in mi35x) echo "Runner uses ${GPU_ARCH}; will fetch mi35x image." ;; mi30x|mi300|mi325) echo "Runner uses ${GPU_ARCH}; will fetch mi30x image." GPU_ARCH="mi30x" ;; *) echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2 GPU_ARCH="mi30x" ;; esac # Set up DEVICE_FLAG based on Kubernetes pod info if [[ -f /etc/podinfo/gha-render-devices ]]; then DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) else DEVICE_FLAG="--device /dev/dri" fi # Find the latest image find_latest_image() { local gpu_arch=$1 local base_tag days_back image_tag case "${gpu_arch}" in mi30x) base_tag="${MI30X_BASE_TAG}" ;; mi35x) base_tag="${MI35X_BASE_TAG}" ;; *) echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;; esac # First, check local cache for days_back in {0..6}; do image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)" local local_image="rocm/sgl-dev:${image_tag}" image_id=$(docker images -q "${local_image}") if [[ -n "$image_id" ]]; then echo "Found cached image locally: ${local_image}" >&2 echo "${local_image}" return 0 fi done # If not found locally, fall back to pulling from public registry for days_back in {0..6}; do image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)" echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2 if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then echo "Found available image: rocm/sgl-dev:${image_tag}" >&2 echo "rocm/sgl-dev:${image_tag}" return 0 fi done # If still not found, try finding any image matching ROCm+arch from remote registry echo "Exact version not found. Searching remote registry for any ${ROCM_VERSION}-${gpu_arch} image…" >&2 for days_back in {0..6}; do local target_date=$(date -d "${days_back} days ago" +%Y%m%d) local remote_tags=$(curl -s "https://registry.hub.docker.com/v2/repositories/rocm/sgl-dev/tags?page_size=100&name=${ROCM_VERSION}-${gpu_arch}-${target_date}" 2>/dev/null | grep -o '"name":"[^"]*"' | cut -d'"' -f4 | head -n 1) if [[ -n "$remote_tags" ]]; then echo "Found available image: rocm/sgl-dev:${remote_tags}" >&2 echo "rocm/sgl-dev:${remote_tags}" return 0 fi done echo "No recent images found. Searching any cached local images matching ROCm+arch…" >&2 local any_local any_local=$(docker images --format '{{.Repository}}:{{.Tag}}' --filter "reference=rocm/sgl-dev:*${ROCM_VERSION}*${gpu_arch}*" | sort -r | head -n 1) if [[ -n "$any_local" ]]; then echo "Using cached fallback image: ${any_local}" >&2 echo "${any_local}" return 0 fi echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2 echo "Using hard-coded fallback for ${ROCM_VERSION}…" >&2 case "${ROCM_VERSION}" in rocm720) if [[ "${gpu_arch}" == "mi35x" ]]; then echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260211-preview" else echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi30x-20260211-preview" fi ;; rocm700) if [[ "${gpu_arch}" == "mi35x" ]]; then echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi35x-20260211" else echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi30x-20260211" fi ;; *) echo "Error: no hard-coded fallback available for ${ROCM_VERSION}" >&2 return 1 ;; esac } # Determine which image to use if [[ -n "${CUSTOM_IMAGE}" ]]; then # Use explicitly provided custom image IMAGE="${CUSTOM_IMAGE}" echo "Using custom image: ${IMAGE}" docker pull "${IMAGE}" elif [[ -n "${BUILD_FROM_DOCKERFILE}" ]]; then # Build image from Dockerfile if [[ -z "${GPU_ARCH_BUILD}" ]]; then echo "Error: --gpu-arch is required when using --build-from-dockerfile" >&2 exit 1 fi DOCKERFILE_DIR="${GITHUB_WORKSPACE:-$PWD}/docker" DOCKERFILE="${DOCKERFILE_DIR}/rocm.Dockerfile" if [[ ! -f "${DOCKERFILE}" ]]; then echo "Error: Dockerfile not found at ${DOCKERFILE}" >&2 exit 1 fi IMAGE="sglang-ci:${GPU_ARCH_BUILD}-$(date +%Y%m%d)" echo "Building Docker image from ${DOCKERFILE} with GPU_ARCH=${GPU_ARCH_BUILD}..." # Pass full GPU_ARCH (e.g., gfx950-rocm720) - Dockerfile handles stripping suffix docker build \ --build-arg GPU_ARCH="${GPU_ARCH_BUILD}" \ --build-arg SGL_BRANCH="main" \ -t "${IMAGE}" \ -f "${DOCKERFILE}" \ "${DOCKERFILE_DIR}" echo "Successfully built image: ${IMAGE}" else # Find the latest pre-built image IMAGE=$(find_latest_image "${GPU_ARCH}") echo "Pulling Docker image: ${IMAGE}" docker pull "${IMAGE}" fi CACHE_HOST=/home/runner/sgl-data if [[ -d "$CACHE_HOST" ]]; then CACHE_VOLUME="-v $CACHE_HOST:/sgl-data" else CACHE_VOLUME="" fi echo "Launching container: ci_sglang" docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \ --ulimit nofile=65536:65536 \ -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \ $CACHE_VOLUME \ --group-add video \ --shm-size 32g \ --cap-add=SYS_PTRACE \ -e HF_TOKEN="${HF_TOKEN:-}" \ -e HF_HOME=/sgl-data/hf-cache \ -e HF_HUB_ETAG_TIMEOUT=300 \ -e HF_HUB_DOWNLOAD_TIMEOUT=300 \ -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ -e MIOPEN_CUSTOM_CACHE_DIR=/sgl-data/miopen-cache \ -e PYTHONPATH="/opt/tilelang:${PYTHONPATH:-}" \ --security-opt seccomp=unconfined \ -w /sglang-checkout \ --name ci_sglang \ "${IMAGE}"