File size: 8,360 Bytes
61ba51e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/bin/bash
set -euo pipefail

# Get version from git tags
SGLANG_VERSION="v0.5.5"   # Default version, will be overridden if git tags are found

# Fetch tags from origin to ensure we have the latest
if git fetch --tags origin; then
  # Get the latest version tag sorted by version number (e.g., v0.5.7)
  VERSION_FROM_TAG=$(git tag -l 'v[0-9]*' --sort=-v:refname | head -1)
  if [ -n "$VERSION_FROM_TAG" ]; then
    SGLANG_VERSION="$VERSION_FROM_TAG"
    echo "Using SGLang version from git tags: $SGLANG_VERSION"
  else
    echo "Warning: No version tags found; using default $SGLANG_VERSION" >&2
  fi
else
  echo "Warning: Failed to fetch tags from origin; using default $SGLANG_VERSION" >&2
fi


# Default base tags (can be overridden by command line arguments)
ROCM_VERSION="rocm700"
DEFAULT_MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x"
DEFAULT_MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"

# Parse command line arguments
MI30X_BASE_TAG="${DEFAULT_MI30X_BASE_TAG}"
MI35X_BASE_TAG="${DEFAULT_MI35X_BASE_TAG}"
CUSTOM_IMAGE=""
BUILD_FROM_DOCKERFILE=""
GPU_ARCH_BUILD=""

while [[ $# -gt 0 ]]; do
  case $1 in
    --mi30x-base-tag) MI30X_BASE_TAG="$2"; shift 2;;
    --mi35x-base-tag) MI35X_BASE_TAG="$2"; shift 2;;
    --custom-image) CUSTOM_IMAGE="$2"; shift 2;;
    --build-from-dockerfile) BUILD_FROM_DOCKERFILE="1"; shift;;
    --gpu-arch) GPU_ARCH_BUILD="$2"; shift 2;;
    --rocm-version)
      ROCM_VERSION="$2"
      MI30X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi30x"
      MI35X_BASE_TAG="${SGLANG_VERSION}-${ROCM_VERSION}-mi35x"
      echo "Using ROCm version override: ${ROCM_VERSION}"
      shift 2;;
    -h|--help)
      echo "Usage: $0 [OPTIONS]"
      echo "Options:"
      echo "  --mi30x-base-tag TAG       Override MI30x base image tag"
      echo "  --mi35x-base-tag TAG       Override MI35x base image tag"
      echo "  --custom-image IMAGE       Use a specific Docker image directly"
      echo "  --build-from-dockerfile    Build image from docker/rocm.Dockerfile"
      echo "  --gpu-arch ARCH            GPU architecture for Dockerfile build (e.g., gfx950-rocm720)"
      echo "  --rocm-version VERSION     Override ROCm version for image lookup (e.g., rocm720)"
      exit 0
      ;;
    *) echo "Unknown option $1"; exit 1;;
  esac
done



# Detect GPU architecture from the Kubernetes runner hostname
HOSTNAME_VALUE=$(hostname)
GPU_ARCH="mi30x"   # default

# Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz
if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then
  GPU_ARCH="${BASH_REMATCH[1]}"
  echo "Detected GPU architecture from hostname: ${GPU_ARCH}"
else
  echo "Warning: could not parse GPU architecture from '${HOSTNAME_VALUE}', defaulting to ${GPU_ARCH}"
fi

# Normalise / collapse architectures we don't yet build specifically for
case "${GPU_ARCH}" in
  mi35x)
    echo "Runner uses ${GPU_ARCH}; will fetch mi35x image."
    ;;
  mi30x|mi300|mi325)
    echo "Runner uses ${GPU_ARCH}; will fetch mi30x image."
    GPU_ARCH="mi30x"
    ;;
  *)
    echo "Runner architecture '${GPU_ARCH}' unrecognised; defaulting to mi30x image." >&2
    GPU_ARCH="mi30x"
    ;;
esac


# Set up DEVICE_FLAG based on Kubernetes pod info
if [[ -f /etc/podinfo/gha-render-devices ]]; then
  DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
else
  DEVICE_FLAG="--device /dev/dri"
fi


# Find the latest image
find_latest_image() {
  local gpu_arch=$1
  local base_tag days_back image_tag

  case "${gpu_arch}" in
      mi30x) base_tag="${MI30X_BASE_TAG}" ;;
      mi35x) base_tag="${MI35X_BASE_TAG}" ;;
      *)     echo "Error: unsupported GPU architecture '${gpu_arch}'" >&2; return 1 ;;
  esac

  # First, check local cache
  for days_back in {0..6}; do
    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
    local local_image="rocm/sgl-dev:${image_tag}"
    image_id=$(docker images -q "${local_image}")
    if [[ -n "$image_id" ]]; then
        echo "Found cached image locally: ${local_image}" >&2
        echo "${local_image}"
        return 0
    fi
  done

  # If not found locally, fall back to pulling from public registry
  for days_back in {0..6}; do
    image_tag="${base_tag}-$(date -d "${days_back} days ago" +%Y%m%d)"
    echo "Checking for image: rocm/sgl-dev:${image_tag}" >&2
    if docker manifest inspect "rocm/sgl-dev:${image_tag}" >/dev/null 2>&1; then
      echo "Found available image: rocm/sgl-dev:${image_tag}" >&2
      echo "rocm/sgl-dev:${image_tag}"
      return 0
    fi
  done

  # If still not found, try finding any image matching ROCm+arch from remote registry
  echo "Exact version not found. Searching remote registry for any ${ROCM_VERSION}-${gpu_arch} image…" >&2
  for days_back in {0..6}; do
    local target_date=$(date -d "${days_back} days ago" +%Y%m%d)
    local remote_tags=$(curl -s "https://registry.hub.docker.com/v2/repositories/rocm/sgl-dev/tags?page_size=100&name=${ROCM_VERSION}-${gpu_arch}-${target_date}" 2>/dev/null | grep -o '"name":"[^"]*"' | cut -d'"' -f4 | head -n 1)
    if [[ -n "$remote_tags" ]]; then
      echo "Found available image: rocm/sgl-dev:${remote_tags}" >&2
      echo "rocm/sgl-dev:${remote_tags}"
      return 0
    fi
  done

  echo "No recent images found. Searching any cached local images matching ROCm+arch…" >&2
  local any_local
  any_local=$(docker images --format '{{.Repository}}:{{.Tag}}' --filter "reference=rocm/sgl-dev:*${ROCM_VERSION}*${gpu_arch}*" | sort -r | head -n 1)
  if [[ -n "$any_local" ]]; then
      echo "Using cached fallback image: ${any_local}" >&2
      echo "${any_local}"
      return 0
  fi

  echo "Error: no ${gpu_arch} image found in the last 7 days for base ${base_tag}" >&2
  echo "Using hard-coded fallback for ${ROCM_VERSION}…" >&2
  case "${ROCM_VERSION}" in
    rocm720)
      if [[ "${gpu_arch}" == "mi35x" ]]; then
        echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260211-preview"
      else
        echo "rocm/sgl-dev:v0.5.8.post1-rocm720-mi30x-20260211-preview"
      fi
      ;;
    rocm700)
      if [[ "${gpu_arch}" == "mi35x" ]]; then
        echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi35x-20260211"
      else
        echo "rocm/sgl-dev:v0.5.8.post1-rocm700-mi30x-20260211"
      fi
      ;;
    *)
      echo "Error: no hard-coded fallback available for ${ROCM_VERSION}" >&2
      return 1
      ;;
  esac
}

# Determine which image to use
if [[ -n "${CUSTOM_IMAGE}" ]]; then
  # Use explicitly provided custom image
  IMAGE="${CUSTOM_IMAGE}"
  echo "Using custom image: ${IMAGE}"
  docker pull "${IMAGE}"
elif [[ -n "${BUILD_FROM_DOCKERFILE}" ]]; then
  # Build image from Dockerfile
  if [[ -z "${GPU_ARCH_BUILD}" ]]; then
    echo "Error: --gpu-arch is required when using --build-from-dockerfile" >&2
    exit 1
  fi

  DOCKERFILE_DIR="${GITHUB_WORKSPACE:-$PWD}/docker"
  DOCKERFILE="${DOCKERFILE_DIR}/rocm.Dockerfile"

  if [[ ! -f "${DOCKERFILE}" ]]; then
    echo "Error: Dockerfile not found at ${DOCKERFILE}" >&2
    exit 1
  fi

  IMAGE="sglang-ci:${GPU_ARCH_BUILD}-$(date +%Y%m%d)"
  echo "Building Docker image from ${DOCKERFILE} with GPU_ARCH=${GPU_ARCH_BUILD}..."

  # Pass full GPU_ARCH (e.g., gfx950-rocm720) - Dockerfile handles stripping suffix
  docker build \
    --build-arg GPU_ARCH="${GPU_ARCH_BUILD}" \
    --build-arg SGL_BRANCH="main" \
    -t "${IMAGE}" \
    -f "${DOCKERFILE}" \
    "${DOCKERFILE_DIR}"
  echo "Successfully built image: ${IMAGE}"
else
  # Find the latest pre-built image
  IMAGE=$(find_latest_image "${GPU_ARCH}")
  echo "Pulling Docker image: ${IMAGE}"
  docker pull "${IMAGE}"
fi

CACHE_HOST=/home/runner/sgl-data
if [[ -d "$CACHE_HOST" ]]; then
    CACHE_VOLUME="-v $CACHE_HOST:/sgl-data"
else
    CACHE_VOLUME=""
fi

echo "Launching container: ci_sglang"
docker run -dt --user root --device=/dev/kfd ${DEVICE_FLAG} \
  --ulimit nofile=65536:65536 \
  -v "${GITHUB_WORKSPACE:-$PWD}:/sglang-checkout" \
  $CACHE_VOLUME \
  --group-add video \
  --shm-size 32g \
  --cap-add=SYS_PTRACE \
  -e HF_TOKEN="${HF_TOKEN:-}" \
  -e HF_HOME=/sgl-data/hf-cache \
  -e HF_HUB_ETAG_TIMEOUT=300 \
  -e HF_HUB_DOWNLOAD_TIMEOUT=300 \
  -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
  -e MIOPEN_CUSTOM_CACHE_DIR=/sgl-data/miopen-cache \
  -e PYTHONPATH="/opt/tilelang:${PYTHONPATH:-}" \
  --security-opt seccomp=unconfined \
  -w /sglang-checkout \
  --name ci_sglang \
  "${IMAGE}"