| set -euo pipefail | |
| # Detect GPU family from hostname (e.g., linux-mi35x-gpu-1-xxxxx-runner-zzzzz) | |
| HOSTNAME_VALUE=$(hostname) | |
| GPU_FAMILY="" | |
| # Host names look like: linux-mi35x-gpu-1-xxxxx-runner-zzzzz | |
| if [[ "${HOSTNAME_VALUE}" =~ ^linux-(mi[0-9]+[a-z]*)-gpu-[0-9]+ ]]; then | |
| GPU_FAMILY="${BASH_REMATCH[1]}" | |
| echo "Detected GPU family from hostname: ${GPU_FAMILY}" | |
| else | |
| echo "Warning: could not parse GPU family from '${HOSTNAME_VALUE}'" | |
| fi | |
| WORKDIR="/sglang-checkout/test/srt" | |
| declare -A ENV_MAP=( | |
| [SGLANG_IS_IN_CI_AMD]=1 | |
| [SGLANG_IS_IN_CI]=1 | |
| [SGLANG_USE_AITER]=1 | |
| ) | |
| # Conditionally add GPU_ARCHS only for mi35x | |
| if [[ "${GPU_FAMILY}" == "mi35x" ]]; then | |
| ENV_MAP[GPU_ARCHS]="gfx950" | |
| fi | |
| # Parse -w/--workdir and -e ENV=VAL | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| -w|--workdir) | |
| WORKDIR="$2" | |
| shift 2 | |
| ;; | |
| -e) | |
| IFS="=" read -r key val <<< "$2" | |
| ENV_MAP["$key"]="$val" | |
| shift 2 | |
| ;; | |
| --) | |
| shift | |
| break | |
| ;; | |
| *) | |
| break | |
| ;; | |
| esac | |
| done | |
| # Build final ENV_ARGS | |
| ENV_ARGS=() | |
| for key in "${!ENV_MAP[@]}"; do | |
| ENV_ARGS+=("-e" "$key=${ENV_MAP[$key]}") | |
| done | |
| # Run docker exec with retry logic for HuggingFace network/download issues | |
| # When HF model downloads fail due to network timeouts or rate limits, | |
| # retrying with HF_HUB_OFFLINE=1 uses cached models from previous downloads. | |
| # | |
| # First attempt: normal mode (allows HF downloads) | |
| if docker exec \ | |
| -w "$WORKDIR" \ | |
| "${ENV_ARGS[@]}" \ | |
| ci_sglang "$@"; then | |
| exit 0 | |
| else | |
| FIRST_EXIT_CODE=$? | |
| fi | |
| echo "First attempt failed with exit code $FIRST_EXIT_CODE" | |
| # Skip retry for test failures that won't be fixed by offline mode: | |
| # - Exit 1: Test assertion failures (accuracy below threshold) | |
| # - Exit 137 (128+9): Process killed by OOM | |
| # - Exit 255: Test suite completed with test errors | |
| # Only retry for other exit codes (e.g., network timeouts, HF download failures) | |
| if [[ "$FIRST_EXIT_CODE" -eq 1 || "$FIRST_EXIT_CODE" -eq 137 || "$FIRST_EXIT_CODE" -eq 255 ]]; then | |
| echo "Exit code $FIRST_EXIT_CODE indicates test failure (not network issue), not retrying" | |
| exit $FIRST_EXIT_CODE | |
| fi | |
| echo "Retrying with HF_HUB_OFFLINE=1 (offline mode to use cached models)..." | |
| # Second attempt: force HF offline mode to avoid network timeouts | |
| docker exec \ | |
| -w "$WORKDIR" \ | |
| "${ENV_ARGS[@]}" \ | |
| -e HF_HUB_OFFLINE=1 \ | |
| ci_sglang "$@" | |