Hanrui / sglang /scripts /ci /cuda /ci_install_dependency.sh

Add files using upload-large-folder tool

61ba51e verified about 1 month ago

15.7 kB

	#!/bin/bash
	# Install the dependency in CI.
	set -euxo pipefail

	# Set up environment variables
	IS_BLACKWELL=${IS_BLACKWELL:-0}
	CU_VERSION="cu129"
	FLASHINFER_VERSION=0.6.4
	OPTIONAL_DEPS="${1:-}"

	# Detect system architecture
	ARCH=$(uname -m)
	echo "Detected architecture: ${ARCH}"

	if [ "$CU_VERSION" = "cu130" ]; then
	NVRTC_SPEC="nvidia-cuda-nvrtc"
	else
	NVRTC_SPEC="nvidia-cuda-nvrtc-cu12"
	fi

	# Kill existing processes
	SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
	bash "${SCRIPT_DIR}/../../killall_sglang.sh"
	echo "CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-}"

	# Install apt packages (including python3/pip which may be missing on some runners)
	# Use --no-install-recommends and ignore errors from unrelated broken packages on the runner
	# The NVIDIA driver packages may have broken dependencies that are unrelated to these packages
	# Run apt-get update first to refresh package index (stale index causes 404 on security.ubuntu.com)
	apt-get update \|\| true
	apt-get install -y --no-install-recommends python3 python3-pip python3-venv python3-dev git libnuma-dev libssl-dev pkg-config libibverbs-dev libibverbs1 ibverbs-providers ibverbs-utils \|\| {
	echo "Warning: apt-get install failed, checking if required packages are available..."
	# Verify the packages we need are actually installed
	for pkg in python3 python3-pip python3-venv python3-dev git libnuma-dev libssl-dev pkg-config libibverbs-dev libibverbs1 ibverbs-providers ibverbs-utils; do
	if ! dpkg -l "$pkg" 2>/dev/null \| grep -q "^ii"; then
	echo "ERROR: Required package $pkg is not installed and apt-get failed"
	exit 1
	fi
	done
	echo "All required packages are already installed, continuing..."
	}

	# Clear torch compilation cache
	python3 -c 'import os, shutil, tempfile, getpass; cache_dir = os.environ.get("TORCHINDUCTOR_CACHE_DIR") or os.path.join(tempfile.gettempdir(), "torchinductor_" + getpass.getuser()); shutil.rmtree(cache_dir, ignore_errors=True)'

	# Check if protoc of correct architecture is already installed
	if command -v protoc >/dev/null 2>&1; then
	if protoc --version >/dev/null 2>&1; then
	echo "protoc already installed: $(protoc --version)"
	else
	echo "protoc found but not runnable, reinstalling..."
	INSTALL_PROTOC=1
	fi
	else
	INSTALL_PROTOC=1
	fi

	# Install protoc for router build (gRPC protobuf compilation)
	if [ "${INSTALL_PROTOC:-0}" = "1" ]; then
	# TODO: move this to a separate script
	echo "Installing protoc..."
	if command -v apt-get &> /dev/null; then
	# Ubuntu/Debian
	apt-get update \|\| true # May fail due to unrelated broken packages
	apt-get install -y --no-install-recommends wget unzip gcc g++ perl make \|\| {
	echo "Warning: apt-get install failed, checking if required packages are available..."
	for pkg in wget unzip gcc g++ perl make; do
	if ! dpkg -l "$pkg" 2>/dev/null \| grep -q "^ii"; then
	echo "ERROR: Required package $pkg is not installed and apt-get failed"
	exit 1
	fi
	done
	echo "All required packages are already installed, continuing..."
	}
	elif command -v yum &> /dev/null; then
	# RHEL/CentOS
	yum update -y
	yum install -y wget unzip gcc gcc-c++ perl-core make
	fi

	cd /tmp
	# Determine protoc architecture
	if [ "$ARCH" = "aarch64" ] \|\| [ "$ARCH" = "arm64" ]; then
	PROTOC_ARCH="aarch_64"
	else
	PROTOC_ARCH="x86_64"
	fi
	PROTOC_ZIP="protoc-32.0-linux-${PROTOC_ARCH}.zip"
	wget https://github.com/protocolbuffers/protobuf/releases/download/v32.0/${PROTOC_ZIP}
	unzip -o ${PROTOC_ZIP} -d /usr/local
	rm ${PROTOC_ZIP}
	protoc --version
	cd -
	else
	echo "protoc already installed: $(protoc --version)"
	fi

	# Install uv (use python3 -m pip for robustness since some runners only have pip3)
	python3 -m pip install --upgrade pip

	if [ "$IS_BLACKWELL" = "1" ]; then
	# The blackwell CI runner has some issues with pip and uv,
	# so we can only use pip with `--break-system-packages`
	PIP_CMD="pip"
	PIP_INSTALL_SUFFIX="--break-system-packages"
	PIP_UNINSTALL_CMD="pip uninstall -y"
	PIP_UNINSTALL_SUFFIX="--break-system-packages"
	else
	# In normal cases, we use uv, which is much faster than pip.
	pip install uv
	export UV_SYSTEM_PYTHON=true

	PIP_CMD="uv pip"
	PIP_INSTALL_SUFFIX="--index-strategy unsafe-best-match --prerelease allow"
	PIP_UNINSTALL_CMD="uv pip uninstall"
	PIP_UNINSTALL_SUFFIX=""
	fi

	# Clean up existing installations
	$PIP_UNINSTALL_CMD sgl-kernel sglang $PIP_UNINSTALL_SUFFIX \|\| true
	# Keep flashinfer packages installed if version matches to avoid re-downloading:
	# - flashinfer-cubin: 150+ MB, plus extra cubins from ci_download_flashinfer_cubin.sh
	# - flashinfer-jit-cache: 1.2+ GB, by far the largest download in CI
	FLASHINFER_CUBIN_REQUIRED=$(grep -Po -m1 '(?<=flashinfer_cubin==)[0-9A-Za-z\.\-]+' python/pyproject.toml \|\| echo "")
	FLASHINFER_CUBIN_INSTALLED=$(pip show flashinfer-cubin 2>/dev/null \| grep "^Version:" \| awk '{print $2}' \|\| echo "")
	FLASHINFER_JIT_INSTALLED=$(pip show flashinfer-jit-cache 2>/dev/null \| grep "^Version:" \| awk '{print $2}' \| sed 's/+.*//' \|\| echo "")

	UNINSTALL_CUBIN=true
	UNINSTALL_JIT_CACHE=true

	if [ "$FLASHINFER_CUBIN_INSTALLED" = "$FLASHINFER_CUBIN_REQUIRED" ] && [ -n "$FLASHINFER_CUBIN_REQUIRED" ]; then
	echo "flashinfer-cubin==${FLASHINFER_CUBIN_REQUIRED} already installed, keeping it"
	UNINSTALL_CUBIN=false
	else
	echo "flashinfer-cubin version mismatch (installed: ${FLASHINFER_CUBIN_INSTALLED:-none}, required: ${FLASHINFER_CUBIN_REQUIRED}), reinstalling"
	fi

	if [ "$FLASHINFER_JIT_INSTALLED" = "$FLASHINFER_VERSION" ] && [ -n "$FLASHINFER_VERSION" ]; then
	echo "flashinfer-jit-cache==${FLASHINFER_VERSION} already installed, keeping it"
	UNINSTALL_JIT_CACHE=false
	else
	echo "flashinfer-jit-cache version mismatch (installed: ${FLASHINFER_JIT_INSTALLED:-none}, required: ${FLASHINFER_VERSION}), will reinstall"
	fi

	# Build uninstall list based on what needs updating
	FLASHINFER_UNINSTALL="flashinfer-python"
	[ "$UNINSTALL_CUBIN" = true ] && FLASHINFER_UNINSTALL="$FLASHINFER_UNINSTALL flashinfer-cubin"
	[ "$UNINSTALL_JIT_CACHE" = true ] && FLASHINFER_UNINSTALL="$FLASHINFER_UNINSTALL flashinfer-jit-cache"
	$PIP_UNINSTALL_CMD $FLASHINFER_UNINSTALL $PIP_UNINSTALL_SUFFIX \|\| true
	$PIP_UNINSTALL_CMD opencv-python opencv-python-headless $PIP_UNINSTALL_SUFFIX \|\| true

	# Install the main package
	EXTRAS="dev"
	if [ -n "$OPTIONAL_DEPS" ]; then
	EXTRAS="dev,${OPTIONAL_DEPS}"
	fi
	echo "Installing python extras: [${EXTRAS}]"

	$PIP_CMD install -e "python[${EXTRAS}]" --extra-index-url https://download.pytorch.org/whl/${CU_VERSION} $PIP_INSTALL_SUFFIX

	# Fix CUDA version mismatch between torch and torchaudio.
	# PyPI's torch 2.9.1 bundles cu128 but torchaudio from pytorch.org/cu129 uses cu129.
	# This mismatch causes torchaudio's C extension to fail loading, producing:
	# "partially initialized module 'torchaudio' has no attribute 'lib'"
	# We cannot replace torch with cu129 (breaks sgl_kernel ABI), so instead we reinstall
	# torchaudio/torchvision from an index matching torch's CUDA version.
	TORCH_CUDA_VER=$(python3 -c "import torch; v=torch.version.cuda; parts=v.split('.'); print(f'cu{parts[0]}{parts[1]}')")
	echo "Detected torch CUDA version: ${TORCH_CUDA_VER}"
	if [ "${TORCH_CUDA_VER}" != "${CU_VERSION}" ]; then
	# Pin versions to match what was installed by pyproject.toml (strip +cuXYZ suffix)
	TORCHAUDIO_VER=$(pip show torchaudio 2>/dev/null \| grep "^Version:" \| awk '{print $2}' \| sed 's/+.*//')
	TORCHVISION_VER=$(pip show torchvision 2>/dev/null \| grep "^Version:" \| awk '{print $2}' \| sed 's/+.*//')
	echo "Reinstalling torchaudio==${TORCHAUDIO_VER} torchvision==${TORCHVISION_VER} from ${TORCH_CUDA_VER} index to match torch..."
	$PIP_CMD install "torchaudio==${TORCHAUDIO_VER}" "torchvision==${TORCHVISION_VER}" --index-url "https://download.pytorch.org/whl/${TORCH_CUDA_VER}" --force-reinstall --no-deps $PIP_INSTALL_SUFFIX
	fi

	# Install router for pd-disagg test
	$PIP_CMD install sglang-router $PIP_INSTALL_SUFFIX

	# Remove flash_attn folder to avoid conflicts
	PYTHON_LIB_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])")
	FLASH_ATTN_PATH="${PYTHON_LIB_PATH}/flash_attn"

	if [ -d "$FLASH_ATTN_PATH" ]; then
	echo "Directory $FLASH_ATTN_PATH exists. Removing..."
	rm -rf "$FLASH_ATTN_PATH"
	else
	echo "Directory $FLASH_ATTN_PATH does not exist."
	fi

	# Install sgl-kernel
	SGL_KERNEL_VERSION_FROM_KERNEL=$(grep -Po '(?<=^version = ")[^"]*' sgl-kernel/pyproject.toml)
	SGL_KERNEL_VERSION_FROM_SRT=$(grep -Po -m1 '(?<=sgl-kernel==)[0-9A-Za-z\.\-]+' python/pyproject.toml)
	echo "SGL_KERNEL_VERSION_FROM_KERNEL=${SGL_KERNEL_VERSION_FROM_KERNEL} SGL_KERNEL_VERSION_FROM_SRT=${SGL_KERNEL_VERSION_FROM_SRT}"

	if [ "${CUSTOM_BUILD_SGL_KERNEL:-}" = "true" ] && [ -d "sgl-kernel/dist" ]; then
	ls -alh sgl-kernel/dist
	# Determine wheel architecture
	if [ "$ARCH" = "aarch64" ] \|\| [ "$ARCH" = "arm64" ]; then
	WHEEL_ARCH="aarch64"
	else
	WHEEL_ARCH="x86_64"
	fi
	$PIP_CMD install sgl-kernel/dist/sgl_kernel-${SGL_KERNEL_VERSION_FROM_KERNEL}-cp310-abi3-manylinux2014_${WHEEL_ARCH}.whl --force-reinstall $PIP_INSTALL_SUFFIX
	elif [ "${CUSTOM_BUILD_SGL_KERNEL:-}" = "true" ] && [ ! -d "sgl-kernel/dist" ]; then
	# CUSTOM_BUILD_SGL_KERNEL was set but artifacts not available (e.g., stage rerun without wheel build)
	# Fail instead of falling back to PyPI - we need to test the built kernel, not PyPI version
	echo "ERROR: CUSTOM_BUILD_SGL_KERNEL=true but sgl-kernel/dist not found."
	echo "This usually happens when rerunning a stage without the sgl-kernel-build-wheels job."
	echo "Please re-run the full workflow using /tag-and-rerun-ci to rebuild the kernel."
	exit 1
	else
	# On Blackwell machines, skip reinstall if correct version already installed to avoid race conditions
	if [ "$IS_BLACKWELL" = "1" ]; then
	INSTALLED_SGL_KERNEL=$(pip show sgl-kernel 2>/dev/null \| grep "^Version:" \| awk '{print $2}' \|\| echo "")
	if [ "$INSTALLED_SGL_KERNEL" = "$SGL_KERNEL_VERSION_FROM_SRT" ]; then
	echo "sgl-kernel==${SGL_KERNEL_VERSION_FROM_SRT} already installed, skipping reinstall"
	else
	echo "Installing sgl-kernel==${SGL_KERNEL_VERSION_FROM_SRT} (current: ${INSTALLED_SGL_KERNEL:-none})"
	$PIP_CMD install sgl-kernel==${SGL_KERNEL_VERSION_FROM_SRT} $PIP_INSTALL_SUFFIX
	fi
	else
	$PIP_CMD install sgl-kernel==${SGL_KERNEL_VERSION_FROM_SRT} --force-reinstall $PIP_INSTALL_SUFFIX
	fi
	fi

	# Show current packages
	$PIP_CMD list

	# Install other python dependencies
	$PIP_CMD install mooncake-transfer-engine==0.3.9 "${NVRTC_SPEC}" py-spy scipy huggingface_hub[hf_xet] pytest $PIP_INSTALL_SUFFIX

	if [ "$IS_BLACKWELL" != "1" ]; then
	# For lmms_evals evaluating MMMU
	git clone --branch v0.5 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
	$PIP_CMD install -e lmms-eval/ $PIP_INSTALL_SUFFIX
	fi

	# DeepEP depends on nvshmem 3.4.5
	# On Blackwell machines, skip reinstall if correct version already installed to avoid race conditions
	if [ "$IS_BLACKWELL" = "1" ]; then
	INSTALLED_NVSHMEM=$(pip show nvidia-nvshmem-cu12 2>/dev/null \| grep "^Version:" \| awk '{print $2}' \|\| echo "")
	if [ "$INSTALLED_NVSHMEM" = "3.4.5" ]; then
	echo "nvidia-nvshmem-cu12==3.4.5 already installed, skipping reinstall"
	else
	$PIP_CMD install nvidia-nvshmem-cu12==3.4.5 $PIP_INSTALL_SUFFIX
	fi
	else
	$PIP_CMD install nvidia-nvshmem-cu12==3.4.5 --force-reinstall $PIP_INSTALL_SUFFIX
	fi

	# Cudnn with version less than 9.16.0.29 will cause performance regression on Conv3D kernel
	# On Blackwell machines, skip reinstall if correct version already installed to avoid race conditions
	if [ "$IS_BLACKWELL" = "1" ]; then
	INSTALLED_CUDNN=$(pip show nvidia-cudnn-cu12 2>/dev/null \| grep "^Version:" \| awk '{print $2}' \|\| echo "")
	if [ "$INSTALLED_CUDNN" = "9.16.0.29" ]; then
	echo "nvidia-cudnn-cu12==9.16.0.29 already installed, skipping reinstall"
	else
	$PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 $PIP_INSTALL_SUFFIX
	fi
	else
	$PIP_CMD install nvidia-cudnn-cu12==9.16.0.29 --force-reinstall $PIP_INSTALL_SUFFIX
	fi
	$PIP_CMD uninstall xformers \|\| true

	# Install flashinfer-jit-cache with caching and retry logic (flashinfer.ai can have transient DNS issues)
	# The jit-cache wheel is 1.2+ GB, so we skip the download entirely if already installed.
	FLASHINFER_INSTALLED=false
	if [ "$UNINSTALL_JIT_CACHE" = false ]; then
	FLASHINFER_INSTALLED=true
	echo "flashinfer-jit-cache already at correct version, skipping download"
	fi

	if [ "$FLASHINFER_INSTALLED" = false ]; then
	# Cache directory for flashinfer wheels (persists across CI runs on self-hosted runners)
	FLASHINFER_CACHE_DIR="${HOME}/.cache/flashinfer-wheels"
	mkdir -p "${FLASHINFER_CACHE_DIR}"

	# Clean up old versions to avoid cache bloat
	find "${FLASHINFER_CACHE_DIR}" -name "flashinfer_jit_cache-.whl" ! -name "flashinfer_jit_cache-${FLASHINFER_VERSION}" -type f -delete 2>/dev/null \|\| true

	FLASHINFER_WHEEL_PATTERN="flashinfer_jit_cache-${FLASHINFER_VERSION}*.whl"
	CACHED_WHEEL=$(find "${FLASHINFER_CACHE_DIR}" -name "${FLASHINFER_WHEEL_PATTERN}" -type f 2>/dev/null \| head -n 1)

	# Try to install from cache first
	if [ -n "$CACHED_WHEEL" ] && [ -f "$CACHED_WHEEL" ]; then
	echo "Found cached flashinfer wheel: $CACHED_WHEEL"
	if $PIP_CMD install "$CACHED_WHEEL" $PIP_INSTALL_SUFFIX; then
	FLASHINFER_INSTALLED=true
	echo "Successfully installed flashinfer-jit-cache from cache"
	else
	echo "Failed to install from cache, will try downloading..."
	rm -f "$CACHED_WHEEL"
	fi
	fi

	# If not installed from cache, download with retry logic
	if [ "$FLASHINFER_INSTALLED" = false ]; then
	for i in {1..5}; do
	# Download wheel to cache directory (use pip directly as uv pip doesn't support download)
	# Timeout after 10 minutes — the wheel is ~1.2 GB
	if timeout 600 pip download flashinfer-jit-cache==${FLASHINFER_VERSION} \
	--index-url https://flashinfer.ai/whl/${CU_VERSION} \
	-d "${FLASHINFER_CACHE_DIR}"; then

	CACHED_WHEEL=$(find "${FLASHINFER_CACHE_DIR}" -name "${FLASHINFER_WHEEL_PATTERN}" -type f 2>/dev/null \| head -n 1)
	if [ -n "$CACHED_WHEEL" ] && [ -f "$CACHED_WHEEL" ]; then
	if $PIP_CMD install "$CACHED_WHEEL" $PIP_INSTALL_SUFFIX; then
	FLASHINFER_INSTALLED=true
	echo "Successfully downloaded and installed flashinfer-jit-cache"
	break
	fi
	else
	echo "Warning: Download succeeded but wheel file not found"
	fi
	fi
	echo "Attempt $i to download flashinfer-jit-cache failed, retrying in 10 seconds..."
	sleep 10
	done
	fi
	fi

	if [ "$FLASHINFER_INSTALLED" = false ]; then
	echo "ERROR: Failed to install flashinfer-jit-cache after 5 attempts"
	exit 1
	fi

	# Download flashinfer cubins if the local set is incomplete
	bash "${SCRIPT_DIR}/ci_download_flashinfer_cubin.sh"

	# Show current packages
	$PIP_CMD list
	python3 -c "import torch; print(torch.version.cuda)"

	# Prepare the CI runner (cleanup HuggingFace cache, etc.)
	bash "${SCRIPT_DIR}/prepare_runner.sh"