File size: 8,306 Bytes
61ba51e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | ARG BASE_IMG=pytorch/manylinux2_28-builder
ARG CUDA_VERSION=12.9
# Dependency stage: install system deps, CMake, ccache, Python deps (including torch)
FROM ${BASE_IMG}:cuda${CUDA_VERSION} AS deps
# Overridable build arguments
ARG ARCH=x86_64
ARG CUDA_VERSION=12.9
ARG PYTHON_VERSION=3.10
# Manylinux python path tag, e.g. cp310-cp310 / cp312-cp312
ARG PYTHON_TAG=cp310-cp310
ARG CMAKE_VERSION_MAJOR=3.31
ARG CMAKE_VERSION_MINOR=1
# Install ccache 4.12.1 from source for CUDA support (yum provides old 3.7.7)
ARG USE_CCACHE=1
ARG CCACHE_VERSION=4.12.1
ARG GITHUB_ARTIFACTORY=github.com
ARG PYTORCH_MIRROR=download.pytorch.org
ARG PIP_DEFAULT_INDEX=https://pypi.python.org/simple
ENV PYTHON_ROOT_PATH=/opt/python/${PYTHON_TAG}
ENV PATH=/opt/cmake/bin:${PATH}
ENV LD_LIBRARY_PATH=/lib64:${LD_LIBRARY_PATH}
ENV NINJA_STATUS="[%f/%t %es] "
ENV FLASHINFER_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a 12.0a"
# CUDA headers path
ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include/cccl${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}
ENV C_INCLUDE_PATH=/usr/local/cuda/include/cccl${C_INCLUDE_PATH:+:${C_INCLUDE_PATH}}
# Install build dependencies
RUN yum install gcc gcc-c++ make wget tar numactl-devel libibverbs -y --nogpgcheck \
&& ln -sv /usr/lib64/libibverbs.so.1 /usr/lib64/libibverbs.so \
&& yum clean all && rm -rf /var/cache/yum
# Install CMake (cached download)
RUN --mount=type=cache,id=sgl-kernel-cmake,target=/cmake-downloads \
set -eux; \
CMAKE_TARBALL=cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}-linux-${ARCH}.tar.gz; \
# Check if CMake is already cached
if [ -f /cmake-downloads/${CMAKE_TARBALL} ]; then \
echo "Using cached CMake from /cmake-downloads/${CMAKE_TARBALL}"; \
cp /cmake-downloads/${CMAKE_TARBALL} .; \
else \
CMAKE_TARBALL_URL=https://${GITHUB_ARTIFACTORY}/Kitware/CMake/releases/download/v${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}/${CMAKE_TARBALL}; \
echo "Downloading CMake from: ${CMAKE_TARBALL_URL}"; \
wget --progress=dot ${CMAKE_TARBALL_URL}; \
# Cache the downloaded file
cp ${CMAKE_TARBALL} /cmake-downloads/; \
fi; \
tar -xzf ${CMAKE_TARBALL}; \
mv cmake-${CMAKE_VERSION_MAJOR}.${CMAKE_VERSION_MINOR}-linux-${ARCH} /opt/cmake; \
rm -f ${CMAKE_TARBALL}; \
cmake --version
# Install ccache
RUN if [ "${USE_CCACHE}" = "1" ]; then \
set -eux && \
cd /tmp && \
wget --progress=dot https://${GITHUB_ARTIFACTORY}/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.xz && \
tar -xf ccache-${CCACHE_VERSION}.tar.xz && \
cd ccache-${CCACHE_VERSION} && \
mkdir build && cd build && \
cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_INSTALL_PREFIX=/usr -D ENABLE_TESTING=OFF -D REDIS_STORAGE_BACKEND=OFF -D HTTP_STORAGE_BACKEND=OFF -D ENABLE_DOCUMENTATION=OFF .. && \
make -j"$(nproc)" && \
make install && \
ccache --version && \
rm -rf /tmp/ccache-${CCACHE_VERSION}*; \
else \
echo "Skipping ccache build (USE_CCACHE=${USE_CCACHE})"; \
fi
RUN set -eux; \
if [ "${ARCH}" = "aarch64" ]; then _LIB=sbsa; else _LIB="${ARCH}"; fi; \
mkdir -p /usr/lib/${ARCH}-linux-gnu/; \
ln -sf /usr/local/cuda-${CUDA_VERSION}/targets/${_LIB}-linux/lib/stubs/libcuda.so /usr/lib/${ARCH}-linux-gnu/libcuda.so
# Install Python dependencies (torch + build tools)
RUN --mount=type=cache,id=sgl-kernel-pip,target=/root/.cache/pip \
set -eux; \
case "${CUDA_VERSION}" in \
13.0) TORCH_VER=2.9.1; CU_TAG=cu130 ;; \
12.9) TORCH_VER=2.9.1; CU_TAG=cu128 ;; \
12.8) TORCH_VER=2.9.1; CU_TAG=cu128 ;; \
*) TORCH_VER=2.9.1; CU_TAG=cu126 ;; \
esac; \
${PYTHON_ROOT_PATH}/bin/pip install torch==${TORCH_VER} --index-url https://${PYTORCH_MIRROR}/whl/${CU_TAG}; \
${PYTHON_ROOT_PATH}/bin/pip install ninja setuptools==75.0.0 wheel==0.41.0 numpy uv scikit-build-core --index-url ${PIP_DEFAULT_INDEX}
# Build stage: copy source and build wheel
FROM deps AS build
WORKDIR /sgl-kernel
# Only copy sgl-kernel source so code changes only affect later layers
COPY . /sgl-kernel/
# Optional: enable CMake/Ninja profiling (pass non-empty via --build-arg ENABLE_*)
ARG ENABLE_CMAKE_PROFILE
ARG ENABLE_BUILD_PROFILE
ARG ARCH=x86_64
ARG USE_CCACHE=1
# Parallelism knobs (override via --build-arg)
# BUILD_JOBS: number of parallel compilation units (ninja -j)
# NVCC_THREADS: per-compilation-unit NVCC --threads (multi-arch PTXAS)
ARG BUILD_JOBS=0
ARG NVCC_THREADS=32
RUN --mount=type=cache,id=sgl-kernel-ccache,target=/ccache \
--mount=type=cache,id=sgl-kernel-pip,target=/root/.cache/pip \
set -eux; \
if [ "${USE_CCACHE}" = "1" ]; then \
export CCACHE_DIR=/ccache; \
export CCACHE_BASEDIR=/sgl-kernel; \
export CCACHE_MAXSIZE=10G; \
export CCACHE_COMPILERCHECK=content; \
export CCACHE_COMPRESS=true; \
export CCACHE_SLOPPINESS=file_macro,time_macros,include_file_mtime,include_file_ctime; \
export CMAKE_C_COMPILER_LAUNCHER=ccache; \
export CMAKE_CXX_COMPILER_LAUNCHER=ccache; \
export CMAKE_CUDA_COMPILER_LAUNCHER=ccache; \
ccache -sV; \
fi; \
# Setting these flags to reduce OOM chance only on ARM
if [ "${ARCH}" = "aarch64" ]; then \
export CUDA_NVCC_FLAGS="-Xcudafe --threads=2"; \
export MAKEFLAGS="-j2"; \
export CMAKE_BUILD_PARALLEL_LEVEL=2; \
export NINJAFLAGS="-j2"; \
echo "ARM detected: Using extra conservative settings (2 parallel jobs)"; \
elif [ "${BUILD_JOBS}" -gt 0 ] 2>/dev/null; then \
export CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}; \
else \
export CMAKE_BUILD_PARALLEL_LEVEL=$(echo "$(( $(nproc) * 2 / 3 )) 64" | awk '{print ($1 < $2) ? $1 : $2}'); \
fi; \
export CMAKE_ARGS="${CMAKE_ARGS:-} -DSGL_KERNEL_COMPILE_THREADS=${NVCC_THREADS}"; \
if [ -n "${ENABLE_CMAKE_PROFILE:-}" ]; then \
echo "CMake profiling enabled - will save to /sgl-kernel/cmake-profile.json"; \
export CMAKE_ARGS="${CMAKE_ARGS} --profiling-output=/sgl-kernel/cmake-profile.json --profiling-format=google-trace"; \
fi; \
echo "Build parallelism: CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}, NVCC_THREADS=${NVCC_THREADS}"; \
echo "CMAKE_ARGS=${CMAKE_ARGS}"; \
${PYTHON_ROOT_PATH}/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always --no-build-isolation; \
./rename_wheels.sh; \
if [ -n "${ENABLE_BUILD_PROFILE:-}" ] && [ -f /sgl-kernel/build/.ninja_log ]; then \
echo "Ninja build profiling enabled - will save to /sgl-kernel/build-trace.json"; \
wget --progress=dot https://raw.githubusercontent.com/cradleapps/ninjatracing/084212eaf68f25c70579958a2ed67fb4ec2a9ca4/ninjatracing -O /tmp/ninjatracing; \
if [ -f /tmp/ninjatracing ]; then \
${PYTHON_ROOT_PATH}/bin/python /tmp/ninjatracing /sgl-kernel/build/.ninja_log > /sgl-kernel/build-trace.json; \
fi; \
if [ -f /sgl-kernel/build-trace.json ]; then \
gzip -9 -k /sgl-kernel/build-trace.json 2>/dev/null || true; \
echo "Build trace saved to: build-trace.json"; \
if [ -f /sgl-kernel/build-trace.json.gz ]; then \
ORIGINAL_SIZE=$(stat -f%z /sgl-kernel/build-trace.json 2>/dev/null || stat -c%s /sgl-kernel/build-trace.json); \
COMPRESSED_SIZE=$(stat -f%z /sgl-kernel/build-trace.json.gz 2>/dev/null || stat -c%s /sgl-kernel/build-trace.json.gz); \
RATIO=$(( (ORIGINAL_SIZE - COMPRESSED_SIZE) * 100 / ORIGINAL_SIZE )); \
echo "Compressed to: build-trace.json.gz (${RATIO}% smaller)"; \
fi; \
echo ""; \
echo "View in browser:"; \
echo " - chrome://tracing (load JSON file)"; \
echo " - ui.perfetto.dev (recommended, supports .gz files)"; \
echo ""; \
echo "Shows:"; \
echo " - Compilation time per file"; \
echo " - Parallelism utilization"; \
echo " - Critical path (longest dependency chain)"; \
echo " - Where the 2-hour build time went"; \
fi; \
fi; \
if [ "${USE_CCACHE}" = "1" ]; then \
echo "ccache Statistics"; \
ccache -s; \
else \
echo "ccache disabled (USE_CCACHE=${USE_CCACHE})"; \
fi
# Artifact stage (for --output to export wheel)
FROM scratch AS artifact
COPY --from=build /sgl-kernel/dist/*.whl /
|