File size: 4,908 Bytes
61ba51e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/bin/bash
set -ex

if [ $# -lt 2 ]; then
  echo "Usage: $0 <PYTHON_VERSION> <CUDA_VERSION> [ARCH]"
  exit 1
fi

PYTHON_VERSION="$1"          # e.g. 3.10
CUDA_VERSION="$2"            # e.g. 12.9
ARCH="${3:-$(uname -i)}"     # optional override

if [ "${ARCH}" = "aarch64" ]; then
  BASE_IMG="pytorch/manylinuxaarch64-builder"
else
  BASE_IMG="pytorch/manylinux2_28-builder"
fi

# Create cache directories for persistent build artifacts in home directory
# Using home directory to persist across workspace cleanups/checkouts
CACHE_DIR="${HOME}/.cache/sgl-kernel"
BUILDX_CACHE_DIR="${CACHE_DIR}/buildx"
CCACHE_HOST_DIR="${CACHE_DIR}/ccache"
mkdir -p "${BUILDX_CACHE_DIR}" "${CCACHE_HOST_DIR}"

# Ensure a buildx builder with docker-container driver (required for cache export)
BUILDER_NAME="sgl-kernel-builder"
if ! docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then
  docker buildx create --name "${BUILDER_NAME}" --driver docker-container --use --bootstrap
else
  docker buildx use "${BUILDER_NAME}"
fi

PY_TAG="cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}"

# Output directory for wheels
DIST_DIR="dist"
mkdir -p "${DIST_DIR}"

echo "----------------------------------------"
echo "Build configuration"
echo "PYTHON_VERSION: ${PYTHON_VERSION}"
echo "CUDA_VERSION:   ${CUDA_VERSION}"
echo "ARCH:           ${ARCH}"
echo "BASE_IMG:       ${BASE_IMG}"
echo "PYTHON_TAG:     ${PY_TAG}"
echo "Output:         ${DIST_DIR}/"
echo "Buildx cache:   ${BUILDX_CACHE_DIR}"
echo "ccache dir:     ${CCACHE_HOST_DIR}"
echo "Builder:        ${BUILDER_NAME}"
echo "BUILD_JOBS:     ${BUILD_JOBS:-auto}"
echo "NVCC_THREADS:   ${NVCC_THREADS:-32}"
echo "USE_CCACHE:     ${USE_CCACHE:-1}"
echo "----------------------------------------"

# Optional build-args (empty string disables)
BUILD_ARGS=()
[ -n "${ENABLE_CMAKE_PROFILE:-}" ] && BUILD_ARGS+=(--build-arg ENABLE_CMAKE_PROFILE="${ENABLE_CMAKE_PROFILE}")
[ -n "${ENABLE_BUILD_PROFILE:-}" ] && BUILD_ARGS+=(--build-arg ENABLE_BUILD_PROFILE="${ENABLE_BUILD_PROFILE}")
[ -n "${USE_CCACHE:-}" ]           && BUILD_ARGS+=(--build-arg USE_CCACHE="${USE_CCACHE}")
[ -n "${BUILD_JOBS:-}" ]           && BUILD_ARGS+=(--build-arg BUILD_JOBS="${BUILD_JOBS}")
[ -n "${NVCC_THREADS:-}" ]         && BUILD_ARGS+=(--build-arg NVCC_THREADS="${NVCC_THREADS}")

# ---- Step 1: Build deps image (layer cached, fast on repeat) ----
DEPS_TAG="sgl-kernel-deps:cuda${CUDA_VERSION}-${PY_TAG}-${ARCH}"

docker buildx build \
  --builder "${BUILDER_NAME}" \
  -f Dockerfile . \
  --build-arg BASE_IMG="${BASE_IMG}" \
  --build-arg CUDA_VERSION="${CUDA_VERSION}" \
  --build-arg ARCH="${ARCH}" \
  --build-arg PYTHON_VERSION="${PYTHON_VERSION}" \
  --build-arg PYTHON_TAG="${PY_TAG}" \
  "${BUILD_ARGS[@]}" \
  --cache-from type=local,src=${BUILDX_CACHE_DIR} \
  --cache-to type=local,dest=${BUILDX_CACHE_DIR},mode=max \
  --target deps \
  --load \
  -t "${DEPS_TAG}" \
  --network=host

echo "Deps image ready: ${DEPS_TAG}"

# ---- Step 2: Build wheel with host-mounted ccache ----
# This allows ccache to persist on the host filesystem across builds.
CCACHE_FLAG="${USE_CCACHE:-1}"
BUILD_JOBS_FLAG="${BUILD_JOBS:-0}"
NVCC_THREADS_FLAG="${NVCC_THREADS:-32}"

docker run --rm \
  --network=host \
  -v "$(pwd):/sgl-kernel" \
  -v "${CCACHE_HOST_DIR}:/ccache" \
  -w /sgl-kernel \
  -e ARCH="${ARCH}" \
  "${DEPS_TAG}" \
  bash -c '
set -eux

USE_CCACHE='"${CCACHE_FLAG}"'
BUILD_JOBS='"${BUILD_JOBS_FLAG}"'
NVCC_THREADS='"${NVCC_THREADS_FLAG}"'

if [ "${USE_CCACHE}" = "1" ]; then
  export CCACHE_DIR=/ccache
  export CCACHE_BASEDIR=/sgl-kernel
  export CCACHE_MAXSIZE=10G
  export CCACHE_COMPILERCHECK=content
  export CCACHE_COMPRESS=true
  export CCACHE_SLOPPINESS=file_macro,time_macros,include_file_mtime,include_file_ctime
  export CMAKE_C_COMPILER_LAUNCHER=ccache
  export CMAKE_CXX_COMPILER_LAUNCHER=ccache
  export CMAKE_CUDA_COMPILER_LAUNCHER=ccache
  echo "=== ccache stats (before) ==="
  ccache -sV
fi

if [ "'"${ARCH}"'" = "aarch64" ]; then
  export CUDA_NVCC_FLAGS="-Xcudafe --threads=8"
  export MAKEFLAGS="-j8"
  export CMAKE_BUILD_PARALLEL_LEVEL=2
  export NINJAFLAGS="-j4"
  echo "ARM detected: Using extra conservative settings (2 parallel jobs)"
elif [ "${BUILD_JOBS}" -gt 0 ] 2>/dev/null; then
  export CMAKE_BUILD_PARALLEL_LEVEL=${BUILD_JOBS}
else
  export CMAKE_BUILD_PARALLEL_LEVEL=$(echo "$(( $(nproc) * 2 / 3 )) 64" | awk "{print (\$1 < \$2) ? \$1 : \$2}")
fi

export CMAKE_ARGS="${CMAKE_ARGS:-} -DSGL_KERNEL_COMPILE_THREADS=${NVCC_THREADS}"
echo "Build parallelism: CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL}, NVCC_THREADS=${NVCC_THREADS}"

${PYTHON_ROOT_PATH}/bin/python -m uv build --wheel -Cbuild-dir=build . --color=always --no-build-isolation
./rename_wheels.sh

if [ "${USE_CCACHE}" = "1" ]; then
  echo "=== ccache stats (after) ==="
  ccache -s
fi
'

echo "Done. Wheels are in ${DIST_DIR}/"
ls -lh "${DIST_DIR}"/*.whl 2>/dev/null || true