| FROM lmsysorg/sglang:v0.5.9-cu130 | |
| ARG ARCH=arm64 | |
| RUN pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" | |
| # Build and install flashinfer from source with fp4 quantization fix. | |
| # Keep this aligned with the v0.5.9 base image's flashinfer_python version. | |
| RUN --mount=type=cache,target=/root/.cache/pip \ | |
| --mount=type=cache,target=/sgl-workspace/flashinfer-build \ | |
| bash -c 'if [ ! -d /sgl-workspace/flashinfer-build/flashinfer ]; then git clone https://github.com/flashinfer-ai/flashinfer.git --recursive /sgl-workspace/flashinfer-build/flashinfer; fi' && \ | |
| cd /sgl-workspace/flashinfer-build/flashinfer && \ | |
| git checkout v0.6.3 && \ | |
| git config user.email "build@example.com" && \ | |
| git config user.name "Build" && \ | |
| git remote add nvjullin https://github.com/nvjullin/flashinfer 2>/dev/null || true && \ | |
| git fetch nvjullin fix-fp4-quant-padding && \ | |
| git cherry-pick ce48d4fb 10307340 && \ | |
| cd flashinfer-jit-cache && \ | |
| MAX_JOBS=32 FLASHINFER_NVCC_THREADS=2 FLASHINFER_CUDA_ARCH_LIST="10.0a 10.3a" python -m build --no-isolation --skip-dependency-check --wheel && \ | |
| python -m pip install dist/*.whl | |