NeMo
Megatron-LM / docker /Dockerfile.ci.dev
KexuanShi's picture
Upload folder using huggingface_hub
88e6849 verified
Raw
History Blame Contribute Delete
3.83 kB
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# syntax=docker/dockerfile:1.3-labs
ARG FROM_IMAGE_NAME
FROM ${FROM_IMAGE_NAME} as main
ENV PIP_CONSTRAINT=""
ENV DEBIAN_FRONTEND=noninteractive
ARG UV_VERSION=0.7.2
ARG YQ_VERSION=4.44.1
ENV PATH="/root/.local/bin:$PATH"
ARG UV_PROJECT_ENVIRONMENT=/opt/venv
ENV UV_PROJECT_ENVIRONMENT=${UV_PROJECT_ENVIRONMENT}
ENV VIRTUAL_ENV=$UV_PROJECT_ENVIRONMENT
ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
ENV UV_LINK_MODE=copy
RUN bash -ex <<"EOF"
apt-get update
apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime
apt-get clean
python -m venv /opt/jet
ARCH=$(uname -m)
case "${ARCH}" in \
"x86_64") YQ_ARCH=amd64 ;; \
"aarch64") YQ_ARCH=arm64 ;; \
"armv7l") YQ_ARCH=arm ;; \
*) echo "Unsupported architecture: ${ARCH}" && exit 1 ;; \
esac
wget https://github.com/mikefarah/yq/releases/download/v${YQ_VERSION}/yq_linux_${YQ_ARCH} -O /usr/local/bin/yq
chmod a+x /usr/local/bin/yq
curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
EOF
COPY README.md pyproject.toml uv.lock /workspace/
COPY megatron/core/__init__.py /workspace/megatron/core/
COPY megatron/core/package_info.py /workspace/megatron/core/
ARG IMAGE_TYPE=dev
RUN --mount=type=cache,target=/root/.cache/uv \
bash -ex <<"EOF"
export NVTE_CUDA_ARCHS="80;90;100"
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
uv sync --only-group build
uv sync --extra ${IMAGE_TYPE} --extra mlm --link-mode copy --locked \
--no-install-package torch \
--no-install-package torchvision \
--no-install-package triton \
--no-install-package transformer-engine-cu12 \
--no-install-package nvidia-cublas-cu12 \
--no-install-package nvidia-cuda-cupti-cu12 \
--no-install-package nvidia-cuda-nvrtc-cu12 \
--no-install-package nvidia-cuda-runtime-cu12 \
--no-install-package nvidia-cudnn-cu12 \
--no-install-package nvidia-cufft-cu12 \
--no-install-package nvidia-cufile-cu12 \
--no-install-package nvidia-curand-cu12 \
--no-install-package nvidia-cusolver-cu12 \
--no-install-package nvidia-cusparse-cu12 \
--no-install-package nvidia-cusparselt-cu12 \
--no-install-package nvidia-nccl-cu12
EOF
# Install DeepEP
COPY docker/patches/deepep.patch /workspace/deepep.patch
RUN bash -ex <<"EOF"
cd /workspace
uv pip install nvidia-nvshmem-cu13==3.4.5
pushd /opt/venv/lib/python3.12/site-packages/nvidia/nvshmem/lib/
ln -s libnvshmem_host.so.3 libnvshmem_host.so
popd
git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git
pushd DeepEP
git checkout 83e0d156807f31abed4ea55c2fa6eb4b62a11b82
patch -p1 < /workspace/deepep.patch
popd
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/.
rm -rf DeepEP
EOF
COPY assets/ /opt/data/
ENV UV_PYTHON=$UV_PROJECT_ENVIRONMENT/bin/python
##### For NVIDIANS only #####
FROM main as jet
ARG JET_API_VERSION
ENV PATH="$PATH:/opt/jet/bin"
RUN --mount=type=secret,id=JET_INDEX_URLS bash -ex <<"EOF"
JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS)
python -m venv /opt/jet
/opt/jet/bin/pip install --no-cache-dir $JET_INDEX_URLS \
jet-api==$JET_API_VERSION
EOF
RUN --mount=type=secret,id=JET_INDEX_URLS \
--mount=type=secret,id=LOGGER_INDEX_URL bash -ex <<"EOF"
JET_INDEX_URLS=$(cat /run/secrets/JET_INDEX_URLS)
LOGGER_INDEX_URL=$(cat /run/secrets/LOGGER_INDEX_URL)
uv pip install --no-cache-dir --upgrade $LOGGER_INDEX_URL "one-logger"
uv pip install --no-cache-dir --upgrade "setuptools<80.0.0,>=77.0.0"
uv pip install --no-cache-dir --upgrade $JET_INDEX_URLS "jet-client~=4.0"
EOF
###