File size: 2,680 Bytes

1faccd4

# Base image from NGC TensorRT-LLM, which includes a pre-installed TensorRT-LLM.
# For available images, visit: https://nvidia.github.io/TensorRT-LLM/installation/containers.html
# Use TRTLLM_BASE_IMAGE to specify the base image (default: release:1.2.0rc6)
ARG TRTLLM_BASE_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc4
FROM ${TRTLLM_BASE_IMAGE}


# ==============================================================================
# Install Megatron dependencies
# ==============================================================================
# DeepEP is required for IBGDA support.
# Clone and build gdrcopy and deepep-nvshmem dependencies.
WORKDIR /home/dpsk_a2a
RUN git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git && \
    pushd gdrcopy && \
    make prefix=/usr/local lib_install && \
    popd && rm -rf gdrcopy && \
    pip install nvidia-nvshmem-cu13==3.3.20 && \
    export NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem && \
    export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH" && \
    export PATH="${NVSHMEM_DIR}/bin:$PATH" && \
    pushd ${NVSHMEM_DIR}/lib && \
    ln -s libnvshmem_host.so.3 libnvshmem_host.so && \
    popd && \
    git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git && \
    pushd DeepEP && \
    wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch && \
    patch -p1 < deepep.patch && \
    TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install && \
    popd && rm -rf deepep

# Install Python dependencies
RUN pip3 install --no-cache-dir --no-deps trl && \
    pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools && \
    pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git && \
    pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0


# ==============================================================================
# Install verl dependencies
# ==============================================================================
RUN pip install git+https://github.com/volcengine/verl.git@v0.7.0
RUN pip uninstall -y verl
RUN pip install "verl[mcore] @ git+https://github.com/volcengine/verl.git@v0.7.0"
RUN pip uninstall -y verl


# ==============================================================================
# Install a specific TensorRT-LLM on demand
# ==============================================================================
# Note: The NGC image already includes a pre-installed TensorRT-LLM, but you can install a specific version if needed.
# Refer to https://nvidia.github.io/TensorRT-LLM/installation/index.html for more details.