# Base image from NGC TensorRT-LLM, which includes a pre-installed TensorRT-LLM. # For available images, visit: https://nvidia.github.io/TensorRT-LLM/installation/containers.html # Use TRTLLM_BASE_IMAGE to specify the base image (default: release:1.2.0rc6) ARG TRTLLM_BASE_IMAGE=nvcr.io/nvidia/tensorrt-llm/release:1.3.0rc4 FROM ${TRTLLM_BASE_IMAGE} # ============================================================================== # Install Megatron dependencies # ============================================================================== # DeepEP is required for IBGDA support. # Clone and build gdrcopy and deepep-nvshmem dependencies. WORKDIR /home/dpsk_a2a RUN git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git && \ pushd gdrcopy && \ make prefix=/usr/local lib_install && \ popd && rm -rf gdrcopy && \ pip install nvidia-nvshmem-cu13==3.3.20 && \ export NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem && \ export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH" && \ export PATH="${NVSHMEM_DIR}/bin:$PATH" && \ pushd ${NVSHMEM_DIR}/lib && \ ln -s libnvshmem_host.so.3 libnvshmem_host.so && \ popd && \ git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git && \ pushd DeepEP && \ wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch && \ patch -p1 < deepep.patch && \ TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install && \ popd && rm -rf deepep # Install Python dependencies RUN pip3 install --no-cache-dir --no-deps trl && \ pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools && \ pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git && \ pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0 # ============================================================================== # Install verl dependencies # ============================================================================== RUN pip install git+https://github.com/volcengine/verl.git@v0.7.0 RUN pip uninstall -y verl RUN pip install "verl[mcore] @ git+https://github.com/volcengine/verl.git@v0.7.0" RUN pip uninstall -y verl # ============================================================================== # Install a specific TensorRT-LLM on demand # ============================================================================== # Note: The NGC image already includes a pre-installed TensorRT-LLM, but you can install a specific version if needed. # Refer to https://nvidia.github.io/TensorRT-LLM/installation/index.html for more details.