|
|
FROM ghcr.io/swivid/f5-tts@sha256:0b80af1e176550ed5216046c590efdb986e0ec00686ab28127ae40eccf027ce4 |
|
|
|
|
|
ARG GDRCOPY_VERSION=v2.5.1 |
|
|
ARG EFA_INSTALLER_VERSION=1.46.0 |
|
|
ARG AWS_OFI_NCCL_VERSION=1.17.2 |
|
|
ARG NCCL_VERSION=v2.27.7-1 |
|
|
|
|
|
ARG OPEN_MPI_PATH=/opt/amazon/openmpi |
|
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN apt-get update && apt-get upgrade -y && \ |
|
|
apt-get remove -y --allow-change-held-packages \ |
|
|
ibverbs-utils \ |
|
|
libibverbs-dev \ |
|
|
libibverbs1 \ |
|
|
libmlx5-1 && \ |
|
|
rm -rf /opt/hpcx/ompi /usr/local/mpi /usr/local/ucx && \ |
|
|
ldconfig |
|
|
|
|
|
RUN apt-get update && apt-get install -y \ |
|
|
apt-utils \ |
|
|
autoconf \ |
|
|
automake \ |
|
|
build-essential \ |
|
|
cmake \ |
|
|
curl \ |
|
|
gcc \ |
|
|
gdb \ |
|
|
git \ |
|
|
kmod \ |
|
|
libtool \ |
|
|
openssh-client \ |
|
|
openssh-server \ |
|
|
vim \ |
|
|
libhwloc-dev \ |
|
|
&& apt-get autoremove -y \ |
|
|
&& rm -rf /var/lib/apt/lists/* |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN mkdir -p /var/run/sshd && \ |
|
|
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ |
|
|
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ |
|
|
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ |
|
|
sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd |
|
|
|
|
|
RUN rm -rf /root/.ssh && \ |
|
|
mkdir -p /root/.ssh && \ |
|
|
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ |
|
|
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ |
|
|
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/bin:/usr/bin:/bin |
|
|
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy && \ |
|
|
cd /tmp/gdrcopy && \ |
|
|
make prefix=/opt/gdrcopy install && \ |
|
|
rm -rf /tmp/gdrcopy |
|
|
|
|
|
ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat:${LD_LIBRARY_PATH} |
|
|
ENV LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat |
|
|
ENV CPATH=/opt/gdrcopy/include |
|
|
ENV PATH=/opt/gdrcopy/bin:${PATH} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN apt-get update && \ |
|
|
apt-get install -y software-properties-common && \ |
|
|
add-apt-repository universe && \ |
|
|
add-apt-repository multiverse && \ |
|
|
apt-get update |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ |
|
|
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ |
|
|
cd aws-efa-installer && \ |
|
|
./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ |
|
|
cd / && rm -rf aws-efa-installer* |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
|
|
tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz && \ |
|
|
cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} && \ |
|
|
./configure \ |
|
|
--prefix=/opt/aws-ofi-nccl \ |
|
|
--with-mpi=${OPEN_MPI_PATH} \ |
|
|
--with-libfabric=/opt/amazon/efa \ |
|
|
--with-cuda=/usr/local/cuda \ |
|
|
--enable-platform-aws && \ |
|
|
make -j$(nproc) && make install && \ |
|
|
cd / && rm -rf aws-ofi-nccl* |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl && \ |
|
|
cd /opt/nccl && \ |
|
|
make -j$(nproc) src.build CUDA_HOME=/usr/local/cuda \ |
|
|
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 \ |
|
|
-gencode=arch=compute_86,code=sm_86 \ |
|
|
-gencode=arch=compute_89,code=sm_89" |
|
|
|
|
|
ENV LD_LIBRARY_PATH=/opt/nccl/build/lib:${LD_LIBRARY_PATH} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf && \ |
|
|
echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf |
|
|
|
|
|
RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real && \ |
|
|
printf '#!/bin/bash\n%s "$@"\n' "${OPEN_MPI_PATH}/bin/mpirun.real" > ${OPEN_MPI_PATH}/bin/mpirun && \ |
|
|
chmod +x ${OPEN_MPI_PATH}/bin/mpirun |
|
|
|
|
|
ENV OMPI_MCA_pml=^cm,ucx \ |
|
|
OMPI_MCA_btl=tcp,self \ |
|
|
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth \ |
|
|
OPAL_PREFIX=${OPEN_MPI_PATH} \ |
|
|
NCCL_SOCKET_IFNAME=^docker,lo,veth \ |
|
|
PMIX_MCA_gds=hash |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
COPY ofi-nccl-fix.deb /tmp/ofi-nccl-fix.deb |
|
|
RUN dpkg -i /tmp/ofi-nccl-fix.deb || apt-get install -f -y && \ |
|
|
rm -f /tmp/ofi-nccl-fix.deb |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN apt-get update && apt-get install -y \ |
|
|
tmux \ |
|
|
screen \ |
|
|
htop \ |
|
|
libavformat-dev \ |
|
|
libavdevice-dev \ |
|
|
libcudnn9-cuda-12 && \ |
|
|
rm -rf /var/lib/apt/lists/* |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RUN apt-get update && apt-get install -y \ |
|
|
python3 \ |
|
|
python3-pip \ |
|
|
python3-venv \ |
|
|
&& rm -rf /var/lib/apt/lists/* |
|
|
|
|
|
RUN python3 -m venv /opt/venv |
|
|
ENV VIRTUAL_ENV=/opt/venv |
|
|
ENV PATH=/opt/venv/bin:${PATH} |
|
|
|
|
|
RUN pip install --upgrade pip setuptools wheel |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ENV VIRTUAL_ENV=/opt/venv |
|
|
ENV PATH=/opt/venv/bin:${PATH} |
|
|
|
|
|
RUN pip install --no-cache-dir \ |
|
|
torch==2.6.0 \ |
|
|
torchvision==0.21.0 \ |
|
|
torchaudio==2.6.0 |
|
|
|
|
|
ENV MAX_JOBS=16 |
|
|
ENV NVCC_THREADS=8 |
|
|
|
|
|
ENV CUDA_HOME=/usr/local/cuda |
|
|
ENV TORCH_CUDA_ARCH_LIST="80;86;89" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|