FROM ghcr.io/swivid/f5-tts@sha256:0b80af1e176550ed5216046c590efdb986e0ec00686ab28127ae40eccf027ce4 ARG GDRCOPY_VERSION=v2.5.1 ARG EFA_INSTALLER_VERSION=1.46.0 ARG AWS_OFI_NCCL_VERSION=1.17.2 ARG NCCL_VERSION=v2.27.7-1 ARG OPEN_MPI_PATH=/opt/amazon/openmpi ENV DEBIAN_FRONTEND=noninteractive # ------------------------------------------------------------------ # System cleanup & base deps # ------------------------------------------------------------------ RUN apt-get update && apt-get upgrade -y && \ apt-get remove -y --allow-change-held-packages \ ibverbs-utils \ libibverbs-dev \ libibverbs1 \ libmlx5-1 && \ rm -rf /opt/hpcx/ompi /usr/local/mpi /usr/local/ucx && \ ldconfig RUN apt-get update && apt-get install -y \ apt-utils \ autoconf \ automake \ build-essential \ cmake \ curl \ gcc \ gdb \ git \ kmod \ libtool \ openssh-client \ openssh-server \ vim \ libhwloc-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* # ------------------------------------------------------------------ # SSH setup # ------------------------------------------------------------------ RUN mkdir -p /var/run/sshd && \ sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd RUN rm -rf /root/.ssh && \ mkdir -p /root/.ssh && \ ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config # ------------------------------------------------------------------ # Paths # ------------------------------------------------------------------ ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/bin:/usr/bin:/bin ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib # ------------------------------------------------------------------ # GDRCopy # ------------------------------------------------------------------ RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy && \ cd /tmp/gdrcopy && \ make prefix=/opt/gdrcopy install && \ rm -rf /tmp/gdrcopy ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat:${LD_LIBRARY_PATH} ENV LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat ENV CPATH=/opt/gdrcopy/include ENV PATH=/opt/gdrcopy/bin:${PATH} # ------------------------------------------------------------------ # Enable required Ubuntu repositories for EFA # ------------------------------------------------------------------ RUN apt-get update && \ apt-get install -y software-properties-common && \ add-apt-repository universe && \ add-apt-repository multiverse && \ apt-get update # ------------------------------------------------------------------ # EFA installer # ------------------------------------------------------------------ RUN curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \ cd aws-efa-installer && \ ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \ cd / && rm -rf aws-efa-installer* # ------------------------------------------------------------------ # AWS OFI NCCL # ------------------------------------------------------------------ RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz && \ tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz && \ cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} && \ ./configure \ --prefix=/opt/aws-ofi-nccl \ --with-mpi=${OPEN_MPI_PATH} \ --with-libfabric=/opt/amazon/efa \ --with-cuda=/usr/local/cuda \ --enable-platform-aws && \ make -j$(nproc) && make install && \ cd / && rm -rf aws-ofi-nccl* # ------------------------------------------------------------------ # NCCL build # ------------------------------------------------------------------ RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl && \ cd /opt/nccl && \ make -j$(nproc) src.build CUDA_HOME=/usr/local/cuda \ NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 \ -gencode=arch=compute_86,code=sm_86 \ -gencode=arch=compute_89,code=sm_89" ENV LD_LIBRARY_PATH=/opt/nccl/build/lib:${LD_LIBRARY_PATH} # ------------------------------------------------------------------ # OpenMPI fixes # ------------------------------------------------------------------ RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf && \ echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real && \ printf '#!/bin/bash\n%s "$@"\n' "${OPEN_MPI_PATH}/bin/mpirun.real" > ${OPEN_MPI_PATH}/bin/mpirun && \ chmod +x ${OPEN_MPI_PATH}/bin/mpirun ENV OMPI_MCA_pml=^cm,ucx \ OMPI_MCA_btl=tcp,self \ OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth \ OPAL_PREFIX=${OPEN_MPI_PATH} \ NCCL_SOCKET_IFNAME=^docker,lo,veth \ PMIX_MCA_gds=hash # ------------------------------------------------------------------ # OFI NCCL hotfix (NON-INTERACTIVE) # ------------------------------------------------------------------ COPY ofi-nccl-fix.deb /tmp/ofi-nccl-fix.deb RUN dpkg -i /tmp/ofi-nccl-fix.deb || apt-get install -f -y && \ rm -f /tmp/ofi-nccl-fix.deb # ------------------------------------------------------------------ # Runtime tools # ------------------------------------------------------------------ RUN apt-get update && apt-get install -y \ tmux \ screen \ htop \ libavformat-dev \ libavdevice-dev \ libcudnn9-cuda-12 && \ rm -rf /var/lib/apt/lists/* # ------------------------------------------------------------------ # Python + pip + venv # ------------------------------------------------------------------ RUN apt-get update && apt-get install -y \ python3 \ python3-pip \ python3-venv \ && rm -rf /var/lib/apt/lists/* RUN python3 -m venv /opt/venv ENV VIRTUAL_ENV=/opt/venv ENV PATH=/opt/venv/bin:${PATH} RUN pip install --upgrade pip setuptools wheel # ------------------------------------------------------------------ # Python # ------------------------------------------------------------------ ENV VIRTUAL_ENV=/opt/venv ENV PATH=/opt/venv/bin:${PATH} RUN pip install --no-cache-dir \ torch==2.6.0 \ torchvision==0.21.0 \ torchaudio==2.6.0 ENV MAX_JOBS=16 ENV NVCC_THREADS=8 ENV CUDA_HOME=/usr/local/cuda ENV TORCH_CUDA_ARCH_LIST="80;86;89" # RUN pip install --no-cache-dir \ # flash_attn==2.8.3 \ # transformer_engine==2.8.0 \ # deepspeed==0.18.0 # RUN pip install --no-cache-dir \ # tensorboard==2.20.0 \ # nvitop==1.5.3 \ # psycopg2-binary==2.9.11 \ # s3prl==0.4.18 \ # indic_unified_parser==1.0.6 \ # indo-arabic-transliteration==0.1.5 \ # indic-numtowords==1.1.0 \ # jupyter==1.1.1 \ # click==8.0.1 \ # jiwer==3.1.0 \ # transformers==4.41.2 \ # huggingface-hub==0.24.7 \ # speechbrain==1.0.3 \ # indic-nlp-library==0.92 \ # git+https://github.com/libindic/indic-trans # ------------------------------------------------------------------ # Project # ------------------------------------------------------------------ # WORKDIR /workspace # COPY . /workspace # RUN pip install -e . --no-cache-dir && pip install -e .[eval] # # ------------------------------------------------------------------ # # Accelerate config # # ------------------------------------------------------------------ # RUN mkdir -p /root/.cache/huggingface/accelerate && \ # cat > /root/.cache/huggingface/accelerate/default_config.yaml <<'EOF' # compute_environment: LOCAL_MACHINE # distributed_type: MULTI_GPU # gpu_ids: 2,3,6,7 # num_processes: 4 # mixed_precision: bf16 # EOF