sooktam2 / optimized.Dockerfile
vanshp123's picture
Upload folder using huggingface_hub
2e766c2 verified
FROM ghcr.io/swivid/f5-tts@sha256:0b80af1e176550ed5216046c590efdb986e0ec00686ab28127ae40eccf027ce4
ARG GDRCOPY_VERSION=v2.5.1
ARG EFA_INSTALLER_VERSION=1.46.0
ARG AWS_OFI_NCCL_VERSION=1.17.2
ARG NCCL_VERSION=v2.27.7-1
ARG OPEN_MPI_PATH=/opt/amazon/openmpi
ENV DEBIAN_FRONTEND=noninteractive
# ------------------------------------------------------------------
# System cleanup & base deps
# ------------------------------------------------------------------
RUN apt-get update && apt-get upgrade -y && \
apt-get remove -y --allow-change-held-packages \
ibverbs-utils \
libibverbs-dev \
libibverbs1 \
libmlx5-1 && \
rm -rf /opt/hpcx/ompi /usr/local/mpi /usr/local/ucx && \
ldconfig
RUN apt-get update && apt-get install -y \
apt-utils \
autoconf \
automake \
build-essential \
cmake \
curl \
gcc \
gdb \
git \
kmod \
libtool \
openssh-client \
openssh-server \
vim \
libhwloc-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
# ------------------------------------------------------------------
# SSH setup
# ------------------------------------------------------------------
RUN mkdir -p /var/run/sshd && \
sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd
RUN rm -rf /root/.ssh && \
mkdir -p /root/.ssh && \
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
# ------------------------------------------------------------------
# Paths
# ------------------------------------------------------------------
ENV PATH=/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/local/bin:/usr/bin:/bin
ENV LD_LIBRARY_PATH=/usr/local/cuda/extras/CUPTI/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/amazon/ofi-nccl/lib
# ------------------------------------------------------------------
# GDRCopy
# ------------------------------------------------------------------
RUN git clone -b ${GDRCOPY_VERSION} https://github.com/NVIDIA/gdrcopy.git /tmp/gdrcopy && \
cd /tmp/gdrcopy && \
make prefix=/opt/gdrcopy install && \
rm -rf /tmp/gdrcopy
ENV LD_LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat:${LD_LIBRARY_PATH}
ENV LIBRARY_PATH=/opt/gdrcopy/lib:/usr/local/cuda/compat
ENV CPATH=/opt/gdrcopy/include
ENV PATH=/opt/gdrcopy/bin:${PATH}
# ------------------------------------------------------------------
# Enable required Ubuntu repositories for EFA
# ------------------------------------------------------------------
RUN apt-get update && \
apt-get install -y software-properties-common && \
add-apt-repository universe && \
add-apt-repository multiverse && \
apt-get update
# ------------------------------------------------------------------
# EFA installer
# ------------------------------------------------------------------
RUN curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
cd aws-efa-installer && \
./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify && \
cd / && rm -rf aws-efa-installer*
# ------------------------------------------------------------------
# AWS OFI NCCL
# ------------------------------------------------------------------
RUN curl -OL https://github.com/aws/aws-ofi-nccl/releases/download/v${AWS_OFI_NCCL_VERSION}/aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz && \
tar -xf aws-ofi-nccl-${AWS_OFI_NCCL_VERSION}.tar.gz && \
cd aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} && \
./configure \
--prefix=/opt/aws-ofi-nccl \
--with-mpi=${OPEN_MPI_PATH} \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws && \
make -j$(nproc) && make install && \
cd / && rm -rf aws-ofi-nccl*
# ------------------------------------------------------------------
# NCCL build
# ------------------------------------------------------------------
RUN git clone -b ${NCCL_VERSION} https://github.com/NVIDIA/nccl.git /opt/nccl && \
cd /opt/nccl && \
make -j$(nproc) src.build CUDA_HOME=/usr/local/cuda \
NVCC_GENCODE="-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_86,code=sm_86 \
-gencode=arch=compute_89,code=sm_89"
ENV LD_LIBRARY_PATH=/opt/nccl/build/lib:${LD_LIBRARY_PATH}
# ------------------------------------------------------------------
# OpenMPI fixes
# ------------------------------------------------------------------
RUN echo "hwloc_base_binding_policy = none" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf && \
echo "rmaps_base_mapping_policy = slot" >> ${OPEN_MPI_PATH}/etc/openmpi-mca-params.conf
RUN mv ${OPEN_MPI_PATH}/bin/mpirun ${OPEN_MPI_PATH}/bin/mpirun.real && \
printf '#!/bin/bash\n%s "$@"\n' "${OPEN_MPI_PATH}/bin/mpirun.real" > ${OPEN_MPI_PATH}/bin/mpirun && \
chmod +x ${OPEN_MPI_PATH}/bin/mpirun
ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth \
OPAL_PREFIX=${OPEN_MPI_PATH} \
NCCL_SOCKET_IFNAME=^docker,lo,veth \
PMIX_MCA_gds=hash
# ------------------------------------------------------------------
# OFI NCCL hotfix (NON-INTERACTIVE)
# ------------------------------------------------------------------
COPY ofi-nccl-fix.deb /tmp/ofi-nccl-fix.deb
RUN dpkg -i /tmp/ofi-nccl-fix.deb || apt-get install -f -y && \
rm -f /tmp/ofi-nccl-fix.deb
# ------------------------------------------------------------------
# Runtime tools
# ------------------------------------------------------------------
RUN apt-get update && apt-get install -y \
tmux \
screen \
htop \
libavformat-dev \
libavdevice-dev \
libcudnn9-cuda-12 && \
rm -rf /var/lib/apt/lists/*
# ------------------------------------------------------------------
# Python + pip + venv
# ------------------------------------------------------------------
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
python3-venv \
&& rm -rf /var/lib/apt/lists/*
RUN python3 -m venv /opt/venv
ENV VIRTUAL_ENV=/opt/venv
ENV PATH=/opt/venv/bin:${PATH}
RUN pip install --upgrade pip setuptools wheel
# ------------------------------------------------------------------
# Python
# ------------------------------------------------------------------
ENV VIRTUAL_ENV=/opt/venv
ENV PATH=/opt/venv/bin:${PATH}
RUN pip install --no-cache-dir \
torch==2.6.0 \
torchvision==0.21.0 \
torchaudio==2.6.0
ENV MAX_JOBS=16
ENV NVCC_THREADS=8
ENV CUDA_HOME=/usr/local/cuda
ENV TORCH_CUDA_ARCH_LIST="80;86;89"
# RUN pip install --no-cache-dir \
# flash_attn==2.8.3 \
# transformer_engine==2.8.0 \
# deepspeed==0.18.0
# RUN pip install --no-cache-dir \
# tensorboard==2.20.0 \
# nvitop==1.5.3 \
# psycopg2-binary==2.9.11 \
# s3prl==0.4.18 \
# indic_unified_parser==1.0.6 \
# indo-arabic-transliteration==0.1.5 \
# indic-numtowords==1.1.0 \
# jupyter==1.1.1 \
# click==8.0.1 \
# jiwer==3.1.0 \
# transformers==4.41.2 \
# huggingface-hub==0.24.7 \
# speechbrain==1.0.3 \
# indic-nlp-library==0.92 \
# git+https://github.com/libindic/indic-trans
# ------------------------------------------------------------------
# Project
# ------------------------------------------------------------------
# WORKDIR /workspace
# COPY . /workspace
# RUN pip install -e . --no-cache-dir && pip install -e .[eval]
# # ------------------------------------------------------------------
# # Accelerate config
# # ------------------------------------------------------------------
# RUN mkdir -p /root/.cache/huggingface/accelerate && \
# cat > /root/.cache/huggingface/accelerate/default_config.yaml <<'EOF'
# compute_environment: LOCAL_MACHINE
# distributed_type: MULTI_GPU
# gpu_ids: 2,3,6,7
# num_processes: 4
# mixed_precision: bf16
# EOF