Spaces:
Build error
Build error
Fix cudnn.h not found error by copying to Conda env include path and setting CUDA_HOME
Browse files- Dockerfile +26 -15
Dockerfile
CHANGED
|
@@ -20,27 +20,23 @@ RUN apt-get update -y && apt-get install -qqy \
|
|
| 20 |
libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
|
| 21 |
libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
|
| 22 |
ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
|
| 23 |
-
# Ensure git-lfs is installed and initialized
|
| 24 |
git-lfs \
|
| 25 |
&& rm -rf /var/lib/apt/lists/* \
|
| 26 |
-
&& git lfs install
|
| 27 |
|
| 28 |
# Install Miniconda
|
| 29 |
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
|
| 30 |
/bin/bash miniconda.sh -b -p $CONDA_DIR && \
|
| 31 |
rm miniconda.sh && \
|
| 32 |
-
# Add Conda to PATH for subsequent commands in this RUN layer
|
| 33 |
export PATH=$CONDA_DIR/bin:$PATH && \
|
| 34 |
conda clean --all --yes && \
|
| 35 |
conda config --set auto_activate_base false && \
|
| 36 |
conda config --add channels conda-forge
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
# This ensures 'conda' command is available for subsequent RUN commands.
|
| 40 |
ENV PATH=$CONDA_DIR/bin:$PATH
|
| 41 |
|
| 42 |
# Accept Conda Terms of Service for default channels.
|
| 43 |
-
# Now 'conda' command should be found.
|
| 44 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 45 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
|
| 46 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
|
|
@@ -49,7 +45,6 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
|
| 49 |
COPY . /app
|
| 50 |
|
| 51 |
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
|
| 52 |
-
# Now 'conda' command should be found.
|
| 53 |
RUN conda env create -f cosmos-predict1.yaml
|
| 54 |
|
| 55 |
# Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
|
|
@@ -65,15 +60,32 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
|
| 65 |
torchaudio==2.3.1 \
|
| 66 |
--index-url https://download.pytorch.org/whl/cu121
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
# Install Transformer Engine by attempting to compile it
|
|
|
|
| 74 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 75 |
conda activate cosmos-predict1 && \
|
| 76 |
-
pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
|
| 77 |
|
| 78 |
# Install Apex for inference.
|
| 79 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
|
@@ -89,7 +101,7 @@ RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
|
| 89 |
# Make the start.sh script executable.
|
| 90 |
RUN chmod +x /app/start.sh
|
| 91 |
|
| 92 |
-
#
|
| 93 |
RUN echo "Verifying Python and Conda installations..."
|
| 94 |
RUN python --version
|
| 95 |
RUN conda env list
|
|
@@ -104,7 +116,6 @@ else:
|
|
| 104 |
print('CUDA Device Name: N/A')
|
| 105 |
EOF
|
| 106 |
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
|
| 107 |
-
# --- End Verification Steps ---
|
| 108 |
|
| 109 |
# Set the default command to run when the container starts.
|
| 110 |
CMD ["/app/start.sh"]
|
|
|
|
| 20 |
libbz2-dev libreadline-dev libsqlite3-dev curl llvm \
|
| 21 |
libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \
|
| 22 |
ffmpeg libsm6 libxext6 cmake libmagickwand-dev \
|
|
|
|
| 23 |
git-lfs \
|
| 24 |
&& rm -rf /var/lib/apt/lists/* \
|
| 25 |
+
&& git lfs install
|
| 26 |
|
| 27 |
# Install Miniconda
|
| 28 |
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \
|
| 29 |
/bin/bash miniconda.sh -b -p $CONDA_DIR && \
|
| 30 |
rm miniconda.sh && \
|
|
|
|
| 31 |
export PATH=$CONDA_DIR/bin:$PATH && \
|
| 32 |
conda clean --all --yes && \
|
| 33 |
conda config --set auto_activate_base false && \
|
| 34 |
conda config --add channels conda-forge
|
| 35 |
|
| 36 |
+
# Set the global PATH for Conda's base environment immediately after installation.
|
|
|
|
| 37 |
ENV PATH=$CONDA_DIR/bin:$PATH
|
| 38 |
|
| 39 |
# Accept Conda Terms of Service for default channels.
|
|
|
|
| 40 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 41 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \
|
| 42 |
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r
|
|
|
|
| 45 |
COPY . /app
|
| 46 |
|
| 47 |
# Create the Conda environment named 'cosmos-predict1' using the provided YAML file.
|
|
|
|
| 48 |
RUN conda env create -f cosmos-predict1.yaml
|
| 49 |
|
| 50 |
# Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime)
|
|
|
|
| 60 |
torchaudio==2.3.1 \
|
| 61 |
--index-url https://download.pytorch.org/whl/cu121
|
| 62 |
|
| 63 |
+
# NEW: Ensure cudnn.h is available in the Conda environment's include path.
|
| 64 |
+
# The base image has cudnn.h at /usr/local/cuda/include.
|
| 65 |
+
# We explicitly copy it to the Conda environment's include directory if it's not already there.
|
| 66 |
+
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 67 |
+
conda activate cosmos-predict1 && \
|
| 68 |
+
if [ ! -f "$CONCONDA_PREFIX/include/cudnn.h" ]; then \
|
| 69 |
+
echo "cudnn.h not found in Conda environment, copying from /usr/local/cuda/include"; \
|
| 70 |
+
mkdir -p "$CONDA_PREFIX/include" && \
|
| 71 |
+
cp /usr/local/cuda/include/cudnn.h "$CONDA_PREFIX/include/"; \
|
| 72 |
+
else \
|
| 73 |
+
echo "cudnn.h already present in Conda environment"; \
|
| 74 |
+
fi
|
| 75 |
+
|
| 76 |
+
# IMPORTANT: Symlink fix for Transformer Engine compilation (from INSTALL.md).
|
| 77 |
+
# These symlinks are for other NVIDIA headers that might be in Python site-packages.
|
| 78 |
+
# $CONDA_PREFIX is the current activated Conda environment.
|
| 79 |
+
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 80 |
+
conda activate cosmos-predict1 && \
|
| 81 |
+
ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/ || true && \
|
| 82 |
+
ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/python3.10 || true
|
| 83 |
|
| 84 |
+
# Install Transformer Engine by attempting to compile it.
|
| 85 |
+
# Pass CUDA_HOME explicitly to ensure it looks in the correct Conda environment path.
|
| 86 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
| 87 |
conda activate cosmos-predict1 && \
|
| 88 |
+
CUDA_HOME=$CONDA_PREFIX pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0"
|
| 89 |
|
| 90 |
# Install Apex for inference.
|
| 91 |
RUN . $CONDA_DIR/etc/profile.d/conda.sh && \
|
|
|
|
| 101 |
# Make the start.sh script executable.
|
| 102 |
RUN chmod +x /app/start.sh
|
| 103 |
|
| 104 |
+
# Verification Steps
|
| 105 |
RUN echo "Verifying Python and Conda installations..."
|
| 106 |
RUN python --version
|
| 107 |
RUN conda env list
|
|
|
|
| 116 |
print('CUDA Device Name: N/A')
|
| 117 |
EOF
|
| 118 |
RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml."
|
|
|
|
| 119 |
|
| 120 |
# Set the default command to run when the container starts.
|
| 121 |
CMD ["/app/start.sh"]
|