Spaces:
Build error
Build error
| # Adopt new base image with cuDNN pre-installed | |
| FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 | |
| # Set environment variables for non-interactive installations to prevent prompts during apt-get. | |
| ENV DEBIAN_FRONTEND=noninteractive | |
| ENV CONDA_DIR=/opt/conda | |
| WORKDIR /app | |
| # Install essential system dependencies from both Dockerfiles | |
| RUN apt-get update -y && apt-get install -qqy \ | |
| wget \ | |
| git \ | |
| build-essential \ | |
| libgl1-mesa-glx \ | |
| libglib2.0-0 \ | |
| rsync \ | |
| make \ | |
| libssl-dev zlib1g-dev \ | |
| libbz2-dev libreadline-dev libsqlite3-dev curl llvm \ | |
| libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev \ | |
| ffmpeg libsm6 libxext6 cmake libmagickwand-dev \ | |
| git-lfs \ | |
| && rm -rf /var/lib/apt/lists/* \ | |
| && git lfs install | |
| # Install Miniconda | |
| RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh && \ | |
| /bin/bash miniconda.sh -b -p $CONDA_DIR && \ | |
| rm miniconda.sh && \ | |
| export PATH=$CONDA_DIR/bin:$PATH && \ | |
| conda clean --all --yes && \ | |
| conda config --set auto_activate_base false && \ | |
| conda config --add channels conda-forge | |
| # Set the global PATH for Conda's base environment immediately after installation. | |
| ENV PATH=$CONDA_DIR/bin:$PATH | |
| # Accept Conda Terms of Service for default channels. | |
| RUN . $CONDA_DIR/etc/profile.d/conda.sh && \ | |
| conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main && \ | |
| conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r | |
| # Copy all local project files into the container's working directory (/app). | |
| COPY . /app | |
| # Create the Conda environment named 'cosmos-predict1' using the provided YAML file. | |
| RUN conda env create -f cosmos-predict1.yaml | |
| # Set the default Conda environment to be activated and update PATH (for subsequent layers and runtime) | |
| ENV CONDA_DEFAULT_ENV=cosmos-predict1 | |
| ENV PATH=$CONDA_DIR/envs/cosmos-predict1/bin:$PATH | |
| # Install PyTorch and TorchVision via pip with specific CUDA index. | |
| RUN . $CONDA_DIR/etc/profile.d/conda.sh && \ | |
| conda activate cosmos-predict1 && \ | |
| pip install --no-cache-dir \ | |
| torch==2.3.1 \ | |
| torchvision==0.18.1 \ | |
| torchaudio==2.3.1 \ | |
| --index-url https://download.pytorch.org/whl/cu121 | |
| # NEW: Dynamically find cudnn.h and symlink it to the Conda environment's include path. | |
| # This ensures Transformer Engine can find it during compilation. | |
| RUN . $CONDA_DIR/etc/profile.d/conda.sh && \ | |
| conda activate cosmos-predict1 && \ | |
| CUDNN_HEADER_PATH=$(find "$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/" -name "cudnn.h" | head -n 1) && \ | |
| if [ -f "$CUDNN_HEADER_PATH" ]; then \ | |
| echo "Found cudnn.h at: $CUDNN_HEADER_PATH"; \ | |
| mkdir -p "$CONDA_PREFIX/include" && \ | |
| ln -sf "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h" || \ | |
| cp "$CUDNN_HEADER_PATH" "$CONDA_PREFIX/include/cudnn.h"; \ | |
| else \ | |
| echo "Error: cudnn.h not found in any expected location within Conda environment. This will likely cause compilation failures."; \ | |
| exit 1; \ | |
| fi | |
| # IMPORTANT: Symlink fix for Transformer Engine compilation (from INSTALL.md). | |
| # These symlinks are for other NVIDIA headers that might be in Python site-packages. | |
| RUN . $CONDA_DIR/etc/profile.d/conda.sh && \ | |
| conda activate cosmos-predict1 && \ | |
| ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/ || true && \ | |
| ln -sf "$CONDA_PREFIX"/lib/python3.10/site-packages/nvidia/*/include/* "$CONDA_PREFIX"/include/python3.10 || true | |
| # Install Transformer Engine by attempting to compile it. | |
| # Pass CUDA_HOME explicitly to ensure it looks in the correct Conda environment path. | |
| RUN . $CONDA_DIR/etc/profile.d/conda.sh && \ | |
| conda activate cosmos-predict1 && \ | |
| CUDA_HOME=$CONDA_PREFIX pip install --no-cache-dir --no-build-isolation "transformer-engine[pytorch]==1.12.0" | |
| # Install Apex for inference. | |
| RUN . $CONDA_DIR/etc/profile.d/conda.sh && \ | |
| conda activate cosmos-predict1 && \ | |
| git clone https://github.com/NVIDIA/apex /app/apex && \ | |
| CUDA_HOME=$CONDA_PREFIX pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" /app/apex | |
| # Install MoGe for inference. | |
| RUN . $CONDA_DIR/etc/profile.d/conda.sh && \ | |
| conda activate cosmos-predict1 && \ | |
| pip install --no-cache-dir git+https://github.com/microsoft/MoGe.git | |
| # Make the start.sh script executable. | |
| RUN chmod +x /app/start.sh | |
| # Verification Steps | |
| RUN echo "Verifying Python and Conda installations..." | |
| RUN python --version | |
| RUN conda env list | |
| RUN echo "Verifying PyTorch and CUDA availability..." | |
| RUN conda run -n cosmos-predict1 python <<EOF | |
| import torch | |
| print('PyTorch Version: ' + torch.__version__) | |
| print('CUDA Available: ' + str(torch.cuda.is_available())) | |
| if torch.cuda.is_available(): | |
| print('CUDA Device Name: ' + torch.cuda.get_device_name(0)) | |
| else: | |
| print('CUDA Device Name: N/A') | |
| EOF | |
| RUN [ $? -eq 0 ] || echo "PyTorch verification failed. Check dependencies in cosmos-predict1.yaml." | |
| # Set the default command to run when the container starts. | |
| CMD ["/app/start.sh"] |