Spaces:

nalin0503
/

Metamorph

Sleeping

App Files Files Community

nalin0503 commited on Mar 19, 2025

Commit

b5e1a79

1 Parent(s): 0ce9aad

try dockerfile, last

Browse files

Files changed (1) hide show

Dockerfile +35 -55

Dockerfile CHANGED Viewed

@@ -1,4 +1,5 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
@@ -8,54 +9,51 @@ ENV TF_FORCE_GPU_ALLOW_GROWTH=true
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    git \
-    wget \
-    curl \
-    ca-certificates \
-    python3 \
-    python3-pip \
-    python3-dev \
-    ffmpeg \
-    libsm6 \
-    libxext6 \
-    libgl1-mesa-glx \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
 # Set working directory
 WORKDIR /app
-# Copy requirements.txt
 COPY requirements.txt /app/
-# Install Python dependencies with specific compatible versions
 RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel
-# Install TensorFlow with GPU support (compatible with CUDA 11.8)
-RUN pip3 install --no-cache-dir tensorflow==2.12.0
-# Install other dependencies but skip tensorflow (already installed)
 RUN pip3 install --no-cache-dir --no-deps -r requirements.txt
 RUN pip3 install --no-cache-dir tensorflow-hub==0.14.0
 RUN pip3 install --no-cache-dir opencv-python-headless opencv-contrib-python-headless
-# Copy application code
 COPY . /app/
-# Create a robust CPU fallback implementation
 RUN echo 'import tensorflow as tf\n\
 import os\n\
-\n\
-# Set TensorFlow logging level\n\
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"\n\
-\n\
-# Function to setup GPU with memory growth or fallback to CPU\n\
-def setup_tensorflow():\n\
     try:\n\
-        # List physical devices\n\
         physical_devices = tf.config.list_physical_devices("GPU")\n\
-        if len(physical_devices) > 0:\n\
             print(f"Found {len(physical_devices)} GPU(s)")\n\
             for device in physical_devices:\n\
-                # Allow memory growth to avoid allocating all GPU memory at once\n\
                 tf.config.experimental.set_memory_growth(device, True)\n\
                 print(f"Enabled memory growth for {device}")\n\
         else:\n\
@@ -63,56 +61,38 @@ def setup_tensorflow():\n\
     except Exception as e:\n\
         print(f"Error setting up TensorFlow: {e}")\n\
         print("Disabling GPU and falling back to CPU")\n\
-        # Force CPU usage if there was an error with GPU setup\n\
         os.environ["CUDA_VISIBLE_DEVICES"] = "-1"\n\
 \n\
-# Call the setup function\n\
-setup_tensorflow()\n\
-' > /app/tf_setup.py
-# Modify FILM.py to properly handle CPU fallback
 RUN if [ -f "/app/FILM.py" ]; then \
-    # Import our setup at the top of the file\
     sed -i '1s/^/import tensorflow as tf\nfrom tf_setup import setup_tensorflow\n/' /app/FILM.py && \
-    # Add GPU check and CPU fallback in __init__\
     sed -i '/def __init__/a\        # Check if GPU is disabled and use CPU if needed\n        if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] == "-1":\n            print("GPU is disabled, using CPU for FILM")\n            self._device = "/cpu:0"\n        else:\n            self._device = "/gpu:0"\n        print(f"FILM will use device: {self._device}")' /app/FILM.py && \
-    # Add device context to __call__\
     sed -i '/def __call__/a\        with tf.device(self._device):' /app/FILM.py && \
-    # Fix the model call indentation after adding the with statement\
     sed -i 's/        result = self._model/            try:\n                result = self._model/g' /app/FILM.py && \
-    sed -i '/result = self._model/a\            except Exception as e:\n                print(f"Error during model inference: {e}, trying CPU fallback")\n                with tf.device("/cpu:0"):\n                    result = self._model(inputs, training=False)' /app/FILM.py; \
-    # Make sure os is imported if not already\
     sed -i '1s/^/import os\n/' /app/FILM.py; \
 fi
-# Set environment variables for GPU compatibility
-ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}
-ENV PATH=/usr/local/cuda/bin:${PATH}
-ENV CUDA_VISIBLE_DEVICES=0
-ENV TF_FORCE_GPU_ALLOW_GROWTH=true
-# Create a startup script with proper error handling
 RUN echo '#!/bin/bash\n\
 set -e\n\
 \n\
-# Check CUDA and cuDNN status\n\
 echo "CUDA libraries:"\n\
 ldconfig -p | grep cuda\n\
 echo "cuDNN libraries:"\n\
 ldconfig -p | grep cudnn\n\
 \n\
-# Test TensorFlow GPU\n\
 python3 -c "import tensorflow as tf; print(\\"Num GPUs Available: \\", len(tf.config.list_physical_devices(\\"GPU\\")))" || {\n\
   echo "TensorFlow GPU test failed, falling back to CPU"\n\
   export CUDA_VISIBLE_DEVICES=-1\n\
 }\n\
 \n\
-# Run the app with proper error handling\n\
-exec streamlit run app.py --server.port=8501 --server.address=0.0.0.0\n\
-' > /app/start.sh && chmod +x /app/start.sh
-# Expose port for Streamlit
 EXPOSE 8501
-# Use the startup script
 CMD ["/app/start.sh"]

+# Use a CUDA base image without preinstalled cuDNN to avoid conflicts
+FROM nvidia/cuda:12.3.2-devel-ubuntu22.04
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 # Install system dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
+    git wget curl ca-certificates \
+    python3 python3-pip python3-dev \
+    ffmpeg libsm6 libxext6 libgl1-mesa-glx && \
+    apt-get clean && rm -rf /var/lib/apt/lists/*
+# --- Download and install cuDNN 9.3.0 ---
+# Download the archive directly from NVIDIA
+RUN wget -O /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz && \
+    tar -xJvf /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz -C /tmp && \
+    cp -P /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive/cuda/include/cudnn*.h /usr/local/cuda/include && \
+    cp -P /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive/cuda/lib64/libcudnn* /usr/local/cuda/lib64 && \
+    chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* && \
+    rm -rf /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive.tar.xz /tmp/cudnn-linux-x86_64-9.3.0.75_cuda12-archive
+# Set environment variables for CUDA/cuDNN libraries
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}
+ENV PATH=/usr/local/cuda/bin:${PATH}
+ENV CUDA_VISIBLE_DEVICES=0
 # Set working directory
 WORKDIR /app
+# Copy requirements and install Python dependencies
 COPY requirements.txt /app/
 RUN pip3 install --no-cache-dir --upgrade pip setuptools wheel
+# Install TensorFlow GPU support (using version 2.15.0 here for compatibility)
+RUN pip3 install --no-cache-dir tensorflow==2.15.0
+# Install the remaining packages from requirements.txt (skip dependency resolution)
 RUN pip3 install --no-cache-dir --no-deps -r requirements.txt
 RUN pip3 install --no-cache-dir tensorflow-hub==0.14.0
 RUN pip3 install --no-cache-dir opencv-python-headless opencv-contrib-python-headless
+# Copy the application code
 COPY . /app/
+# Create a CPU fallback setup for TensorFlow
 RUN echo 'import tensorflow as tf\n\
 import os\n\
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"\n\
+\ndef setup_tensorflow():\n\
     try:\n\
         physical_devices = tf.config.list_physical_devices("GPU")\n\
+        if physical_devices:\n\
             print(f"Found {len(physical_devices)} GPU(s)")\n\
             for device in physical_devices:\n\
                 tf.config.experimental.set_memory_growth(device, True)\n\
                 print(f"Enabled memory growth for {device}")\n\
         else:\n\
     except Exception as e:\n\
         print(f"Error setting up TensorFlow: {e}")\n\
         print("Disabling GPU and falling back to CPU")\n\
         os.environ["CUDA_VISIBLE_DEVICES"] = "-1"\n\
 \n\
+setup_tensorflow()\n' > /app/tf_setup.py
+# Patch FILM.py to ensure proper GPU/CPU fallback, if the file exists
 RUN if [ -f "/app/FILM.py" ]; then \
     sed -i '1s/^/import tensorflow as tf\nfrom tf_setup import setup_tensorflow\n/' /app/FILM.py && \
     sed -i '/def __init__/a\        # Check if GPU is disabled and use CPU if needed\n        if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] == "-1":\n            print("GPU is disabled, using CPU for FILM")\n            self._device = "/cpu:0"\n        else:\n            self._device = "/gpu:0"\n        print(f"FILM will use device: {self._device}")' /app/FILM.py && \
     sed -i '/def __call__/a\        with tf.device(self._device):' /app/FILM.py && \
     sed -i 's/        result = self._model/            try:\n                result = self._model/g' /app/FILM.py && \
+    sed -i '/result = self._model/a\            except Exception as e:\n                print(f"Error during model inference: {e}, trying CPU fallback")\n                with tf.device("/cpu:0"):\n                    result = self._model(inputs, training=False)' /app/FILM.py && \
     sed -i '1s/^/import os\n/' /app/FILM.py; \
 fi
+# Create a startup script that checks CUDA/cuDNN status and launches Streamlit
 RUN echo '#!/bin/bash\n\
 set -e\n\
 \n\
 echo "CUDA libraries:"\n\
 ldconfig -p | grep cuda\n\
 echo "cuDNN libraries:"\n\
 ldconfig -p | grep cudnn\n\
 \n\
 python3 -c "import tensorflow as tf; print(\\"Num GPUs Available: \\", len(tf.config.list_physical_devices(\\"GPU\\")))" || {\n\
   echo "TensorFlow GPU test failed, falling back to CPU"\n\
   export CUDA_VISIBLE_DEVICES=-1\n\
 }\n\
 \n\
+exec streamlit run app.py --server.port=8501 --server.address=0.0.0.0\n' > /app/start.sh && chmod +x /app/start.sh
+# Expose the port for Streamlit
 EXPOSE 8501
+# Use the startup script as the container's entrypoint
 CMD ["/app/start.sh"]