FROM python:3.9-slim # Create app directory and set permissions RUN mkdir -p /home/user/app && \ chmod -R 777 /home/user && \ mkdir -p /home/user/app/data/uploads && \ mkdir -p /home/user/app/data/processed && \ mkdir -p /home/user/app/nltk_data && \ chmod -R 777 /home/user/app WORKDIR /home/user/app # Install system dependencies RUN apt-get update && apt-get install -y \ build-essential \ curl \ libssl-dev \ libffi-dev \ python3-dev \ python3-pip \ git \ poppler-utils \ tesseract-ocr \ tesseract-ocr-eng \ && rm -rf /var/lib/apt/lists/* # Set environment variables ENV PYTHONUNBUFFERED=1 ENV NLTK_DATA=/home/user/app/nltk_data ENV HUGGINGFACE_HUB_CACHE=/home/user/app/huggingface_cache ENV UPLOAD_FOLDER=/home/user/app/data/uploads ENV PROCESSED_FOLDER=/home/user/app/data/processed # Create a non-root user and switch to it RUN useradd -m -u 1000 user && \ chown -R user:user /home/user # Copy requirements first to leverage Docker cache COPY --chown=user:user requirements.txt . # Install Python dependencies USER root # Create a directory for NLTK data with proper permissions RUN mkdir -p /usr/local/share/nltk_data \ && chmod -R 777 /usr/local/share/nltk_data # Install Python dependencies RUN pip install --no-cache-dir -r requirements.txt # Install system dependencies for NLTK RUN apt-get update && apt-get install -y --no-install-recommends \ unzip \ && rm -rf /var/lib/apt/lists/* # Download and install NLTK data as root RUN python -c "import nltk; nltk.download('punkt', download_dir='/usr/local/share/nltk_data')" \ && python -c "import nltk; nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')" \ && python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')" \ && python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/share/nltk_data')" \ && chmod -R 755 /usr/local/share/nltk_data # Set NLTK_DATA environment variable ENV NLTK_DATA=/usr/local/share/nltk_data # Verify NLTK data is accessible RUN python -c "import nltk; nltk.data.path.append('/usr/local/share/nltk_data'); nltk.data.find('tokenizers/punkt')" # Switch to non-root user USER user # Copy application files COPY --chown=user:user . . # Make scripts executable RUN chmod +x /home/user/app/start.sh /home/user/app/download_nltk_data.py USER user # Set working directory WORKDIR /home/user/app # Expose the port the app runs on EXPOSE 7860 # Health check HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \ CMD curl -f http://localhost:7860/ || exit 1 # Command to run the application CMD ["/app/start.sh"]