bluewhale2025's picture
Remove punkt_tab references and update NLTK data handling
29ad632
FROM python:3.9-slim
# Create app directory and set permissions
RUN mkdir -p /home/user/app && \
chmod -R 777 /home/user && \
mkdir -p /home/user/app/data/uploads && \
mkdir -p /home/user/app/data/processed && \
mkdir -p /home/user/app/nltk_data && \
chmod -R 777 /home/user/app
WORKDIR /home/user/app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
libssl-dev \
libffi-dev \
python3-dev \
python3-pip \
git \
poppler-utils \
tesseract-ocr \
tesseract-ocr-eng \
&& rm -rf /var/lib/apt/lists/*
# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV NLTK_DATA=/home/user/app/nltk_data
ENV HUGGINGFACE_HUB_CACHE=/home/user/app/huggingface_cache
ENV UPLOAD_FOLDER=/home/user/app/data/uploads
ENV PROCESSED_FOLDER=/home/user/app/data/processed
# Create a non-root user and switch to it
RUN useradd -m -u 1000 user && \
chown -R user:user /home/user
# Copy requirements first to leverage Docker cache
COPY --chown=user:user requirements.txt .
# Install Python dependencies
USER root
# Create a directory for NLTK data with proper permissions
RUN mkdir -p /usr/local/share/nltk_data \
&& chmod -R 777 /usr/local/share/nltk_data
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Install system dependencies for NLTK
RUN apt-get update && apt-get install -y --no-install-recommends \
unzip \
&& rm -rf /var/lib/apt/lists/*
# Download and install NLTK data as root
RUN python -c "import nltk; nltk.download('punkt', download_dir='/usr/local/share/nltk_data')" \
&& python -c "import nltk; nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')" \
&& python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')" \
&& python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/share/nltk_data')" \
&& chmod -R 755 /usr/local/share/nltk_data
# Set NLTK_DATA environment variable
ENV NLTK_DATA=/usr/local/share/nltk_data
# Verify NLTK data is accessible
RUN python -c "import nltk; nltk.data.path.append('/usr/local/share/nltk_data'); nltk.data.find('tokenizers/punkt')"
# Switch to non-root user
USER user
# Copy application files
COPY --chown=user:user . .
# Make scripts executable
RUN chmod +x /home/user/app/start.sh /home/user/app/download_nltk_data.py
USER user
# Set working directory
WORKDIR /home/user/app
# Expose the port the app runs on
EXPOSE 7860
# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
CMD curl -f http://localhost:7860/ || exit 1
# Command to run the application
CMD ["/app/start.sh"]