File size: 2,721 Bytes
3022fd1
 
3c83f33
 
 
 
 
 
 
 
 
3022fd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3c83f33
 
 
 
 
 
 
 
3022fd1
 
3c83f33
3022fd1
83a76fb
544d677
3022fd1
83a76fb
 
 
544d677
83a76fb
 
 
23e4091
 
 
 
 
 
83a76fb
 
 
 
 
5daea2d
 
83a76fb
544d677
23e4091
 
 
83a76fb
544d677
5daea2d
3022fd1
3c83f33
3022fd1
 
3c83f33
3022fd1
 
 
 
3c83f33
3022fd1
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
FROM python:3.9-slim

# Create app directory and set permissions
RUN mkdir -p /home/user/app && \
    chmod -R 777 /home/user && \
    mkdir -p /home/user/app/data/uploads && \
    mkdir -p /home/user/app/data/processed && \
    mkdir -p /home/user/app/nltk_data && \
    chmod -R 777 /home/user/app

WORKDIR /home/user/app

# Install system dependencies
RUN apt-get update && apt-get install -y \
    build-essential \
    curl \
    libssl-dev \
    libffi-dev \
    python3-dev \
    python3-pip \
    git \
    poppler-utils \
    tesseract-ocr \
    tesseract-ocr-eng \
    && rm -rf /var/lib/apt/lists/*

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV NLTK_DATA=/home/user/app/nltk_data
ENV HUGGINGFACE_HUB_CACHE=/home/user/app/huggingface_cache
ENV UPLOAD_FOLDER=/home/user/app/data/uploads
ENV PROCESSED_FOLDER=/home/user/app/data/processed

# Create a non-root user and switch to it
RUN useradd -m -u 1000 user && \
    chown -R user:user /home/user

# Copy requirements first to leverage Docker cache
COPY --chown=user:user requirements.txt .

# Install Python dependencies
USER root

# Create a directory for NLTK data with proper permissions
RUN mkdir -p /usr/local/share/nltk_data \
    && chmod -R 777 /usr/local/share/nltk_data

# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Install system dependencies for NLTK
RUN apt-get update && apt-get install -y --no-install-recommends \
    unzip \
    && rm -rf /var/lib/apt/lists/*

# Download and install NLTK data as root
RUN python -c "import nltk; nltk.download('punkt', download_dir='/usr/local/share/nltk_data')" \
    && python -c "import nltk; nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')" \
    && python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')" \
    && python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/share/nltk_data')" \
    && chmod -R 755 /usr/local/share/nltk_data

# Set NLTK_DATA environment variable
ENV NLTK_DATA=/usr/local/share/nltk_data

# Verify NLTK data is accessible
RUN python -c "import nltk; nltk.data.path.append('/usr/local/share/nltk_data'); nltk.data.find('tokenizers/punkt')"

# Switch to non-root user
USER user

# Copy application files
COPY --chown=user:user . .

# Make scripts executable
RUN chmod +x /home/user/app/start.sh /home/user/app/download_nltk_data.py

USER user

# Set working directory
WORKDIR /home/user/app

# Expose the port the app runs on
EXPOSE 7860

# Health check
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:7860/ || exit 1

# Command to run the application
CMD ["/app/start.sh"]