AB_Testing_RAG / Dockerfile
kamkol's picture
Fix PDFLoader to process pages individually for correct page numbering
6b569cb
# Get a distribution that has uv already installed
FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
# Add user - this is the user that will run the app
# If you do not set user, the app will run as root (undesirable)
RUN useradd -m -u 1000 user
USER user
# Set the home directory and path
ENV HOME=/home/user \
PATH=/home/user/.local/bin:$PATH
ENV UVICORN_WS_PROTOCOL=websockets
# Set the working directory
WORKDIR $HOME/app
# Copy the app to the container
COPY --chown=user . $HOME/app
# Install the dependencies
# RUN uv sync --frozen
RUN uv sync
# Create data directory if it doesn't exist
RUN mkdir -p $HOME/app/data
# Install additional required packages
RUN uv pip install huggingface_hub datasets python-dotenv pypdf2
# Download PDFs from Hugging Face dataset
# Use a build arg for the token
ARG HF_TOKEN
# Check token availability using a simpler approach
RUN echo "=================== CHECKING HF_TOKEN ===================" && \
if [ -n "${HF_TOKEN}" ]; then \
echo "HF_TOKEN is available (first character: ${HF_TOKEN:0:1}*)"; \
python -c "from huggingface_hub import login; login(token='${HF_TOKEN}')"; \
else \
echo "ERROR: HF_TOKEN is empty or not set!"; \
fi
# Use uv run to ensure we use the environment where datasets is installed
RUN uv run python download_pdfs.py || echo "WARNING: download_pdfs.py failed, but continuing build"
# Run preprocessing to generate the embeddings
# Note: This requires the OPENAI_API_KEY environment variable to be set during build
# For Hugging Face, you'll need to use their build secrets feature
ARG OPENAI_API_KEY
RUN echo "=================== CHECKING OPENAI_API_KEY ===================" && \
if [ -n "${OPENAI_API_KEY}" ]; then \
echo "OPENAI_API_KEY is available (first character: ${OPENAI_API_KEY:0:1}*)"; \
OPENAI_API_KEY=${OPENAI_API_KEY} uv run python preprocess.py || echo "WARNING: preprocessing failed, app will not work properly"; \
else \
echo "ERROR: OPENAI_API_KEY is empty or not set!"; \
echo "WARNING: App will not work without preprocessed data!"; \
fi
# Expose the port
EXPOSE 7860
# Run the app
CMD ["uv", "run", "chainlit", "run", "app.py", "--host", "0.0.0.0", "--port", "7860"]