AB_Testing_RAG_Agent / Dockerfile
kamkol's picture
Better handling large preprocessed data file to Huggingface
2585f8a
FROM python:3.11-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
gcc \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy the rest of the application code
COPY . .
# Create data directory for PDFs (if not already created)
RUN mkdir -p data
# Create directory for pre-processed data
RUN mkdir -p processed_data
# Configure Streamlit to run in headless mode (no welcome screen)
RUN mkdir -p /root/.streamlit && \
echo '[general]' > /root/.streamlit/config.toml && \
echo 'email = ""' >> /root/.streamlit/config.toml && \
echo 'showWarningOnDirectExecution = false' >> /root/.streamlit/config.toml && \
echo '' >> /root/.streamlit/config.toml && \
echo '[server]' >> /root/.streamlit/config.toml && \
echo 'headless = true' >> /root/.streamlit/config.toml
# Install additional required packages
RUN pip install --no-cache-dir huggingface_hub datasets
# Set environment variables
ENV HOST=0.0.0.0
ENV PORT=8000
# Expose the port
EXPOSE $PORT
# Create the entrypoint script
RUN echo '#!/bin/bash' > /app/entrypoint.sh && \
echo 'echo "Starting AB Testing RAG Agent"' >> /app/entrypoint.sh && \
echo 'echo "Checking for pre-processed data..."' >> /app/entrypoint.sh && \
echo 'if [ ! -f "processed_data/document_chunks.pkl" ] || [ ! -d "processed_data/qdrant_vectorstore" ]; then' >> /app/entrypoint.sh && \
echo ' echo "Pre-processed data not found. Downloading PDFs..."' >> /app/entrypoint.sh && \
echo ' if [ -n "${HF_TOKEN}" ]; then' >> /app/entrypoint.sh && \
echo ' python download_pdfs.py' >> /app/entrypoint.sh && \
echo ' echo "Running preprocessing..."' >> /app/entrypoint.sh && \
echo ' python scripts/preprocess_data.py' >> /app/entrypoint.sh && \
echo ' else' >> /app/entrypoint.sh && \
echo ' echo "Error: HF_TOKEN environment variable is not set. Cannot download PDFs."' >> /app/entrypoint.sh && \
echo ' echo "Please set the HF_TOKEN environment variable in your Hugging Face Space settings."' >> /app/entrypoint.sh && \
echo ' exit 1' >> /app/entrypoint.sh && \
echo ' fi' >> /app/entrypoint.sh && \
echo 'else' >> /app/entrypoint.sh && \
echo ' echo "Using existing pre-processed data"' >> /app/entrypoint.sh && \
echo 'fi' >> /app/entrypoint.sh && \
echo 'streamlit run streamlit_app.py --server.address $HOST --server.port $PORT' >> /app/entrypoint.sh && \
chmod +x /app/entrypoint.sh
# Run the application
ENTRYPOINT ["/app/entrypoint.sh"]