scraper / Dockerfile
samuelolubukun's picture
Upload 7 files
0f4206c verified
FROM python:3.12-slim-bookworm AS build
ENV PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
DEBIAN_FRONTEND=noninteractive \
HF_PORT=7860
RUN apt-get update && apt-get install -y --no-install-recommends curl gnupg git \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /build
# Clone the crawl4ai repo to get deploy/docker files
RUN git clone --depth 1 --branch main https://github.com/unclecode/crawl4ai.git /build/crawl4ai
# --- Final image ---
FROM python:3.12-slim-bookworm
ENV PYTHONUNBUFFERED=1 \
PIP_NO_CACHE_DIR=1 \
PYTHONDONTWRITEBYTECODE=1 \
DEBIAN_FRONTEND=noninteractive \
HF_PORT=7860
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
curl gnupg wget git cmake pkg-config python3-dev libjpeg-dev \
redis-server supervisor \
libglib2.0-0 libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 \
libcups2 libdrm2 libdbus-1-3 libxcb1 libxkbcommon0 libx11-6 \
libxcomposite1 libxdamage1 libxext6 libxfixes3 libxrandr2 \
libgbm1 libpango-1.0-0 libcairo2 libasound2 libatspi2.0-0 \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get dist-upgrade -y \
&& rm -rf /var/lib/apt/lists/*
# Create non-root user
RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser \
&& mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
WORKDIR /app
# Install crawl4ai
RUN pip install --no-cache-dir crawl4ai
# Install server dependencies
RUN pip install --no-cache-dir \
"fastapi>=0.115.12" \
"uvicorn>=0.34.2" \
"gunicorn>=23.0.0" \
"slowapi==0.1.9" \
"prometheus-fastapi-instrumentator>=7.1.0" \
"redis>=5.2.1" \
"jwt>=1.3.1" \
"dnspython>=2.7.0" \
"email-validator==2.2.0" \
"sse-starlette==2.2.1" \
"pydantic>=2.11" \
"rank-bm25==0.2.2" \
"anyio==4.9.0" \
"mcp>=1.18.0" \
"websockets>=15.0.1" \
"httpx[http2]>=0.27.2" \
"psutil>=5.9.0"
# Setup Playwright + Chromium
RUN crawl4ai-setup && playwright install --with-deps chromium
# Copy Playwright browsers to appuser home
RUN mkdir -p /home/appuser/.cache/ms-playwright \
&& cp -r /root/.cache/ms-playwright/chromium-* /home/appuser/.cache/ms-playwright/ \
&& chown -R appuser:appuser /home/appuser/.cache
# Copy server code from cloned repo
COPY --from=build /build/crawl4ai/deploy/docker/server.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/api.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/auth.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/crawler_pool.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/hook_manager.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/job.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/monitor.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/monitor_routes.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/mcp_bridge.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/schemas.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/utils.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/webhook.py /app/
COPY --from=build /build/crawl4ai/deploy/docker/static /app/static
# Copy our custom config and supervisor config
COPY config.yml /app/config.yml
COPY supervisord.conf /app/supervisord.conf
# Create Redis data/log dirs and fix permissions
RUN mkdir -p /var/lib/redis /var/log/redis /home/appuser/.cache \
&& chown -R appuser:appuser /var/lib/redis /var/log/redis /home/appuser/.cache /app
EXPOSE ${HF_PORT}
USER appuser
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
CMD curl -f http://localhost:${HF_PORT}/health || exit 1
CMD ["supervisord", "-c", "supervisord.conf"]