WebScraper.pro / Dockerfile
LovnishVerma's picture
Update Dockerfile
71bcb0e verified
# ==============================================================================
# 🐳 WebScraper.pro — Dockerfile for Hugging Face Free Spaces
# ==============================================================================
# This Dockerfile is highly optimized for Hugging Face Spaces:
# 1. Uses the official Playwright base image matching playwright==1.44.0
# 2. Runs as a secure, non-root user (UID 1000) as required by HF Spaces
# 3. Ensures write access for SQLite database, logs, and scraped exports
# 4. Serves the Flask app via Gunicorn on the standard HF Space port (7860)
# ==============================================================================
# Use official Playwright Python image. The jammy tag is based on Ubuntu Jammy.
# The version v1.44.0-jammy perfectly matches the playwright version in requirements.txt.
FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
# Set shell to bash and enable pipefail
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# Set environment variables for Python, Flask, and Playwright
ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
FLASK_APP=run.py \
FLASK_ENV=production \
PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
PORT=7860
# Install additional utility system packages if needed
RUN apt-get update && apt-get install -y --no-install-recommends \
sqlite3 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Set up the working directory inside the container
WORKDIR /app
# Install Python packages
# We copy requirements.txt separately to leverage Docker build cache
COPY requirements.txt /app/
RUN pip install --no-cache-dir -U pip && \
pip install --no-cache-dir -r requirements.txt
# Leverage the pre-created 'pwuser' user (UID 1000) already defined in the official Playwright image
# This prevents UID collision errors during useradd and complies with HF Spaces UID 1000 requirement.
RUN mkdir -p /app/database /app/logs /app/exports && \
chown -R 1000:1000 /app
# Ensure that the Playwright browser files are fully readable/executable by the non-root user.
# In the official Playwright image, browsers are stored in /ms-playwright.
RUN chmod -R 755 /ms-playwright
# Copy the rest of the application files to the container and set ownership to UID 1000 (pwuser)
COPY --chown=1000:1000 . /app
# Switch to the non-root user (UID 1000)
USER 1000
# Expose the default port expected by Hugging Face Spaces (7860)
EXPOSE 7860
# Define the command to start the application using Gunicorn
# - --bind 0.0.0.0:7860: Listens on all interfaces on the Hugging Face port
# - --workers 2: Employs 2 worker processes (optimal for HF's 2-vCPU free tier)
# - --timeout 120: High timeout because web scraping tasks can take longer to load/process
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "2", "--timeout", "120", "run:app"]