File size: 2,836 Bytes
eb20dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71bcb0e
 
 
 
eb20dff
71bcb0e
eb20dff
 
 
71bcb0e
 
eb20dff
71bcb0e
 
eb20dff
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# ==============================================================================
# 🐳 WebScraper.pro — Dockerfile for Hugging Face Free Spaces
# ==============================================================================
# This Dockerfile is highly optimized for Hugging Face Spaces:
# 1. Uses the official Playwright base image matching playwright==1.44.0
# 2. Runs as a secure, non-root user (UID 1000) as required by HF Spaces
# 3. Ensures write access for SQLite database, logs, and scraped exports
# 4. Serves the Flask app via Gunicorn on the standard HF Space port (7860)
# ==============================================================================

# Use official Playwright Python image. The jammy tag is based on Ubuntu Jammy.
# The version v1.44.0-jammy perfectly matches the playwright version in requirements.txt.
FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy

# Set shell to bash and enable pipefail
SHELL ["/bin/bash", "-o", "pipefail", "-c"]

# Set environment variables for Python, Flask, and Playwright
ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONUNBUFFERED=1 \
    FLASK_APP=run.py \
    FLASK_ENV=production \
    PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
    PORT=7860

# Install additional utility system packages if needed
RUN apt-get update && apt-get install -y --no-install-recommends \
    sqlite3 \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# Set up the working directory inside the container
WORKDIR /app

# Install Python packages
# We copy requirements.txt separately to leverage Docker build cache
COPY requirements.txt /app/
RUN pip install --no-cache-dir -U pip && \
    pip install --no-cache-dir -r requirements.txt

# Leverage the pre-created 'pwuser' user (UID 1000) already defined in the official Playwright image
# This prevents UID collision errors during useradd and complies with HF Spaces UID 1000 requirement.
RUN mkdir -p /app/database /app/logs /app/exports && \
    chown -R 1000:1000 /app

# Ensure that the Playwright browser files are fully readable/executable by the non-root user.
# In the official Playwright image, browsers are stored in /ms-playwright.
RUN chmod -R 755 /ms-playwright

# Copy the rest of the application files to the container and set ownership to UID 1000 (pwuser)
COPY --chown=1000:1000 . /app

# Switch to the non-root user (UID 1000)
USER 1000

# Expose the default port expected by Hugging Face Spaces (7860)
EXPOSE 7860

# Define the command to start the application using Gunicorn
# - --bind 0.0.0.0:7860: Listens on all interfaces on the Hugging Face port
# - --workers 2: Employs 2 worker processes (optimal for HF's 2-vCPU free tier)
# - --timeout 120: High timeout because web scraping tasks can take longer to load/process
CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "2", "--timeout", "120", "run:app"]