LovnishVerma commited on
Commit
eb20dff
·
verified ·
1 Parent(s): ac4c54d

Upload 3 files

Browse files
Files changed (1) hide show
  1. Dockerfile +64 -0
Dockerfile ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================================
2
+ # 🐳 WebScraper.pro — Dockerfile for Hugging Face Free Spaces
3
+ # ==============================================================================
4
+ # This Dockerfile is highly optimized for Hugging Face Spaces:
5
+ # 1. Uses the official Playwright base image matching playwright==1.44.0
6
+ # 2. Runs as a secure, non-root user (UID 1000) as required by HF Spaces
7
+ # 3. Ensures write access for SQLite database, logs, and scraped exports
8
+ # 4. Serves the Flask app via Gunicorn on the standard HF Space port (7860)
9
+ # ==============================================================================
10
+
11
+ # Use official Playwright Python image. The jammy tag is based on Ubuntu Jammy.
12
+ # The version v1.44.0-jammy perfectly matches the playwright version in requirements.txt.
13
+ FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
14
+
15
+ # Set shell to bash and enable pipefail
16
+ SHELL ["/bin/bash", "-o", "pipefail", "-c"]
17
+
18
+ # Set environment variables for Python, Flask, and Playwright
19
+ ENV PYTHONDONTWRITEBYTECODE=1 \
20
+ PYTHONUNBUFFERED=1 \
21
+ FLASK_APP=run.py \
22
+ FLASK_ENV=production \
23
+ PLAYWRIGHT_BROWSERS_PATH=/ms-playwright \
24
+ PORT=7860
25
+
26
+ # Install additional utility system packages if needed
27
+ RUN apt-get update && apt-get install -y --no-install-recommends \
28
+ sqlite3 \
29
+ && apt-get clean \
30
+ && rm -rf /var/lib/apt/lists/*
31
+
32
+ # Set up the working directory inside the container
33
+ WORKDIR /app
34
+
35
+ # Install Python packages
36
+ # We copy requirements.txt separately to leverage Docker build cache
37
+ COPY requirements.txt /app/
38
+ RUN pip install --no-cache-dir -U pip && \
39
+ pip install --no-cache-dir -r requirements.txt
40
+
41
+ # Create a non-root user 'user' with UID 1000 (standard for Hugging Face Spaces)
42
+ # This prevents permission issues when Hugging Face runs the container as UID 1000
43
+ RUN useradd -m -u 1000 user && \
44
+ mkdir -p /app/database /app/logs /app/exports && \
45
+ chown -R user:user /app
46
+
47
+ # Ensure that the Playwright browser files are fully readable/executable by our non-root user.
48
+ # In the official Playwright image, browsers are stored in /ms-playwright.
49
+ RUN chmod -R 755 /ms-playwright
50
+
51
+ # Copy the rest of the application files to the container and set ownership to 'user'
52
+ COPY --chown=user:user . /app
53
+
54
+ # Switch to the non-root user
55
+ USER user
56
+
57
+ # Expose the default port expected by Hugging Face Spaces (7860)
58
+ EXPOSE 7860
59
+
60
+ # Define the command to start the application using Gunicorn
61
+ # - --bind 0.0.0.0:7860: Listens on all interfaces on the Hugging Face port
62
+ # - --workers 2: Employs 2 worker processes (optimal for HF's 2-vCPU free tier)
63
+ # - --timeout 120: High timeout because web scraping tasks can take longer to load/process
64
+ CMD ["gunicorn", "--bind", "0.0.0.0:7860", "--workers", "2", "--timeout", "120", "run:app"]