NitinBot001 commited on
Commit
2e94159
·
verified ·
1 Parent(s): af7e677

Update Dockerfile

Browse files
Files changed (1) hide show
  1. Dockerfile +162 -105
Dockerfile CHANGED
@@ -1,120 +1,177 @@
1
- # Use Ubuntu as base image for better Docker-in-Docker support
2
- FROM ubuntu:22.04
3
 
4
- # Avoid prompts from apt
5
- ENV DEBIAN_FRONTEND=noninteractive
6
 
7
- # Install system dependencies
8
  RUN apt-get update && apt-get install -y \
9
- curl \
10
  git \
11
- sudo \
 
 
12
  ca-certificates \
13
- gnupg \
14
- lsb-release \
15
- && rm -rf /var/lib/apt/lists/*
16
-
17
- # Install Docker
18
- RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
19
- && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \
20
- && apt-get update \
21
- && apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin \
22
  && rm -rf /var/lib/apt/lists/*
23
 
24
- # Create non-root user
25
- RUN useradd -m -s /bin/bash -u 1001 firecrawl \
26
- && usermod -aG docker firecrawl \
27
- && echo 'firecrawl ALL=(ALL) NOPASSWD: /usr/bin/dockerd' >> /etc/sudoers
28
-
29
- # Set working directory
30
- WORKDIR /home/firecrawl
31
 
32
- # Switch to non-root user for cloning
33
  USER firecrawl
 
34
 
35
  # Clone Firecrawl repository
36
- RUN git clone https://github.com/mendableai/firecrawl.git /home/firecrawl/firecrawl
37
-
38
- # Set working directory to firecrawl
39
- WORKDIR /home/firecrawl/firecrawl
40
-
41
- # Copy .env.example to .env in the API directory
42
- RUN if [ -f apps/api/.env.example ]; then \
43
- cp apps/api/.env.example apps/api/.env; \
44
- fi
45
-
46
- # Create startup script
47
- USER root
48
- RUN echo '#!/bin/bash' > /start-firecrawl.sh && \
49
- echo 'set -e' >> /start-firecrawl.sh && \
50
- echo '' >> /start-firecrawl.sh && \
51
- echo 'echo "🔥 Starting Firecrawl Self-Host Setup 🔥"' >> /start-firecrawl.sh && \
52
- echo 'echo "==========================================="' >> /start-firecrawl.sh && \
53
- echo '' >> /start-firecrawl.sh && \
54
- echo '# Start Docker daemon in background if not running' >> /start-firecrawl.sh && \
55
- echo 'if ! pgrep dockerd > /dev/null; then' >> /start-firecrawl.sh && \
56
- echo ' echo "Starting Docker daemon..."' >> /start-firecrawl.sh && \
57
- echo ' sudo dockerd --host=unix:///var/run/docker.sock --host=tcp://0.0.0.0:2376 &' >> /start-firecrawl.sh && \
58
- echo ' sleep 5' >> /start-firecrawl.sh && \
59
- echo 'fi' >> /start-firecrawl.sh && \
60
- echo '' >> /start-firecrawl.sh && \
61
- echo '# Wait for Docker to be ready' >> /start-firecrawl.sh && \
62
- echo 'echo "Waiting for Docker to be ready..."' >> /start-firecrawl.sh && \
63
- echo 'timeout=30' >> /start-firecrawl.sh && \
64
- echo 'while [ $timeout -gt 0 ]; do' >> /start-firecrawl.sh && \
65
- echo ' if docker info > /dev/null 2>&1; then' >> /start-firecrawl.sh && \
66
- echo ' echo "Docker is ready!"' >> /start-firecrawl.sh && \
67
- echo ' break' >> /start-firecrawl.sh && \
68
- echo ' fi' >> /start-firecrawl.sh && \
69
- echo ' sleep 1' >> /start-firecrawl.sh && \
70
- echo ' timeout=$((timeout-1))' >> /start-firecrawl.sh && \
71
- echo 'done' >> /start-firecrawl.sh && \
72
- echo '' >> /start-firecrawl.sh && \
73
- echo 'if [ $timeout -eq 0 ]; then' >> /start-firecrawl.sh && \
74
- echo ' echo "Error: Docker failed to start within 30 seconds"' >> /start-firecrawl.sh && \
75
- echo ' exit 1' >> /start-firecrawl.sh && \
76
- echo 'fi' >> /start-firecrawl.sh && \
77
- echo '' >> /start-firecrawl.sh && \
78
- echo 'cd /home/firecrawl/firecrawl' >> /start-firecrawl.sh && \
79
- echo '' >> /start-firecrawl.sh && \
80
- echo 'echo "Prerequisites check:"' >> /start-firecrawl.sh && \
81
- echo 'echo "✓ Docker and Docker Compose are installed"' >> /start-firecrawl.sh && \
82
- echo 'echo "✓ Repository cloned to /home/firecrawl/firecrawl"' >> /start-firecrawl.sh && \
83
- echo '' >> /start-firecrawl.sh && \
84
- echo 'if [ -f apps/api/.env ]; then' >> /start-firecrawl.sh && \
85
- echo ' echo "✓ Environment file exists at apps/api/.env"' >> /start-firecrawl.sh && \
86
- echo 'else' >> /start-firecrawl.sh && \
87
- echo ' echo "⚠ No .env file found, using defaults"' >> /start-firecrawl.sh && \
88
- echo 'fi' >> /start-firecrawl.sh && \
89
- echo '' >> /start-firecrawl.sh && \
90
- echo 'echo ""' >> /start-firecrawl.sh && \
91
- echo 'echo "Building Firecrawl services..."' >> /start-firecrawl.sh && \
92
- echo 'echo "Running: docker compose build"' >> /start-firecrawl.sh && \
93
- echo 'docker compose build' >> /start-firecrawl.sh && \
94
- echo '' >> /start-firecrawl.sh && \
95
- echo 'echo ""' >> /start-firecrawl.sh && \
96
- echo 'echo "Starting Firecrawl services..."' >> /start-firecrawl.sh && \
97
- echo 'echo "Running: docker compose up"' >> /start-firecrawl.sh && \
98
- echo 'echo ""' >> /start-firecrawl.sh && \
99
- echo '' >> /start-firecrawl.sh && \
100
- echo '# Run docker compose' >> /start-firecrawl.sh && \
101
- echo 'exec docker compose up "$@"' >> /start-firecrawl.sh
102
-
103
- RUN chmod +x /start-firecrawl.sh \
104
- && chown firecrawl:firecrawl /start-firecrawl.sh
105
-
106
- # Switch back to non-root user
107
- USER firecrawl
108
-
109
- # Expose common ports used by Firecrawl
110
- EXPOSE 3002 8080 6379 5432
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  # Health check
113
  HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
114
- CMD docker compose ps -q | xargs docker inspect --format='{{.State.Health.Status}}' | grep -v healthy && exit 1 || exit 0
115
-
116
- # Set entrypoint
117
- ENTRYPOINT ["/start-firecrawl.sh"]
118
 
119
- # Default command (can be overridden)
120
- CMD ["./start-firecrawl.sh"]
 
1
+ # Dockerfile for Hugging Face Spaces - Non-root setup
2
+ FROM node:18-slim
3
 
4
+ # Create non-root user
5
+ RUN groupadd -r firecrawl && useradd -r -g firecrawl firecrawl
6
 
7
+ # Install system dependencies as root
8
  RUN apt-get update && apt-get install -y \
 
9
  git \
10
+ curl \
11
+ wget \
12
+ bash \
13
  ca-certificates \
14
+ redis-server \
15
+ postgresql \
16
+ postgresql-contrib \
17
+ sudo \
 
 
 
 
 
18
  && rm -rf /var/lib/apt/lists/*
19
 
20
+ # Create app directory and set permissions
21
+ RUN mkdir -p /home/firecrawl/app && \
22
+ chown -R firecrawl:firecrawl /home/firecrawl
 
 
 
 
23
 
24
+ # Switch to non-root user
25
  USER firecrawl
26
+ WORKDIR /home/firecrawl/app
27
 
28
  # Clone Firecrawl repository
29
+ RUN git clone https://github.com/mendableai/firecrawl.git .
30
+
31
+ # Install Node.js dependencies for API
32
+ WORKDIR /home/firecrawl/app/apps/api
33
+ RUN npm ci --only=production
34
+
35
+ # Install Node.js dependencies for Playwright service (lightweight version)
36
+ WORKDIR /home/firecrawl/app/apps/playwright-service-ts
37
+ RUN npm ci --only=production
38
+
39
+ # Build the applications
40
+ WORKDIR /home/firecrawl/app/apps/api
41
+ RUN npm run build
42
+
43
+ WORKDIR /home/firecrawl/app/apps/playwright-service-ts
44
+ RUN npm run build
45
+
46
+ # Create directories for runtime
47
+ WORKDIR /home/firecrawl/app
48
+ RUN mkdir -p logs tmp data redis-data postgres-data
49
+
50
+ # Copy environment template and create .env
51
+ RUN cp apps/api/.env.example .env
52
+
53
+ # Create a simple startup script for HF Spaces
54
+ RUN cat > start.sh << 'EOF' && chmod +x start.sh
55
+ #!/bin/bash
56
+
57
+ # Set environment variables for HF Spaces
58
+ export PORT=${PORT:-7860}
59
+ export HOST=0.0.0.0
60
+ export REDIS_URL=redis://localhost:6379
61
+ export PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape
62
+ export USE_DB_AUTHENTICATION=false
63
+ export ENV=production
64
+ export LOGGING_LEVEL=info
65
+
66
+ # Start Redis in background (if we can)
67
+ redis-server --daemonize yes --port 6379 --bind 127.0.0.1 --dir /home/firecrawl/app/redis-data || echo "Redis start failed, continuing..."
68
+
69
+ # Start Playwright service in background
70
+ cd /home/firecrawl/app/apps/playwright-service-ts
71
+ npm start &
72
+ PLAYWRIGHT_PID=$!
73
+
74
+ # Wait a bit for Playwright to start
75
+ sleep 5
76
+
77
+ # Start API service
78
+ cd /home/firecrawl/app/apps/api
79
+ exec node dist/src/harness.js --start-docker
80
+ EOF
81
+
82
+ # Create a simple health check script
83
+ RUN cat > health.sh << 'EOF' && chmod +x health.sh
84
+ #!/bin/bash
85
+ curl -f http://localhost:${PORT:-7860}/health 2>/dev/null || exit 1
86
+ EOF
87
+
88
+ # Create minimal configuration files
89
+ RUN cat > /home/firecrawl/app/.env << 'EOF'
90
+ # HF Spaces Configuration
91
+ PORT=7860
92
+ HOST=0.0.0.0
93
+ ENV=production
94
+ LOGGING_LEVEL=info
95
+
96
+ # Disable database authentication for simplicity
97
+ USE_DB_AUTHENTICATION=false
98
+
99
+ # Redis (will try to use local instance)
100
+ REDIS_URL=redis://localhost:6379
101
+ REDIS_RATE_LIMIT_URL=redis://localhost:6379
102
+
103
+ # Playwright service
104
+ PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape
105
+ BLOCK_MEDIA=true
106
+
107
+ # Disable optional services that might not work in HF Spaces
108
+ BULL_AUTH_KEY=fc-demo
109
+ TEST_API_KEY=fc-demo-key
110
+
111
+ # Model configuration (can be overridden with HF Spaces secrets)
112
+ MODEL_NAME=gpt-3.5-turbo
113
+ MODEL_EMBEDDING_NAME=text-embedding-ada-002
114
+ EOF
115
+
116
+ # Create a simple API-only version for HF Spaces constraints
117
+ RUN cat > start-api-only.sh << 'EOF' && chmod +x start-api-only.sh
118
+ #!/bin/bash
119
+
120
+ # Simplified startup for HF Spaces - API only
121
+ export PORT=${PORT:-7860}
122
+ export HOST=0.0.0.0
123
+ export ENV=production
124
+ export LOGGING_LEVEL=info
125
+ export USE_DB_AUTHENTICATION=false
126
+
127
+ # Disable services that might not work
128
+ export PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3000/scrape
129
+ export REDIS_URL=redis://localhost:6379
130
+
131
+ echo "🔥 Starting Firecrawl API on port $PORT"
132
+ echo "⚠️ Note: This is a simplified version for Hugging Face Spaces"
133
+ echo "📝 Some features may be limited due to platform constraints"
134
+
135
+ cd /home/firecrawl/app/apps/api
136
+
137
+ # Create a simple in-memory fallback if Redis isn't available
138
+ cat > dist/src/lib/redis-fallback.js << 'JSEOF'
139
+ // Simple in-memory fallback for Redis when not available
140
+ class MemoryStore {
141
+ constructor() {
142
+ this.store = new Map();
143
+ }
144
+
145
+ async get(key) {
146
+ return this.store.get(key);
147
+ }
148
+
149
+ async set(key, value, options = {}) {
150
+ this.store.set(key, value);
151
+ if (options.EX) {
152
+ setTimeout(() => this.store.delete(key), options.EX * 1000);
153
+ }
154
+ return 'OK';
155
+ }
156
+
157
+ async del(key) {
158
+ return this.store.delete(key);
159
+ }
160
+ }
161
+
162
+ module.exports = { MemoryStore };
163
+ JSEOF
164
+
165
+ # Start the API service
166
+ exec node dist/src/harness.js --start-docker
167
+ EOF
168
+
169
+ # Expose the port that HF Spaces expects
170
+ EXPOSE 7860
171
 
172
  # Health check
173
  HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
174
+ CMD ./health.sh
 
 
 
175
 
176
+ # Default command - use the API-only version for HF Spaces
177
+ CMD ["./start-api-only.sh"]