NitinBot001 commited on
Commit
9105f1d
·
verified ·
1 Parent(s): c125555

Create setup-firecrawl.sh

Browse files
Files changed (1) hide show
  1. setup-firecrawl.sh +412 -0
setup-firecrawl.sh ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Complete Firecrawl Setup Script for HF Spaces
4
+ # This script handles cloning, building, and running Firecrawl
5
+
6
+ set -e
7
+
8
+ # Colors for output
9
+ RED='\033[0;31m'
10
+ GREEN='\033[0;32m'
11
+ YELLOW='\033[1;33m'
12
+ BLUE='\033[0;34m'
13
+ PURPLE='\033[0;35m'
14
+ NC='\033[0m' # No Color
15
+
16
+ # Logging functions
17
+ log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
18
+ log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
19
+ log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
20
+ log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
21
+ log_step() { echo -e "${PURPLE}[STEP]${NC} $1"; }
22
+
23
+ # Configuration
24
+ export PORT=${PORT:-7860}
25
+ export HOST=${HOST:-0.0.0.0}
26
+ export ENV=${ENV:-production}
27
+ export LOGGING_LEVEL=${LOGGING_LEVEL:-info}
28
+ export USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION:-false}
29
+ export REDIS_URL=${REDIS_URL:-redis://localhost:6379}
30
+ export PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://localhost:3000/scrape}
31
+
32
+ # Global variables
33
+ FIRECRAWL_DIR="/home/firecrawl/app"
34
+ API_DIR="$FIRECRAWL_DIR/apps/api"
35
+ PLAYWRIGHT_DIR="$FIRECRAWL_DIR/apps/playwright-service-ts"
36
+ PID_FILE="/tmp/firecrawl.pid"
37
+
38
+ # Cleanup function
39
+ cleanup() {
40
+ log_warning "Shutting down Firecrawl services..."
41
+
42
+ # Kill background processes
43
+ if [ -f "$PID_FILE" ]; then
44
+ while read pid; do
45
+ if kill -0 "$pid" 2>/dev/null; then
46
+ log_info "Stopping process: $pid"
47
+ kill "$pid" 2>/dev/null || true
48
+ fi
49
+ done < "$PID_FILE"
50
+ rm -f "$PID_FILE"
51
+ fi
52
+
53
+ # Kill any remaining node processes
54
+ pkill -f "node" 2>/dev/null || true
55
+ pkill -f "npm" 2>/dev/null || true
56
+
57
+ log_success "Cleanup completed"
58
+ exit 0
59
+ }
60
+
61
+ # Set up signal handlers
62
+ trap cleanup SIGINT SIGTERM
63
+
64
+ # Check if repository exists and clone if needed
65
+ setup_repository() {
66
+ log_step "Setting up Firecrawl repository..."
67
+
68
+ if [ ! -d "$FIRECRAWL_DIR/.git" ]; then
69
+ log_info "Cloning Firecrawl repository..."
70
+ git clone https://github.com/mendableai/firecrawl.git "$FIRECRAWL_DIR" || {
71
+ log_error "Failed to clone repository"
72
+ exit 1
73
+ }
74
+ log_success "Repository cloned successfully"
75
+ else
76
+ log_info "Repository already exists, pulling latest changes..."
77
+ cd "$FIRECRAWL_DIR"
78
+ git pull origin main || log_warning "Failed to pull latest changes, continuing with existing code"
79
+ fi
80
+
81
+ cd "$FIRECRAWL_DIR"
82
+ }
83
+
84
+ # Create environment configuration
85
+ create_environment() {
86
+ log_step "Creating environment configuration..."
87
+
88
+ cat > "$FIRECRAWL_DIR/.env" << EOF
89
+ # HF Spaces Configuration
90
+ PORT=$PORT
91
+ HOST=$HOST
92
+ ENV=$ENV
93
+ LOGGING_LEVEL=$LOGGING_LEVEL
94
+
95
+ # Authentication
96
+ USE_DB_AUTHENTICATION=$USE_DB_AUTHENTICATION
97
+
98
+ # Services
99
+ REDIS_URL=$REDIS_URL
100
+ PLAYWRIGHT_MICROSERVICE_URL=$PLAYWRIGHT_MICROSERVICE_URL
101
+
102
+ # Performance settings for HF Spaces
103
+ NUM_WORKERS_PER_QUEUE=2
104
+ BLOCK_MEDIA=true
105
+ HEADLESS=true
106
+
107
+ # Optional API keys (can be set via HF Spaces secrets)
108
+ OPENAI_API_KEY=\${OPENAI_API_KEY:-}
109
+ OPENAI_BASE_URL=\${OPENAI_BASE_URL:-}
110
+ MODEL_NAME=\${MODEL_NAME:-gpt-3.5-turbo}
111
+
112
+ # Test credentials
113
+ BULL_AUTH_KEY=fc-demo
114
+ TEST_API_KEY=fc-demo-key
115
+
116
+ # Disable features that might not work in HF Spaces
117
+ SUPABASE_ANON_TOKEN=
118
+ SUPABASE_URL=
119
+ SUPABASE_SERVICE_TOKEN=
120
+ POSTHOG_API_KEY=
121
+ POSTHOG_HOST=
122
+ EOF
123
+
124
+ log_success "Environment configuration created"
125
+ }
126
+
127
+ # Install dependencies for API service
128
+ install_api_dependencies() {
129
+ log_step "Installing API dependencies..."
130
+
131
+ cd "$API_DIR"
132
+
133
+ if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then
134
+ log_info "Installing fresh dependencies..."
135
+ npm ci --only=production || {
136
+ log_warning "npm ci failed, trying npm install..."
137
+ npm install --only=production
138
+ }
139
+ else
140
+ log_info "Dependencies already installed"
141
+ fi
142
+
143
+ log_success "API dependencies installed"
144
+ }
145
+
146
+ # Install dependencies for Playwright service
147
+ install_playwright_dependencies() {
148
+ log_step "Installing Playwright service dependencies..."
149
+
150
+ cd "$PLAYWRIGHT_DIR"
151
+
152
+ if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then
153
+ log_info "Installing Playwright dependencies..."
154
+ npm ci --only=production || {
155
+ log_warning "npm ci failed, trying npm install..."
156
+ npm install --only=production
157
+ }
158
+ else
159
+ log_info "Playwright dependencies already installed"
160
+ fi
161
+
162
+ log_success "Playwright dependencies installed"
163
+ }
164
+
165
+ # Build the applications
166
+ build_applications() {
167
+ log_step "Building applications..."
168
+
169
+ # Build API
170
+ log_info "Building API service..."
171
+ cd "$API_DIR"
172
+ npm run build || {
173
+ log_error "Failed to build API service"
174
+ exit 1
175
+ }
176
+
177
+ # Build Playwright service
178
+ log_info "Building Playwright service..."
179
+ cd "$PLAYWRIGHT_DIR"
180
+ npm run build || {
181
+ log_warning "Failed to build Playwright service, continuing..."
182
+ }
183
+
184
+ log_success "Applications built successfully"
185
+ }
186
+
187
+ # Create in-memory fallbacks for services
188
+ create_fallbacks() {
189
+ log_step "Creating fallback configurations..."
190
+
191
+ # Create Redis fallback
192
+ cat > "$API_DIR/dist/src/lib/redis-fallback.js" << 'EOF'
193
+ // Simple in-memory fallback for Redis when not available
194
+ class MemoryStore {
195
+ constructor() {
196
+ this.store = new Map();
197
+ this.expiry = new Map();
198
+ }
199
+
200
+ async get(key) {
201
+ // Check expiry
202
+ if (this.expiry.has(key) && Date.now() > this.expiry.get(key)) {
203
+ this.store.delete(key);
204
+ this.expiry.delete(key);
205
+ return null;
206
+ }
207
+ return this.store.get(key) || null;
208
+ }
209
+
210
+ async set(key, value, options = {}) {
211
+ this.store.set(key, value);
212
+ if (options.EX) {
213
+ this.expiry.set(key, Date.now() + (options.EX * 1000));
214
+ }
215
+ return 'OK';
216
+ }
217
+
218
+ async del(key) {
219
+ const existed = this.store.has(key);
220
+ this.store.delete(key);
221
+ this.expiry.delete(key);
222
+ return existed ? 1 : 0;
223
+ }
224
+
225
+ async exists(key) {
226
+ return this.store.has(key) ? 1 : 0;
227
+ }
228
+
229
+ async flushall() {
230
+ this.store.clear();
231
+ this.expiry.clear();
232
+ return 'OK';
233
+ }
234
+ }
235
+
236
+ module.exports = { MemoryStore };
237
+ EOF
238
+
239
+ # Create health check endpoint
240
+ cat > "$API_DIR/dist/src/routes/health.js" << 'EOF'
241
+ const express = require('express');
242
+ const router = express.Router();
243
+
244
+ router.get('/health', (req, res) => {
245
+ res.json({
246
+ status: 'healthy',
247
+ timestamp: new Date().toISOString(),
248
+ version: '1.0.0-hf-spaces',
249
+ environment: process.env.ENV || 'production'
250
+ });
251
+ });
252
+
253
+ router.get('/', (req, res) => {
254
+ res.json({
255
+ message: '🔥 Firecrawl API on Hugging Face Spaces',
256
+ version: '1.0.0-hf-spaces',
257
+ documentation: 'https://docs.firecrawl.dev',
258
+ endpoints: {
259
+ health: '/health',
260
+ scrape: '/v0/scrape',
261
+ crawl: '/v0/crawl'
262
+ }
263
+ });
264
+ });
265
+
266
+ module.exports = router;
267
+ EOF
268
+
269
+ log_success "Fallback configurations created"
270
+ }
271
+
272
+ # Start Playwright service in background
273
+ start_playwright_service() {
274
+ log_step "Starting Playwright service..."
275
+
276
+ cd "$PLAYWRIGHT_DIR"
277
+
278
+ # Set Playwright-specific environment
279
+ export PORT=3000
280
+ export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
281
+ export PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
282
+
283
+ # Start Playwright service in background
284
+ npm start > /tmp/playwright.log 2>&1 &
285
+ PLAYWRIGHT_PID=$!
286
+ echo "$PLAYWRIGHT_PID" >> "$PID_FILE"
287
+
288
+ log_info "Playwright service started with PID: $PLAYWRIGHT_PID"
289
+
290
+ # Wait for Playwright service to be ready
291
+ log_info "Waiting for Playwright service to be ready..."
292
+ for i in {1..30}; do
293
+ if curl -s http://localhost:3000/health >/dev/null 2>&1; then
294
+ log_success "Playwright service is ready"
295
+ return 0
296
+ fi
297
+ log_info "Waiting... ($i/30)"
298
+ sleep 2
299
+ done
300
+
301
+ log_warning "Playwright service may not be fully ready, continuing..."
302
+ }
303
+
304
+ # Start API service
305
+ start_api_service() {
306
+ log_step "Starting API service..."
307
+
308
+ cd "$API_DIR"
309
+
310
+ # Set API-specific environment
311
+ export PORT=$PORT
312
+ export HOST=$HOST
313
+ export WORKER_PORT=3005
314
+
315
+ log_info "Starting Firecrawl API on $HOST:$PORT"
316
+ log_info "Environment: $ENV"
317
+ log_info "Logging level: $LOGGING_LEVEL"
318
+
319
+ # Start API service (this runs in foreground)
320
+ node dist/src/harness.js --start-docker || {
321
+ log_error "Failed to start API service"
322
+ log_info "Checking logs..."
323
+ [ -f /tmp/api.log ] && tail -50 /tmp/api.log
324
+ exit 1
325
+ }
326
+ }
327
+
328
+ # Health check function
329
+ check_health() {
330
+ log_step "Performing health checks..."
331
+
332
+ # Check API health
333
+ if curl -s http://localhost:$PORT/health >/dev/null 2>&1; then
334
+ log_success "API service is healthy"
335
+ else
336
+ log_warning "API service health check failed"
337
+ fi
338
+
339
+ # Check Playwright health (if running)
340
+ if curl -s http://localhost:3000/health >/dev/null 2>&1; then
341
+ log_success "Playwright service is healthy"
342
+ else
343
+ log_warning "Playwright service health check failed"
344
+ fi
345
+ }
346
+
347
+ # Create startup info
348
+ show_startup_info() {
349
+ log_success "🔥 Firecrawl is starting on Hugging Face Spaces!"
350
+ echo
351
+ log_info "Configuration:"
352
+ echo " • Port: $PORT"
353
+ echo " • Host: $HOST"
354
+ echo " • Environment: $ENV"
355
+ echo " • Logging: $LOGGING_LEVEL"
356
+ echo
357
+ log_info "Available endpoints:"
358
+ echo " • Health: http://localhost:$PORT/health"
359
+ echo " • API Docs: http://localhost:$PORT/"
360
+ echo " • Scrape: http://localhost:$PORT/v0/scrape"
361
+ echo " • Crawl: http://localhost:$PORT/v0/crawl"
362
+ echo
363
+ log_info "Test command:"
364
+ echo " curl -X POST http://localhost:$PORT/v0/scrape \\"
365
+ echo " -H 'Content-Type: application/json' \\"
366
+ echo " -d '{\"url\": \"https://example.com\", \"formats\": [\"markdown\"]}'"
367
+ echo
368
+ }
369
+
370
+ # Main execution
371
+ main() {
372
+ echo "🔥 Firecrawl Setup for Hugging Face Spaces"
373
+ echo "=========================================="
374
+ echo
375
+
376
+ log_info "Starting setup process..."
377
+
378
+ # Setup steps
379
+ setup_repository
380
+ create_environment
381
+ install_api_dependencies
382
+ install_playwright_dependencies
383
+ build_applications
384
+ create_fallbacks
385
+
386
+ show_startup_info
387
+
388
+ # Start services
389
+ start_playwright_service
390
+
391
+ # Wait a bit before starting API
392
+ sleep 5
393
+
394
+ # Start API service (this runs in foreground and keeps the container alive)
395
+ start_api_service
396
+ }
397
+
398
+ # Health monitoring in background
399
+ monitor_health() {
400
+ while true; do
401
+ sleep 60
402
+ check_health
403
+ done
404
+ }
405
+
406
+ # Start health monitoring in background
407
+ monitor_health &
408
+ HEALTH_PID=$!
409
+ echo "$HEALTH_PID" >> "$PID_FILE"
410
+
411
+ # Run main function
412
+ main "$@"