File size: 10,891 Bytes
9105f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#!/bin/bash

# Complete Firecrawl Setup Script for HF Spaces
# This script handles cloning, building, and running Firecrawl

set -e

# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
NC='\033[0m' # No Color

# Logging functions
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_step() { echo -e "${PURPLE}[STEP]${NC} $1"; }

# Configuration
export PORT=${PORT:-7860}
export HOST=${HOST:-0.0.0.0}
export ENV=${ENV:-production}
export LOGGING_LEVEL=${LOGGING_LEVEL:-info}
export USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION:-false}
export REDIS_URL=${REDIS_URL:-redis://localhost:6379}
export PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://localhost:3000/scrape}

# Global variables
FIRECRAWL_DIR="/home/firecrawl/app"
API_DIR="$FIRECRAWL_DIR/apps/api"
PLAYWRIGHT_DIR="$FIRECRAWL_DIR/apps/playwright-service-ts"
PID_FILE="/tmp/firecrawl.pid"

# Cleanup function
cleanup() {
    log_warning "Shutting down Firecrawl services..."
    
    # Kill background processes
    if [ -f "$PID_FILE" ]; then
        while read pid; do
            if kill -0 "$pid" 2>/dev/null; then
                log_info "Stopping process: $pid"
                kill "$pid" 2>/dev/null || true
            fi
        done < "$PID_FILE"
        rm -f "$PID_FILE"
    fi
    
    # Kill any remaining node processes
    pkill -f "node" 2>/dev/null || true
    pkill -f "npm" 2>/dev/null || true
    
    log_success "Cleanup completed"
    exit 0
}

# Set up signal handlers
trap cleanup SIGINT SIGTERM

# Check if repository exists and clone if needed
setup_repository() {
    log_step "Setting up Firecrawl repository..."
    
    if [ ! -d "$FIRECRAWL_DIR/.git" ]; then
        log_info "Cloning Firecrawl repository..."
        git clone https://github.com/mendableai/firecrawl.git "$FIRECRAWL_DIR" || {
            log_error "Failed to clone repository"
            exit 1
        }
        log_success "Repository cloned successfully"
    else
        log_info "Repository already exists, pulling latest changes..."
        cd "$FIRECRAWL_DIR"
        git pull origin main || log_warning "Failed to pull latest changes, continuing with existing code"
    fi
    
    cd "$FIRECRAWL_DIR"
}

# Create environment configuration
create_environment() {
    log_step "Creating environment configuration..."
    
    cat > "$FIRECRAWL_DIR/.env" << EOF
# HF Spaces Configuration
PORT=$PORT
HOST=$HOST
ENV=$ENV
LOGGING_LEVEL=$LOGGING_LEVEL

# Authentication
USE_DB_AUTHENTICATION=$USE_DB_AUTHENTICATION

# Services
REDIS_URL=$REDIS_URL
PLAYWRIGHT_MICROSERVICE_URL=$PLAYWRIGHT_MICROSERVICE_URL

# Performance settings for HF Spaces
NUM_WORKERS_PER_QUEUE=2
BLOCK_MEDIA=true
HEADLESS=true

# Optional API keys (can be set via HF Spaces secrets)
OPENAI_API_KEY=\${OPENAI_API_KEY:-}
OPENAI_BASE_URL=\${OPENAI_BASE_URL:-}
MODEL_NAME=\${MODEL_NAME:-gpt-3.5-turbo}

# Test credentials
BULL_AUTH_KEY=fc-demo
TEST_API_KEY=fc-demo-key

# Disable features that might not work in HF Spaces
SUPABASE_ANON_TOKEN=
SUPABASE_URL=
SUPABASE_SERVICE_TOKEN=
POSTHOG_API_KEY=
POSTHOG_HOST=
EOF

    log_success "Environment configuration created"
}

# Install dependencies for API service
install_api_dependencies() {
    log_step "Installing API dependencies..."
    
    cd "$API_DIR"
    
    if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then
        log_info "Installing fresh dependencies..."
        npm ci --only=production || {
            log_warning "npm ci failed, trying npm install..."
            npm install --only=production
        }
    else
        log_info "Dependencies already installed"
    fi
    
    log_success "API dependencies installed"
}

# Install dependencies for Playwright service
install_playwright_dependencies() {
    log_step "Installing Playwright service dependencies..."
    
    cd "$PLAYWRIGHT_DIR"
    
    if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then
        log_info "Installing Playwright dependencies..."
        npm ci --only=production || {
            log_warning "npm ci failed, trying npm install..."
            npm install --only=production
        }
    else
        log_info "Playwright dependencies already installed"
    fi
    
    log_success "Playwright dependencies installed"
}

# Build the applications
build_applications() {
    log_step "Building applications..."
    
    # Build API
    log_info "Building API service..."
    cd "$API_DIR"
    npm run build || {
        log_error "Failed to build API service"
        exit 1
    }
    
    # Build Playwright service
    log_info "Building Playwright service..."
    cd "$PLAYWRIGHT_DIR"
    npm run build || {
        log_warning "Failed to build Playwright service, continuing..."
    }
    
    log_success "Applications built successfully"
}

# Create in-memory fallbacks for services
create_fallbacks() {
    log_step "Creating fallback configurations..."
    
    # Create Redis fallback
    cat > "$API_DIR/dist/src/lib/redis-fallback.js" << 'EOF'
// Simple in-memory fallback for Redis when not available
class MemoryStore {
    constructor() {
        this.store = new Map();
        this.expiry = new Map();
    }
    
    async get(key) {
        // Check expiry
        if (this.expiry.has(key) && Date.now() > this.expiry.get(key)) {
            this.store.delete(key);
            this.expiry.delete(key);
            return null;
        }
        return this.store.get(key) || null;
    }
    
    async set(key, value, options = {}) {
        this.store.set(key, value);
        if (options.EX) {
            this.expiry.set(key, Date.now() + (options.EX * 1000));
        }
        return 'OK';
    }
    
    async del(key) {
        const existed = this.store.has(key);
        this.store.delete(key);
        this.expiry.delete(key);
        return existed ? 1 : 0;
    }
    
    async exists(key) {
        return this.store.has(key) ? 1 : 0;
    }
    
    async flushall() {
        this.store.clear();
        this.expiry.clear();
        return 'OK';
    }
}

module.exports = { MemoryStore };
EOF

    # Create health check endpoint
    cat > "$API_DIR/dist/src/routes/health.js" << 'EOF'
const express = require('express');
const router = express.Router();

router.get('/health', (req, res) => {
    res.json({
        status: 'healthy',
        timestamp: new Date().toISOString(),
        version: '1.0.0-hf-spaces',
        environment: process.env.ENV || 'production'
    });
});

router.get('/', (req, res) => {
    res.json({
        message: '🔥 Firecrawl API on Hugging Face Spaces',
        version: '1.0.0-hf-spaces',
        documentation: 'https://docs.firecrawl.dev',
        endpoints: {
            health: '/health',
            scrape: '/v0/scrape',
            crawl: '/v0/crawl'
        }
    });
});

module.exports = router;
EOF

    log_success "Fallback configurations created"
}

# Start Playwright service in background
start_playwright_service() {
    log_step "Starting Playwright service..."
    
    cd "$PLAYWRIGHT_DIR"
    
    # Set Playwright-specific environment
    export PORT=3000
    export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
    export PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
    
    # Start Playwright service in background
    npm start > /tmp/playwright.log 2>&1 &
    PLAYWRIGHT_PID=$!
    echo "$PLAYWRIGHT_PID" >> "$PID_FILE"
    
    log_info "Playwright service started with PID: $PLAYWRIGHT_PID"
    
    # Wait for Playwright service to be ready
    log_info "Waiting for Playwright service to be ready..."
    for i in {1..30}; do
        if curl -s http://localhost:3000/health >/dev/null 2>&1; then
            log_success "Playwright service is ready"
            return 0
        fi
        log_info "Waiting... ($i/30)"
        sleep 2
    done
    
    log_warning "Playwright service may not be fully ready, continuing..."
}

# Start API service
start_api_service() {
    log_step "Starting API service..."
    
    cd "$API_DIR"
    
    # Set API-specific environment
    export PORT=$PORT
    export HOST=$HOST
    export WORKER_PORT=3005
    
    log_info "Starting Firecrawl API on $HOST:$PORT"
    log_info "Environment: $ENV"
    log_info "Logging level: $LOGGING_LEVEL"
    
    # Start API service (this runs in foreground)
    node dist/src/harness.js --start-docker || {
        log_error "Failed to start API service"
        log_info "Checking logs..."
        [ -f /tmp/api.log ] && tail -50 /tmp/api.log
        exit 1
    }
}

# Health check function
check_health() {
    log_step "Performing health checks..."
    
    # Check API health
    if curl -s http://localhost:$PORT/health >/dev/null 2>&1; then
        log_success "API service is healthy"
    else
        log_warning "API service health check failed"
    fi
    
    # Check Playwright health (if running)
    if curl -s http://localhost:3000/health >/dev/null 2>&1; then
        log_success "Playwright service is healthy"
    else
        log_warning "Playwright service health check failed"
    fi
}

# Create startup info
show_startup_info() {
    log_success "🔥 Firecrawl is starting on Hugging Face Spaces!"
    echo
    log_info "Configuration:"
    echo "  • Port: $PORT"
    echo "  • Host: $HOST"
    echo "  • Environment: $ENV"
    echo "  • Logging: $LOGGING_LEVEL"
    echo
    log_info "Available endpoints:"
    echo "  • Health: http://localhost:$PORT/health"
    echo "  • API Docs: http://localhost:$PORT/"
    echo "  • Scrape: http://localhost:$PORT/v0/scrape"
    echo "  • Crawl: http://localhost:$PORT/v0/crawl"
    echo
    log_info "Test command:"
    echo "  curl -X POST http://localhost:$PORT/v0/scrape \\"
    echo "    -H 'Content-Type: application/json' \\"
    echo "    -d '{\"url\": \"https://example.com\", \"formats\": [\"markdown\"]}'"
    echo
}

# Main execution
main() {
    echo "🔥 Firecrawl Setup for Hugging Face Spaces"
    echo "=========================================="
    echo
    
    log_info "Starting setup process..."
    
    # Setup steps
    setup_repository
    create_environment
    install_api_dependencies
    install_playwright_dependencies
    build_applications
    create_fallbacks
    
    show_startup_info
    
    # Start services
    start_playwright_service
    
    # Wait a bit before starting API
    sleep 5
    
    # Start API service (this runs in foreground and keeps the container alive)
    start_api_service
}

# Health monitoring in background
monitor_health() {
    while true; do
        sleep 60
        check_health
    done
}

# Start health monitoring in background
monitor_health &
HEALTH_PID=$!
echo "$HEALTH_PID" >> "$PID_FILE"

# Run main function
main "$@"