#!/bin/bash # Complete Firecrawl Setup Script for HF Spaces # This script handles cloning, building, and running Firecrawl set -e # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' PURPLE='\033[0;35m' NC='\033[0m' # No Color # Logging functions log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_step() { echo -e "${PURPLE}[STEP]${NC} $1"; } # Configuration export PORT=${PORT:-7860} export HOST=${HOST:-0.0.0.0} export ENV=${ENV:-production} export LOGGING_LEVEL=${LOGGING_LEVEL:-info} export USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION:-false} export REDIS_URL=${REDIS_URL:-redis://localhost:6379} export PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://localhost:3000/scrape} # Global variables FIRECRAWL_DIR="/home/firecrawl/app" API_DIR="$FIRECRAWL_DIR/apps/api" PLAYWRIGHT_DIR="$FIRECRAWL_DIR/apps/playwright-service-ts" PID_FILE="/tmp/firecrawl.pid" # Cleanup function cleanup() { log_warning "Shutting down Firecrawl services..." # Kill background processes if [ -f "$PID_FILE" ]; then while read pid; do if kill -0 "$pid" 2>/dev/null; then log_info "Stopping process: $pid" kill "$pid" 2>/dev/null || true fi done < "$PID_FILE" rm -f "$PID_FILE" fi # Kill any remaining node processes pkill -f "node" 2>/dev/null || true pkill -f "npm" 2>/dev/null || true log_success "Cleanup completed" exit 0 } # Set up signal handlers trap cleanup SIGINT SIGTERM # Check if repository exists and clone if needed setup_repository() { log_step "Setting up Firecrawl repository..." if [ ! -d "$FIRECRAWL_DIR/.git" ]; then log_info "Cloning Firecrawl repository..." git clone https://github.com/mendableai/firecrawl.git "$FIRECRAWL_DIR" || { log_error "Failed to clone repository" exit 1 } log_success "Repository cloned successfully" else log_info "Repository already exists, pulling latest changes..." cd "$FIRECRAWL_DIR" git pull origin main || log_warning "Failed to pull latest changes, continuing with existing code" fi cd "$FIRECRAWL_DIR" } # Create environment configuration create_environment() { log_step "Creating environment configuration..." cat > "$FIRECRAWL_DIR/.env" << EOF # HF Spaces Configuration PORT=$PORT HOST=$HOST ENV=$ENV LOGGING_LEVEL=$LOGGING_LEVEL # Authentication USE_DB_AUTHENTICATION=$USE_DB_AUTHENTICATION # Services REDIS_URL=$REDIS_URL PLAYWRIGHT_MICROSERVICE_URL=$PLAYWRIGHT_MICROSERVICE_URL # Performance settings for HF Spaces NUM_WORKERS_PER_QUEUE=2 BLOCK_MEDIA=true HEADLESS=true # Optional API keys (can be set via HF Spaces secrets) OPENAI_API_KEY=\${OPENAI_API_KEY:-} OPENAI_BASE_URL=\${OPENAI_BASE_URL:-} MODEL_NAME=\${MODEL_NAME:-gpt-3.5-turbo} # Test credentials BULL_AUTH_KEY=fc-demo TEST_API_KEY=fc-demo-key # Disable features that might not work in HF Spaces SUPABASE_ANON_TOKEN= SUPABASE_URL= SUPABASE_SERVICE_TOKEN= POSTHOG_API_KEY= POSTHOG_HOST= EOF log_success "Environment configuration created" } # Install dependencies for API service install_api_dependencies() { log_step "Installing API dependencies..." cd "$API_DIR" if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then log_info "Installing fresh dependencies..." npm ci --only=production || { log_warning "npm ci failed, trying npm install..." npm install --only=production } else log_info "Dependencies already installed" fi log_success "API dependencies installed" } # Install dependencies for Playwright service install_playwright_dependencies() { log_step "Installing Playwright service dependencies..." cd "$PLAYWRIGHT_DIR" if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then log_info "Installing Playwright dependencies..." npm ci --only=production || { log_warning "npm ci failed, trying npm install..." npm install --only=production } else log_info "Playwright dependencies already installed" fi log_success "Playwright dependencies installed" } # Build the applications build_applications() { log_step "Building applications..." # Build API log_info "Building API service..." cd "$API_DIR" npm run build || { log_error "Failed to build API service" exit 1 } # Build Playwright service log_info "Building Playwright service..." cd "$PLAYWRIGHT_DIR" npm run build || { log_warning "Failed to build Playwright service, continuing..." } log_success "Applications built successfully" } # Create in-memory fallbacks for services create_fallbacks() { log_step "Creating fallback configurations..." # Create Redis fallback cat > "$API_DIR/dist/src/lib/redis-fallback.js" << 'EOF' // Simple in-memory fallback for Redis when not available class MemoryStore { constructor() { this.store = new Map(); this.expiry = new Map(); } async get(key) { // Check expiry if (this.expiry.has(key) && Date.now() > this.expiry.get(key)) { this.store.delete(key); this.expiry.delete(key); return null; } return this.store.get(key) || null; } async set(key, value, options = {}) { this.store.set(key, value); if (options.EX) { this.expiry.set(key, Date.now() + (options.EX * 1000)); } return 'OK'; } async del(key) { const existed = this.store.has(key); this.store.delete(key); this.expiry.delete(key); return existed ? 1 : 0; } async exists(key) { return this.store.has(key) ? 1 : 0; } async flushall() { this.store.clear(); this.expiry.clear(); return 'OK'; } } module.exports = { MemoryStore }; EOF # Create health check endpoint cat > "$API_DIR/dist/src/routes/health.js" << 'EOF' const express = require('express'); const router = express.Router(); router.get('/health', (req, res) => { res.json({ status: 'healthy', timestamp: new Date().toISOString(), version: '1.0.0-hf-spaces', environment: process.env.ENV || 'production' }); }); router.get('/', (req, res) => { res.json({ message: '🔥 Firecrawl API on Hugging Face Spaces', version: '1.0.0-hf-spaces', documentation: 'https://docs.firecrawl.dev', endpoints: { health: '/health', scrape: '/v0/scrape', crawl: '/v0/crawl' } }); }); module.exports = router; EOF log_success "Fallback configurations created" } # Start Playwright service in background start_playwright_service() { log_step "Starting Playwright service..." cd "$PLAYWRIGHT_DIR" # Set Playwright-specific environment export PORT=3000 export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 export PLAYWRIGHT_BROWSERS_PATH=/ms-playwright # Start Playwright service in background npm start > /tmp/playwright.log 2>&1 & PLAYWRIGHT_PID=$! echo "$PLAYWRIGHT_PID" >> "$PID_FILE" log_info "Playwright service started with PID: $PLAYWRIGHT_PID" # Wait for Playwright service to be ready log_info "Waiting for Playwright service to be ready..." for i in {1..30}; do if curl -s http://localhost:3000/health >/dev/null 2>&1; then log_success "Playwright service is ready" return 0 fi log_info "Waiting... ($i/30)" sleep 2 done log_warning "Playwright service may not be fully ready, continuing..." } # Start API service start_api_service() { log_step "Starting API service..." cd "$API_DIR" # Set API-specific environment export PORT=$PORT export HOST=$HOST export WORKER_PORT=3005 log_info "Starting Firecrawl API on $HOST:$PORT" log_info "Environment: $ENV" log_info "Logging level: $LOGGING_LEVEL" # Start API service (this runs in foreground) node dist/src/harness.js --start-docker || { log_error "Failed to start API service" log_info "Checking logs..." [ -f /tmp/api.log ] && tail -50 /tmp/api.log exit 1 } } # Health check function check_health() { log_step "Performing health checks..." # Check API health if curl -s http://localhost:$PORT/health >/dev/null 2>&1; then log_success "API service is healthy" else log_warning "API service health check failed" fi # Check Playwright health (if running) if curl -s http://localhost:3000/health >/dev/null 2>&1; then log_success "Playwright service is healthy" else log_warning "Playwright service health check failed" fi } # Create startup info show_startup_info() { log_success "🔥 Firecrawl is starting on Hugging Face Spaces!" echo log_info "Configuration:" echo " • Port: $PORT" echo " • Host: $HOST" echo " • Environment: $ENV" echo " • Logging: $LOGGING_LEVEL" echo log_info "Available endpoints:" echo " • Health: http://localhost:$PORT/health" echo " • API Docs: http://localhost:$PORT/" echo " • Scrape: http://localhost:$PORT/v0/scrape" echo " • Crawl: http://localhost:$PORT/v0/crawl" echo log_info "Test command:" echo " curl -X POST http://localhost:$PORT/v0/scrape \\" echo " -H 'Content-Type: application/json' \\" echo " -d '{\"url\": \"https://example.com\", \"formats\": [\"markdown\"]}'" echo } # Main execution main() { echo "🔥 Firecrawl Setup for Hugging Face Spaces" echo "==========================================" echo log_info "Starting setup process..." # Setup steps setup_repository create_environment install_api_dependencies install_playwright_dependencies build_applications create_fallbacks show_startup_info # Start services start_playwright_service # Wait a bit before starting API sleep 5 # Start API service (this runs in foreground and keeps the container alive) start_api_service } # Health monitoring in background monitor_health() { while true; do sleep 60 check_health done } # Start health monitoring in background monitor_health & HEALTH_PID=$! echo "$HEALTH_PID" >> "$PID_FILE" # Run main function main "$@"