Spaces:
Runtime error
Runtime error
| # Complete Firecrawl Setup Script for HF Spaces | |
| # This script handles cloning, building, and running Firecrawl | |
| set -e | |
| # Colors for output | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| BLUE='\033[0;34m' | |
| PURPLE='\033[0;35m' | |
| NC='\033[0m' # No Color | |
| # Logging functions | |
| log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } | |
| log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } | |
| log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; } | |
| log_error() { echo -e "${RED}[ERROR]${NC} $1"; } | |
| log_step() { echo -e "${PURPLE}[STEP]${NC} $1"; } | |
| # Configuration | |
| export PORT=${PORT:-7860} | |
| export HOST=${HOST:-0.0.0.0} | |
| export ENV=${ENV:-production} | |
| export LOGGING_LEVEL=${LOGGING_LEVEL:-info} | |
| export USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION:-false} | |
| export REDIS_URL=${REDIS_URL:-redis://localhost:6379} | |
| export PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://localhost:3000/scrape} | |
| # Global variables | |
| FIRECRAWL_DIR="/home/firecrawl/app" | |
| API_DIR="$FIRECRAWL_DIR/apps/api" | |
| PLAYWRIGHT_DIR="$FIRECRAWL_DIR/apps/playwright-service-ts" | |
| PID_FILE="/tmp/firecrawl.pid" | |
| # Cleanup function | |
| cleanup() { | |
| log_warning "Shutting down Firecrawl services..." | |
| # Kill background processes | |
| if [ -f "$PID_FILE" ]; then | |
| while read pid; do | |
| if kill -0 "$pid" 2>/dev/null; then | |
| log_info "Stopping process: $pid" | |
| kill "$pid" 2>/dev/null || true | |
| fi | |
| done < "$PID_FILE" | |
| rm -f "$PID_FILE" | |
| fi | |
| # Kill any remaining node processes | |
| pkill -f "node" 2>/dev/null || true | |
| pkill -f "npm" 2>/dev/null || true | |
| log_success "Cleanup completed" | |
| exit 0 | |
| } | |
| # Set up signal handlers | |
| trap cleanup SIGINT SIGTERM | |
| # Check if repository exists and clone if needed | |
| setup_repository() { | |
| log_step "Setting up Firecrawl repository..." | |
| if [ ! -d "$FIRECRAWL_DIR/.git" ]; then | |
| log_info "Cloning Firecrawl repository..." | |
| git clone https://github.com/mendableai/firecrawl.git "$FIRECRAWL_DIR" || { | |
| log_error "Failed to clone repository" | |
| exit 1 | |
| } | |
| log_success "Repository cloned successfully" | |
| else | |
| log_info "Repository already exists, pulling latest changes..." | |
| cd "$FIRECRAWL_DIR" | |
| git pull origin main || log_warning "Failed to pull latest changes, continuing with existing code" | |
| fi | |
| cd "$FIRECRAWL_DIR" | |
| } | |
| # Create environment configuration | |
| create_environment() { | |
| log_step "Creating environment configuration..." | |
| cat > "$FIRECRAWL_DIR/.env" << EOF | |
| # HF Spaces Configuration | |
| PORT=$PORT | |
| HOST=$HOST | |
| ENV=$ENV | |
| LOGGING_LEVEL=$LOGGING_LEVEL | |
| # Authentication | |
| USE_DB_AUTHENTICATION=$USE_DB_AUTHENTICATION | |
| # Services | |
| REDIS_URL=$REDIS_URL | |
| PLAYWRIGHT_MICROSERVICE_URL=$PLAYWRIGHT_MICROSERVICE_URL | |
| # Performance settings for HF Spaces | |
| NUM_WORKERS_PER_QUEUE=2 | |
| BLOCK_MEDIA=true | |
| HEADLESS=true | |
| # Optional API keys (can be set via HF Spaces secrets) | |
| OPENAI_API_KEY=\${OPENAI_API_KEY:-} | |
| OPENAI_BASE_URL=\${OPENAI_BASE_URL:-} | |
| MODEL_NAME=\${MODEL_NAME:-gpt-3.5-turbo} | |
| # Test credentials | |
| BULL_AUTH_KEY=fc-demo | |
| TEST_API_KEY=fc-demo-key | |
| # Disable features that might not work in HF Spaces | |
| SUPABASE_ANON_TOKEN= | |
| SUPABASE_URL= | |
| SUPABASE_SERVICE_TOKEN= | |
| POSTHOG_API_KEY= | |
| POSTHOG_HOST= | |
| EOF | |
| log_success "Environment configuration created" | |
| } | |
| # Install dependencies for API service | |
| install_api_dependencies() { | |
| log_step "Installing API dependencies..." | |
| cd "$API_DIR" | |
| if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then | |
| log_info "Installing fresh dependencies..." | |
| npm ci --only=production || { | |
| log_warning "npm ci failed, trying npm install..." | |
| npm install --only=production | |
| } | |
| else | |
| log_info "Dependencies already installed" | |
| fi | |
| log_success "API dependencies installed" | |
| } | |
| # Install dependencies for Playwright service | |
| install_playwright_dependencies() { | |
| log_step "Installing Playwright service dependencies..." | |
| cd "$PLAYWRIGHT_DIR" | |
| if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then | |
| log_info "Installing Playwright dependencies..." | |
| npm ci --only=production || { | |
| log_warning "npm ci failed, trying npm install..." | |
| npm install --only=production | |
| } | |
| else | |
| log_info "Playwright dependencies already installed" | |
| fi | |
| log_success "Playwright dependencies installed" | |
| } | |
| # Build the applications | |
| build_applications() { | |
| log_step "Building applications..." | |
| # Build API | |
| log_info "Building API service..." | |
| cd "$API_DIR" | |
| npm run build || { | |
| log_error "Failed to build API service" | |
| exit 1 | |
| } | |
| # Build Playwright service | |
| log_info "Building Playwright service..." | |
| cd "$PLAYWRIGHT_DIR" | |
| npm run build || { | |
| log_warning "Failed to build Playwright service, continuing..." | |
| } | |
| log_success "Applications built successfully" | |
| } | |
| # Create in-memory fallbacks for services | |
| create_fallbacks() { | |
| log_step "Creating fallback configurations..." | |
| # Create Redis fallback | |
| cat > "$API_DIR/dist/src/lib/redis-fallback.js" << 'EOF' | |
| // Simple in-memory fallback for Redis when not available | |
| class MemoryStore { | |
| constructor() { | |
| this.store = new Map(); | |
| this.expiry = new Map(); | |
| } | |
| async get(key) { | |
| // Check expiry | |
| if (this.expiry.has(key) && Date.now() > this.expiry.get(key)) { | |
| this.store.delete(key); | |
| this.expiry.delete(key); | |
| return null; | |
| } | |
| return this.store.get(key) || null; | |
| } | |
| async set(key, value, options = {}) { | |
| this.store.set(key, value); | |
| if (options.EX) { | |
| this.expiry.set(key, Date.now() + (options.EX * 1000)); | |
| } | |
| return 'OK'; | |
| } | |
| async del(key) { | |
| const existed = this.store.has(key); | |
| this.store.delete(key); | |
| this.expiry.delete(key); | |
| return existed ? 1 : 0; | |
| } | |
| async exists(key) { | |
| return this.store.has(key) ? 1 : 0; | |
| } | |
| async flushall() { | |
| this.store.clear(); | |
| this.expiry.clear(); | |
| return 'OK'; | |
| } | |
| } | |
| module.exports = { MemoryStore }; | |
| EOF | |
| # Create health check endpoint | |
| cat > "$API_DIR/dist/src/routes/health.js" << 'EOF' | |
| const express = require('express'); | |
| const router = express.Router(); | |
| router.get('/health', (req, res) => { | |
| res.json({ | |
| status: 'healthy', | |
| timestamp: new Date().toISOString(), | |
| version: '1.0.0-hf-spaces', | |
| environment: process.env.ENV || 'production' | |
| }); | |
| }); | |
| router.get('/', (req, res) => { | |
| res.json({ | |
| message: '🔥 Firecrawl API on Hugging Face Spaces', | |
| version: '1.0.0-hf-spaces', | |
| documentation: 'https://docs.firecrawl.dev', | |
| endpoints: { | |
| health: '/health', | |
| scrape: '/v0/scrape', | |
| crawl: '/v0/crawl' | |
| } | |
| }); | |
| }); | |
| module.exports = router; | |
| EOF | |
| log_success "Fallback configurations created" | |
| } | |
| # Start Playwright service in background | |
| start_playwright_service() { | |
| log_step "Starting Playwright service..." | |
| cd "$PLAYWRIGHT_DIR" | |
| # Set Playwright-specific environment | |
| export PORT=3000 | |
| export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 | |
| export PLAYWRIGHT_BROWSERS_PATH=/ms-playwright | |
| # Start Playwright service in background | |
| npm start > /tmp/playwright.log 2>&1 & | |
| PLAYWRIGHT_PID=$! | |
| echo "$PLAYWRIGHT_PID" >> "$PID_FILE" | |
| log_info "Playwright service started with PID: $PLAYWRIGHT_PID" | |
| # Wait for Playwright service to be ready | |
| log_info "Waiting for Playwright service to be ready..." | |
| for i in {1..30}; do | |
| if curl -s http://localhost:3000/health >/dev/null 2>&1; then | |
| log_success "Playwright service is ready" | |
| return 0 | |
| fi | |
| log_info "Waiting... ($i/30)" | |
| sleep 2 | |
| done | |
| log_warning "Playwright service may not be fully ready, continuing..." | |
| } | |
| # Start API service | |
| start_api_service() { | |
| log_step "Starting API service..." | |
| cd "$API_DIR" | |
| # Set API-specific environment | |
| export PORT=$PORT | |
| export HOST=$HOST | |
| export WORKER_PORT=3005 | |
| log_info "Starting Firecrawl API on $HOST:$PORT" | |
| log_info "Environment: $ENV" | |
| log_info "Logging level: $LOGGING_LEVEL" | |
| # Start API service (this runs in foreground) | |
| node dist/src/harness.js --start-docker || { | |
| log_error "Failed to start API service" | |
| log_info "Checking logs..." | |
| [ -f /tmp/api.log ] && tail -50 /tmp/api.log | |
| exit 1 | |
| } | |
| } | |
| # Health check function | |
| check_health() { | |
| log_step "Performing health checks..." | |
| # Check API health | |
| if curl -s http://localhost:$PORT/health >/dev/null 2>&1; then | |
| log_success "API service is healthy" | |
| else | |
| log_warning "API service health check failed" | |
| fi | |
| # Check Playwright health (if running) | |
| if curl -s http://localhost:3000/health >/dev/null 2>&1; then | |
| log_success "Playwright service is healthy" | |
| else | |
| log_warning "Playwright service health check failed" | |
| fi | |
| } | |
| # Create startup info | |
| show_startup_info() { | |
| log_success "🔥 Firecrawl is starting on Hugging Face Spaces!" | |
| echo | |
| log_info "Configuration:" | |
| echo " • Port: $PORT" | |
| echo " • Host: $HOST" | |
| echo " • Environment: $ENV" | |
| echo " • Logging: $LOGGING_LEVEL" | |
| echo | |
| log_info "Available endpoints:" | |
| echo " • Health: http://localhost:$PORT/health" | |
| echo " • API Docs: http://localhost:$PORT/" | |
| echo " • Scrape: http://localhost:$PORT/v0/scrape" | |
| echo " • Crawl: http://localhost:$PORT/v0/crawl" | |
| echo | |
| log_info "Test command:" | |
| echo " curl -X POST http://localhost:$PORT/v0/scrape \\" | |
| echo " -H 'Content-Type: application/json' \\" | |
| echo " -d '{\"url\": \"https://example.com\", \"formats\": [\"markdown\"]}'" | |
| echo | |
| } | |
| # Main execution | |
| main() { | |
| echo "🔥 Firecrawl Setup for Hugging Face Spaces" | |
| echo "==========================================" | |
| echo | |
| log_info "Starting setup process..." | |
| # Setup steps | |
| setup_repository | |
| create_environment | |
| install_api_dependencies | |
| install_playwright_dependencies | |
| build_applications | |
| create_fallbacks | |
| show_startup_info | |
| # Start services | |
| start_playwright_service | |
| # Wait a bit before starting API | |
| sleep 5 | |
| # Start API service (this runs in foreground and keeps the container alive) | |
| start_api_service | |
| } | |
| # Health monitoring in background | |
| monitor_health() { | |
| while true; do | |
| sleep 60 | |
| check_health | |
| done | |
| } | |
| # Start health monitoring in background | |
| monitor_health & | |
| HEALTH_PID=$! | |
| echo "$HEALTH_PID" >> "$PID_FILE" | |
| # Run main function | |
| main "$@" |