FireCrawl / setup-firecrawl.sh
NitinBot001's picture
Create setup-firecrawl.sh
9105f1d verified
#!/bin/bash
# Complete Firecrawl Setup Script for HF Spaces
# This script handles cloning, building, and running Firecrawl
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
NC='\033[0m' # No Color
# Logging functions
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
log_warning() { echo -e "${YELLOW}[WARNING]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_step() { echo -e "${PURPLE}[STEP]${NC} $1"; }
# Configuration
export PORT=${PORT:-7860}
export HOST=${HOST:-0.0.0.0}
export ENV=${ENV:-production}
export LOGGING_LEVEL=${LOGGING_LEVEL:-info}
export USE_DB_AUTHENTICATION=${USE_DB_AUTHENTICATION:-false}
export REDIS_URL=${REDIS_URL:-redis://localhost:6379}
export PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://localhost:3000/scrape}
# Global variables
FIRECRAWL_DIR="/home/firecrawl/app"
API_DIR="$FIRECRAWL_DIR/apps/api"
PLAYWRIGHT_DIR="$FIRECRAWL_DIR/apps/playwright-service-ts"
PID_FILE="/tmp/firecrawl.pid"
# Cleanup function
cleanup() {
log_warning "Shutting down Firecrawl services..."
# Kill background processes
if [ -f "$PID_FILE" ]; then
while read pid; do
if kill -0 "$pid" 2>/dev/null; then
log_info "Stopping process: $pid"
kill "$pid" 2>/dev/null || true
fi
done < "$PID_FILE"
rm -f "$PID_FILE"
fi
# Kill any remaining node processes
pkill -f "node" 2>/dev/null || true
pkill -f "npm" 2>/dev/null || true
log_success "Cleanup completed"
exit 0
}
# Set up signal handlers
trap cleanup SIGINT SIGTERM
# Check if repository exists and clone if needed
setup_repository() {
log_step "Setting up Firecrawl repository..."
if [ ! -d "$FIRECRAWL_DIR/.git" ]; then
log_info "Cloning Firecrawl repository..."
git clone https://github.com/mendableai/firecrawl.git "$FIRECRAWL_DIR" || {
log_error "Failed to clone repository"
exit 1
}
log_success "Repository cloned successfully"
else
log_info "Repository already exists, pulling latest changes..."
cd "$FIRECRAWL_DIR"
git pull origin main || log_warning "Failed to pull latest changes, continuing with existing code"
fi
cd "$FIRECRAWL_DIR"
}
# Create environment configuration
create_environment() {
log_step "Creating environment configuration..."
cat > "$FIRECRAWL_DIR/.env" << EOF
# HF Spaces Configuration
PORT=$PORT
HOST=$HOST
ENV=$ENV
LOGGING_LEVEL=$LOGGING_LEVEL
# Authentication
USE_DB_AUTHENTICATION=$USE_DB_AUTHENTICATION
# Services
REDIS_URL=$REDIS_URL
PLAYWRIGHT_MICROSERVICE_URL=$PLAYWRIGHT_MICROSERVICE_URL
# Performance settings for HF Spaces
NUM_WORKERS_PER_QUEUE=2
BLOCK_MEDIA=true
HEADLESS=true
# Optional API keys (can be set via HF Spaces secrets)
OPENAI_API_KEY=\${OPENAI_API_KEY:-}
OPENAI_BASE_URL=\${OPENAI_BASE_URL:-}
MODEL_NAME=\${MODEL_NAME:-gpt-3.5-turbo}
# Test credentials
BULL_AUTH_KEY=fc-demo
TEST_API_KEY=fc-demo-key
# Disable features that might not work in HF Spaces
SUPABASE_ANON_TOKEN=
SUPABASE_URL=
SUPABASE_SERVICE_TOKEN=
POSTHOG_API_KEY=
POSTHOG_HOST=
EOF
log_success "Environment configuration created"
}
# Install dependencies for API service
install_api_dependencies() {
log_step "Installing API dependencies..."
cd "$API_DIR"
if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then
log_info "Installing fresh dependencies..."
npm ci --only=production || {
log_warning "npm ci failed, trying npm install..."
npm install --only=production
}
else
log_info "Dependencies already installed"
fi
log_success "API dependencies installed"
}
# Install dependencies for Playwright service
install_playwright_dependencies() {
log_step "Installing Playwright service dependencies..."
cd "$PLAYWRIGHT_DIR"
if [ ! -d "node_modules" ] || [ ! -f "package-lock.json" ]; then
log_info "Installing Playwright dependencies..."
npm ci --only=production || {
log_warning "npm ci failed, trying npm install..."
npm install --only=production
}
else
log_info "Playwright dependencies already installed"
fi
log_success "Playwright dependencies installed"
}
# Build the applications
build_applications() {
log_step "Building applications..."
# Build API
log_info "Building API service..."
cd "$API_DIR"
npm run build || {
log_error "Failed to build API service"
exit 1
}
# Build Playwright service
log_info "Building Playwright service..."
cd "$PLAYWRIGHT_DIR"
npm run build || {
log_warning "Failed to build Playwright service, continuing..."
}
log_success "Applications built successfully"
}
# Create in-memory fallbacks for services
create_fallbacks() {
log_step "Creating fallback configurations..."
# Create Redis fallback
cat > "$API_DIR/dist/src/lib/redis-fallback.js" << 'EOF'
// Simple in-memory fallback for Redis when not available
class MemoryStore {
constructor() {
this.store = new Map();
this.expiry = new Map();
}
async get(key) {
// Check expiry
if (this.expiry.has(key) && Date.now() > this.expiry.get(key)) {
this.store.delete(key);
this.expiry.delete(key);
return null;
}
return this.store.get(key) || null;
}
async set(key, value, options = {}) {
this.store.set(key, value);
if (options.EX) {
this.expiry.set(key, Date.now() + (options.EX * 1000));
}
return 'OK';
}
async del(key) {
const existed = this.store.has(key);
this.store.delete(key);
this.expiry.delete(key);
return existed ? 1 : 0;
}
async exists(key) {
return this.store.has(key) ? 1 : 0;
}
async flushall() {
this.store.clear();
this.expiry.clear();
return 'OK';
}
}
module.exports = { MemoryStore };
EOF
# Create health check endpoint
cat > "$API_DIR/dist/src/routes/health.js" << 'EOF'
const express = require('express');
const router = express.Router();
router.get('/health', (req, res) => {
res.json({
status: 'healthy',
timestamp: new Date().toISOString(),
version: '1.0.0-hf-spaces',
environment: process.env.ENV || 'production'
});
});
router.get('/', (req, res) => {
res.json({
message: '🔥 Firecrawl API on Hugging Face Spaces',
version: '1.0.0-hf-spaces',
documentation: 'https://docs.firecrawl.dev',
endpoints: {
health: '/health',
scrape: '/v0/scrape',
crawl: '/v0/crawl'
}
});
});
module.exports = router;
EOF
log_success "Fallback configurations created"
}
# Start Playwright service in background
start_playwright_service() {
log_step "Starting Playwright service..."
cd "$PLAYWRIGHT_DIR"
# Set Playwright-specific environment
export PORT=3000
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
export PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
# Start Playwright service in background
npm start > /tmp/playwright.log 2>&1 &
PLAYWRIGHT_PID=$!
echo "$PLAYWRIGHT_PID" >> "$PID_FILE"
log_info "Playwright service started with PID: $PLAYWRIGHT_PID"
# Wait for Playwright service to be ready
log_info "Waiting for Playwright service to be ready..."
for i in {1..30}; do
if curl -s http://localhost:3000/health >/dev/null 2>&1; then
log_success "Playwright service is ready"
return 0
fi
log_info "Waiting... ($i/30)"
sleep 2
done
log_warning "Playwright service may not be fully ready, continuing..."
}
# Start API service
start_api_service() {
log_step "Starting API service..."
cd "$API_DIR"
# Set API-specific environment
export PORT=$PORT
export HOST=$HOST
export WORKER_PORT=3005
log_info "Starting Firecrawl API on $HOST:$PORT"
log_info "Environment: $ENV"
log_info "Logging level: $LOGGING_LEVEL"
# Start API service (this runs in foreground)
node dist/src/harness.js --start-docker || {
log_error "Failed to start API service"
log_info "Checking logs..."
[ -f /tmp/api.log ] && tail -50 /tmp/api.log
exit 1
}
}
# Health check function
check_health() {
log_step "Performing health checks..."
# Check API health
if curl -s http://localhost:$PORT/health >/dev/null 2>&1; then
log_success "API service is healthy"
else
log_warning "API service health check failed"
fi
# Check Playwright health (if running)
if curl -s http://localhost:3000/health >/dev/null 2>&1; then
log_success "Playwright service is healthy"
else
log_warning "Playwright service health check failed"
fi
}
# Create startup info
show_startup_info() {
log_success "🔥 Firecrawl is starting on Hugging Face Spaces!"
echo
log_info "Configuration:"
echo " • Port: $PORT"
echo " • Host: $HOST"
echo " • Environment: $ENV"
echo " • Logging: $LOGGING_LEVEL"
echo
log_info "Available endpoints:"
echo " • Health: http://localhost:$PORT/health"
echo " • API Docs: http://localhost:$PORT/"
echo " • Scrape: http://localhost:$PORT/v0/scrape"
echo " • Crawl: http://localhost:$PORT/v0/crawl"
echo
log_info "Test command:"
echo " curl -X POST http://localhost:$PORT/v0/scrape \\"
echo " -H 'Content-Type: application/json' \\"
echo " -d '{\"url\": \"https://example.com\", \"formats\": [\"markdown\"]}'"
echo
}
# Main execution
main() {
echo "🔥 Firecrawl Setup for Hugging Face Spaces"
echo "=========================================="
echo
log_info "Starting setup process..."
# Setup steps
setup_repository
create_environment
install_api_dependencies
install_playwright_dependencies
build_applications
create_fallbacks
show_startup_info
# Start services
start_playwright_service
# Wait a bit before starting API
sleep 5
# Start API service (this runs in foreground and keeps the container alive)
start_api_service
}
# Health monitoring in background
monitor_health() {
while true; do
sleep 60
check_health
done
}
# Start health monitoring in background
monitor_health &
HEALTH_PID=$!
echo "$HEALTH_PID" >> "$PID_FILE"
# Run main function
main "$@"