Spaces:
Sleeping
Sleeping
| import { NextRequest, NextResponse } from 'next/server'; | |
| // Function to sanitize smart quotes and other problematic characters | |
| function sanitizeQuotes(text: string): string { | |
| return text | |
| // Replace smart single quotes | |
| .replace(/[\u2018\u2019\u201A\u201B]/g, "'") | |
| // Replace smart double quotes | |
| .replace(/[\u201C\u201D\u201E\u201F]/g, '"') | |
| // Replace other quote-like characters | |
| .replace(/[\u00AB\u00BB]/g, '"') // Guillemets | |
| .replace(/[\u2039\u203A]/g, "'") // Single guillemets | |
| // Replace other problematic characters | |
| .replace(/[\u2013\u2014]/g, '-') // En dash and em dash | |
| .replace(/[\u2026]/g, '...') // Ellipsis | |
| .replace(/[\u00A0]/g, ' '); // Non-breaking space | |
| } | |
| export async function POST(request: NextRequest) { | |
| try { | |
| const { url } = await request.json(); | |
| if (!url) { | |
| return NextResponse.json({ | |
| success: false, | |
| error: 'URL is required' | |
| }, { status: 400 }); | |
| } | |
| console.log('[scrape-url-enhanced] Scraping with Firecrawl:', url); | |
| const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY; | |
| if (!FIRECRAWL_API_KEY) { | |
| throw new Error('FIRECRAWL_API_KEY environment variable is not set'); | |
| } | |
| // Make request to Firecrawl API with maxAge for 500% faster scraping | |
| const firecrawlResponse = await fetch('https://api.firecrawl.dev/v1/scrape', { | |
| method: 'POST', | |
| headers: { | |
| 'Authorization': `Bearer ${FIRECRAWL_API_KEY}`, | |
| 'Content-Type': 'application/json' | |
| }, | |
| body: JSON.stringify({ | |
| url, | |
| formats: ['markdown', 'html', 'screenshot'], | |
| waitFor: 3000, | |
| timeout: 30000, | |
| blockAds: true, | |
| maxAge: 3600000, // Use cached data if less than 1 hour old (500% faster!) | |
| actions: [ | |
| { | |
| type: 'wait', | |
| milliseconds: 2000 | |
| }, | |
| { | |
| type: 'screenshot', | |
| fullPage: false // Just visible viewport for performance | |
| } | |
| ] | |
| }) | |
| }); | |
| if (!firecrawlResponse.ok) { | |
| const error = await firecrawlResponse.text(); | |
| throw new Error(`Firecrawl API error: ${error}`); | |
| } | |
| const data = await firecrawlResponse.json(); | |
| if (!data.success || !data.data) { | |
| throw new Error('Failed to scrape content'); | |
| } | |
| const { markdown, metadata, screenshot, actions } = data.data; | |
| // html available but not used in current implementation | |
| // Get screenshot from either direct field or actions result | |
| const screenshotUrl = screenshot || actions?.screenshots?.[0] || null; | |
| // Sanitize the markdown content | |
| const sanitizedMarkdown = sanitizeQuotes(markdown || ''); | |
| // Extract structured data from the response | |
| const title = metadata?.title || ''; | |
| const description = metadata?.description || ''; | |
| // Format content for AI | |
| const formattedContent = ` | |
| Title: ${sanitizeQuotes(title)} | |
| Description: ${sanitizeQuotes(description)} | |
| URL: ${url} | |
| Main Content: | |
| ${sanitizedMarkdown} | |
| `.trim(); | |
| return NextResponse.json({ | |
| success: true, | |
| url, | |
| content: formattedContent, | |
| screenshot: screenshotUrl, | |
| structured: { | |
| title: sanitizeQuotes(title), | |
| description: sanitizeQuotes(description), | |
| content: sanitizedMarkdown, | |
| url, | |
| screenshot: screenshotUrl | |
| }, | |
| metadata: { | |
| scraper: 'firecrawl-enhanced', | |
| timestamp: new Date().toISOString(), | |
| contentLength: formattedContent.length, | |
| cached: data.data.cached || false, // Indicates if data came from cache | |
| ...metadata | |
| }, | |
| message: 'URL scraped successfully with Firecrawl (with caching for 500% faster performance)' | |
| }); | |
| } catch (error) { | |
| console.error('[scrape-url-enhanced] Error:', error); | |
| return NextResponse.json({ | |
| success: false, | |
| error: (error as Error).message | |
| }, { status: 500 }); | |
| } | |
| } |