AUXteam's picture
Upload folder using huggingface_hub
d530f14 verified
import { NextRequest, NextResponse } from 'next/server';
// Function to sanitize smart quotes and other problematic characters
function sanitizeQuotes(text: string): string {
return text
// Replace smart single quotes
.replace(/[\u2018\u2019\u201A\u201B]/g, "'")
// Replace smart double quotes
.replace(/[\u201C\u201D\u201E\u201F]/g, '"')
// Replace other quote-like characters
.replace(/[\u00AB\u00BB]/g, '"') // Guillemets
.replace(/[\u2039\u203A]/g, "'") // Single guillemets
// Replace other problematic characters
.replace(/[\u2013\u2014]/g, '-') // En dash and em dash
.replace(/[\u2026]/g, '...') // Ellipsis
.replace(/[\u00A0]/g, ' '); // Non-breaking space
}
export async function POST(request: NextRequest) {
try {
const { url } = await request.json();
if (!url) {
return NextResponse.json({
success: false,
error: 'URL is required'
}, { status: 400 });
}
console.log('[scrape-url-enhanced] Scraping with Firecrawl:', url);
const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY;
if (!FIRECRAWL_API_KEY) {
throw new Error('FIRECRAWL_API_KEY environment variable is not set');
}
// Make request to Firecrawl API with maxAge for 500% faster scraping
const firecrawlResponse = await fetch('https://api.firecrawl.dev/v1/scrape', {
method: 'POST',
headers: {
'Authorization': `Bearer ${FIRECRAWL_API_KEY}`,
'Content-Type': 'application/json'
},
body: JSON.stringify({
url,
formats: ['markdown', 'html', 'screenshot'],
waitFor: 3000,
timeout: 30000,
blockAds: true,
maxAge: 3600000, // Use cached data if less than 1 hour old (500% faster!)
actions: [
{
type: 'wait',
milliseconds: 2000
},
{
type: 'screenshot',
fullPage: false // Just visible viewport for performance
}
]
})
});
if (!firecrawlResponse.ok) {
const error = await firecrawlResponse.text();
throw new Error(`Firecrawl API error: ${error}`);
}
const data = await firecrawlResponse.json();
if (!data.success || !data.data) {
throw new Error('Failed to scrape content');
}
const { markdown, metadata, screenshot, actions } = data.data;
// html available but not used in current implementation
// Get screenshot from either direct field or actions result
const screenshotUrl = screenshot || actions?.screenshots?.[0] || null;
// Sanitize the markdown content
const sanitizedMarkdown = sanitizeQuotes(markdown || '');
// Extract structured data from the response
const title = metadata?.title || '';
const description = metadata?.description || '';
// Format content for AI
const formattedContent = `
Title: ${sanitizeQuotes(title)}
Description: ${sanitizeQuotes(description)}
URL: ${url}
Main Content:
${sanitizedMarkdown}
`.trim();
return NextResponse.json({
success: true,
url,
content: formattedContent,
screenshot: screenshotUrl,
structured: {
title: sanitizeQuotes(title),
description: sanitizeQuotes(description),
content: sanitizedMarkdown,
url,
screenshot: screenshotUrl
},
metadata: {
scraper: 'firecrawl-enhanced',
timestamp: new Date().toISOString(),
contentLength: formattedContent.length,
cached: data.data.cached || false, // Indicates if data came from cache
...metadata
},
message: 'URL scraped successfully with Firecrawl (with caching for 500% faster performance)'
});
} catch (error) {
console.error('[scrape-url-enhanced] Error:', error);
return NextResponse.json({
success: false,
error: (error as Error).message
}, { status: 500 });
}
}