Spaces:

AUXteam
/

Paper2Code

Sleeping

App Files Files Community

Paper2Code / app /api /scrape-url-enhanced /route.ts

AUXteam

Upload folder using huggingface_hub

d530f14 verified about 1 month ago

raw

history blame contribute delete

3.97 kB

	import { NextRequest, NextResponse } from 'next/server';

	// Function to sanitize smart quotes and other problematic characters
	function sanitizeQuotes(text: string): string {
	return text
	// Replace smart single quotes
	.replace(/[\u2018\u2019\u201A\u201B]/g, "'")
	// Replace smart double quotes
	.replace(/[\u201C\u201D\u201E\u201F]/g, '"')
	// Replace other quote-like characters
	.replace(/[\u00AB\u00BB]/g, '"') // Guillemets
	.replace(/[\u2039\u203A]/g, "'") // Single guillemets
	// Replace other problematic characters
	.replace(/[\u2013\u2014]/g, '-') // En dash and em dash
	.replace(/[\u2026]/g, '...') // Ellipsis
	.replace(/[\u00A0]/g, ' '); // Non-breaking space
	}

	export async function POST(request: NextRequest) {
	try {
	const { url } = await request.json();

	if (!url) {
	return NextResponse.json({
	success: false,
	error: 'URL is required'
	}, { status: 400 });
	}

	console.log('[scrape-url-enhanced] Scraping with Firecrawl:', url);

	const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY;
	if (!FIRECRAWL_API_KEY) {
	throw new Error('FIRECRAWL_API_KEY environment variable is not set');
	}

	// Make request to Firecrawl API with maxAge for 500% faster scraping
	const firecrawlResponse = await fetch('https://api.firecrawl.dev/v1/scrape', {
	method: 'POST',
	headers: {
	'Authorization': `Bearer ${FIRECRAWL_API_KEY}`,
	'Content-Type': 'application/json'
	},
	body: JSON.stringify({
	url,
	formats: ['markdown', 'html', 'screenshot'],
	waitFor: 3000,
	timeout: 30000,
	blockAds: true,
	maxAge: 3600000, // Use cached data if less than 1 hour old (500% faster!)
	actions: [
	{
	type: 'wait',
	milliseconds: 2000
	},
	{
	type: 'screenshot',
	fullPage: false // Just visible viewport for performance
	}
	]
	})
	});

	if (!firecrawlResponse.ok) {
	const error = await firecrawlResponse.text();
	throw new Error(`Firecrawl API error: ${error}`);
	}

	const data = await firecrawlResponse.json();

	if (!data.success \|\| !data.data) {
	throw new Error('Failed to scrape content');
	}

	const { markdown, metadata, screenshot, actions } = data.data;
	// html available but not used in current implementation

	// Get screenshot from either direct field or actions result
	const screenshotUrl = screenshot \|\| actions?.screenshots?.[0] \|\| null;

	// Sanitize the markdown content
	const sanitizedMarkdown = sanitizeQuotes(markdown \|\| '');

	// Extract structured data from the response
	const title = metadata?.title \|\| '';
	const description = metadata?.description \|\| '';

	// Format content for AI
	const formattedContent = `
	Title: ${sanitizeQuotes(title)}
	Description: ${sanitizeQuotes(description)}
	URL: ${url}

	Main Content:
	${sanitizedMarkdown}
	`.trim();

	return NextResponse.json({
	success: true,
	url,
	content: formattedContent,
	screenshot: screenshotUrl,
	structured: {
	title: sanitizeQuotes(title),
	description: sanitizeQuotes(description),
	content: sanitizedMarkdown,
	url,
	screenshot: screenshotUrl
	},
	metadata: {
	scraper: 'firecrawl-enhanced',
	timestamp: new Date().toISOString(),
	contentLength: formattedContent.length,
	cached: data.data.cached \|\| false, // Indicates if data came from cache
	...metadata
	},
	message: 'URL scraped successfully with Firecrawl (with caching for 500% faster performance)'
	});

	} catch (error) {
	console.error('[scrape-url-enhanced] Error:', error);
	return NextResponse.json({
	success: false,
	error: (error as Error).message
	}, { status: 500 });
	}
	}