Spaces:

AIgoose
/

carouselforge

Sleeping

carouselforge / src /lib /parser /pdf.ts

CarouselForge Developer

fix: resolve HF build - ESLint rule and img tags

ef4c2aa about 1 month ago

2.84 kB

	/**
	* PDF Parser
	* Extracts text from PDF files
	*/

	import pdfParse from 'pdf-parse';

	export interface PDFContent {
	success: boolean;
	text?: string;
	title?: string;
	pageCount?: number;
	error?: string;
	}

	/**
	* Extract text from PDF buffer
	*/
	export async function extractPDFText(buffer: Buffer): Promise<PDFContent> {
	try {
	const data = await pdfParse(buffer);

	if (!data.text) {
	return {
	success: false,
	error: 'No text content found in PDF',
	};
	}

	// Clean up text: remove extra whitespace, join lines
	const cleanText = data.text
	.split('\n')
	.map((line: string) => line.trim())
	.filter((line: string) => line.length > 0)
	.join('\n\n');

	// If text is too long, truncate to first 10000 characters
	const finalText = cleanText.length > 10000 ? cleanText.substring(0, 10000) : cleanText;

	return {
	success: true,
	text: finalText,
	pageCount: data.numpages,
	title: extractTitle(cleanText),
	};
	} catch (error) {
	console.error('[pdf parser] extraction failed:', error);
	return {
	success: false,
	error: error instanceof Error ? error.message : 'Failed to parse PDF',
	};
	}
	}

	/**
	* Extract PDF text from file path (Node.js only)
	*/
	export async function extractPDFFromFile(filePath: string): Promise<PDFContent> {
	try {
	const fs = await import('fs');
	const buffer = await fs.promises.readFile(filePath);
	return extractPDFText(buffer);
	} catch (error) {
	console.error('[pdf parser] file read failed:', error);
	return {
	success: false,
	error: error instanceof Error ? error.message : 'Failed to read PDF file',
	};
	}
	}

	/**
	* Extract title from PDF text (first line or heading)
	*/
	function extractTitle(text: string): string {
	const lines = text.split('\n');

	// Look for a reasonable title (non-empty line, not too long)
	for (const line of lines) {
	const trimmed = line.trim();
	if (trimmed.length > 5 && trimmed.length < 200 && !trimmed.startsWith('http')) {
	return trimmed;
	}
	}

	return 'PDF Document';
	}

	/**
	* Chunk PDF text by word count
	*/
	export function chunkPDFText(
	text: string,
	maxWords: number = 500
	): { chunks: string[]; count: number } {
	if (!text \|\| text.trim().length === 0) {
	return { chunks: [], count: 0 };
	}

	const words = text
	.split(/\s+/)
	.filter((w) => w.length > 0);
	const chunks: string[] = [];
	let currentChunk: string[] = [];

	for (const word of words) {
	currentChunk.push(word);

	if (currentChunk.length >= maxWords) {
	chunks.push(currentChunk.join(' '));
	currentChunk = [];
	}
	}

	// Add remaining words
	if (currentChunk.length > 0) {
	chunks.push(currentChunk.join(' '));
	}

	return { chunks, count: chunks.length };
	}