/** * PDF Parser * Extracts text from PDF files */ import pdfParse from 'pdf-parse'; export interface PDFContent { success: boolean; text?: string; title?: string; pageCount?: number; error?: string; } /** * Extract text from PDF buffer */ export async function extractPDFText(buffer: Buffer): Promise { try { const data = await pdfParse(buffer); if (!data.text) { return { success: false, error: 'No text content found in PDF', }; } // Clean up text: remove extra whitespace, join lines const cleanText = data.text .split('\n') .map((line: string) => line.trim()) .filter((line: string) => line.length > 0) .join('\n\n'); // If text is too long, truncate to first 10000 characters const finalText = cleanText.length > 10000 ? cleanText.substring(0, 10000) : cleanText; return { success: true, text: finalText, pageCount: data.numpages, title: extractTitle(cleanText), }; } catch (error) { console.error('[pdf parser] extraction failed:', error); return { success: false, error: error instanceof Error ? error.message : 'Failed to parse PDF', }; } } /** * Extract PDF text from file path (Node.js only) */ export async function extractPDFFromFile(filePath: string): Promise { try { const fs = await import('fs'); const buffer = await fs.promises.readFile(filePath); return extractPDFText(buffer); } catch (error) { console.error('[pdf parser] file read failed:', error); return { success: false, error: error instanceof Error ? error.message : 'Failed to read PDF file', }; } } /** * Extract title from PDF text (first line or heading) */ function extractTitle(text: string): string { const lines = text.split('\n'); // Look for a reasonable title (non-empty line, not too long) for (const line of lines) { const trimmed = line.trim(); if (trimmed.length > 5 && trimmed.length < 200 && !trimmed.startsWith('http')) { return trimmed; } } return 'PDF Document'; } /** * Chunk PDF text by word count */ export function chunkPDFText( text: string, maxWords: number = 500 ): { chunks: string[]; count: number } { if (!text || text.trim().length === 0) { return { chunks: [], count: 0 }; } const words = text .split(/\s+/) .filter((w) => w.length > 0); const chunks: string[] = []; let currentChunk: string[] = []; for (const word of words) { currentChunk.push(word); if (currentChunk.length >= maxWords) { chunks.push(currentChunk.join(' ')); currentChunk = []; } } // Add remaining words if (currentChunk.length > 0) { chunks.push(currentChunk.join(' ')); } return { chunks, count: chunks.length }; }