Spaces:
Sleeping
Sleeping
| /** | |
| * PDF Parser | |
| * Extracts text from PDF files | |
| */ | |
| import pdfParse from 'pdf-parse'; | |
| export interface PDFContent { | |
| success: boolean; | |
| text?: string; | |
| title?: string; | |
| pageCount?: number; | |
| error?: string; | |
| } | |
| /** | |
| * Extract text from PDF buffer | |
| */ | |
| export async function extractPDFText(buffer: Buffer): Promise<PDFContent> { | |
| try { | |
| const data = await pdfParse(buffer); | |
| if (!data.text) { | |
| return { | |
| success: false, | |
| error: 'No text content found in PDF', | |
| }; | |
| } | |
| // Clean up text: remove extra whitespace, join lines | |
| const cleanText = data.text | |
| .split('\n') | |
| .map((line: string) => line.trim()) | |
| .filter((line: string) => line.length > 0) | |
| .join('\n\n'); | |
| // If text is too long, truncate to first 10000 characters | |
| const finalText = cleanText.length > 10000 ? cleanText.substring(0, 10000) : cleanText; | |
| return { | |
| success: true, | |
| text: finalText, | |
| pageCount: data.numpages, | |
| title: extractTitle(cleanText), | |
| }; | |
| } catch (error) { | |
| console.error('[pdf parser] extraction failed:', error); | |
| return { | |
| success: false, | |
| error: error instanceof Error ? error.message : 'Failed to parse PDF', | |
| }; | |
| } | |
| } | |
| /** | |
| * Extract PDF text from file path (Node.js only) | |
| */ | |
| export async function extractPDFFromFile(filePath: string): Promise<PDFContent> { | |
| try { | |
| const fs = await import('fs'); | |
| const buffer = await fs.promises.readFile(filePath); | |
| return extractPDFText(buffer); | |
| } catch (error) { | |
| console.error('[pdf parser] file read failed:', error); | |
| return { | |
| success: false, | |
| error: error instanceof Error ? error.message : 'Failed to read PDF file', | |
| }; | |
| } | |
| } | |
| /** | |
| * Extract title from PDF text (first line or heading) | |
| */ | |
| function extractTitle(text: string): string { | |
| const lines = text.split('\n'); | |
| // Look for a reasonable title (non-empty line, not too long) | |
| for (const line of lines) { | |
| const trimmed = line.trim(); | |
| if (trimmed.length > 5 && trimmed.length < 200 && !trimmed.startsWith('http')) { | |
| return trimmed; | |
| } | |
| } | |
| return 'PDF Document'; | |
| } | |
| /** | |
| * Chunk PDF text by word count | |
| */ | |
| export function chunkPDFText( | |
| text: string, | |
| maxWords: number = 500 | |
| ): { chunks: string[]; count: number } { | |
| if (!text || text.trim().length === 0) { | |
| return { chunks: [], count: 0 }; | |
| } | |
| const words = text | |
| .split(/\s+/) | |
| .filter((w) => w.length > 0); | |
| const chunks: string[] = []; | |
| let currentChunk: string[] = []; | |
| for (const word of words) { | |
| currentChunk.push(word); | |
| if (currentChunk.length >= maxWords) { | |
| chunks.push(currentChunk.join(' ')); | |
| currentChunk = []; | |
| } | |
| } | |
| // Add remaining words | |
| if (currentChunk.length > 0) { | |
| chunks.push(currentChunk.join(' ')); | |
| } | |
| return { chunks, count: chunks.length }; | |
| } | |