Spaces:
Running
Running
File size: 2,837 Bytes
9a43362 ef4c2aa 9a43362 ef4c2aa 9a43362 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | /**
* PDF Parser
* Extracts text from PDF files
*/
import pdfParse from 'pdf-parse';
export interface PDFContent {
success: boolean;
text?: string;
title?: string;
pageCount?: number;
error?: string;
}
/**
* Extract text from PDF buffer
*/
export async function extractPDFText(buffer: Buffer): Promise<PDFContent> {
try {
const data = await pdfParse(buffer);
if (!data.text) {
return {
success: false,
error: 'No text content found in PDF',
};
}
// Clean up text: remove extra whitespace, join lines
const cleanText = data.text
.split('\n')
.map((line: string) => line.trim())
.filter((line: string) => line.length > 0)
.join('\n\n');
// If text is too long, truncate to first 10000 characters
const finalText = cleanText.length > 10000 ? cleanText.substring(0, 10000) : cleanText;
return {
success: true,
text: finalText,
pageCount: data.numpages,
title: extractTitle(cleanText),
};
} catch (error) {
console.error('[pdf parser] extraction failed:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to parse PDF',
};
}
}
/**
* Extract PDF text from file path (Node.js only)
*/
export async function extractPDFFromFile(filePath: string): Promise<PDFContent> {
try {
const fs = await import('fs');
const buffer = await fs.promises.readFile(filePath);
return extractPDFText(buffer);
} catch (error) {
console.error('[pdf parser] file read failed:', error);
return {
success: false,
error: error instanceof Error ? error.message : 'Failed to read PDF file',
};
}
}
/**
* Extract title from PDF text (first line or heading)
*/
function extractTitle(text: string): string {
const lines = text.split('\n');
// Look for a reasonable title (non-empty line, not too long)
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.length > 5 && trimmed.length < 200 && !trimmed.startsWith('http')) {
return trimmed;
}
}
return 'PDF Document';
}
/**
* Chunk PDF text by word count
*/
export function chunkPDFText(
text: string,
maxWords: number = 500
): { chunks: string[]; count: number } {
if (!text || text.trim().length === 0) {
return { chunks: [], count: 0 };
}
const words = text
.split(/\s+/)
.filter((w) => w.length > 0);
const chunks: string[] = [];
let currentChunk: string[] = [];
for (const word of words) {
currentChunk.push(word);
if (currentChunk.length >= maxWords) {
chunks.push(currentChunk.join(' '));
currentChunk = [];
}
}
// Add remaining words
if (currentChunk.length > 0) {
chunks.push(currentChunk.join(' '));
}
return { chunks, count: chunks.length };
}
|