/** * Zero-dependency PDF text extraction. * * Handles PDF 1.x text streams: decompress FlateDecode streams with * Node.js built-in zlib, then extract text from Tj/TJ operators. * Not a full PDF parser — designed for text-layer PDFs (reports, docs). * Scanned PDFs (image-only) return empty text. */ import { inflateSync } from 'zlib'; import { log } from './config.js'; const MAX_STREAMS = 200; const MAX_STREAM_DECODED = 5 * 1024 * 1024; const MAX_TOTAL_DECODED = 25 * 1024 * 1024; /** * Extract text from a PDF buffer. * @param {Buffer} buf - Raw PDF bytes * @returns {string} Extracted text, or empty string if no text layer */ export function extractPdfText(buf) { const pages = []; let streamCount = 0; let totalDecoded = 0; // Find all stream...endstream blocks let pos = 0; while (pos < buf.length) { const streamStart = buf.indexOf('stream\n', pos); if (streamStart === -1) break; const dataStart = streamStart + 7; // skip "stream\n" // Handle \r\n after "stream" const actualStart = buf[streamStart + 6] === 0x0d ? dataStart + 1 : dataStart; const endStream = buf.indexOf('\nendstream', actualStart); if (endStream === -1) break; const streamData = buf.subarray(actualStart, endStream); streamCount++; if (streamCount > MAX_STREAMS) throw new Error('PDF stream count exceeds safety limit'); // Check if this stream has FlateDecode by looking back at the dictionary const dictStart = Math.max(0, streamStart - 500); const dictText = buf.subarray(dictStart, streamStart).toString('latin1'); const isFlate = dictText.includes('FlateDecode'); let decoded; try { if (isFlate) { const inflated = inflateSync(streamData, { maxOutputLength: MAX_STREAM_DECODED }); if (inflated.length > MAX_STREAM_DECODED) throw new Error('PDF decoded content exceeds safety limit'); totalDecoded += inflated.length; if (totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit'); decoded = inflated.toString('latin1'); } else { totalDecoded += streamData.length; if (streamData.length > MAX_STREAM_DECODED || totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit'); decoded = streamData.toString('latin1'); } } catch (e) { if (/limit|exceed|maxOutputLength|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') throw e; pos = endStream + 10; continue; } // Extract text from PDF operators const text = extractTextOps(decoded); if (text.trim()) pages.push(text.trim()); pos = endStream + 10; } return pages.join('\n\n'); } /** * Extract text from PDF content stream operators. * Handles: (text) Tj, [(text)] TJ, Td/Tm for positioning */ function extractTextOps(stream) { const lines = []; let currentLine = ''; // Match BT...ET blocks (text objects) const btBlocks = stream.match(/BT[\s\S]*?ET/g); if (!btBlocks) return ''; for (const block of btBlocks) { // (string) Tj — show string const tjMatches = block.matchAll(/\(([^)]*)\)\s*Tj/g); for (const m of tjMatches) { currentLine += decodePdfString(m[1]); } // [...] TJ — show strings with spacing const tjArrayMatches = block.matchAll(/\[((?:[^[\]]*|\([^)]*\))*)\]\s*TJ/gi); for (const m of tjArrayMatches) { const inner = m[1]; const parts = inner.matchAll(/\(([^)]*)\)|(-?\d+(?:\.\d+)?)/g); for (const p of parts) { if (p[1] !== undefined) { currentLine += decodePdfString(p[1]); } else if (p[2] !== undefined) { const kern = parseFloat(p[2]); if (kern < -100) currentLine += ' '; } } } // Td/TD/Tm — text positioning (new line heuristic) if (/\d+\s+(?:-?\d+(?:\.\d+)?)\s+T[dD]/g.test(block)) { if (currentLine.trim()) { lines.push(currentLine.trim()); currentLine = ''; } } } if (currentLine.trim()) lines.push(currentLine.trim()); return lines.join('\n'); } /** * Decode PDF string escapes: \n, \r, \t, \\, \(, \), octal */ function decodePdfString(s) { return s.replace(/\\([nrtbf()\\]|\d{1,3})/g, (_, c) => { if (c === 'n') return '\n'; if (c === 'r') return '\r'; if (c === 't') return '\t'; if (c === 'b') return '\b'; if (c === 'f') return '\f'; if (c === '(' || c === ')' || c === '\\') return c; return String.fromCharCode(parseInt(c, 8)); }); } /** * Try to extract text from base64-encoded PDF. * @param {string} base64Data - Base64 encoded PDF * @returns {{ text: string, pageCount: number } | null} */ export function tryExtractPdf(base64Data) { try { const buf = Buffer.from(base64Data, 'base64'); if (buf.length < 5 || buf.subarray(0, 5).toString() !== '%PDF-') return null; const text = extractPdfText(buf); if (!text.trim()) { log.warn('PDF has no extractable text layer (scanned/image-only PDF)'); return { text: '', pageCount: 0 }; } const pageCount = (buf.toString('latin1').match(/\/Type\s*\/Page\b/g) || []).length; return { text, pageCount }; } catch (e) { log.warn(`PDF extraction failed: ${e.message}`); if (/exceeds safety limit|maxOutputLength|too large|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') { return { text: 'PDF 内容无法提取', pageCount: 0 }; } return null; } }