| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import { inflateSync } from 'zlib'; |
| import { log } from './config.js'; |
|
|
| const MAX_STREAMS = 200; |
| const MAX_STREAM_DECODED = 5 * 1024 * 1024; |
| const MAX_TOTAL_DECODED = 25 * 1024 * 1024; |
|
|
| |
| |
| |
| |
| |
| export function extractPdfText(buf) { |
| const pages = []; |
| let streamCount = 0; |
| let totalDecoded = 0; |
|
|
| |
| let pos = 0; |
| while (pos < buf.length) { |
| const streamStart = buf.indexOf('stream\n', pos); |
| if (streamStart === -1) break; |
|
|
| const dataStart = streamStart + 7; |
| |
| const actualStart = buf[streamStart + 6] === 0x0d ? dataStart + 1 : dataStart; |
|
|
| const endStream = buf.indexOf('\nendstream', actualStart); |
| if (endStream === -1) break; |
|
|
| const streamData = buf.subarray(actualStart, endStream); |
| streamCount++; |
| if (streamCount > MAX_STREAMS) throw new Error('PDF stream count exceeds safety limit'); |
|
|
| |
| const dictStart = Math.max(0, streamStart - 500); |
| const dictText = buf.subarray(dictStart, streamStart).toString('latin1'); |
| const isFlate = dictText.includes('FlateDecode'); |
|
|
| let decoded; |
| try { |
| if (isFlate) { |
| const inflated = inflateSync(streamData, { maxOutputLength: MAX_STREAM_DECODED }); |
| if (inflated.length > MAX_STREAM_DECODED) throw new Error('PDF decoded content exceeds safety limit'); |
| totalDecoded += inflated.length; |
| if (totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit'); |
| decoded = inflated.toString('latin1'); |
| } else { |
| totalDecoded += streamData.length; |
| if (streamData.length > MAX_STREAM_DECODED || totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit'); |
| decoded = streamData.toString('latin1'); |
| } |
| } catch (e) { |
| if (/limit|exceed|maxOutputLength|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') throw e; |
| pos = endStream + 10; |
| continue; |
| } |
|
|
| |
| const text = extractTextOps(decoded); |
| if (text.trim()) pages.push(text.trim()); |
|
|
| pos = endStream + 10; |
| } |
|
|
| return pages.join('\n\n'); |
| } |
|
|
| |
| |
| |
| |
| function extractTextOps(stream) { |
| const lines = []; |
| let currentLine = ''; |
|
|
| |
| const btBlocks = stream.match(/BT[\s\S]*?ET/g); |
| if (!btBlocks) return ''; |
|
|
| for (const block of btBlocks) { |
| |
| const tjMatches = block.matchAll(/\(([^)]*)\)\s*Tj/g); |
| for (const m of tjMatches) { |
| currentLine += decodePdfString(m[1]); |
| } |
|
|
| |
| const tjArrayMatches = block.matchAll(/\[((?:[^[\]]*|\([^)]*\))*)\]\s*TJ/gi); |
| for (const m of tjArrayMatches) { |
| const inner = m[1]; |
| const parts = inner.matchAll(/\(([^)]*)\)|(-?\d+(?:\.\d+)?)/g); |
| for (const p of parts) { |
| if (p[1] !== undefined) { |
| currentLine += decodePdfString(p[1]); |
| } else if (p[2] !== undefined) { |
| const kern = parseFloat(p[2]); |
| if (kern < -100) currentLine += ' '; |
| } |
| } |
| } |
|
|
| |
| if (/\d+\s+(?:-?\d+(?:\.\d+)?)\s+T[dD]/g.test(block)) { |
| if (currentLine.trim()) { |
| lines.push(currentLine.trim()); |
| currentLine = ''; |
| } |
| } |
| } |
|
|
| if (currentLine.trim()) lines.push(currentLine.trim()); |
| return lines.join('\n'); |
| } |
|
|
| |
| |
| |
| function decodePdfString(s) { |
| return s.replace(/\\([nrtbf()\\]|\d{1,3})/g, (_, c) => { |
| if (c === 'n') return '\n'; |
| if (c === 'r') return '\r'; |
| if (c === 't') return '\t'; |
| if (c === 'b') return '\b'; |
| if (c === 'f') return '\f'; |
| if (c === '(' || c === ')' || c === '\\') return c; |
| return String.fromCharCode(parseInt(c, 8)); |
| }); |
| } |
|
|
| |
| |
| |
| |
| |
| export function tryExtractPdf(base64Data) { |
| try { |
| const buf = Buffer.from(base64Data, 'base64'); |
| if (buf.length < 5 || buf.subarray(0, 5).toString() !== '%PDF-') return null; |
|
|
| const text = extractPdfText(buf); |
| if (!text.trim()) { |
| log.warn('PDF has no extractable text layer (scanned/image-only PDF)'); |
| return { text: '', pageCount: 0 }; |
| } |
|
|
| const pageCount = (buf.toString('latin1').match(/\/Type\s*\/Page\b/g) || []).length; |
| return { text, pageCount }; |
| } catch (e) { |
| log.warn(`PDF extraction failed: ${e.message}`); |
| if (/exceeds safety limit|maxOutputLength|too large|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') { |
| return { text: 'PDF 内容无法提取', pageCount: 0 }; |
| } |
| return null; |
| } |
| } |
|
|