File size: 5,486 Bytes
2b64d42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | /**
* Zero-dependency PDF text extraction.
*
* Handles PDF 1.x text streams: decompress FlateDecode streams with
* Node.js built-in zlib, then extract text from Tj/TJ operators.
* Not a full PDF parser — designed for text-layer PDFs (reports, docs).
* Scanned PDFs (image-only) return empty text.
*/
import { inflateSync } from 'zlib';
import { log } from './config.js';
const MAX_STREAMS = 200;
const MAX_STREAM_DECODED = 5 * 1024 * 1024;
const MAX_TOTAL_DECODED = 25 * 1024 * 1024;
/**
* Extract text from a PDF buffer.
* @param {Buffer} buf - Raw PDF bytes
* @returns {string} Extracted text, or empty string if no text layer
*/
export function extractPdfText(buf) {
const pages = [];
let streamCount = 0;
let totalDecoded = 0;
// Find all stream...endstream blocks
let pos = 0;
while (pos < buf.length) {
const streamStart = buf.indexOf('stream\n', pos);
if (streamStart === -1) break;
const dataStart = streamStart + 7; // skip "stream\n"
// Handle \r\n after "stream"
const actualStart = buf[streamStart + 6] === 0x0d ? dataStart + 1 : dataStart;
const endStream = buf.indexOf('\nendstream', actualStart);
if (endStream === -1) break;
const streamData = buf.subarray(actualStart, endStream);
streamCount++;
if (streamCount > MAX_STREAMS) throw new Error('PDF stream count exceeds safety limit');
// Check if this stream has FlateDecode by looking back at the dictionary
const dictStart = Math.max(0, streamStart - 500);
const dictText = buf.subarray(dictStart, streamStart).toString('latin1');
const isFlate = dictText.includes('FlateDecode');
let decoded;
try {
if (isFlate) {
const inflated = inflateSync(streamData, { maxOutputLength: MAX_STREAM_DECODED });
if (inflated.length > MAX_STREAM_DECODED) throw new Error('PDF decoded content exceeds safety limit');
totalDecoded += inflated.length;
if (totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit');
decoded = inflated.toString('latin1');
} else {
totalDecoded += streamData.length;
if (streamData.length > MAX_STREAM_DECODED || totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit');
decoded = streamData.toString('latin1');
}
} catch (e) {
if (/limit|exceed|maxOutputLength|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') throw e;
pos = endStream + 10;
continue;
}
// Extract text from PDF operators
const text = extractTextOps(decoded);
if (text.trim()) pages.push(text.trim());
pos = endStream + 10;
}
return pages.join('\n\n');
}
/**
* Extract text from PDF content stream operators.
* Handles: (text) Tj, [(text)] TJ, Td/Tm for positioning
*/
function extractTextOps(stream) {
const lines = [];
let currentLine = '';
// Match BT...ET blocks (text objects)
const btBlocks = stream.match(/BT[\s\S]*?ET/g);
if (!btBlocks) return '';
for (const block of btBlocks) {
// (string) Tj — show string
const tjMatches = block.matchAll(/\(([^)]*)\)\s*Tj/g);
for (const m of tjMatches) {
currentLine += decodePdfString(m[1]);
}
// [...] TJ — show strings with spacing
const tjArrayMatches = block.matchAll(/\[((?:[^[\]]*|\([^)]*\))*)\]\s*TJ/gi);
for (const m of tjArrayMatches) {
const inner = m[1];
const parts = inner.matchAll(/\(([^)]*)\)|(-?\d+(?:\.\d+)?)/g);
for (const p of parts) {
if (p[1] !== undefined) {
currentLine += decodePdfString(p[1]);
} else if (p[2] !== undefined) {
const kern = parseFloat(p[2]);
if (kern < -100) currentLine += ' ';
}
}
}
// Td/TD/Tm — text positioning (new line heuristic)
if (/\d+\s+(?:-?\d+(?:\.\d+)?)\s+T[dD]/g.test(block)) {
if (currentLine.trim()) {
lines.push(currentLine.trim());
currentLine = '';
}
}
}
if (currentLine.trim()) lines.push(currentLine.trim());
return lines.join('\n');
}
/**
* Decode PDF string escapes: \n, \r, \t, \\, \(, \), octal
*/
function decodePdfString(s) {
return s.replace(/\\([nrtbf()\\]|\d{1,3})/g, (_, c) => {
if (c === 'n') return '\n';
if (c === 'r') return '\r';
if (c === 't') return '\t';
if (c === 'b') return '\b';
if (c === 'f') return '\f';
if (c === '(' || c === ')' || c === '\\') return c;
return String.fromCharCode(parseInt(c, 8));
});
}
/**
* Try to extract text from base64-encoded PDF.
* @param {string} base64Data - Base64 encoded PDF
* @returns {{ text: string, pageCount: number } | null}
*/
export function tryExtractPdf(base64Data) {
try {
const buf = Buffer.from(base64Data, 'base64');
if (buf.length < 5 || buf.subarray(0, 5).toString() !== '%PDF-') return null;
const text = extractPdfText(buf);
if (!text.trim()) {
log.warn('PDF has no extractable text layer (scanned/image-only PDF)');
return { text: '', pageCount: 0 };
}
const pageCount = (buf.toString('latin1').match(/\/Type\s*\/Page\b/g) || []).length;
return { text, pageCount };
} catch (e) {
log.warn(`PDF extraction failed: ${e.message}`);
if (/exceeds safety limit|maxOutputLength|too large|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') {
return { text: 'PDF 内容无法提取', pageCount: 0 };
}
return null;
}
}
|