Spaces:

Ac66
/

W

Sleeping

App Files Files Community

W / src /pdf.js

Ac66's picture

Upload folder using huggingface_hub

2b64d42 verified 10 days ago

history blame contribute delete

5.49 kB

	/**
	* Zero-dependency PDF text extraction.
	*
	* Handles PDF 1.x text streams: decompress FlateDecode streams with
	* Node.js built-in zlib, then extract text from Tj/TJ operators.
	* Not a full PDF parser — designed for text-layer PDFs (reports, docs).
	* Scanned PDFs (image-only) return empty text.
	*/

	import { inflateSync } from 'zlib';
	import { log } from './config.js';

	const MAX_STREAMS = 200;
	const MAX_STREAM_DECODED = 5 * 1024 * 1024;
	const MAX_TOTAL_DECODED = 25 * 1024 * 1024;

	/**
	* Extract text from a PDF buffer.
	* @param {Buffer} buf - Raw PDF bytes
	* @returns {string} Extracted text, or empty string if no text layer
	*/
	export function extractPdfText(buf) {
	const pages = [];
	let streamCount = 0;
	let totalDecoded = 0;

	// Find all stream...endstream blocks
	let pos = 0;
	while (pos < buf.length) {
	const streamStart = buf.indexOf('stream\n', pos);
	if (streamStart === -1) break;

	const dataStart = streamStart + 7; // skip "stream\n"
	// Handle \r\n after "stream"
	const actualStart = buf[streamStart + 6] === 0x0d ? dataStart + 1 : dataStart;

	const endStream = buf.indexOf('\nendstream', actualStart);
	if (endStream === -1) break;

	const streamData = buf.subarray(actualStart, endStream);
	streamCount++;
	if (streamCount > MAX_STREAMS) throw new Error('PDF stream count exceeds safety limit');

	// Check if this stream has FlateDecode by looking back at the dictionary
	const dictStart = Math.max(0, streamStart - 500);
	const dictText = buf.subarray(dictStart, streamStart).toString('latin1');
	const isFlate = dictText.includes('FlateDecode');

	let decoded;
	try {
	if (isFlate) {
	const inflated = inflateSync(streamData, { maxOutputLength: MAX_STREAM_DECODED });
	if (inflated.length > MAX_STREAM_DECODED) throw new Error('PDF decoded content exceeds safety limit');
	totalDecoded += inflated.length;
	if (totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit');
	decoded = inflated.toString('latin1');
	} else {
	totalDecoded += streamData.length;
	if (streamData.length > MAX_STREAM_DECODED \|\| totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit');
	decoded = streamData.toString('latin1');
	}
	} catch (e) {
	if (/limit\|exceed\|maxOutputLength\|Buffer larger/i.test(e.message) \|\| e.code === 'ERR_BUFFER_TOO_LARGE') throw e;
	pos = endStream + 10;
	continue;
	}

	// Extract text from PDF operators
	const text = extractTextOps(decoded);
	if (text.trim()) pages.push(text.trim());

	pos = endStream + 10;
	}

	return pages.join('\n\n');
	}

	/**
	* Extract text from PDF content stream operators.
	* Handles: (text) Tj, [(text)] TJ, Td/Tm for positioning
	*/
	function extractTextOps(stream) {
	const lines = [];
	let currentLine = '';

	// Match BT...ET blocks (text objects)
	const btBlocks = stream.match(/BT[\s\S]*?ET/g);
	if (!btBlocks) return '';

	for (const block of btBlocks) {
	// (string) Tj — show string
	const tjMatches = block.matchAll(/\(([^)])\)\sTj/g);
	for (const m of tjMatches) {
	currentLine += decodePdfString(m[1]);
	}

	// [...] TJ — show strings with spacing
	const tjArrayMatches = block.matchAll(/\[((?:[^[\]]\|\([^)]\)))\]\sTJ/gi);
	for (const m of tjArrayMatches) {
	const inner = m[1];
	const parts = inner.matchAll(/\(([^)]*)\)\|(-?\d+(?:\.\d+)?)/g);
	for (const p of parts) {
	if (p[1] !== undefined) {
	currentLine += decodePdfString(p[1]);
	} else if (p[2] !== undefined) {
	const kern = parseFloat(p[2]);
	if (kern < -100) currentLine += ' ';
	}
	}
	}

	// Td/TD/Tm — text positioning (new line heuristic)
	if (/\d+\s+(?:-?\d+(?:\.\d+)?)\s+T[dD]/g.test(block)) {
	if (currentLine.trim()) {
	lines.push(currentLine.trim());
	currentLine = '';
	}
	}
	}

	if (currentLine.trim()) lines.push(currentLine.trim());
	return lines.join('\n');
	}

	/**
	* Decode PDF string escapes: \n, \r, \t, \\, \(, \), octal
	*/
	function decodePdfString(s) {
	return s.replace(/\\([nrtbf()\\]\|\d{1,3})/g, (_, c) => {
	if (c === 'n') return '\n';
	if (c === 'r') return '\r';
	if (c === 't') return '\t';
	if (c === 'b') return '\b';
	if (c === 'f') return '\f';
	if (c === '(' \|\| c === ')' \|\| c === '\\') return c;
	return String.fromCharCode(parseInt(c, 8));
	});
	}

	/**
	* Try to extract text from base64-encoded PDF.
	* @param {string} base64Data - Base64 encoded PDF
	* @returns {{ text: string, pageCount: number } \| null}
	*/
	export function tryExtractPdf(base64Data) {
	try {
	const buf = Buffer.from(base64Data, 'base64');
	if (buf.length < 5 \|\| buf.subarray(0, 5).toString() !== '%PDF-') return null;

	const text = extractPdfText(buf);
	if (!text.trim()) {
	log.warn('PDF has no extractable text layer (scanned/image-only PDF)');
	return { text: '', pageCount: 0 };
	}

	const pageCount = (buf.toString('latin1').match(/\/Type\s*\/Page\b/g) \|\| []).length;
	return { text, pageCount };
	} catch (e) {
	log.warn(`PDF extraction failed: ${e.message}`);
	if (/exceeds safety limit\|maxOutputLength\|too large\|Buffer larger/i.test(e.message) \|\| e.code === 'ERR_BUFFER_TOO_LARGE') {
	return { text: 'PDF 内容无法提取', pageCount: 0 };
	}
	return null;
	}
	}