Spaces:

Ac66
/

W

Sleeping

File size: 5,486 Bytes

2b64d42

/**
 * Zero-dependency PDF text extraction.
 *
 * Handles PDF 1.x text streams: decompress FlateDecode streams with
 * Node.js built-in zlib, then extract text from Tj/TJ operators.
 * Not a full PDF parser — designed for text-layer PDFs (reports, docs).
 * Scanned PDFs (image-only) return empty text.
 */

import { inflateSync } from 'zlib';
import { log } from './config.js';

const MAX_STREAMS = 200;
const MAX_STREAM_DECODED = 5 * 1024 * 1024;
const MAX_TOTAL_DECODED = 25 * 1024 * 1024;

/**
 * Extract text from a PDF buffer.
 * @param {Buffer} buf - Raw PDF bytes
 * @returns {string} Extracted text, or empty string if no text layer
 */
export function extractPdfText(buf) {
  const pages = [];
  let streamCount = 0;
  let totalDecoded = 0;

  // Find all stream...endstream blocks
  let pos = 0;
  while (pos < buf.length) {
    const streamStart = buf.indexOf('stream\n', pos);
    if (streamStart === -1) break;

    const dataStart = streamStart + 7; // skip "stream\n"
    // Handle \r\n after "stream"
    const actualStart = buf[streamStart + 6] === 0x0d ? dataStart + 1 : dataStart;

    const endStream = buf.indexOf('\nendstream', actualStart);
    if (endStream === -1) break;

    const streamData = buf.subarray(actualStart, endStream);
    streamCount++;
    if (streamCount > MAX_STREAMS) throw new Error('PDF stream count exceeds safety limit');

    // Check if this stream has FlateDecode by looking back at the dictionary
    const dictStart = Math.max(0, streamStart - 500);
    const dictText = buf.subarray(dictStart, streamStart).toString('latin1');
    const isFlate = dictText.includes('FlateDecode');

    let decoded;
    try {
      if (isFlate) {
        const inflated = inflateSync(streamData, { maxOutputLength: MAX_STREAM_DECODED });
        if (inflated.length > MAX_STREAM_DECODED) throw new Error('PDF decoded content exceeds safety limit');
        totalDecoded += inflated.length;
        if (totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit');
        decoded = inflated.toString('latin1');
      } else {
        totalDecoded += streamData.length;
        if (streamData.length > MAX_STREAM_DECODED || totalDecoded > MAX_TOTAL_DECODED) throw new Error('PDF decoded content exceeds safety limit');
        decoded = streamData.toString('latin1');
      }
    } catch (e) {
      if (/limit|exceed|maxOutputLength|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') throw e;
      pos = endStream + 10;
      continue;
    }

    // Extract text from PDF operators
    const text = extractTextOps(decoded);
    if (text.trim()) pages.push(text.trim());

    pos = endStream + 10;
  }

  return pages.join('\n\n');
}

/**
 * Extract text from PDF content stream operators.
 * Handles: (text) Tj, [(text)] TJ, Td/Tm for positioning
 */
function extractTextOps(stream) {
  const lines = [];
  let currentLine = '';

  // Match BT...ET blocks (text objects)
  const btBlocks = stream.match(/BT[\s\S]*?ET/g);
  if (!btBlocks) return '';

  for (const block of btBlocks) {
    // (string) Tj — show string
    const tjMatches = block.matchAll(/\(([^)]*)\)\s*Tj/g);
    for (const m of tjMatches) {
      currentLine += decodePdfString(m[1]);
    }

    // [...] TJ — show strings with spacing
    const tjArrayMatches = block.matchAll(/\[((?:[^[\]]*|\([^)]*\))*)\]\s*TJ/gi);
    for (const m of tjArrayMatches) {
      const inner = m[1];
      const parts = inner.matchAll(/\(([^)]*)\)|(-?\d+(?:\.\d+)?)/g);
      for (const p of parts) {
        if (p[1] !== undefined) {
          currentLine += decodePdfString(p[1]);
        } else if (p[2] !== undefined) {
          const kern = parseFloat(p[2]);
          if (kern < -100) currentLine += ' ';
        }
      }
    }

    // Td/TD/Tm — text positioning (new line heuristic)
    if (/\d+\s+(?:-?\d+(?:\.\d+)?)\s+T[dD]/g.test(block)) {
      if (currentLine.trim()) {
        lines.push(currentLine.trim());
        currentLine = '';
      }
    }
  }

  if (currentLine.trim()) lines.push(currentLine.trim());
  return lines.join('\n');
}

/**
 * Decode PDF string escapes: \n, \r, \t, \\, \(, \), octal
 */
function decodePdfString(s) {
  return s.replace(/\\([nrtbf()\\]|\d{1,3})/g, (_, c) => {
    if (c === 'n') return '\n';
    if (c === 'r') return '\r';
    if (c === 't') return '\t';
    if (c === 'b') return '\b';
    if (c === 'f') return '\f';
    if (c === '(' || c === ')' || c === '\\') return c;
    return String.fromCharCode(parseInt(c, 8));
  });
}

/**
 * Try to extract text from base64-encoded PDF.
 * @param {string} base64Data - Base64 encoded PDF
 * @returns {{ text: string, pageCount: number } | null}
 */
export function tryExtractPdf(base64Data) {
  try {
    const buf = Buffer.from(base64Data, 'base64');
    if (buf.length < 5 || buf.subarray(0, 5).toString() !== '%PDF-') return null;

    const text = extractPdfText(buf);
    if (!text.trim()) {
      log.warn('PDF has no extractable text layer (scanned/image-only PDF)');
      return { text: '', pageCount: 0 };
    }

    const pageCount = (buf.toString('latin1').match(/\/Type\s*\/Page\b/g) || []).length;
    return { text, pageCount };
  } catch (e) {
    log.warn(`PDF extraction failed: ${e.message}`);
    if (/exceeds safety limit|maxOutputLength|too large|Buffer larger/i.test(e.message) || e.code === 'ERR_BUFFER_TOO_LARGE') {
      return { text: 'PDF 内容无法提取', pageCount: 0 };
    }
    return null;
  }
}