File size: 2,128 Bytes
dca8ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import * as pdfjsLib from 'pdfjs-dist';
import { join } from 'path';
import { existsSync } from 'fs';
import { mkdir } from 'fs/promises';

// Configure PDF.js worker
const WORKER_SRC = join(process.cwd(), 'node_modules', 'pdfjs-dist', 'build', 'pdf.worker.js');
if (typeof window === 'undefined') {
  pdfjsLib.GlobalWorkerOptions.workerSrc = WORKER_SRC;
}

/**

 * Custom PDF parser that uses PDF.js

 * @param dataBuffer PDF file buffer

 * @returns Parsed data with text content

 */
export default async function customPdfParse(dataBuffer: Buffer) {
  try {
    console.log("Loading PDF document with PDF.js");
    
    // Create a temp directory for PDF.js files if needed
    const tempDir = join(process.cwd(), 'temp');
    if (!existsSync(tempDir)) {
      await mkdir(tempDir, { recursive: true });
    }
    
    // Load the PDF document
    const loadingTask = pdfjsLib.getDocument({ data: dataBuffer });
    const pdfDocument = await loadingTask.promise;
    
    console.log(`PDF loaded with ${pdfDocument.numPages} pages`);
    
    // Extract text from all pages
    let fullText = '';
    
    for (let i = 1; i <= pdfDocument.numPages; i++) {
      console.log(`Processing page ${i}/${pdfDocument.numPages}`);
      const page = await pdfDocument.getPage(i);
      const textContent = await page.getTextContent();
      
      // Concatenate all items' text
      const pageText = textContent.items
        .map(item => 'str' in item ? item.str : '')
        .join(' ');
        
      fullText += pageText + '\n';
    }
    
    console.log(`Extracted ${fullText.length} characters of text`);
    
    // Return data in a format compatible with the original pdf-parse
    return {
      text: fullText,
      metadata: {
        info: await pdfDocument.getMetadata(),
        pageInfo: {
          pageCount: pdfDocument.numPages
        }
      },
      numpages: pdfDocument.numPages,
      numrender: pdfDocument.numPages,
      version: '1.0.0'
    };
  } catch (error) {
    console.error('Error parsing PDF:', error);
    throw error;
  }
}