File size: 2,609 Bytes
dca8ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import * as pdfjs from 'pdfjs-dist';
import { join } from 'path';
import { readFile } from 'fs/promises';
import { existsSync } from 'fs';

// Configure the PDF.js worker
const WORKER_PATH = join(process.cwd(), 'node_modules', 'pdfjs-dist', 'build', 'pdf.worker.js');
if (typeof window === 'undefined') {
  pdfjs.GlobalWorkerOptions.workerSrc = WORKER_PATH;
}

/**

 * A simplified PDF text extractor using PDF.js

 */
export async function extractTextFromPdfFile(pdfPath: string): Promise<string> {
  console.log("=== Starting simplified PDF text extraction ===");
  
  try {
    console.log("PDF path:", pdfPath);
    
    if (!existsSync(pdfPath)) {
      console.error("PDF file does not exist:", pdfPath);
      throw new Error(`PDF file does not exist: ${pdfPath}`);
    }
    
    console.log("Reading file...");
    const data = await readFile(pdfPath);
    console.log(`Read ${data.length} bytes`);
    
    return await extractTextFromPdfBuffer(data);
  } catch (error) {
    console.error("Error extracting text from PDF file:", error);
    if (error instanceof Error) {
      throw new Error(`Failed to extract text from PDF file: ${error.message}`);
    }
    throw new Error("Failed to extract text from PDF file");
  }
}

/**

 * Extract text from a PDF buffer using PDF.js

 */
export async function extractTextFromPdfBuffer(data: Buffer): Promise<string> {
  try {
    console.log(`Processing PDF buffer of size ${data.length} bytes`);
    
    // Load the PDF document
    const loadingTask = pdfjs.getDocument({ data });
    const pdfDocument = await loadingTask.promise;
    console.log(`PDF loaded successfully with ${pdfDocument.numPages} pages`);
    
    let fullText = '';
    
    // Process each page
    for (let i = 1; i <= pdfDocument.numPages; i++) {
      console.log(`Processing page ${i}/${pdfDocument.numPages}`);
      
      const page = await pdfDocument.getPage(i);
      const textContent = await page.getTextContent();
      
      // Extract text from page
      const pageText = textContent.items
        .map(item => 'str' in item ? item.str : '')
        .join(' ');
      
      fullText += pageText + '\n\n';
    }
    
    console.log(`Extracted ${fullText.length} characters of text`);
    return fullText;
  } catch (error) {
    console.error("Error extracting text from PDF buffer:", error);
    if (error instanceof Error) {
      throw new Error(`Failed to extract text from PDF buffer: ${error.message}`);
    }
    throw new Error("Failed to extract text from PDF buffer");
  }
}