File size: 1,318 Bytes
dca8ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import { readFile } from 'fs/promises';
import { existsSync } from 'fs';
import pdfParseFix from './pdf-parse-fix.js';

/**

 * Extract text from a PDF file

 */
export async function extractTextFromPdfFile(pdfPath: string): Promise<string> {
  console.log("=== Starting PDF text extraction ===");
  
  try {
    console.log("PDF path:", pdfPath);
    
    if (!existsSync(pdfPath)) {
      console.error("PDF file does not exist:", pdfPath);
      throw new Error(`PDF file does not exist: ${pdfPath}`);
    }
    
    console.log("Reading file...");
    const dataBuffer = await readFile(pdfPath);
    console.log(`Read ${dataBuffer.length} bytes`);
    
    console.log("Parsing PDF...");
    const data = await pdfParseFix(dataBuffer);
    console.log("PDF parsed successfully");
    
    if (!data.text || data.text.length === 0) {
      console.warn("No text extracted from PDF");
      return "";
    }
    
    console.log(`Extracted ${data.text.length} characters of text`);
    return data.text;
  } catch (error) {
    console.error("Error extracting text from PDF file:", error);
    if (error instanceof Error) {
      throw new Error(`Failed to extract text from PDF file: ${error.message}`);
    }
    throw new Error("Failed to extract text from PDF file");
  }
}