File size: 5,472 Bytes
dca8ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import { readFile } from 'fs/promises';
import { existsSync } from 'fs';
import { basename, extname } from 'path';
import extractPdfData from './pdf-parse-fix.js';

/**

 * Text extractor that handles both PDFs and text files

 * Always returns some text, never throws exceptions

 */
export async function extractTextFromFile(filePath: string): Promise<string> {
  try {
    // Check if file exists
    if (!existsSync(filePath)) {
      throw new Error(`File does not exist: ${filePath}`);
    }

    // Get file extension
    const fileExt = extname(filePath).toLowerCase();
    const fileName = basename(filePath);
    
    console.log(`Extracting text from ${fileExt} file: ${fileName}`);
    
    // Read file as buffer
    const buffer = await readFile(filePath);
    
    // Handle based on extension
    if (fileExt === '.pdf') {
      return await extractTextFromPdf(buffer, filePath);
    } else if (fileExt === '.docx' || fileExt === '.doc') {
      // For DOC/DOCX, we should have already converted to PDF by this point
      // but we'll try a basic extraction just in case
      console.log(`Attempting to extract text directly from ${fileExt} file: ${fileName}`);
      return await extractBasicText(buffer, filePath);
    } else {
      // For other files, do a naive text extraction
      console.log(`Using basic text extraction for ${fileExt} file: ${fileName}`);
      return await extractBasicText(buffer, filePath);
    }
  } catch (error) {
    console.error(`Error extracting text from file: ${filePath}`, error);
    
    if (error instanceof Error) {
      return `Error extracting text: ${error.message}`;
    } else {
      return 'Error extracting text: Unknown error';
    }
  }
}

/**

 * Extracts text from a PDF file

 * @param buffer Buffer containing the PDF data

 * @param filePath The original file path (for logging)

 * @returns Extracted text

 */
async function extractTextFromPdf(buffer: Buffer, filePath: string): Promise<string> {
  try {
    console.log(`Parsing PDF: ${filePath}`);
    
    // Use the fixed PDF parser that doesn't try to access test files
    const data = await extractPdfData(buffer, filePath);
    
    if (!data || !data.text) {
      console.warn('PDF extraction returned empty or invalid result');
      return 'PDF extraction failed, no text content found.';
    }
    
    // Clean up the text
    let text = data.text.trim();
    
    // If text is too short, it might be a failed extraction
    if (text.length < 50) {
      console.warn('PDF extraction returned very little text, might be a failed extraction');
      return `PDF extraction returned limited text: "${text}". This might indicate a scanned or image-based PDF.`;
    }
    
    console.log(`Successfully extracted ${text.length} characters from PDF`);
    return text;
  } catch (error) {
    console.error('Error parsing PDF:', error);
    
    // Provide a helpful error message based on the error
    if (error instanceof Error) {
      if (error.message.includes('file does not exist') || error.message.includes('ENOENT')) {
        return `Error: PDF file not found or inaccessible at ${filePath}`;
      } else if (error.message.includes('encrypted')) {
        return 'Error: PDF is password protected or encrypted';
      } else {
        return `Error parsing PDF: ${error.message}`;
      }
    } else {
      return 'Error parsing PDF: Unknown error';
    }
  }
}

/**

 * Attempts to extract text from a buffer in a basic way

 * Good for text files or as a fallback for other formats

 * @param buffer The file buffer

 * @param filePath The original file path (for logging)

 * @returns Extracted text

 */
async function extractBasicText(buffer: Buffer, filePath: string): Promise<string> {
  try {
    // Try UTF-8 first
    let text = buffer.toString('utf8');
    
    // Clean up the text - remove non-printable characters
    text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');
    
    // Remove excessive whitespace
    text = text.replace(/\s+/g, ' ').trim();
    
    // If we got meaningful text, return it
    if (text.length > 50) {
      console.log(`Extracted ${text.length} characters using basic text extraction`);
      return text;
    }
    
    // If UTF-8 didn't work well, try Latin1
    text = buffer.toString('latin1');
    text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');
    text = text.replace(/\s+/g, ' ').trim();
    
    // If we still don't have meaningful text, try binary extraction
    if (text.length < 50) {
      console.log('Basic extraction failed, trying binary extraction');
      
      // Look for text patterns in binary data
      const fileContent = buffer.toString('binary');
      const textMatches = fileContent.match(/[A-Za-z0-9\s.,;:'"!?()-]{10,100}/g);
      
      if (textMatches && textMatches.length > 0) {
        text = textMatches.join(' ');
        console.log(`Extracted ${text.length} characters using binary extraction`);
      } else {
        console.warn('Could not extract meaningful text from file');
        text = `Could not extract meaningful text from this file format: ${extname(filePath)}`;
      }
    }
    
    return text;
  } catch (error) {
    console.error('Error in basic text extraction:', error);
    return `Error in text extraction: ${error instanceof Error ? error.message : 'Unknown error'}`;
  }
}