Spaces:

I2E
/

resume-parser

Build error

File size: 5,472 Bytes

dca8ede

import { readFile } from 'fs/promises';
import { existsSync } from 'fs';
import { basename, extname } from 'path';
import extractPdfData from './pdf-parse-fix.js';

/**

 * Text extractor that handles both PDFs and text files

 * Always returns some text, never throws exceptions

 */
export async function extractTextFromFile(filePath: string): Promise<string> {
  try {
    // Check if file exists
    if (!existsSync(filePath)) {
      throw new Error(`File does not exist: ${filePath}`);
    }

    // Get file extension
    const fileExt = extname(filePath).toLowerCase();
    const fileName = basename(filePath);
    
    console.log(`Extracting text from ${fileExt} file: ${fileName}`);
    
    // Read file as buffer
    const buffer = await readFile(filePath);
    
    // Handle based on extension
    if (fileExt === '.pdf') {
      return await extractTextFromPdf(buffer, filePath);
    } else if (fileExt === '.docx' || fileExt === '.doc') {
      // For DOC/DOCX, we should have already converted to PDF by this point
      // but we'll try a basic extraction just in case
      console.log(`Attempting to extract text directly from ${fileExt} file: ${fileName}`);
      return await extractBasicText(buffer, filePath);
    } else {
      // For other files, do a naive text extraction
      console.log(`Using basic text extraction for ${fileExt} file: ${fileName}`);
      return await extractBasicText(buffer, filePath);
    }
  } catch (error) {
    console.error(`Error extracting text from file: ${filePath}`, error);
    
    if (error instanceof Error) {
      return `Error extracting text: ${error.message}`;
    } else {
      return 'Error extracting text: Unknown error';
    }
  }
}

/**

 * Extracts text from a PDF file

 * @param buffer Buffer containing the PDF data

 * @param filePath The original file path (for logging)

 * @returns Extracted text

 */
async function extractTextFromPdf(buffer: Buffer, filePath: string): Promise<string> {
  try {
    console.log(`Parsing PDF: ${filePath}`);
    
    // Use the fixed PDF parser that doesn't try to access test files
    const data = await extractPdfData(buffer, filePath);
    
    if (!data || !data.text) {
      console.warn('PDF extraction returned empty or invalid result');
      return 'PDF extraction failed, no text content found.';
    }
    
    // Clean up the text
    let text = data.text.trim();
    
    // If text is too short, it might be a failed extraction
    if (text.length < 50) {
      console.warn('PDF extraction returned very little text, might be a failed extraction');
      return `PDF extraction returned limited text: "${text}". This might indicate a scanned or image-based PDF.`;
    }
    
    console.log(`Successfully extracted ${text.length} characters from PDF`);
    return text;
  } catch (error) {
    console.error('Error parsing PDF:', error);
    
    // Provide a helpful error message based on the error
    if (error instanceof Error) {
      if (error.message.includes('file does not exist') || error.message.includes('ENOENT')) {
        return `Error: PDF file not found or inaccessible at ${filePath}`;
      } else if (error.message.includes('encrypted')) {
        return 'Error: PDF is password protected or encrypted';
      } else {
        return `Error parsing PDF: ${error.message}`;
      }
    } else {
      return 'Error parsing PDF: Unknown error';
    }
  }
}

/**

 * Attempts to extract text from a buffer in a basic way

 * Good for text files or as a fallback for other formats

 * @param buffer The file buffer

 * @param filePath The original file path (for logging)

 * @returns Extracted text

 */
async function extractBasicText(buffer: Buffer, filePath: string): Promise<string> {
  try {
    // Try UTF-8 first
    let text = buffer.toString('utf8');
    
    // Clean up the text - remove non-printable characters
    text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');
    
    // Remove excessive whitespace
    text = text.replace(/\s+/g, ' ').trim();
    
    // If we got meaningful text, return it
    if (text.length > 50) {
      console.log(`Extracted ${text.length} characters using basic text extraction`);
      return text;
    }
    
    // If UTF-8 didn't work well, try Latin1
    text = buffer.toString('latin1');
    text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');
    text = text.replace(/\s+/g, ' ').trim();
    
    // If we still don't have meaningful text, try binary extraction
    if (text.length < 50) {
      console.log('Basic extraction failed, trying binary extraction');
      
      // Look for text patterns in binary data
      const fileContent = buffer.toString('binary');
      const textMatches = fileContent.match(/[A-Za-z0-9\s.,;:'"!?()-]{10,100}/g);
      
      if (textMatches && textMatches.length > 0) {
        text = textMatches.join(' ');
        console.log(`Extracted ${text.length} characters using binary extraction`);
      } else {
        console.warn('Could not extract meaningful text from file');
        text = `Could not extract meaningful text from this file format: ${extname(filePath)}`;
      }
    }
    
    return text;
  } catch (error) {
    console.error('Error in basic text extraction:', error);
    return `Error in text extraction: ${error instanceof Error ? error.message : 'Unknown error'}`;
  }
}