Spaces:

I2E
/

resume-parser

Build error

App Files Files Community

resume-parser / lib /simple-text-parser.ts

PPSA

Upload 235 files

dca8ede verified 10 months ago

raw

history blame contribute delete

5.47 kB

	import { readFile } from 'fs/promises';
	import { existsSync } from 'fs';
	import { basename, extname } from 'path';
	import extractPdfData from './pdf-parse-fix.js';

	/**
	* Text extractor that handles both PDFs and text files
	* Always returns some text, never throws exceptions
	*/
	export async function extractTextFromFile(filePath: string): Promise<string> {
	try {
	// Check if file exists
	if (!existsSync(filePath)) {
	throw new Error(`File does not exist: ${filePath}`);
	}

	// Get file extension
	const fileExt = extname(filePath).toLowerCase();
	const fileName = basename(filePath);

	console.log(`Extracting text from ${fileExt} file: ${fileName}`);

	// Read file as buffer
	const buffer = await readFile(filePath);

	// Handle based on extension
	if (fileExt === '.pdf') {
	return await extractTextFromPdf(buffer, filePath);
	} else if (fileExt === '.docx' \|\| fileExt === '.doc') {
	// For DOC/DOCX, we should have already converted to PDF by this point
	// but we'll try a basic extraction just in case
	console.log(`Attempting to extract text directly from ${fileExt} file: ${fileName}`);
	return await extractBasicText(buffer, filePath);
	} else {
	// For other files, do a naive text extraction
	console.log(`Using basic text extraction for ${fileExt} file: ${fileName}`);
	return await extractBasicText(buffer, filePath);
	}
	} catch (error) {
	console.error(`Error extracting text from file: ${filePath}`, error);

	if (error instanceof Error) {
	return `Error extracting text: ${error.message}`;
	} else {
	return 'Error extracting text: Unknown error';
	}
	}
	}

	/**
	* Extracts text from a PDF file
	* @param buffer Buffer containing the PDF data
	* @param filePath The original file path (for logging)
	* @returns Extracted text
	*/
	async function extractTextFromPdf(buffer: Buffer, filePath: string): Promise<string> {
	try {
	console.log(`Parsing PDF: ${filePath}`);

	// Use the fixed PDF parser that doesn't try to access test files
	const data = await extractPdfData(buffer, filePath);

	if (!data \|\| !data.text) {
	console.warn('PDF extraction returned empty or invalid result');
	return 'PDF extraction failed, no text content found.';
	}

	// Clean up the text
	let text = data.text.trim();

	// If text is too short, it might be a failed extraction
	if (text.length < 50) {
	console.warn('PDF extraction returned very little text, might be a failed extraction');
	return `PDF extraction returned limited text: "${text}". This might indicate a scanned or image-based PDF.`;
	}

	console.log(`Successfully extracted ${text.length} characters from PDF`);
	return text;
	} catch (error) {
	console.error('Error parsing PDF:', error);

	// Provide a helpful error message based on the error
	if (error instanceof Error) {
	if (error.message.includes('file does not exist') \|\| error.message.includes('ENOENT')) {
	return `Error: PDF file not found or inaccessible at ${filePath}`;
	} else if (error.message.includes('encrypted')) {
	return 'Error: PDF is password protected or encrypted';
	} else {
	return `Error parsing PDF: ${error.message}`;
	}
	} else {
	return 'Error parsing PDF: Unknown error';
	}
	}
	}

	/**
	* Attempts to extract text from a buffer in a basic way
	* Good for text files or as a fallback for other formats
	* @param buffer The file buffer
	* @param filePath The original file path (for logging)
	* @returns Extracted text
	*/
	async function extractBasicText(buffer: Buffer, filePath: string): Promise<string> {
	try {
	// Try UTF-8 first
	let text = buffer.toString('utf8');

	// Clean up the text - remove non-printable characters
	text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');

	// Remove excessive whitespace
	text = text.replace(/\s+/g, ' ').trim();

	// If we got meaningful text, return it
	if (text.length > 50) {
	console.log(`Extracted ${text.length} characters using basic text extraction`);
	return text;
	}

	// If UTF-8 didn't work well, try Latin1
	text = buffer.toString('latin1');
	text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');
	text = text.replace(/\s+/g, ' ').trim();

	// If we still don't have meaningful text, try binary extraction
	if (text.length < 50) {
	console.log('Basic extraction failed, trying binary extraction');

	// Look for text patterns in binary data
	const fileContent = buffer.toString('binary');
	const textMatches = fileContent.match(/[A-Za-z0-9\s.,;:'"!?()-]{10,100}/g);

	if (textMatches && textMatches.length > 0) {
	text = textMatches.join(' ');
	console.log(`Extracted ${text.length} characters using binary extraction`);
	} else {
	console.warn('Could not extract meaningful text from file');
	text = `Could not extract meaningful text from this file format: ${extname(filePath)}`;
	}
	}

	return text;
	} catch (error) {
	console.error('Error in basic text extraction:', error);
	return `Error in text extraction: ${error instanceof Error ? error.message : 'Unknown error'}`;
	}
	}