Spaces:
Build error
Build error
| import { readFile } from 'fs/promises'; | |
| import { existsSync } from 'fs'; | |
| import { basename, extname } from 'path'; | |
| import extractPdfData from './pdf-parse-fix.js'; | |
| /** | |
| * Text extractor that handles both PDFs and text files | |
| * Always returns some text, never throws exceptions | |
| */ | |
| export async function extractTextFromFile(filePath: string): Promise<string> { | |
| try { | |
| // Check if file exists | |
| if (!existsSync(filePath)) { | |
| throw new Error(`File does not exist: ${filePath}`); | |
| } | |
| // Get file extension | |
| const fileExt = extname(filePath).toLowerCase(); | |
| const fileName = basename(filePath); | |
| console.log(`Extracting text from ${fileExt} file: ${fileName}`); | |
| // Read file as buffer | |
| const buffer = await readFile(filePath); | |
| // Handle based on extension | |
| if (fileExt === '.pdf') { | |
| return await extractTextFromPdf(buffer, filePath); | |
| } else if (fileExt === '.docx' || fileExt === '.doc') { | |
| // For DOC/DOCX, we should have already converted to PDF by this point | |
| // but we'll try a basic extraction just in case | |
| console.log(`Attempting to extract text directly from ${fileExt} file: ${fileName}`); | |
| return await extractBasicText(buffer, filePath); | |
| } else { | |
| // For other files, do a naive text extraction | |
| console.log(`Using basic text extraction for ${fileExt} file: ${fileName}`); | |
| return await extractBasicText(buffer, filePath); | |
| } | |
| } catch (error) { | |
| console.error(`Error extracting text from file: ${filePath}`, error); | |
| if (error instanceof Error) { | |
| return `Error extracting text: ${error.message}`; | |
| } else { | |
| return 'Error extracting text: Unknown error'; | |
| } | |
| } | |
| } | |
| /** | |
| * Extracts text from a PDF file | |
| * @param buffer Buffer containing the PDF data | |
| * @param filePath The original file path (for logging) | |
| * @returns Extracted text | |
| */ | |
| async function extractTextFromPdf(buffer: Buffer, filePath: string): Promise<string> { | |
| try { | |
| console.log(`Parsing PDF: ${filePath}`); | |
| // Use the fixed PDF parser that doesn't try to access test files | |
| const data = await extractPdfData(buffer, filePath); | |
| if (!data || !data.text) { | |
| console.warn('PDF extraction returned empty or invalid result'); | |
| return 'PDF extraction failed, no text content found.'; | |
| } | |
| // Clean up the text | |
| let text = data.text.trim(); | |
| // If text is too short, it might be a failed extraction | |
| if (text.length < 50) { | |
| console.warn('PDF extraction returned very little text, might be a failed extraction'); | |
| return `PDF extraction returned limited text: "${text}". This might indicate a scanned or image-based PDF.`; | |
| } | |
| console.log(`Successfully extracted ${text.length} characters from PDF`); | |
| return text; | |
| } catch (error) { | |
| console.error('Error parsing PDF:', error); | |
| // Provide a helpful error message based on the error | |
| if (error instanceof Error) { | |
| if (error.message.includes('file does not exist') || error.message.includes('ENOENT')) { | |
| return `Error: PDF file not found or inaccessible at ${filePath}`; | |
| } else if (error.message.includes('encrypted')) { | |
| return 'Error: PDF is password protected or encrypted'; | |
| } else { | |
| return `Error parsing PDF: ${error.message}`; | |
| } | |
| } else { | |
| return 'Error parsing PDF: Unknown error'; | |
| } | |
| } | |
| } | |
| /** | |
| * Attempts to extract text from a buffer in a basic way | |
| * Good for text files or as a fallback for other formats | |
| * @param buffer The file buffer | |
| * @param filePath The original file path (for logging) | |
| * @returns Extracted text | |
| */ | |
| async function extractBasicText(buffer: Buffer, filePath: string): Promise<string> { | |
| try { | |
| // Try UTF-8 first | |
| let text = buffer.toString('utf8'); | |
| // Clean up the text - remove non-printable characters | |
| text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' '); | |
| // Remove excessive whitespace | |
| text = text.replace(/\s+/g, ' ').trim(); | |
| // If we got meaningful text, return it | |
| if (text.length > 50) { | |
| console.log(`Extracted ${text.length} characters using basic text extraction`); | |
| return text; | |
| } | |
| // If UTF-8 didn't work well, try Latin1 | |
| text = buffer.toString('latin1'); | |
| text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' '); | |
| text = text.replace(/\s+/g, ' ').trim(); | |
| // If we still don't have meaningful text, try binary extraction | |
| if (text.length < 50) { | |
| console.log('Basic extraction failed, trying binary extraction'); | |
| // Look for text patterns in binary data | |
| const fileContent = buffer.toString('binary'); | |
| const textMatches = fileContent.match(/[A-Za-z0-9\s.,;:'"!?()-]{10,100}/g); | |
| if (textMatches && textMatches.length > 0) { | |
| text = textMatches.join(' '); | |
| console.log(`Extracted ${text.length} characters using binary extraction`); | |
| } else { | |
| console.warn('Could not extract meaningful text from file'); | |
| text = `Could not extract meaningful text from this file format: ${extname(filePath)}`; | |
| } | |
| } | |
| return text; | |
| } catch (error) { | |
| console.error('Error in basic text extraction:', error); | |
| return `Error in text extraction: ${error instanceof Error ? error.message : 'Unknown error'}`; | |
| } | |
| } |