Spaces:
Build error
Build error
File size: 5,472 Bytes
dca8ede |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import { readFile } from 'fs/promises';
import { existsSync } from 'fs';
import { basename, extname } from 'path';
import extractPdfData from './pdf-parse-fix.js';
/**
* Text extractor that handles both PDFs and text files
* Always returns some text, never throws exceptions
*/
export async function extractTextFromFile(filePath: string): Promise<string> {
try {
// Check if file exists
if (!existsSync(filePath)) {
throw new Error(`File does not exist: ${filePath}`);
}
// Get file extension
const fileExt = extname(filePath).toLowerCase();
const fileName = basename(filePath);
console.log(`Extracting text from ${fileExt} file: ${fileName}`);
// Read file as buffer
const buffer = await readFile(filePath);
// Handle based on extension
if (fileExt === '.pdf') {
return await extractTextFromPdf(buffer, filePath);
} else if (fileExt === '.docx' || fileExt === '.doc') {
// For DOC/DOCX, we should have already converted to PDF by this point
// but we'll try a basic extraction just in case
console.log(`Attempting to extract text directly from ${fileExt} file: ${fileName}`);
return await extractBasicText(buffer, filePath);
} else {
// For other files, do a naive text extraction
console.log(`Using basic text extraction for ${fileExt} file: ${fileName}`);
return await extractBasicText(buffer, filePath);
}
} catch (error) {
console.error(`Error extracting text from file: ${filePath}`, error);
if (error instanceof Error) {
return `Error extracting text: ${error.message}`;
} else {
return 'Error extracting text: Unknown error';
}
}
}
/**
* Extracts text from a PDF file
* @param buffer Buffer containing the PDF data
* @param filePath The original file path (for logging)
* @returns Extracted text
*/
async function extractTextFromPdf(buffer: Buffer, filePath: string): Promise<string> {
try {
console.log(`Parsing PDF: ${filePath}`);
// Use the fixed PDF parser that doesn't try to access test files
const data = await extractPdfData(buffer, filePath);
if (!data || !data.text) {
console.warn('PDF extraction returned empty or invalid result');
return 'PDF extraction failed, no text content found.';
}
// Clean up the text
let text = data.text.trim();
// If text is too short, it might be a failed extraction
if (text.length < 50) {
console.warn('PDF extraction returned very little text, might be a failed extraction');
return `PDF extraction returned limited text: "${text}". This might indicate a scanned or image-based PDF.`;
}
console.log(`Successfully extracted ${text.length} characters from PDF`);
return text;
} catch (error) {
console.error('Error parsing PDF:', error);
// Provide a helpful error message based on the error
if (error instanceof Error) {
if (error.message.includes('file does not exist') || error.message.includes('ENOENT')) {
return `Error: PDF file not found or inaccessible at ${filePath}`;
} else if (error.message.includes('encrypted')) {
return 'Error: PDF is password protected or encrypted';
} else {
return `Error parsing PDF: ${error.message}`;
}
} else {
return 'Error parsing PDF: Unknown error';
}
}
}
/**
* Attempts to extract text from a buffer in a basic way
* Good for text files or as a fallback for other formats
* @param buffer The file buffer
* @param filePath The original file path (for logging)
* @returns Extracted text
*/
async function extractBasicText(buffer: Buffer, filePath: string): Promise<string> {
try {
// Try UTF-8 first
let text = buffer.toString('utf8');
// Clean up the text - remove non-printable characters
text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');
// Remove excessive whitespace
text = text.replace(/\s+/g, ' ').trim();
// If we got meaningful text, return it
if (text.length > 50) {
console.log(`Extracted ${text.length} characters using basic text extraction`);
return text;
}
// If UTF-8 didn't work well, try Latin1
text = buffer.toString('latin1');
text = text.replace(/[^\x20-\x7E\r\n\t]/g, ' ');
text = text.replace(/\s+/g, ' ').trim();
// If we still don't have meaningful text, try binary extraction
if (text.length < 50) {
console.log('Basic extraction failed, trying binary extraction');
// Look for text patterns in binary data
const fileContent = buffer.toString('binary');
const textMatches = fileContent.match(/[A-Za-z0-9\s.,;:'"!?()-]{10,100}/g);
if (textMatches && textMatches.length > 0) {
text = textMatches.join(' ');
console.log(`Extracted ${text.length} characters using binary extraction`);
} else {
console.warn('Could not extract meaningful text from file');
text = `Could not extract meaningful text from this file format: ${extname(filePath)}`;
}
}
return text;
} catch (error) {
console.error('Error in basic text extraction:', error);
return `Error in text extraction: ${error instanceof Error ? error.message : 'Unknown error'}`;
}
} |