const axios = require('axios'); const fs = require('fs'); const path = require('path'); const { exec } = require('child_process'); const util = require('util'); const execPromise = util.promisify(exec); const Tesseract = require('tesseract.js'); exports.processFile = async (filePath) => { const ext = path.extname(filePath).toLowerCase(); const fileName = path.basename(filePath); const uploadsDir = path.dirname(filePath); let content = ''; try { if (['.png', '.jpg', '.jpeg', '.webp', '.heic'].includes(ext)) { console.log(`[OCR] Processing image with ImageMagick + Tesseract.js: ${fileName}`); let processingPath = filePath; let tempFiles = []; // 1. Convert HEIC to PNG using heif-convert if necessary if (ext === '.heic') { const heicOutputPath = path.join(uploadsDir, `${Date.now()}_converted.png`); console.log('[OCR] Converting HEIC to PNG...'); await execPromise(`heif-convert "${filePath}" "${heicOutputPath}"`); processingPath = heicOutputPath; tempFiles.push(heicOutputPath); } // 2. Pre-process with ImageMagick for better OCR accuracy // (Convert to grayscale, enhance contrast, and sharpen) const processedPath = path.join(uploadsDir, `${Date.now()}_processed.png`); console.log('[OCR] Enhancing image with ImageMagick...'); await execPromise(`convert "${processingPath}" -colorspace gray -type grayscale -contrast-stretch 2x50% -sharpen 0x1 "${processedPath}"`); tempFiles.push(processedPath); // 3. Perform OCR using Tesseract.js (WASM version) // This avoids all system binary and language data path issues console.log('[OCR] Extracting text with Tesseract.js...'); const { data: { text } } = await Tesseract.recognize(processedPath, 'eng', { logger: m => console.log(`[Tesseract.js] ${m.status}: ${Math.round(m.progress * 100)}%`) }); // Cleanup temp files tempFiles.forEach(file => { if (fs.existsSync(file)) fs.unlinkSync(file); }); content = `[Image Analysis of ${fileName}]: ${text.trim() || 'No text detected'}`; } else if (ext === '.pdf') { const pdf = require('pdf-parse'); const dataBuffer = fs.readFileSync(filePath); const data = await pdf(dataBuffer); content = `[PDF Content from ${fileName}]: ${data.text}`; } else if (['.txt', '.js', '.py', '.html', '.css', '.json', '.md'].includes(ext)) { content = `[File Content from ${fileName}]: ${fs.readFileSync(filePath, 'utf8')}`; } else { content = `[File ${fileName} attached, but extension ${ext} is not supported]`; } return content; } catch (err) { console.error('[OCR_ERROR]:', err.message); return `[Error processing image ${fileName}: ${err.message}]`; } };