File size: 2,831 Bytes
1dc8372 1418755 1dc8372 1418755 1dc8372 ed58206 1418755 1dc8372 1418755 2690ab5 1418755 6644e5d 2690ab5 1418755 1dc8372 1418755 1dc8372 1418755 1dc8372 1418755 1dc8372 1418755 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
const axios = require('axios');
const fs = require('fs');
const path = require('path');
const { exec } = require('child_process');
const util = require('util');
const execPromise = util.promisify(exec);
const Tesseract = require('tesseract.js');
exports.processFile = async (filePath) => {
const ext = path.extname(filePath).toLowerCase();
const fileName = path.basename(filePath);
const uploadsDir = path.dirname(filePath);
let content = '';
try {
if (['.png', '.jpg', '.jpeg', '.webp', '.heic'].includes(ext)) {
console.log(`[OCR] Processing image with ImageMagick + Tesseract.js: ${fileName}`);
let processingPath = filePath;
let tempFiles = [];
// 1. Convert HEIC to PNG using heif-convert if necessary
if (ext === '.heic') {
const heicOutputPath = path.join(uploadsDir, `${Date.now()}_converted.png`);
console.log('[OCR] Converting HEIC to PNG...');
await execPromise(`heif-convert "${filePath}" "${heicOutputPath}"`);
processingPath = heicOutputPath;
tempFiles.push(heicOutputPath);
}
// 2. Pre-process with ImageMagick for better OCR accuracy
// (Convert to grayscale, enhance contrast, and sharpen)
const processedPath = path.join(uploadsDir, `${Date.now()}_processed.png`);
console.log('[OCR] Enhancing image with ImageMagick...');
await execPromise(`convert "${processingPath}" -colorspace gray -type grayscale -contrast-stretch 2x50% -sharpen 0x1 "${processedPath}"`);
tempFiles.push(processedPath);
// 3. Perform OCR using Tesseract.js (WASM version)
// This avoids all system binary and language data path issues
console.log('[OCR] Extracting text with Tesseract.js...');
const { data: { text } } = await Tesseract.recognize(processedPath, 'eng', {
logger: m => console.log(`[Tesseract.js] ${m.status}: ${Math.round(m.progress * 100)}%`)
});
// Cleanup temp files
tempFiles.forEach(file => {
if (fs.existsSync(file)) fs.unlinkSync(file);
});
content = `[Image Analysis of ${fileName}]: ${text.trim() || 'No text detected'}`;
} else if (ext === '.pdf') {
const pdf = require('pdf-parse');
const dataBuffer = fs.readFileSync(filePath);
const data = await pdf(dataBuffer);
content = `[PDF Content from ${fileName}]: ${data.text}`;
} else if (['.txt', '.js', '.py', '.html', '.css', '.json', '.md'].includes(ext)) {
content = `[File Content from ${fileName}]: ${fs.readFileSync(filePath, 'utf8')}`;
} else {
content = `[File ${fileName} attached, but extension ${ext} is not supported]`;
}
return content;
} catch (err) {
console.error('[OCR_ERROR]:', err.message);
return `[Error processing image ${fileName}: ${err.message}]`;
}
};
|