|
|
const axios = require('axios'); |
|
|
const fs = require('fs'); |
|
|
const path = require('path'); |
|
|
const { exec } = require('child_process'); |
|
|
const util = require('util'); |
|
|
const execPromise = util.promisify(exec); |
|
|
const Tesseract = require('tesseract.js'); |
|
|
|
|
|
exports.processFile = async (filePath) => { |
|
|
const ext = path.extname(filePath).toLowerCase(); |
|
|
const fileName = path.basename(filePath); |
|
|
const uploadsDir = path.dirname(filePath); |
|
|
|
|
|
let content = ''; |
|
|
|
|
|
try { |
|
|
if (['.png', '.jpg', '.jpeg', '.webp', '.heic'].includes(ext)) { |
|
|
console.log(`[OCR] Processing image with ImageMagick + Tesseract.js: ${fileName}`); |
|
|
|
|
|
let processingPath = filePath; |
|
|
let tempFiles = []; |
|
|
|
|
|
|
|
|
if (ext === '.heic') { |
|
|
const heicOutputPath = path.join(uploadsDir, `${Date.now()}_converted.png`); |
|
|
console.log('[OCR] Converting HEIC to PNG...'); |
|
|
await execPromise(`heif-convert "${filePath}" "${heicOutputPath}"`); |
|
|
processingPath = heicOutputPath; |
|
|
tempFiles.push(heicOutputPath); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
const processedPath = path.join(uploadsDir, `${Date.now()}_processed.png`); |
|
|
console.log('[OCR] Enhancing image with ImageMagick...'); |
|
|
await execPromise(`convert "${processingPath}" -colorspace gray -type grayscale -contrast-stretch 2x50% -sharpen 0x1 "${processedPath}"`); |
|
|
tempFiles.push(processedPath); |
|
|
|
|
|
|
|
|
|
|
|
console.log('[OCR] Extracting text with Tesseract.js...'); |
|
|
const { data: { text } } = await Tesseract.recognize(processedPath, 'eng', { |
|
|
logger: m => console.log(`[Tesseract.js] ${m.status}: ${Math.round(m.progress * 100)}%`) |
|
|
}); |
|
|
|
|
|
|
|
|
tempFiles.forEach(file => { |
|
|
if (fs.existsSync(file)) fs.unlinkSync(file); |
|
|
}); |
|
|
|
|
|
content = `[Image Analysis of ${fileName}]: ${text.trim() || 'No text detected'}`; |
|
|
} else if (ext === '.pdf') { |
|
|
const pdf = require('pdf-parse'); |
|
|
const dataBuffer = fs.readFileSync(filePath); |
|
|
const data = await pdf(dataBuffer); |
|
|
content = `[PDF Content from ${fileName}]: ${data.text}`; |
|
|
} else if (['.txt', '.js', '.py', '.html', '.css', '.json', '.md'].includes(ext)) { |
|
|
content = `[File Content from ${fileName}]: ${fs.readFileSync(filePath, 'utf8')}`; |
|
|
} else { |
|
|
content = `[File ${fileName} attached, but extension ${ext} is not supported]`; |
|
|
} |
|
|
|
|
|
return content; |
|
|
} catch (err) { |
|
|
console.error('[OCR_ERROR]:', err.message); |
|
|
return `[Error processing image ${fileName}: ${err.message}]`; |
|
|
} |
|
|
}; |
|
|
|