Spaces:
Sleeping
Sleeping
| const axios = require('axios'); | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| const { exec } = require('child_process'); | |
| const util = require('util'); | |
| const execPromise = util.promisify(exec); | |
| const Tesseract = require('tesseract.js'); | |
| exports.processFile = async (filePath) => { | |
| const ext = path.extname(filePath).toLowerCase(); | |
| const fileName = path.basename(filePath); | |
| const uploadsDir = path.dirname(filePath); | |
| let content = ''; | |
| try { | |
| if (['.png', '.jpg', '.jpeg', '.webp', '.heic'].includes(ext)) { | |
| console.log(`[OCR] Processing image with ImageMagick + Tesseract.js: ${fileName}`); | |
| let processingPath = filePath; | |
| let tempFiles = []; | |
| // 1. Convert HEIC to PNG using heif-convert if necessary | |
| if (ext === '.heic') { | |
| const heicOutputPath = path.join(uploadsDir, `${Date.now()}_converted.png`); | |
| console.log('[OCR] Converting HEIC to PNG...'); | |
| await execPromise(`heif-convert "${filePath}" "${heicOutputPath}"`); | |
| processingPath = heicOutputPath; | |
| tempFiles.push(heicOutputPath); | |
| } | |
| // 2. Pre-process with ImageMagick for better OCR accuracy | |
| // (Convert to grayscale, enhance contrast, and sharpen) | |
| const processedPath = path.join(uploadsDir, `${Date.now()}_processed.png`); | |
| console.log('[OCR] Enhancing image with ImageMagick...'); | |
| await execPromise(`convert "${processingPath}" -colorspace gray -type grayscale -contrast-stretch 2x50% -sharpen 0x1 "${processedPath}"`); | |
| tempFiles.push(processedPath); | |
| // 3. Perform OCR using Tesseract.js (WASM version) | |
| // This avoids all system binary and language data path issues | |
| console.log('[OCR] Extracting text with Tesseract.js...'); | |
| const { data: { text } } = await Tesseract.recognize(processedPath, 'eng', { | |
| logger: m => console.log(`[Tesseract.js] ${m.status}: ${Math.round(m.progress * 100)}%`) | |
| }); | |
| // Cleanup temp files | |
| tempFiles.forEach(file => { | |
| if (fs.existsSync(file)) fs.unlinkSync(file); | |
| }); | |
| content = `[Image Analysis of ${fileName}]: ${text.trim() || 'No text detected'}`; | |
| } else if (ext === '.pdf') { | |
| const pdf = require('pdf-parse'); | |
| const dataBuffer = fs.readFileSync(filePath); | |
| const data = await pdf(dataBuffer); | |
| content = `[PDF Content from ${fileName}]: ${data.text}`; | |
| } else if (['.txt', '.js', '.py', '.html', '.css', '.json', '.md'].includes(ext)) { | |
| content = `[File Content from ${fileName}]: ${fs.readFileSync(filePath, 'utf8')}`; | |
| } else { | |
| content = `[File ${fileName} attached, but extension ${ext} is not supported]`; | |
| } | |
| return content; | |
| } catch (err) { | |
| console.error('[OCR_ERROR]:', err.message); | |
| return `[Error processing image ${fileName}: ${err.message}]`; | |
| } | |
| }; | |