File size: 2,831 Bytes
1dc8372
 
 
1418755
 
 
 
1dc8372
 
 
 
1418755
1dc8372
 
 
 
ed58206
1418755
1dc8372
1418755
 
2690ab5
1418755
 
 
 
 
 
 
6644e5d
2690ab5
1418755
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dc8372
1418755
1dc8372
 
 
 
 
 
1418755
1dc8372
 
 
 
1418755
 
1dc8372
1418755
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
const axios = require('axios');
const fs = require('fs');
const path = require('path');
const { exec } = require('child_process');
const util = require('util');
const execPromise = util.promisify(exec);
const Tesseract = require('tesseract.js');

exports.processFile = async (filePath) => {
  const ext = path.extname(filePath).toLowerCase();
  const fileName = path.basename(filePath);
  const uploadsDir = path.dirname(filePath);
  
  let content = '';
  
  try {
    if (['.png', '.jpg', '.jpeg', '.webp', '.heic'].includes(ext)) {
      console.log(`[OCR] Processing image with ImageMagick + Tesseract.js: ${fileName}`);
      
      let processingPath = filePath;
      let tempFiles = [];

      // 1. Convert HEIC to PNG using heif-convert if necessary
      if (ext === '.heic') {
        const heicOutputPath = path.join(uploadsDir, `${Date.now()}_converted.png`);
        console.log('[OCR] Converting HEIC to PNG...');
        await execPromise(`heif-convert "${filePath}" "${heicOutputPath}"`);
        processingPath = heicOutputPath;
        tempFiles.push(heicOutputPath);
      }

      // 2. Pre-process with ImageMagick for better OCR accuracy
      // (Convert to grayscale, enhance contrast, and sharpen)
      const processedPath = path.join(uploadsDir, `${Date.now()}_processed.png`);
      console.log('[OCR] Enhancing image with ImageMagick...');
      await execPromise(`convert "${processingPath}" -colorspace gray -type grayscale -contrast-stretch 2x50% -sharpen 0x1 "${processedPath}"`);
      tempFiles.push(processedPath);

      // 3. Perform OCR using Tesseract.js (WASM version)
      // This avoids all system binary and language data path issues
      console.log('[OCR] Extracting text with Tesseract.js...');
      const { data: { text } } = await Tesseract.recognize(processedPath, 'eng', {
        logger: m => console.log(`[Tesseract.js] ${m.status}: ${Math.round(m.progress * 100)}%`)
      });

      // Cleanup temp files
      tempFiles.forEach(file => {
        if (fs.existsSync(file)) fs.unlinkSync(file);
      });

      content = `[Image Analysis of ${fileName}]: ${text.trim() || 'No text detected'}`;
    } else if (ext === '.pdf') {
      const pdf = require('pdf-parse');
      const dataBuffer = fs.readFileSync(filePath);
      const data = await pdf(dataBuffer);
      content = `[PDF Content from ${fileName}]: ${data.text}`;
    } else if (['.txt', '.js', '.py', '.html', '.css', '.json', '.md'].includes(ext)) {
      content = `[File Content from ${fileName}]: ${fs.readFileSync(filePath, 'utf8')}`;
    } else {
      content = `[File ${fileName} attached, but extension ${ext} is not supported]`;
    }
    
    return content;
  } catch (err) {
    console.error('[OCR_ERROR]:', err.message);
    return `[Error processing image ${fileName}: ${err.message}]`;
  }
};