'use strict'; /** * ocr.js — Advanced Tesseract OCR pipeline * * BACKWARDS COMPATIBLE: original shape { text, confidence } is always present. * All new extraction passes always run and enrich the output alongside it. * * Usage: * node ocr.js [options] * * Options (all optional — nothing breaks without them): * --lang= Tesseract language code(s), e.g. eng, eng+fra (default: eng) * --psm=<0-13> Page segmentation mode (default: multi-pass best-wins) * --oem=<0-3> OCR engine mode (default: 3 = LSTM+legacy) * --no-words Omit per-word detail from output * --no-lines Omit per-line detail from output * --no-paragraphs Omit per-paragraph detail from output * --hocr Include raw hOCR string in output * --tsv Include raw TSV string in output * --threshold=<0-100> Minimum confidence to include a word (default: 0) * --pretty Pretty-print JSON output * * Output (always includes legacy fields, new fields added alongside): * { * text: string — full extracted text (LEGACY) * confidence: number — overall confidence 0-100 (LEGACY) * lang: string — language(s) used * psm: number — winning PSM mode used * oem: number — OEM used * words: Word[] — per-word detail (unless --no-words) * lines: Line[] — per-line detail (unless --no-lines) * paragraphs: Para[] — per-paragraph detail (unless --no-paragraphs) * blocks: Block[] — per-block detail * multiPass: Pass[] — all PSM attempts with their scores * hocr: string — raw hOCR markup (if --hocr) * tsv: string — raw TSV data (if --tsv) * version: string — pipeline version for consumers * } */ const Tesseract = require('tesseract.js'); // ─── Pipeline version ──────────────────────────────────────────────────────── const PIPELINE_VERSION = '2.0.0'; // ─── PSM descriptions (for metadata) ───────────────────────────────────────── const PSM_LABELS = { 0: 'Orientation and script detection only', 1: 'Automatic page segmentation with OSD', 2: 'Automatic page segmentation, no OSD or OCR', 3: 'Fully automatic page segmentation, no OSD (default)', 4: 'Assume a single column of text of variable sizes', 5: 'Assume a single uniform block of vertically aligned text', 6: 'Assume a single uniform block of text', 7: 'Treat the image as a single text line', 8: 'Treat the image as a single word', 9: 'Treat the image as a single word in a circle', 10: 'Treat the image as a single character', 11: 'Sparse text - find as much text as possible', 12: 'Sparse text with OSD', 13: 'Raw line - treat the image as a single text line, bypassing Tesseract hacks', }; // ─── Argument parser ────────────────────────────────────────────────────────── function parseArgs(argv) { const args = argv.slice(2); const opts = { imagePath: null, lang: 'eng', psm: null, // null = multi-pass oem: 3, noWords: false, noLines: false, noParagraphs: false, hocr: false, tsv: false, threshold: 0, pretty: false, }; for (const arg of args) { if (arg.startsWith('--lang=')) { opts.lang = arg.split('=')[1]; } else if (arg.startsWith('--psm=')) { opts.psm = parseInt(arg.split('=')[1], 10); } else if (arg.startsWith('--oem=')) { opts.oem = parseInt(arg.split('=')[1], 10); } else if (arg === '--no-words') { opts.noWords = true; } else if (arg === '--no-lines') { opts.noLines = true; } else if (arg === '--no-paragraphs') { opts.noParagraphs = true; } else if (arg === '--hocr') { opts.hocr = true; } else if (arg === '--tsv') { opts.tsv = true; } else if (arg.startsWith('--threshold=')) { opts.threshold = parseFloat(arg.split('=')[1]); } else if (arg === '--pretty') { opts.pretty = true; } else if (!arg.startsWith('--')) { opts.imagePath = arg; } } return opts; } // ─── Run a single Tesseract pass ────────────────────────────────────────────── async function runPass(imagePath, lang, psm, oem) { const result = await Tesseract.recognize(imagePath, lang, { tessedit_pageseg_mode: psm, tessedit_ocr_engine_mode: oem, // Always request rich data so we can extract words/lines/blocks tessjs_create_hocr: '1', tessjs_create_tsv: '1', }); return result; } // ─── Extract structured data from a Tesseract result ───────────────────────── function extractStructured(result, opts) { const data = result.data || {}; const words = (data.words || []) .filter(w => (w.confidence ?? 0) >= opts.threshold) .map(w => ({ text: w.text, confidence: w.confidence ?? null, bbox: w.bbox ? { x0: w.bbox.x0, y0: w.bbox.y0, x1: w.bbox.x1, y1: w.bbox.y1 } : null, fontName: w.font_name ?? null, fontSize: w.font_size ?? null, bold: w.is_bold ?? null, italic: w.is_italic ?? null, underlined: w.is_underlined ?? null, inDict: w.in_dict ?? null, })); const lines = (data.lines || []).map(l => ({ text: l.text, confidence: l.confidence ?? null, bbox: l.bbox ? { x0: l.bbox.x0, y0: l.bbox.y0, x1: l.bbox.x1, y1: l.bbox.y1 } : null, wordCount: (l.words || []).length, })); const paragraphs = (data.paragraphs || []).map(p => ({ text: p.text, confidence: p.confidence ?? null, bbox: p.bbox ? { x0: p.bbox.x0, y0: p.bbox.y0, x1: p.bbox.x1, y1: p.bbox.y1 } : null, lineCount: (p.lines || []).length, })); const blocks = (data.blocks || []).map(b => ({ text: b.text, confidence: b.confidence ?? null, bbox: b.bbox ? { x0: b.bbox.x0, y0: b.bbox.y0, x1: b.bbox.x1, y1: b.bbox.y1 } : null, paragraphCount: (b.paragraphs || []).length, })); return { words, lines, paragraphs, blocks }; } // ─── Multi-pass strategy: try several PSMs, pick best confidence ────────────── const MULTI_PASS_PSMS = [3, 6, 11, 4]; // ordered by general utility async function multiPass(imagePath, lang, oem, opts) { const passes = []; for (const psm of MULTI_PASS_PSMS) { try { const result = await runPass(imagePath, lang, psm, oem); const data = result.data || {}; const text = (data.text || '').trim(); const confidence = data.confidence ?? 0; passes.push({ psm, psmLabel: PSM_LABELS[psm] ?? 'unknown', text, confidence, result, // keep full result for winner extraction }); } catch (err) { passes.push({ psm, psmLabel: PSM_LABELS[psm] ?? 'unknown', error: String(err) }); } } // Pick the pass with the highest confidence that also produced text const ranked = passes .filter(p => p.text && !p.error) .sort((a, b) => b.confidence - a.confidence); const winner = ranked[0] || passes.find(p => !p.error) || passes[0]; return { passes, winner }; } // ─── Main ───────────────────────────────────────────────────────────────────── async function main() { const opts = parseArgs(process.argv); if (!opts.imagePath) { // LEGACY: exact same error format as original console.error(JSON.stringify({ error: 'missing image path' })); process.exit(1); } try { let winnerResult, winnerPsm, allPasses; if (opts.psm !== null) { // Single-pass mode (explicit --psm) const result = await runPass(opts.imagePath, opts.lang, opts.psm, opts.oem); winnerResult = result; winnerPsm = opts.psm; allPasses = [{ psm: opts.psm, psmLabel: PSM_LABELS[opts.psm] ?? 'unknown', text: (result.data?.text || '').trim(), confidence: result.data?.confidence ?? null, }]; } else { // Multi-pass mode: run all strategies, pick winner const { passes, winner } = await multiPass(opts.imagePath, opts.lang, opts.oem, opts); winnerResult = winner.result; winnerPsm = winner.psm; allPasses = passes.map(p => ({ psm: p.psm, psmLabel: p.psmLabel, text: p.text ?? null, confidence: p.confidence ?? null, error: p.error ?? undefined, })); } const data = winnerResult?.data || {}; // ── Extract all structured layers (always runs) ────────────────────────── const structured = extractStructured(winnerResult, opts); // ── Build output — LEGACY fields first, new fields appended ───────────── const output = { // ── LEGACY (always present, never moved or renamed) ─────────────────── text: (data.text || '').trim(), confidence: data.confidence ?? null, // ── New metadata ────────────────────────────────────────────────────── version: PIPELINE_VERSION, lang: opts.lang, psm: winnerPsm, psmLabel: PSM_LABELS[winnerPsm] ?? 'unknown', oem: opts.oem, // ── Multi-pass summary ──────────────────────────────────────────────── multiPass: allPasses, // ── Structured layers (conditionally omitted by flags) ──────────────── ...(!opts.noWords && { words: structured.words }), ...(!opts.noLines && { lines: structured.lines }), ...(!opts.noParagraphs && { paragraphs: structured.paragraphs }), blocks: structured.blocks, // ── Optional raw formats ────────────────────────────────────────────── ...(opts.hocr && { hocr: data.hocr ?? null }), ...(opts.tsv && { tsv: data.tsv ?? null }), }; // LEGACY: same stdout channel, same JSON shape (with additions) console.log(opts.pretty ? JSON.stringify(output, null, 2) : JSON.stringify(output)); } catch (err) { // LEGACY: exact same error format as original console.error(JSON.stringify({ error: String(err) })); process.exit(1); } } main();