Phillnet-2 / Tools /browser_ocr.js
ayjays132's picture
Upload 478 files
101858b verified
'use strict';
/**
* ocr.js β€” Advanced Tesseract OCR pipeline
*
* BACKWARDS COMPATIBLE: original shape { text, confidence } is always present.
* All new extraction passes always run and enrich the output alongside it.
*
* Usage:
* node ocr.js <imagePath> [options]
*
* Options (all optional β€” nothing breaks without them):
* --lang=<lang> Tesseract language code(s), e.g. eng, eng+fra (default: eng)
* --psm=<0-13> Page segmentation mode (default: multi-pass best-wins)
* --oem=<0-3> OCR engine mode (default: 3 = LSTM+legacy)
* --no-words Omit per-word detail from output
* --no-lines Omit per-line detail from output
* --no-paragraphs Omit per-paragraph detail from output
* --hocr Include raw hOCR string in output
* --tsv Include raw TSV string in output
* --threshold=<0-100> Minimum confidence to include a word (default: 0)
* --pretty Pretty-print JSON output
*
* Output (always includes legacy fields, new fields added alongside):
* {
* text: string β€” full extracted text (LEGACY)
* confidence: number β€” overall confidence 0-100 (LEGACY)
* lang: string β€” language(s) used
* psm: number β€” winning PSM mode used
* oem: number β€” OEM used
* words: Word[] β€” per-word detail (unless --no-words)
* lines: Line[] β€” per-line detail (unless --no-lines)
* paragraphs: Para[] β€” per-paragraph detail (unless --no-paragraphs)
* blocks: Block[] β€” per-block detail
* multiPass: Pass[] β€” all PSM attempts with their scores
* hocr: string β€” raw hOCR markup (if --hocr)
* tsv: string β€” raw TSV data (if --tsv)
* version: string β€” pipeline version for consumers
* }
*/
const Tesseract = require('tesseract.js');
// ─── Pipeline version ────────────────────────────────────────────────────────
const PIPELINE_VERSION = '2.0.0';
// ─── PSM descriptions (for metadata) ─────────────────────────────────────────
const PSM_LABELS = {
0: 'Orientation and script detection only',
1: 'Automatic page segmentation with OSD',
2: 'Automatic page segmentation, no OSD or OCR',
3: 'Fully automatic page segmentation, no OSD (default)',
4: 'Assume a single column of text of variable sizes',
5: 'Assume a single uniform block of vertically aligned text',
6: 'Assume a single uniform block of text',
7: 'Treat the image as a single text line',
8: 'Treat the image as a single word',
9: 'Treat the image as a single word in a circle',
10: 'Treat the image as a single character',
11: 'Sparse text - find as much text as possible',
12: 'Sparse text with OSD',
13: 'Raw line - treat the image as a single text line, bypassing Tesseract hacks',
};
// ─── Argument parser ──────────────────────────────────────────────────────────
function parseArgs(argv) {
const args = argv.slice(2);
const opts = {
imagePath: null,
lang: 'eng',
psm: null, // null = multi-pass
oem: 3,
noWords: false,
noLines: false,
noParagraphs: false,
hocr: false,
tsv: false,
threshold: 0,
pretty: false,
};
for (const arg of args) {
if (arg.startsWith('--lang=')) { opts.lang = arg.split('=')[1]; }
else if (arg.startsWith('--psm=')) { opts.psm = parseInt(arg.split('=')[1], 10); }
else if (arg.startsWith('--oem=')) { opts.oem = parseInt(arg.split('=')[1], 10); }
else if (arg === '--no-words') { opts.noWords = true; }
else if (arg === '--no-lines') { opts.noLines = true; }
else if (arg === '--no-paragraphs') { opts.noParagraphs = true; }
else if (arg === '--hocr') { opts.hocr = true; }
else if (arg === '--tsv') { opts.tsv = true; }
else if (arg.startsWith('--threshold=')) { opts.threshold = parseFloat(arg.split('=')[1]); }
else if (arg === '--pretty') { opts.pretty = true; }
else if (!arg.startsWith('--')) { opts.imagePath = arg; }
}
return opts;
}
// ─── Run a single Tesseract pass ──────────────────────────────────────────────
async function runPass(imagePath, lang, psm, oem) {
const result = await Tesseract.recognize(imagePath, lang, {
tessedit_pageseg_mode: psm,
tessedit_ocr_engine_mode: oem,
// Always request rich data so we can extract words/lines/blocks
tessjs_create_hocr: '1',
tessjs_create_tsv: '1',
});
return result;
}
// ─── Extract structured data from a Tesseract result ─────────────────────────
function extractStructured(result, opts) {
const data = result.data || {};
const words = (data.words || [])
.filter(w => (w.confidence ?? 0) >= opts.threshold)
.map(w => ({
text: w.text,
confidence: w.confidence ?? null,
bbox: w.bbox ? { x0: w.bbox.x0, y0: w.bbox.y0, x1: w.bbox.x1, y1: w.bbox.y1 } : null,
fontName: w.font_name ?? null,
fontSize: w.font_size ?? null,
bold: w.is_bold ?? null,
italic: w.is_italic ?? null,
underlined: w.is_underlined ?? null,
inDict: w.in_dict ?? null,
}));
const lines = (data.lines || []).map(l => ({
text: l.text,
confidence: l.confidence ?? null,
bbox: l.bbox ? { x0: l.bbox.x0, y0: l.bbox.y0, x1: l.bbox.x1, y1: l.bbox.y1 } : null,
wordCount: (l.words || []).length,
}));
const paragraphs = (data.paragraphs || []).map(p => ({
text: p.text,
confidence: p.confidence ?? null,
bbox: p.bbox ? { x0: p.bbox.x0, y0: p.bbox.y0, x1: p.bbox.x1, y1: p.bbox.y1 } : null,
lineCount: (p.lines || []).length,
}));
const blocks = (data.blocks || []).map(b => ({
text: b.text,
confidence: b.confidence ?? null,
bbox: b.bbox ? { x0: b.bbox.x0, y0: b.bbox.y0, x1: b.bbox.x1, y1: b.bbox.y1 } : null,
paragraphCount: (b.paragraphs || []).length,
}));
return { words, lines, paragraphs, blocks };
}
// ─── Multi-pass strategy: try several PSMs, pick best confidence ──────────────
const MULTI_PASS_PSMS = [3, 6, 11, 4]; // ordered by general utility
async function multiPass(imagePath, lang, oem, opts) {
const passes = [];
for (const psm of MULTI_PASS_PSMS) {
try {
const result = await runPass(imagePath, lang, psm, oem);
const data = result.data || {};
const text = (data.text || '').trim();
const confidence = data.confidence ?? 0;
passes.push({
psm,
psmLabel: PSM_LABELS[psm] ?? 'unknown',
text,
confidence,
result, // keep full result for winner extraction
});
} catch (err) {
passes.push({ psm, psmLabel: PSM_LABELS[psm] ?? 'unknown', error: String(err) });
}
}
// Pick the pass with the highest confidence that also produced text
const ranked = passes
.filter(p => p.text && !p.error)
.sort((a, b) => b.confidence - a.confidence);
const winner = ranked[0] || passes.find(p => !p.error) || passes[0];
return { passes, winner };
}
// ─── Main ─────────────────────────────────────────────────────────────────────
async function main() {
const opts = parseArgs(process.argv);
if (!opts.imagePath) {
// LEGACY: exact same error format as original
console.error(JSON.stringify({ error: 'missing image path' }));
process.exit(1);
}
try {
let winnerResult, winnerPsm, allPasses;
if (opts.psm !== null) {
// Single-pass mode (explicit --psm)
const result = await runPass(opts.imagePath, opts.lang, opts.psm, opts.oem);
winnerResult = result;
winnerPsm = opts.psm;
allPasses = [{
psm: opts.psm,
psmLabel: PSM_LABELS[opts.psm] ?? 'unknown',
text: (result.data?.text || '').trim(),
confidence: result.data?.confidence ?? null,
}];
} else {
// Multi-pass mode: run all strategies, pick winner
const { passes, winner } = await multiPass(opts.imagePath, opts.lang, opts.oem, opts);
winnerResult = winner.result;
winnerPsm = winner.psm;
allPasses = passes.map(p => ({
psm: p.psm,
psmLabel: p.psmLabel,
text: p.text ?? null,
confidence: p.confidence ?? null,
error: p.error ?? undefined,
}));
}
const data = winnerResult?.data || {};
// ── Extract all structured layers (always runs) ──────────────────────────
const structured = extractStructured(winnerResult, opts);
// ── Build output β€” LEGACY fields first, new fields appended ─────────────
const output = {
// ── LEGACY (always present, never moved or renamed) ───────────────────
text: (data.text || '').trim(),
confidence: data.confidence ?? null,
// ── New metadata ──────────────────────────────────────────────────────
version: PIPELINE_VERSION,
lang: opts.lang,
psm: winnerPsm,
psmLabel: PSM_LABELS[winnerPsm] ?? 'unknown',
oem: opts.oem,
// ── Multi-pass summary ────────────────────────────────────────────────
multiPass: allPasses,
// ── Structured layers (conditionally omitted by flags) ────────────────
...(!opts.noWords && { words: structured.words }),
...(!opts.noLines && { lines: structured.lines }),
...(!opts.noParagraphs && { paragraphs: structured.paragraphs }),
blocks: structured.blocks,
// ── Optional raw formats ──────────────────────────────────────────────
...(opts.hocr && { hocr: data.hocr ?? null }),
...(opts.tsv && { tsv: data.tsv ?? null }),
};
// LEGACY: same stdout channel, same JSON shape (with additions)
console.log(opts.pretty
? JSON.stringify(output, null, 2)
: JSON.stringify(output));
} catch (err) {
// LEGACY: exact same error format as original
console.error(JSON.stringify({ error: String(err) }));
process.exit(1);
}
}
main();