'use strict';

/**
 * ocr.js — Advanced Tesseract OCR pipeline
 *
 * BACKWARDS COMPATIBLE: original shape { text, confidence } is always present.
 * All new extraction passes always run and enrich the output alongside it.
 *
 * Usage:
 *   node ocr.js <imagePath> [options]
 *
 * Options (all optional — nothing breaks without them):
 *   --lang=<lang>          Tesseract language code(s), e.g. eng, eng+fra  (default: eng)
 *   --psm=<0-13>           Page segmentation mode (default: multi-pass best-wins)
 *   --oem=<0-3>            OCR engine mode (default: 3 = LSTM+legacy)
 *   --no-words             Omit per-word detail from output
 *   --no-lines             Omit per-line detail from output
 *   --no-paragraphs        Omit per-paragraph detail from output
 *   --hocr                 Include raw hOCR string in output
 *   --tsv                  Include raw TSV string in output
 *   --threshold=<0-100>    Minimum confidence to include a word (default: 0)
 *   --pretty               Pretty-print JSON output
 *
 * Output (always includes legacy fields, new fields added alongside):
 * {
 *   text:         string   — full extracted text (LEGACY)
 *   confidence:   number   — overall confidence 0-100 (LEGACY)
 *   lang:         string   — language(s) used
 *   psm:          number   — winning PSM mode used
 *   oem:          number   — OEM used
 *   words:        Word[]   — per-word detail (unless --no-words)
 *   lines:        Line[]   — per-line detail (unless --no-lines)
 *   paragraphs:   Para[]   — per-paragraph detail (unless --no-paragraphs)
 *   blocks:       Block[]  — per-block detail
 *   multiPass:    Pass[]   — all PSM attempts with their scores
 *   hocr:         string   — raw hOCR markup (if --hocr)
 *   tsv:          string   — raw TSV data (if --tsv)
 *   version:      string   — pipeline version for consumers
 * }
 */

const Tesseract = require('tesseract.js');

// ─── Pipeline version ────────────────────────────────────────────────────────
const PIPELINE_VERSION = '2.0.0';

// ─── PSM descriptions (for metadata) ─────────────────────────────────────────
const PSM_LABELS = {
  0:  'Orientation and script detection only',
  1:  'Automatic page segmentation with OSD',
  2:  'Automatic page segmentation, no OSD or OCR',
  3:  'Fully automatic page segmentation, no OSD (default)',
  4:  'Assume a single column of text of variable sizes',
  5:  'Assume a single uniform block of vertically aligned text',
  6:  'Assume a single uniform block of text',
  7:  'Treat the image as a single text line',
  8:  'Treat the image as a single word',
  9:  'Treat the image as a single word in a circle',
  10: 'Treat the image as a single character',
  11: 'Sparse text - find as much text as possible',
  12: 'Sparse text with OSD',
  13: 'Raw line - treat the image as a single text line, bypassing Tesseract hacks',
};

// ─── Argument parser ──────────────────────────────────────────────────────────
function parseArgs(argv) {
  const args = argv.slice(2);
  const opts = {
    imagePath:    null,
    lang:         'eng',
    psm:          null,   // null = multi-pass
    oem:          3,
    noWords:      false,
    noLines:      false,
    noParagraphs: false,
    hocr:         false,
    tsv:          false,
    threshold:    0,
    pretty:       false,
  };

  for (const arg of args) {
    if (arg.startsWith('--lang='))        { opts.lang         = arg.split('=')[1]; }
    else if (arg.startsWith('--psm='))    { opts.psm          = parseInt(arg.split('=')[1], 10); }
    else if (arg.startsWith('--oem='))    { opts.oem          = parseInt(arg.split('=')[1], 10); }
    else if (arg === '--no-words')        { opts.noWords       = true; }
    else if (arg === '--no-lines')        { opts.noLines       = true; }
    else if (arg === '--no-paragraphs')   { opts.noParagraphs  = true; }
    else if (arg === '--hocr')            { opts.hocr          = true; }
    else if (arg === '--tsv')             { opts.tsv           = true; }
    else if (arg.startsWith('--threshold=')) { opts.threshold  = parseFloat(arg.split('=')[1]); }
    else if (arg === '--pretty')          { opts.pretty        = true; }
    else if (!arg.startsWith('--'))      { opts.imagePath     = arg; }
  }
  return opts;
}

// ─── Run a single Tesseract pass ──────────────────────────────────────────────
async function runPass(imagePath, lang, psm, oem) {
  const result = await Tesseract.recognize(imagePath, lang, {
    tessedit_pageseg_mode: psm,
    tessedit_ocr_engine_mode: oem,
    // Always request rich data so we can extract words/lines/blocks
    tessjs_create_hocr: '1',
    tessjs_create_tsv: '1',
  });
  return result;
}

// ─── Extract structured data from a Tesseract result ─────────────────────────
function extractStructured(result, opts) {
  const data  = result.data || {};
  const words = (data.words || [])
    .filter(w => (w.confidence ?? 0) >= opts.threshold)
    .map(w => ({
      text:       w.text,
      confidence: w.confidence ?? null,
      bbox:       w.bbox   ? { x0: w.bbox.x0, y0: w.bbox.y0, x1: w.bbox.x1, y1: w.bbox.y1 } : null,
      fontName:   w.font_name    ?? null,
      fontSize:   w.font_size    ?? null,
      bold:       w.is_bold      ?? null,
      italic:     w.is_italic    ?? null,
      underlined: w.is_underlined ?? null,
      inDict:     w.in_dict      ?? null,
    }));

  const lines = (data.lines || []).map(l => ({
    text:       l.text,
    confidence: l.confidence ?? null,
    bbox:       l.bbox ? { x0: l.bbox.x0, y0: l.bbox.y0, x1: l.bbox.x1, y1: l.bbox.y1 } : null,
    wordCount:  (l.words || []).length,
  }));

  const paragraphs = (data.paragraphs || []).map(p => ({
    text:       p.text,
    confidence: p.confidence ?? null,
    bbox:       p.bbox ? { x0: p.bbox.x0, y0: p.bbox.y0, x1: p.bbox.x1, y1: p.bbox.y1 } : null,
    lineCount:  (p.lines || []).length,
  }));

  const blocks = (data.blocks || []).map(b => ({
    text:           b.text,
    confidence:     b.confidence ?? null,
    bbox:           b.bbox ? { x0: b.bbox.x0, y0: b.bbox.y0, x1: b.bbox.x1, y1: b.bbox.y1 } : null,
    paragraphCount: (b.paragraphs || []).length,
  }));

  return { words, lines, paragraphs, blocks };
}

// ─── Multi-pass strategy: try several PSMs, pick best confidence ──────────────
const MULTI_PASS_PSMS = [3, 6, 11, 4]; // ordered by general utility

async function multiPass(imagePath, lang, oem, opts) {
  const passes = [];

  for (const psm of MULTI_PASS_PSMS) {
    try {
      const result    = await runPass(imagePath, lang, psm, oem);
      const data      = result.data || {};
      const text      = (data.text || '').trim();
      const confidence = data.confidence ?? 0;
      passes.push({
        psm,
        psmLabel:   PSM_LABELS[psm] ?? 'unknown',
        text,
        confidence,
        result,   // keep full result for winner extraction
      });
    } catch (err) {
      passes.push({ psm, psmLabel: PSM_LABELS[psm] ?? 'unknown', error: String(err) });
    }
  }

  // Pick the pass with the highest confidence that also produced text
  const ranked = passes
    .filter(p => p.text && !p.error)
    .sort((a, b) => b.confidence - a.confidence);

  const winner = ranked[0] || passes.find(p => !p.error) || passes[0];

  return { passes, winner };
}

// ─── Main ─────────────────────────────────────────────────────────────────────
async function main() {
  const opts = parseArgs(process.argv);

  if (!opts.imagePath) {
    // LEGACY: exact same error format as original
    console.error(JSON.stringify({ error: 'missing image path' }));
    process.exit(1);
  }

  try {
    let winnerResult, winnerPsm, allPasses;

    if (opts.psm !== null) {
      // Single-pass mode (explicit --psm)
      const result = await runPass(opts.imagePath, opts.lang, opts.psm, opts.oem);
      winnerResult = result;
      winnerPsm    = opts.psm;
      allPasses    = [{
        psm:        opts.psm,
        psmLabel:   PSM_LABELS[opts.psm] ?? 'unknown',
        text:       (result.data?.text || '').trim(),
        confidence: result.data?.confidence ?? null,
      }];
    } else {
      // Multi-pass mode: run all strategies, pick winner
      const { passes, winner } = await multiPass(opts.imagePath, opts.lang, opts.oem, opts);
      winnerResult = winner.result;
      winnerPsm    = winner.psm;
      allPasses    = passes.map(p => ({
        psm:        p.psm,
        psmLabel:   p.psmLabel,
        text:       p.text        ?? null,
        confidence: p.confidence  ?? null,
        error:      p.error       ?? undefined,
      }));
    }

    const data = winnerResult?.data || {};

    // ── Extract all structured layers (always runs) ──────────────────────────
    const structured = extractStructured(winnerResult, opts);

    // ── Build output — LEGACY fields first, new fields appended ─────────────
    const output = {
      // ── LEGACY (always present, never moved or renamed) ───────────────────
      text:       (data.text || '').trim(),
      confidence: data.confidence ?? null,

      // ── New metadata ──────────────────────────────────────────────────────
      version:    PIPELINE_VERSION,
      lang:       opts.lang,
      psm:        winnerPsm,
      psmLabel:   PSM_LABELS[winnerPsm] ?? 'unknown',
      oem:        opts.oem,

      // ── Multi-pass summary ────────────────────────────────────────────────
      multiPass: allPasses,

      // ── Structured layers (conditionally omitted by flags) ────────────────
      ...(!opts.noWords       && { words:      structured.words      }),
      ...(!opts.noLines       && { lines:      structured.lines      }),
      ...(!opts.noParagraphs  && { paragraphs: structured.paragraphs }),
      blocks: structured.blocks,

      // ── Optional raw formats ──────────────────────────────────────────────
      ...(opts.hocr && { hocr: data.hocr ?? null }),
      ...(opts.tsv  && { tsv:  data.tsv  ?? null }),
    };

    // LEGACY: same stdout channel, same JSON shape (with additions)
    console.log(opts.pretty
      ? JSON.stringify(output, null, 2)
      : JSON.stringify(output));

  } catch (err) {
    // LEGACY: exact same error format as original
    console.error(JSON.stringify({ error: String(err) }));
    process.exit(1);
  }
}

main();