import Tesseract from 'tesseract.js'; import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib'; import fontkit from '@pdf-lib/fontkit'; import * as pdfjsLib from 'pdfjs-dist'; import { getFontForLanguage } from './font-loader.js'; import { OcrPage, OcrLine } from '@/types'; import { parseHocrDocument, calculateWordTransform, calculateSpaceTransform, } from './hocr-transform.js'; import { getPDFDocument } from './helpers.js'; export interface OcrOptions { language: string; resolution: number; binarize: boolean; whitelist: string; onProgress?: (status: string, progress: number) => void; } export interface OcrResult { pdfBytes: Uint8Array; pdfDoc: PDFDocument; fullText: string; } function binarizeCanvas(ctx: CanvasRenderingContext2D) { const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height); const data = imageData.data; for (let i = 0; i < data.length; i += 4) { const brightness = 0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2]; const color = brightness > 128 ? 255 : 0; data[i] = data[i + 1] = data[i + 2] = color; } ctx.putImageData(imageData, 0, 0); } function drawOcrTextLayer( page: ReturnType, ocrPage: OcrPage, pageHeight: number, primaryFont: PDFFont, latinFont: PDFFont ): void { ocrPage.lines.forEach(function (line: OcrLine) { const words = line.words; for (let i = 0; i < words.length; i++) { const word = words[i]; const text = word.text.replace( /[\u0000-\u001F\u007F-\u009F\u200E\u200F\u202A-\u202E\uFEFF]/g, '' ); if (!text.trim()) continue; const hasNonLatin = /[^\u0000-\u007F]/.test(text); const font = hasNonLatin ? primaryFont : latinFont; if (!font) { console.warn('Font not available for text: "' + text + '"'); continue; } const transform = calculateWordTransform( word, line, pageHeight, (txt: string, size: number) => { try { return font.widthOfTextAtSize(txt, size); } catch { return 0; } } ); if (transform.fontSize <= 0) continue; try { page.drawText(text, { x: transform.x, y: transform.y, font, size: transform.fontSize, color: rgb(0, 0, 0), opacity: 0, }); } catch (error) { console.warn(`Could not draw text "${text}":`, error); } if (line.injectWordBreaks && i < words.length - 1) { const nextWord = words[i + 1]; const spaceTransform = calculateSpaceTransform( word, nextWord, line, pageHeight, (size: number) => { try { return font.widthOfTextAtSize(' ', size); } catch { return 0; } } ); if (spaceTransform && spaceTransform.horizontalScale > 0.1) { try { page.drawText(' ', { x: spaceTransform.x, y: spaceTransform.y, font, size: spaceTransform.fontSize, color: rgb(0, 0, 0), opacity: 0, }); } catch { console.warn(`Could not draw space between words`); } } } } }); } export async function performOcr( pdfBytes: Uint8Array | ArrayBuffer, options: OcrOptions ): Promise { const { language, resolution, binarize, whitelist, onProgress } = options; const progress = onProgress || (() => {}); const worker = await Tesseract.createWorker(language, 1, { logger: function (m: { status: string; progress: number }) { progress(m.status, m.progress || 0); }, }); await worker.setParameters({ tessjs_create_hocr: '1', tessedit_pageseg_mode: Tesseract.PSM.AUTO, }); if (whitelist) { await worker.setParameters({ tessedit_char_whitelist: whitelist, }); } const pdf = await getPDFDocument({ data: pdfBytes }).promise; const newPdfDoc = await PDFDocument.create(); newPdfDoc.registerFontkit(fontkit); progress('Loading fonts...', 0); const selectedLangs = language.split('+'); const cjkLangs = ['jpn', 'chi_sim', 'chi_tra', 'kor']; const indicLangs = [ 'hin', 'ben', 'guj', 'kan', 'mal', 'ori', 'pan', 'tam', 'tel', 'sin', ]; const priorityLangs = [...cjkLangs, ...indicLangs, 'ara', 'rus', 'ukr']; const primaryLang = selectedLangs.find((l) => priorityLangs.includes(l)) || selectedLangs[0] || 'eng'; const hasCJK = selectedLangs.some((l) => cjkLangs.includes(l)); const hasIndic = selectedLangs.some((l) => indicLangs.includes(l)); const hasLatin = selectedLangs.some((l) => !priorityLangs.includes(l)) || selectedLangs.includes('eng'); const isIndicPlusLatin = hasIndic && hasLatin && !hasCJK; let primaryFont: PDFFont; let latinFont: PDFFont; try { if (isIndicPlusLatin) { const [scriptFontBytes, latinFontBytes] = await Promise.all([ getFontForLanguage(primaryLang), getFontForLanguage('eng'), ]); primaryFont = await newPdfDoc.embedFont(scriptFontBytes, { subset: false, }); latinFont = await newPdfDoc.embedFont(latinFontBytes, { subset: false, }); } else { const fontBytes = await getFontForLanguage(primaryLang); primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false }); latinFont = primaryFont; } } catch (e) { console.error('Font loading failed, falling back to Helvetica', e); primaryFont = await newPdfDoc.embedFont(StandardFonts.Helvetica); latinFont = primaryFont; } let fullText = ''; try { for (let i = 1; i <= pdf.numPages; i++) { progress( `Processing page ${i} of ${pdf.numPages}`, (i - 1) / pdf.numPages ); const page = await pdf.getPage(i); const viewport = page.getViewport({ scale: resolution }); const canvas = document.createElement('canvas'); canvas.width = viewport.width; canvas.height = viewport.height; const context = canvas.getContext('2d'); if (!context) throw new Error('Failed to create canvas context'); await page.render({ canvasContext: context, viewport, canvas }).promise; if (binarize) { binarizeCanvas(context); } const result = await worker.recognize( canvas, {}, { text: true, hocr: true } ); const data = result.data; const newPage = newPdfDoc.addPage([viewport.width, viewport.height]); const pngImageBytes = await new Promise(function ( resolve, reject ) { canvas.toBlob(function (blob) { if (!blob) { reject(new Error('Failed to create image blob')); return; } const reader = new FileReader(); reader.onload = function () { resolve(new Uint8Array(reader.result as ArrayBuffer)); }; reader.onerror = function () { reject(new Error('Failed to read image data')); }; reader.readAsArrayBuffer(blob); }, 'image/png'); }); // Release canvas memory canvas.width = 0; canvas.height = 0; const pngImage = await newPdfDoc.embedPng(pngImageBytes); newPage.drawImage(pngImage, { x: 0, y: 0, width: viewport.width, height: viewport.height, }); if (data.hocr) { const ocrPage = parseHocrDocument(data.hocr); drawOcrTextLayer( newPage, ocrPage, viewport.height, primaryFont, latinFont ); } fullText += data.text + '\n\n'; } } finally { await worker.terminate(); } const savedBytes = await newPdfDoc.save(); return { pdfBytes: new Uint8Array(savedBytes), pdfDoc: newPdfDoc, fullText, }; }