Bentopdf / src /js /utils /ocr.ts
AUXteam's picture
Upload folder using huggingface_hub
1b756c8 verified
import Tesseract from 'tesseract.js';
import { PDFDocument, StandardFonts, rgb, PDFFont } from 'pdf-lib';
import fontkit from '@pdf-lib/fontkit';
import * as pdfjsLib from 'pdfjs-dist';
import { getFontForLanguage } from './font-loader.js';
import { OcrPage, OcrLine } from '@/types';
import {
parseHocrDocument,
calculateWordTransform,
calculateSpaceTransform,
} from './hocr-transform.js';
import { getPDFDocument } from './helpers.js';
export interface OcrOptions {
language: string;
resolution: number;
binarize: boolean;
whitelist: string;
onProgress?: (status: string, progress: number) => void;
}
export interface OcrResult {
pdfBytes: Uint8Array;
pdfDoc: PDFDocument;
fullText: string;
}
function binarizeCanvas(ctx: CanvasRenderingContext2D) {
const imageData = ctx.getImageData(0, 0, ctx.canvas.width, ctx.canvas.height);
const data = imageData.data;
for (let i = 0; i < data.length; i += 4) {
const brightness =
0.299 * data[i] + 0.587 * data[i + 1] + 0.114 * data[i + 2];
const color = brightness > 128 ? 255 : 0;
data[i] = data[i + 1] = data[i + 2] = color;
}
ctx.putImageData(imageData, 0, 0);
}
function drawOcrTextLayer(
page: ReturnType<typeof PDFDocument.prototype.addPage>,
ocrPage: OcrPage,
pageHeight: number,
primaryFont: PDFFont,
latinFont: PDFFont
): void {
ocrPage.lines.forEach(function (line: OcrLine) {
const words = line.words;
for (let i = 0; i < words.length; i++) {
const word = words[i];
const text = word.text.replace(
/[\u0000-\u001F\u007F-\u009F\u200E\u200F\u202A-\u202E\uFEFF]/g,
''
);
if (!text.trim()) continue;
const hasNonLatin = /[^\u0000-\u007F]/.test(text);
const font = hasNonLatin ? primaryFont : latinFont;
if (!font) {
console.warn('Font not available for text: "' + text + '"');
continue;
}
const transform = calculateWordTransform(
word,
line,
pageHeight,
(txt: string, size: number) => {
try {
return font.widthOfTextAtSize(txt, size);
} catch {
return 0;
}
}
);
if (transform.fontSize <= 0) continue;
try {
page.drawText(text, {
x: transform.x,
y: transform.y,
font,
size: transform.fontSize,
color: rgb(0, 0, 0),
opacity: 0,
});
} catch (error) {
console.warn(`Could not draw text "${text}":`, error);
}
if (line.injectWordBreaks && i < words.length - 1) {
const nextWord = words[i + 1];
const spaceTransform = calculateSpaceTransform(
word,
nextWord,
line,
pageHeight,
(size: number) => {
try {
return font.widthOfTextAtSize(' ', size);
} catch {
return 0;
}
}
);
if (spaceTransform && spaceTransform.horizontalScale > 0.1) {
try {
page.drawText(' ', {
x: spaceTransform.x,
y: spaceTransform.y,
font,
size: spaceTransform.fontSize,
color: rgb(0, 0, 0),
opacity: 0,
});
} catch {
console.warn(`Could not draw space between words`);
}
}
}
}
});
}
export async function performOcr(
pdfBytes: Uint8Array | ArrayBuffer,
options: OcrOptions
): Promise<OcrResult> {
const { language, resolution, binarize, whitelist, onProgress } = options;
const progress = onProgress || (() => {});
const worker = await Tesseract.createWorker(language, 1, {
logger: function (m: { status: string; progress: number }) {
progress(m.status, m.progress || 0);
},
});
await worker.setParameters({
tessjs_create_hocr: '1',
tessedit_pageseg_mode: Tesseract.PSM.AUTO,
});
if (whitelist) {
await worker.setParameters({
tessedit_char_whitelist: whitelist,
});
}
const pdf = await getPDFDocument({ data: pdfBytes }).promise;
const newPdfDoc = await PDFDocument.create();
newPdfDoc.registerFontkit(fontkit);
progress('Loading fonts...', 0);
const selectedLangs = language.split('+');
const cjkLangs = ['jpn', 'chi_sim', 'chi_tra', 'kor'];
const indicLangs = [
'hin',
'ben',
'guj',
'kan',
'mal',
'ori',
'pan',
'tam',
'tel',
'sin',
];
const priorityLangs = [...cjkLangs, ...indicLangs, 'ara', 'rus', 'ukr'];
const primaryLang =
selectedLangs.find((l) => priorityLangs.includes(l)) ||
selectedLangs[0] ||
'eng';
const hasCJK = selectedLangs.some((l) => cjkLangs.includes(l));
const hasIndic = selectedLangs.some((l) => indicLangs.includes(l));
const hasLatin =
selectedLangs.some((l) => !priorityLangs.includes(l)) ||
selectedLangs.includes('eng');
const isIndicPlusLatin = hasIndic && hasLatin && !hasCJK;
let primaryFont: PDFFont;
let latinFont: PDFFont;
try {
if (isIndicPlusLatin) {
const [scriptFontBytes, latinFontBytes] = await Promise.all([
getFontForLanguage(primaryLang),
getFontForLanguage('eng'),
]);
primaryFont = await newPdfDoc.embedFont(scriptFontBytes, {
subset: false,
});
latinFont = await newPdfDoc.embedFont(latinFontBytes, {
subset: false,
});
} else {
const fontBytes = await getFontForLanguage(primaryLang);
primaryFont = await newPdfDoc.embedFont(fontBytes, { subset: false });
latinFont = primaryFont;
}
} catch (e) {
console.error('Font loading failed, falling back to Helvetica', e);
primaryFont = await newPdfDoc.embedFont(StandardFonts.Helvetica);
latinFont = primaryFont;
}
let fullText = '';
try {
for (let i = 1; i <= pdf.numPages; i++) {
progress(
`Processing page ${i} of ${pdf.numPages}`,
(i - 1) / pdf.numPages
);
const page = await pdf.getPage(i);
const viewport = page.getViewport({ scale: resolution });
const canvas = document.createElement('canvas');
canvas.width = viewport.width;
canvas.height = viewport.height;
const context = canvas.getContext('2d');
if (!context) throw new Error('Failed to create canvas context');
await page.render({ canvasContext: context, viewport, canvas }).promise;
if (binarize) {
binarizeCanvas(context);
}
const result = await worker.recognize(
canvas,
{},
{ text: true, hocr: true }
);
const data = result.data;
const newPage = newPdfDoc.addPage([viewport.width, viewport.height]);
const pngImageBytes = await new Promise<Uint8Array>(function (
resolve,
reject
) {
canvas.toBlob(function (blob) {
if (!blob) {
reject(new Error('Failed to create image blob'));
return;
}
const reader = new FileReader();
reader.onload = function () {
resolve(new Uint8Array(reader.result as ArrayBuffer));
};
reader.onerror = function () {
reject(new Error('Failed to read image data'));
};
reader.readAsArrayBuffer(blob);
}, 'image/png');
});
// Release canvas memory
canvas.width = 0;
canvas.height = 0;
const pngImage = await newPdfDoc.embedPng(pngImageBytes);
newPage.drawImage(pngImage, {
x: 0,
y: 0,
width: viewport.width,
height: viewport.height,
});
if (data.hocr) {
const ocrPage = parseHocrDocument(data.hocr);
drawOcrTextLayer(
newPage,
ocrPage,
viewport.height,
primaryFont,
latinFont
);
}
fullText += data.text + '\n\n';
}
} finally {
await worker.terminate();
}
const savedBytes = await newPdfDoc.save();
return {
pdfBytes: new Uint8Array(savedBytes),
pdfDoc: newPdfDoc,
fullText,
};
}