| import { |
| BBox, |
| OcrLine, |
| OcrPage, |
| OcrWord, |
| WordTransform, |
| Baseline, |
| } from '@/types'; |
|
|
| const BBOX_PATTERN = /bbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)/; |
| const BASELINE_PATTERN = /baseline\s+([-+]?\d*\.?\d*)\s+([-+]?\d+)/; |
| const TEXTANGLE_PATTERN = /textangle\s+([-+]?\d*\.?\d*)/; |
|
|
| export function parseBBox(title: string): BBox | null { |
| const match = title.match(BBOX_PATTERN); |
| if (!match) return null; |
|
|
| return { |
| x0: parseInt(match[1], 10), |
| y0: parseInt(match[2], 10), |
| x1: parseInt(match[3], 10), |
| y1: parseInt(match[4], 10), |
| }; |
| } |
|
|
| export function parseBaseline(title: string): Baseline { |
| const match = title.match(BASELINE_PATTERN); |
| if (!match) { |
| return { slope: 0, intercept: 0 }; |
| } |
|
|
| return { |
| slope: parseFloat(match[1]) || 0, |
| intercept: parseInt(match[2], 10) || 0, |
| }; |
| } |
|
|
| export function parseTextangle(title: string): number { |
| const match = title.match(TEXTANGLE_PATTERN); |
| if (!match) return 0; |
| return parseFloat(match[1]) || 0; |
| } |
|
|
| export function getTextDirection(element: Element): 'ltr' | 'rtl' { |
| const dir = element.getAttribute('dir'); |
| return dir === 'rtl' ? 'rtl' : 'ltr'; |
| } |
|
|
| export function shouldInjectWordBreaks(element: Element): boolean { |
| const lang = element.getAttribute('lang') || ''; |
| const cjkLangs = ['chi_sim', 'chi_tra', 'jpn', 'kor', 'zh', 'ja', 'ko']; |
| return !cjkLangs.includes(lang); |
| } |
|
|
| export function normalizeText(text: string): string { |
| return text.normalize('NFKC'); |
| } |
|
|
| export function parseHocrDocument(hocrText: string): OcrPage { |
| const parser = new DOMParser(); |
| const doc = parser.parseFromString(hocrText, 'text/html'); |
|
|
| let width = 0; |
| let height = 0; |
| const pageDiv = doc.querySelector('.ocr_page'); |
| if (pageDiv) { |
| const title = pageDiv.getAttribute('title') || ''; |
| const bbox = parseBBox(title); |
| if (bbox) { |
| width = bbox.x1 - bbox.x0; |
| height = bbox.y1 - bbox.y0; |
| } |
| } |
|
|
| const lines: OcrLine[] = []; |
|
|
| const lineClasses = [ |
| 'ocr_line', |
| 'ocr_textfloat', |
| 'ocr_header', |
| 'ocr_caption', |
| ]; |
| const lineSelectors = lineClasses.map((c) => `.${c}`).join(', '); |
| const lineElements = doc.querySelectorAll(lineSelectors); |
|
|
| if (lineElements.length > 0) { |
| lineElements.forEach((lineEl) => { |
| const line = parseHocrLine(lineEl); |
| if (line && line.words.length > 0) { |
| lines.push(line); |
| } |
| }); |
| } else { |
| const wordElements = doc.querySelectorAll('.ocrx_word'); |
| if (wordElements.length > 0) { |
| const words = parseWordsFromElements(wordElements); |
| if (words.length > 0) { |
| const allBBox = calculateBoundingBox(words.map((w) => w.bbox)); |
| lines.push({ |
| bbox: allBBox, |
| baseline: { slope: 0, intercept: 0 }, |
| textangle: 0, |
| words, |
| direction: 'ltr', |
| injectWordBreaks: true, |
| }); |
| } |
| } |
| } |
|
|
| return { width, height, dpi: 72, lines }; |
| } |
|
|
| function parseHocrLine(lineElement: Element): OcrLine | null { |
| const title = lineElement.getAttribute('title') || ''; |
| const bbox = parseBBox(title); |
|
|
| if (!bbox) return null; |
|
|
| const baseline = parseBaseline(title); |
| const textangle = parseTextangle(title); |
|
|
| const parent = lineElement.closest('.ocr_par') || lineElement.parentElement; |
| const direction = parent ? getTextDirection(parent) : 'ltr'; |
| const injectWordBreaks = parent ? shouldInjectWordBreaks(parent) : true; |
| const wordElements = lineElement.querySelectorAll('.ocrx_word'); |
| const words = parseWordsFromElements(wordElements); |
|
|
| return { |
| bbox, |
| baseline, |
| textangle, |
| words, |
| direction, |
| injectWordBreaks, |
| }; |
| } |
|
|
| function parseWordsFromElements(wordElements: NodeListOf<Element>): OcrWord[] { |
| const words: OcrWord[] = []; |
|
|
| wordElements.forEach((wordEl) => { |
| const title = wordEl.getAttribute('title') || ''; |
| const text = normalizeText((wordEl.textContent || '').trim()); |
|
|
| if (!text) return; |
|
|
| const bbox = parseBBox(title); |
| if (!bbox) return; |
|
|
| const confMatch = title.match(/x_wconf\s+(\d+)/); |
| const confidence = confMatch ? parseInt(confMatch[1], 10) : 0; |
|
|
| words.push({ |
| text, |
| bbox, |
| confidence, |
| }); |
| }); |
|
|
| return words; |
| } |
|
|
| function calculateBoundingBox(bboxes: BBox[]): BBox { |
| if (bboxes.length === 0) { |
| return { x0: 0, y0: 0, x1: 0, y1: 0 }; |
| } |
|
|
| return { |
| x0: Math.min(...bboxes.map((b) => b.x0)), |
| y0: Math.min(...bboxes.map((b) => b.y0)), |
| x1: Math.max(...bboxes.map((b) => b.x1)), |
| y1: Math.max(...bboxes.map((b) => b.y1)), |
| }; |
| } |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| export function calculateWordTransform( |
| word: OcrWord, |
| line: OcrLine, |
| pageHeight: number, |
| fontWidthFn: (text: string, fontSize: number) => number |
| ): WordTransform { |
| const wordBBox = word.bbox; |
| const wordWidth = wordBBox.x1 - wordBBox.x0; |
| const wordHeight = wordBBox.y1 - wordBBox.y0; |
|
|
| let fontSize = wordHeight; |
| const maxIterations = 10; |
|
|
| for (let i = 0; i < maxIterations; i++) { |
| const currentWidth = fontWidthFn(word.text, fontSize); |
| if (currentWidth <= 0) break; |
|
|
| const ratio = wordWidth / currentWidth; |
| const newFontSize = fontSize * ratio; |
|
|
| if (Math.abs(newFontSize - fontSize) / fontSize < 0.01) { |
| fontSize = newFontSize; |
| break; |
| } |
| fontSize = newFontSize; |
| } |
|
|
| fontSize = Math.max(1, Math.min(fontSize, wordHeight * 2)); |
|
|
| const fontWidth = fontWidthFn(word.text, fontSize); |
| const horizontalScale = fontWidth > 0 ? wordWidth / fontWidth : 1; |
|
|
| const slopeAngle = Math.atan(line.baseline.slope) * (180 / Math.PI); |
| const rotation = -line.textangle + slopeAngle; |
|
|
| const x = wordBBox.x0; |
|
|
| |
| const y = pageHeight - wordBBox.y1; |
|
|
| return { |
| x, |
| y, |
| fontSize, |
| horizontalScale, |
| rotation, |
| }; |
| } |
|
|
| export function calculateSpaceTransform( |
| prevWord: OcrWord, |
| nextWord: OcrWord, |
| line: OcrLine, |
| pageHeight: number, |
| spaceWidthFn: (fontSize: number) => number |
| ): { x: number; y: number; horizontalScale: number; fontSize: number } | null { |
| const lineHeight = line.bbox.y1 - line.bbox.y0; |
| const fontSize = Math.max(lineHeight + line.baseline.intercept, 1); |
|
|
| const gapStart = prevWord.bbox.x1; |
| const gapEnd = nextWord.bbox.x0; |
| const gapWidth = gapEnd - gapStart; |
|
|
| if (gapWidth <= 0) return null; |
|
|
| const spaceWidth = spaceWidthFn(fontSize); |
| if (spaceWidth <= 0) return null; |
|
|
| const horizontalScale = gapWidth / spaceWidth; |
| const baselineY = pageHeight - line.bbox.y1 - line.baseline.intercept; |
|
|
| return { |
| x: gapStart, |
| y: baselineY, |
| horizontalScale, |
| fontSize, |
| }; |
| } |
|
|