Spaces:
Running
Running
| /** | |
| * Tesseract Output Format Adapter | |
| * | |
| * Format options: | |
| * 1. .gt.txt files (ground truth text) | |
| * 2. .box files (character-level bounding boxes) | |
| * 3. LSTM training format | |
| */ | |
| import { mkdir, writeFile } from 'fs/promises'; | |
| import { join } from 'path'; | |
| export interface TesseractOptions { | |
| format?: 'gt' | 'box' | 'lstm'; | |
| boxWidth?: number; // For box format | |
| boxHeight?: number; | |
| } | |
| export interface TesseractOutput { | |
| outputDir: string; | |
| count: number; | |
| format: string; | |
| } | |
| /** | |
| * Tesseract Adapter class | |
| */ | |
| export class TesseractAdapter { | |
| private outputDir: string; | |
| private options: TesseractOptions; | |
| private count: number = 0; | |
| constructor(outputDir: string, options: TesseractOptions = {}) { | |
| this.outputDir = join(outputDir, 'tesseract'); | |
| this.options = { | |
| format: options.format || 'gt', | |
| boxWidth: options.boxWidth || 256, | |
| boxHeight: options.boxHeight || 64, | |
| }; | |
| } | |
| /** | |
| * Initialize output directories | |
| */ | |
| async init(): Promise<void> { | |
| await mkdir(this.outputDir, { recursive: true }); | |
| this.count = 0; | |
| } | |
| /** | |
| * Add a sample with ground truth | |
| */ | |
| async addSample(imageFilename: string, text: string, _imagePath: string): Promise<void> { | |
| const baseName = imageFilename.replace(/\.[^.]+$/, ''); | |
| switch (this.options.format) { | |
| case 'gt': | |
| await this.writeGtFile(baseName, text); | |
| break; | |
| case 'box': | |
| await this.writeBoxFile(baseName, text); | |
| break; | |
| case 'lstm': | |
| await this.writeLstmFiles(baseName, text); | |
| break; | |
| } | |
| this.count++; | |
| } | |
| /** | |
| * Write .gt.txt file (ground truth) | |
| */ | |
| private async writeGtFile(baseName: string, text: string): Promise<void> { | |
| const gtPath = join(this.outputDir, `${baseName}.gt.txt`); | |
| await writeFile(gtPath, text, 'utf-8'); | |
| } | |
| /** | |
| * Write .box file (character-level boxes) | |
| * Format: char left bottom right top page | |
| */ | |
| private async writeBoxFile(baseName: string, text: string): Promise<void> { | |
| const boxPath = join(this.outputDir, `${baseName}.box`); | |
| const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)]; | |
| const width = this.options.boxWidth!; | |
| const height = this.options.boxHeight!; | |
| const charWidth = width / Math.max(graphemes.length, 1); | |
| const lines: string[] = []; | |
| for (let i = 0; i < graphemes.length; i++) { | |
| const char = graphemes[i].segment; | |
| const left = Math.round(i * charWidth); | |
| const right = Math.round((i + 1) * charWidth); | |
| const bottom = 0; | |
| const top = height; | |
| // Format: char left bottom right top page | |
| lines.push(`${char} ${left} ${bottom} ${right} ${top} 0`); | |
| } | |
| await writeFile(boxPath, lines.join('\n'), 'utf-8'); | |
| } | |
| /** | |
| * Write LSTM training format files | |
| */ | |
| private async writeLstmFiles(baseName: string, text: string): Promise<void> { | |
| // Write ground truth | |
| await this.writeGtFile(baseName, text); | |
| // LSTM format also uses box files with special markers | |
| const boxPath = join(this.outputDir, `${baseName}.box`); | |
| const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)]; | |
| const width = this.options.boxWidth!; | |
| const height = this.options.boxHeight!; | |
| const charWidth = width / Math.max(graphemes.length, 1); | |
| const lines: string[] = []; | |
| for (let i = 0; i < graphemes.length; i++) { | |
| const char = graphemes[i].segment; | |
| const left = Math.round(i * charWidth); | |
| const right = Math.round((i + 1) * charWidth); | |
| lines.push(`${char} ${left} 0 ${right} ${height} 0`); | |
| } | |
| // Add end-of-line marker | |
| lines.push(`\t ${width - 1} 0 ${width} ${height} 0`); | |
| await writeFile(boxPath, lines.join('\n'), 'utf-8'); | |
| } | |
| /** | |
| * Get image output path for a sample | |
| */ | |
| getImagePath(index: number, prefix: string = 'img'): string { | |
| const filename = `${prefix}_${String(index).padStart(6, '0')}.png`; | |
| return join(this.outputDir, filename); | |
| } | |
| /** | |
| * Finalize | |
| */ | |
| async finalize(): Promise<TesseractOutput> { | |
| return { | |
| outputDir: this.outputDir, | |
| count: this.count, | |
| format: this.options.format!, | |
| }; | |
| } | |
| /** | |
| * Get statistics | |
| */ | |
| getStats(): { count: number } { | |
| return { count: this.count }; | |
| } | |
| } | |
| /** | |
| * Create Tesseract adapter | |
| */ | |
| export function createTesseractAdapter(outputDir: string, options?: TesseractOptions): TesseractAdapter { | |
| return new TesseractAdapter(outputDir, options); | |
| } | |