OCR_DATASET_MAKER / src /adapters /tesseract.ts
Omarrran's picture
OCR Dataset Generator for HF Spaces
24a732c
/**
* Tesseract Output Format Adapter
*
* Format options:
* 1. .gt.txt files (ground truth text)
* 2. .box files (character-level bounding boxes)
* 3. LSTM training format
*/
import { mkdir, writeFile } from 'fs/promises';
import { join } from 'path';
export interface TesseractOptions {
format?: 'gt' | 'box' | 'lstm';
boxWidth?: number; // For box format
boxHeight?: number;
}
export interface TesseractOutput {
outputDir: string;
count: number;
format: string;
}
/**
* Tesseract Adapter class
*/
export class TesseractAdapter {
private outputDir: string;
private options: TesseractOptions;
private count: number = 0;
constructor(outputDir: string, options: TesseractOptions = {}) {
this.outputDir = join(outputDir, 'tesseract');
this.options = {
format: options.format || 'gt',
boxWidth: options.boxWidth || 256,
boxHeight: options.boxHeight || 64,
};
}
/**
* Initialize output directories
*/
async init(): Promise<void> {
await mkdir(this.outputDir, { recursive: true });
this.count = 0;
}
/**
* Add a sample with ground truth
*/
async addSample(imageFilename: string, text: string, _imagePath: string): Promise<void> {
const baseName = imageFilename.replace(/\.[^.]+$/, '');
switch (this.options.format) {
case 'gt':
await this.writeGtFile(baseName, text);
break;
case 'box':
await this.writeBoxFile(baseName, text);
break;
case 'lstm':
await this.writeLstmFiles(baseName, text);
break;
}
this.count++;
}
/**
* Write .gt.txt file (ground truth)
*/
private async writeGtFile(baseName: string, text: string): Promise<void> {
const gtPath = join(this.outputDir, `${baseName}.gt.txt`);
await writeFile(gtPath, text, 'utf-8');
}
/**
* Write .box file (character-level boxes)
* Format: char left bottom right top page
*/
private async writeBoxFile(baseName: string, text: string): Promise<void> {
const boxPath = join(this.outputDir, `${baseName}.box`);
const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];
const width = this.options.boxWidth!;
const height = this.options.boxHeight!;
const charWidth = width / Math.max(graphemes.length, 1);
const lines: string[] = [];
for (let i = 0; i < graphemes.length; i++) {
const char = graphemes[i].segment;
const left = Math.round(i * charWidth);
const right = Math.round((i + 1) * charWidth);
const bottom = 0;
const top = height;
// Format: char left bottom right top page
lines.push(`${char} ${left} ${bottom} ${right} ${top} 0`);
}
await writeFile(boxPath, lines.join('\n'), 'utf-8');
}
/**
* Write LSTM training format files
*/
private async writeLstmFiles(baseName: string, text: string): Promise<void> {
// Write ground truth
await this.writeGtFile(baseName, text);
// LSTM format also uses box files with special markers
const boxPath = join(this.outputDir, `${baseName}.box`);
const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];
const width = this.options.boxWidth!;
const height = this.options.boxHeight!;
const charWidth = width / Math.max(graphemes.length, 1);
const lines: string[] = [];
for (let i = 0; i < graphemes.length; i++) {
const char = graphemes[i].segment;
const left = Math.round(i * charWidth);
const right = Math.round((i + 1) * charWidth);
lines.push(`${char} ${left} 0 ${right} ${height} 0`);
}
// Add end-of-line marker
lines.push(`\t ${width - 1} 0 ${width} ${height} 0`);
await writeFile(boxPath, lines.join('\n'), 'utf-8');
}
/**
* Get image output path for a sample
*/
getImagePath(index: number, prefix: string = 'img'): string {
const filename = `${prefix}_${String(index).padStart(6, '0')}.png`;
return join(this.outputDir, filename);
}
/**
* Finalize
*/
async finalize(): Promise<TesseractOutput> {
return {
outputDir: this.outputDir,
count: this.count,
format: this.options.format!,
};
}
/**
* Get statistics
*/
getStats(): { count: number } {
return { count: this.count };
}
}
/**
* Create Tesseract adapter
*/
export function createTesseractAdapter(outputDir: string, options?: TesseractOptions): TesseractAdapter {
return new TesseractAdapter(outputDir, options);
}