OCR_DATASET_MAKER / web /lib /generator.ts
Omarrran's picture
Update OCR Dataset Generator
004fbbb
import JSZip from 'jszip'
import { saveAs } from 'file-saver'
import { getWorkerPool, isWorkerRenderingAvailable, WorkerTask, WorkerResult } from './worker-pool'
import { isWebGLAvailable, applyGPUAugmentation, GPUAugmentOptions } from './gpu-augmentation'
import { StorageManager, StorageMode, getStorageManager, StoredSample } from './storage-manager'
// ============================================
// Canvas Pool for Performance
// ============================================
interface PooledCanvas {
canvas: HTMLCanvasElement
ctx: CanvasRenderingContext2D
inUse: boolean
width: number
height: number
}
const canvasPool: PooledCanvas[] = []
const MAX_POOL_SIZE = 8
function acquireCanvas(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
// Find available canvas with matching dimensions
for (const item of canvasPool) {
if (!item.inUse && item.width === width && item.height === height) {
item.inUse = true
item.ctx.clearRect(0, 0, width, height)
item.ctx.setTransform(1, 0, 0, 1, 0, 0) // Reset transform
return { canvas: item.canvas, ctx: item.ctx }
}
}
// Create new canvas if pool not full
if (canvasPool.length < MAX_POOL_SIZE) {
const canvas = document.createElement('canvas')
canvas.width = width
canvas.height = height
const ctx = canvas.getContext('2d', { willReadFrequently: true })!
const pooled: PooledCanvas = { canvas, ctx, inUse: true, width, height }
canvasPool.push(pooled)
return { canvas, ctx }
}
// Pool full, create temporary canvas (will be GC'd)
const canvas = document.createElement('canvas')
canvas.width = width
canvas.height = height
const ctx = canvas.getContext('2d', { willReadFrequently: true })!
return { canvas, ctx }
}
function releaseCanvas(canvas: HTMLCanvasElement): void {
for (const item of canvasPool) {
if (item.canvas === canvas) {
item.inUse = false
return
}
}
}
export function clearCanvasPool(): void {
canvasPool.length = 0
}
export interface FontData {
name: string
family: string
percentage: number
dataUrl?: string
}
export interface GeneratorConfig {
dataset: {
size: number
seed: number
}
input: {
segmentation: string
textData?: string[]
}
image: {
width: number
height: number
background: string
backgroundStyle?: string
backgroundMode?: 'single' | 'mix'
backgroundPercentages?: Record<string, number>
customBackgrounds?: { name: string; dataUrl: string; percentage: number }[]
textColor: string
direction: string
}
fonts: {
distribution: FontData[]
}
augmentation: {
enabled: boolean
applyPercentage: number
preset: string
customMode?: boolean
values?: Record<string, number>
}
output: {
formats: string[]
}
}
export interface GenerationResult {
stats: {
total_samples: number
duration_seconds: number
samples_per_second: number
font_distribution: { family: string; count: number; percentage: number }[]
clean_samples: number
augmented_samples: number
augmentation_stats: { name: string; count: number; percentage: number }[]
avg_transforms_per_sample: number
unique_tokens: number
avg_chars_per_sample: number
unicode_valid: number
script_pure: number
rejected_samples: number
background_distribution?: { name: string; count: number; percentage: number }[]
}
zipBlob: Blob
}
// Generation state for pause/resume functionality
export interface GenerationState {
currentIndex: number
labels: string[]
fontUsageCounts: Record<string, number>
augmentationCounts: Record<string, number>
cleanCount: number
augmentedCount: number
totalChars: number
uniqueTexts: Set<string>
backgroundCounts: Record<string, number>
startTime: number
zip: JSZip
}
// Abort controller for cancellation
let abortController: AbortController | null = null
let currentState: GenerationState | null = null
// Get current generation state for external access
export function getGenerationState(): GenerationState | null {
return currentState
}
// Result from rendering a single sample
interface SampleResult {
index: number
filename: string
blob: Blob
label: string
fontName: string
augmentations: string[]
backgroundStyle: string
isAugmented: boolean
}
// Simple seeded random number generator
function seededRandom(seed: number) {
let s = seed
return function () {
s = Math.sin(s) * 10000
return s - Math.floor(s)
}
}
// Create independent seeded random for a specific index (for parallel processing)
function seededRandomForIndex(baseSeed: number, index: number) {
return seededRandom(baseSeed + index * 1000)
}
// Load a font from dataUrl or use system font
async function loadFont(font: FontData): Promise<string> {
if (font.dataUrl) {
try {
const fontFace = new FontFace(font.family, `url(${font.dataUrl})`)
await fontFace.load()
document.fonts.add(fontFace)
return font.family
} catch (err) {
console.warn(`Failed to load font ${font.name}:`, err)
return 'Arial'
}
}
return font.family || 'Arial'
}
// Select a font based on distribution percentages
function selectFont(fonts: FontData[], random: () => number): FontData {
if (!fonts || fonts.length === 0) {
return { name: 'Default', family: 'Arial', percentage: 100 }
}
const roll = random() * 100
let cumulative = 0
for (const font of fonts) {
cumulative += font.percentage
if (roll < cumulative) {
return font
}
}
return fonts[fonts.length - 1]
}
// Apply augmentation to canvas context
function applyAugmentation(
ctx: CanvasRenderingContext2D,
canvas: HTMLCanvasElement,
augValues: Record<string, number>,
random: () => number
): string[] {
const applied: string[] = []
// Rotation
if (augValues.rotation && random() > 0.5) {
const angle = (random() - 0.5) * 2 * augValues.rotation * Math.PI / 180
ctx.translate(canvas.width / 2, canvas.height / 2)
ctx.rotate(angle)
ctx.translate(-canvas.width / 2, -canvas.height / 2)
applied.push('rotation')
}
// Skew (approximated with transform)
if (augValues.skew && random() > 0.5) {
const skewAmount = (random() - 0.5) * augValues.skew * 0.01
ctx.transform(1, skewAmount, 0, 1, 0, 0)
applied.push('skew')
}
return applied
}
// Background style colors
const backgroundColors: Record<string, string> = {
clean_white: '#FFFFFF',
aged_paper: '#F5E6D3',
book_page: '#FAF0E6',
newspaper: '#E8E8E8',
notebook: '#FFFEF0',
parchment: '#F0E68C',
weathered: '#D4C4A8',
coffee_stain: '#E6D5C3',
old_book: '#E8DCC4',
recycled: '#D9D4C5',
cream: '#FFFDD0',
ivory: '#FFFFF0',
}
// Get background color based on style (supports mix mode with percentages)
function getBackgroundColor(config: GeneratorConfig, random?: () => number): string {
// Check if mix mode is enabled
if (config.image.backgroundMode === 'mix' && config.image.backgroundPercentages && random) {
const percentages = config.image.backgroundPercentages as Record<string, number>
const roll = random() * 100
let cumulative = 0
for (const [styleId, percentage] of Object.entries(percentages)) {
cumulative += percentage
if (roll < cumulative && backgroundColors[styleId]) {
return backgroundColors[styleId]
}
}
}
// Single mode or fallback
if (config.image.backgroundStyle && backgroundColors[config.image.backgroundStyle]) {
return backgroundColors[config.image.backgroundStyle]
}
return config.image.background || '#FFFFFF'
}
// Load an image from dataUrl
async function loadImage(dataUrl: string): Promise<HTMLImageElement> {
return new Promise((resolve, reject) => {
const img = new Image()
img.onload = () => resolve(img)
img.onerror = reject
img.src = dataUrl
})
}
// Select background - returns either a color string OR an image dataUrl
function selectBackground(config: GeneratorConfig, random: () => number): { type: 'color', value: string, styleName: string } | { type: 'image', value: string, styleName: string } {
// Check for custom backgrounds with percentages > 0
const customBgs = config.image.customBackgrounds?.filter(bg => bg.percentage > 0) || []
const hasCustomBgs = customBgs.length > 0
// Calculate total percentages
const stylePercentages = config.image.backgroundPercentages || {}
const totalStylePct = Object.values(stylePercentages).reduce((acc, p) => acc + p, 0)
const totalCustomPct = customBgs.reduce((acc, bg) => acc + bg.percentage, 0)
const totalPct = totalStylePct + totalCustomPct
if (totalPct === 0) {
// Fallback to single style or default
const style = config.image.backgroundStyle || 'clean_white'
return { type: 'color', value: backgroundColors[style] || config.image.background || '#FFFFFF', styleName: style }
}
// Random selection based on combined percentages
const roll = random() * totalPct
let cumulative = 0
// Check style backgrounds first
for (const [styleId, percentage] of Object.entries(stylePercentages)) {
cumulative += percentage
if (roll < cumulative && backgroundColors[styleId]) {
return { type: 'color', value: backgroundColors[styleId], styleName: styleId }
}
}
// Check custom backgrounds
for (const bg of customBgs) {
cumulative += bg.percentage
if (roll < cumulative) {
return { type: 'image', value: bg.dataUrl, styleName: bg.name }
}
}
// Fallback
const style = config.image.backgroundStyle || 'clean_white'
return { type: 'color', value: backgroundColors[style] || config.image.background || '#FFFFFF', styleName: style }
}
// Render text to canvas and return as blob
async function renderTextToCanvas(
text: string,
config: GeneratorConfig,
fontFamily: string,
shouldAugment: boolean,
augValues: Record<string, number>,
random: () => number
): Promise<{ blob: Blob; augmentations: string[]; backgroundStyle: string }> {
// Use canvas pool for better performance
const { canvas, ctx } = acquireCanvas(config.image.width, config.image.height)
// Declare variables outside try for proper scoping
let appliedAugmentations: string[] = []
let bg: ReturnType<typeof selectBackground>
try {
// Select and apply background
bg = selectBackground(config, random)
if (bg.type === 'color') {
ctx.fillStyle = bg.value
ctx.fillRect(0, 0, canvas.width, canvas.height)
} else {
// Draw custom background image
try {
const img = await loadImage(bg.value)
ctx.drawImage(img, 0, 0, canvas.width, canvas.height)
} catch {
// Fallback to white if image fails
ctx.fillStyle = '#FFFFFF'
ctx.fillRect(0, 0, canvas.width, canvas.height)
}
}
// Apply augmentation transforms if enabled
if (shouldAugment && config.augmentation.enabled) {
ctx.save()
appliedAugmentations = applyAugmentation(ctx, canvas, augValues, random)
}
// Set text properties with the selected font
const fontSize = Math.min(canvas.height * 0.6, 48)
ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
ctx.fillStyle = config.image.textColor
ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
ctx.textBaseline = 'middle'
// Draw text
const x = config.image.direction === 'rtl'
? canvas.width - 10
: 10
const y = canvas.height / 2
ctx.direction = config.image.direction as CanvasDirection
ctx.fillText(text, x, y)
if (shouldAugment && config.augmentation.enabled) {
ctx.restore()
}
// Apply post-processing augmentations (GPU-accelerated when available)
if (shouldAugment && config.augmentation.enabled) {
const applyBrightness = augValues.brightness && random() > 0.5
const applyNoise = augValues.gaussian_noise && random() > 0.6
if (applyBrightness || applyNoise) {
// Try GPU-accelerated augmentation first
const useGPU = isWebGLAvailable()
if (useGPU) {
// GPU path - apply all augmentations in a single GPU pass
const gpuOptions: GPUAugmentOptions = {
brightness: applyBrightness ? (random() - 0.5) * augValues.brightness / 50 : 0,
contrast: 1, // Could add contrast augmentation here
noiseAmount: applyNoise ? augValues.gaussian_noise / 200 : 0,
seed: random() * 1000
}
const gpuResult = applyGPUAugmentation(canvas, gpuOptions)
if (gpuResult && gpuResult !== canvas) {
// Copy GPU result back to main canvas
const gpuCtx = gpuResult.getContext('webgl')
if (gpuCtx) {
const pixels = new Uint8Array(canvas.width * canvas.height * 4)
gpuCtx.readPixels(0, 0, canvas.width, canvas.height, gpuCtx.RGBA, gpuCtx.UNSIGNED_BYTE, pixels)
// Flip Y and apply to main canvas
const imageData = ctx.createImageData(canvas.width, canvas.height)
const rowSize = canvas.width * 4
for (let y = 0; y < canvas.height; y++) {
const srcRow = (canvas.height - 1 - y) * rowSize
const dstRow = y * rowSize
imageData.data.set(pixels.subarray(srcRow, srcRow + rowSize), dstRow)
}
ctx.putImageData(imageData, 0, 0)
if (applyBrightness) appliedAugmentations.push('brightness_gpu')
if (applyNoise) appliedAugmentations.push('noise_gpu')
}
}
} else {
// CPU fallback path
if (applyBrightness) {
const adjustment = 1 + (random() - 0.5) * augValues.brightness / 50
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
for (let i = 0; i < imageData.data.length; i += 4) {
imageData.data[i] = Math.min(255, imageData.data[i] * adjustment)
imageData.data[i + 1] = Math.min(255, imageData.data[i + 1] * adjustment)
imageData.data[i + 2] = Math.min(255, imageData.data[i + 2] * adjustment)
}
ctx.putImageData(imageData, 0, 0)
appliedAugmentations.push('brightness')
}
if (applyNoise) {
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
const noiseLevel = augValues.gaussian_noise / 2
for (let i = 0; i < imageData.data.length; i += 4) {
const noise = (random() - 0.5) * noiseLevel
imageData.data[i] = Math.max(0, Math.min(255, imageData.data[i] + noise))
imageData.data[i + 1] = Math.max(0, Math.min(255, imageData.data[i + 1] + noise))
imageData.data[i + 2] = Math.max(0, Math.min(255, imageData.data[i + 2] + noise))
}
ctx.putImageData(imageData, 0, 0)
appliedAugmentations.push('noise')
}
}
}
}
// Convert to blob and release canvas when done
return new Promise((resolve, reject) => {
canvas.toBlob(
(blob) => {
releaseCanvas(canvas) // Return canvas to pool
if (blob) {
resolve({ blob, augmentations: appliedAugmentations, backgroundStyle: bg.styleName })
} else {
reject(new Error('Failed to convert canvas to blob'))
}
},
'image/png'
)
})
} catch (err) {
releaseCanvas(canvas) // Ensure canvas is released on error
throw err
}
}
// Render a single sample (for parallel processing)
async function renderSample(
index: number,
text: string,
config: GeneratorConfig,
loadedFonts: Map<string, string>,
augValues: Record<string, number>,
baseSeed: number
): Promise<SampleResult> {
// Use index-specific random generator for reproducibility
const random = seededRandomForIndex(baseSeed, index)
// Select font
const selectedFont = selectFont(config.fonts.distribution, random)
const fontFamily = loadedFonts.get(selectedFont.name) || selectedFont.family || 'Arial'
// Determine augmentation
const shouldAugment = config.augmentation.enabled &&
(random() * 100) < config.augmentation.applyPercentage
// Render
const { blob, augmentations, backgroundStyle } = await renderTextToCanvas(
text,
config,
fontFamily,
shouldAugment,
augValues,
random
)
const filename = `image_${String(index).padStart(6, '0')}.png`
return {
index,
filename,
blob,
label: `${filename}\t${text}`,
fontName: selectedFont.name,
augmentations,
backgroundStyle,
isAugmented: shouldAugment
}
}
// Validate configuration and data
function validateInputs(config: GeneratorConfig, textData: string[]): { valid: boolean; error?: string; adjustedSize?: number } {
if (!textData || textData.length === 0) {
return { valid: false, error: 'No text data provided. Please upload a text file.' }
}
if (config.dataset.size <= 0) {
return { valid: false, error: 'Dataset size must be greater than 0.' }
}
// Check background percentages in mix mode
if (config.image.backgroundMode === 'mix' && config.image.backgroundPercentages) {
const total = Object.values(config.image.backgroundPercentages).reduce((a, b) => a + b, 0)
const customTotal = config.image.customBackgrounds?.reduce((a, b) => a + b.percentage, 0) || 0
if (Math.abs(total + customTotal - 100) > 1 && total + customTotal > 0) {
console.warn(`Background percentages total ${total + customTotal}%, expected 100%`)
}
}
// Adjust size if needed
if (config.dataset.size > textData.length) {
console.warn(`Dataset size (${config.dataset.size}) exceeds available samples (${textData.length}). Adjusting to ${textData.length}.`)
return { valid: true, adjustedSize: textData.length }
}
return { valid: true }
}
// Build label files with chunked processing for large datasets
function buildLabelFiles(labels: string[], formats: string[], zip: JSZip) {
console.log(`Building label files for ${labels.length} samples, formats: ${formats.join(', ')}`)
// CRNN/PaddleOCR format: labels.txt
if (formats.includes('crnn') || formats.includes('paddleocr')) {
// Process in chunks to avoid string length limits
const CHUNK_SIZE = 10000
const chunks: string[] = []
for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
chunks.push(labels.slice(i, i + CHUNK_SIZE).join('\n'))
}
const labelsContent = chunks.join('\n')
zip.file('labels.txt', labelsContent)
console.log(`Created labels.txt with ${labels.length} entries`)
}
// TrOCR/JSONL format: data.jsonl
if (formats.includes('trocr') || formats.includes('jsonl')) {
const CHUNK_SIZE = 10000
const chunks: string[] = []
for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
const chunk = labels.slice(i, i + CHUNK_SIZE).map((label) => {
const parts = label.split('\t')
const filename = parts[0]
const text = parts.slice(1).join('\t')
return JSON.stringify({ image: `images/${filename}`, text: text })
})
chunks.push(chunk.join('\n'))
}
zip.file('data.jsonl', chunks.join('\n'))
console.log(`Created data.jsonl with ${labels.length} entries`)
}
// CSV format: data.csv
if (formats.includes('csv')) {
const CHUNK_SIZE = 10000
const chunks: string[] = ['image,text']
for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
const chunk = labels.slice(i, i + CHUNK_SIZE).map(label => {
const parts = label.split('\t')
const filename = parts[0]
const text = parts.slice(1).join('\t')
return `"images/${filename}","${text.replace(/"/g, '""')}"`
})
chunks.push(chunk.join('\n'))
}
zip.file('data.csv', chunks.join('\n'))
console.log(`Created data.csv with ${labels.length} entries`)
}
// JSON format: data.json (array format)
if (formats.includes('json')) {
const jsonData = labels.map((label) => {
const parts = label.split('\t')
const filename = parts[0]
const text = parts.slice(1).join('\t')
return { image: `images/${filename}`, text: text }
})
zip.file('data.json', JSON.stringify(jsonData, null, 2))
console.log(`Created data.json with ${labels.length} entries`)
}
// HuggingFace format: metadata.csv
if (formats.includes('huggingface')) {
const CHUNK_SIZE = 10000
const chunks: string[] = ['file_name,text']
for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
const chunk = labels.slice(i, i + CHUNK_SIZE).map((label) => {
const parts = label.split('\t')
const filename = parts[0]
const text = parts.slice(1).join('\t')
return `"images/${filename}","${text.replace(/"/g, '""')}"`
})
chunks.push(chunk.join('\n'))
}
zip.file('metadata.csv', chunks.join('\n'))
console.log(`Created metadata.csv for HuggingFace with ${labels.length} entries`)
}
}
// Main generation function with parallel processing
export async function generateDataset(
config: GeneratorConfig,
textData: string[],
onProgress: (progress: number, message: string) => void,
abortSignal?: AbortSignal
): Promise<GenerationResult> {
const startTime = Date.now()
// Validate inputs
const validation = validateInputs(config, textData)
if (!validation.valid) {
throw new Error(validation.error)
}
if (validation.adjustedSize) {
config.dataset.size = validation.adjustedSize
}
const zip = new JSZip()
const imagesFolder = zip.folder('images')!
// Get augmentation values
const augValues = config.augmentation.customMode && config.augmentation.values
? config.augmentation.values
: {
rotation: 5,
skew: 3,
gaussian_blur: 1,
motion_blur: 3,
gaussian_noise: 10,
brightness: 15,
contrast: 20,
jpeg_quality: 70,
}
// Determine number of samples
const numSamples = Math.min(config.dataset.size, textData.length)
// Determine batch size based on available resources and dataset size
// Smaller batches for very large datasets to manage memory
const cpuCores = typeof navigator !== 'undefined' ? (navigator.hardwareConcurrency || 4) : 4
let BATCH_SIZE = Math.min(cpuCores, 8)
// For very large datasets, use smaller batches to manage memory
if (numSamples > 50000) {
BATCH_SIZE = Math.min(BATCH_SIZE, 4)
console.log(`Large dataset detected (${numSamples}), reducing batch size for memory management`)
}
console.log(`Starting generation: ${numSamples} samples, batch size: ${BATCH_SIZE}`)
// Load all fonts before starting
onProgress(0, 'Loading fonts...')
const loadedFonts: Map<string, string> = new Map()
if (config.fonts.distribution && config.fonts.distribution.length > 0) {
for (const font of config.fonts.distribution) {
try {
const loadedFamily = await loadFont(font)
loadedFonts.set(font.name, loadedFamily)
onProgress(0, `Loaded font: ${font.name}`)
} catch (err) {
console.warn(`Failed to load font ${font.name}, using fallback`)
loadedFonts.set(font.name, 'Arial')
}
}
}
// Initialize tracking
let cleanCount = 0
let augmentedCount = 0
const augmentationCounts: Record<string, number> = {}
const fontUsageCounts: Record<string, number> = {}
const backgroundCounts: Record<string, number> = {}
const labels: string[] = []
let totalChars = 0
const uniqueTexts = new Set<string>()
// Store current state for pause/resume
currentState = {
currentIndex: 0,
labels,
fontUsageCounts,
augmentationCounts,
cleanCount,
augmentedCount,
totalChars,
uniqueTexts,
backgroundCounts,
startTime,
zip
}
// Check if Web Workers with OffscreenCanvas are available
const useWorkers = isWorkerRenderingAvailable()
const workerPool = useWorkers ? getWorkerPool() : null
const renderMode = useWorkers ? `Web Workers (${workerPool?.getWorkerCount()} threads)` : `Main Thread (${BATCH_SIZE} parallel)`
console.log(`Starting generation: ${numSamples} samples, mode: ${renderMode}`)
// Generate samples
onProgress(1, `Starting generation (${renderMode})...`)
if (useWorkers && workerPool) {
// === WEB WORKER RENDERING ===
// Prepare all tasks upfront for worker distribution
const tasks: WorkerTask[] = []
const random = seededRandom(config.dataset.seed)
for (let i = 0; i < numSamples; i++) {
const text = textData[i % textData.length]
// Select font and background for this sample
const selectedFont = selectFont(config.fonts.distribution, () => seededRandom(config.dataset.seed + i * 1000)())
const fontFamily = loadedFonts.get(selectedFont.name) || selectedFont.family || 'Arial'
const bg = selectBackground(config, () => seededRandom(config.dataset.seed + i * 2000)())
const shouldAugment = config.augmentation.enabled &&
(seededRandom(config.dataset.seed + i * 3000)() * 100) < config.augmentation.applyPercentage
tasks.push({
id: i,
index: i,
text,
config: {
width: config.image.width,
height: config.image.height,
textColor: config.image.textColor,
direction: config.image.direction,
backgroundStyle: bg.styleName,
backgroundColor: bg.type === 'color' ? bg.value : '#FFFFFF'
},
fontFamily,
shouldAugment,
augValues,
seed: config.dataset.seed
})
// Track font usage
fontUsageCounts[selectedFont.name] = (fontUsageCounts[selectedFont.name] || 0) + 1
}
onProgress(5, `Prepared ${numSamples} tasks, dispatching to ${workerPool.getWorkerCount()} workers...`)
try {
// Process all tasks with workers
let processedCount = 0
const results = await workerPool.processTasks(tasks, (result: WorkerResult) => {
processedCount++
// Track progress
if (processedCount % 100 === 0 || processedCount >= numSamples) {
const progress = (processedCount / numSamples) * 85 + 5
onProgress(progress, `Generated ${processedCount}/${numSamples} samples (Workers: ${workerPool.getWorkerCount()} threads)`)
}
})
// Process results
for (const result of results) {
if (result.error) {
console.warn(`Sample ${result.index} failed:`, result.error)
continue
}
uniqueTexts.add(textData[result.index % textData.length])
totalChars += textData[result.index % textData.length].length
backgroundCounts[result.backgroundStyle] = (backgroundCounts[result.backgroundStyle] || 0) + 1
if (result.isAugmented) {
augmentedCount++
result.augmentations.forEach(aug => {
augmentationCounts[aug] = (augmentationCounts[aug] || 0) + 1
})
} else {
cleanCount++
}
imagesFolder.file(result.filename, result.blob)
labels.push(result.label)
}
currentState.currentIndex = numSamples
currentState.cleanCount = cleanCount
currentState.augmentedCount = augmentedCount
currentState.totalChars = totalChars
} catch (err) {
console.error('Worker pool error, falling back to main thread:', err)
// Fallback happens below
}
}
// === MAIN THREAD FALLBACK or PRIMARY (if workers not available) ===
// Only run if labels were not populated by workers
if (labels.length === 0) {
for (let batchStart = 0; batchStart < numSamples; batchStart += BATCH_SIZE) {
// Check for abort
if (abortSignal?.aborted) {
console.log('Generation aborted')
break
}
const batchEnd = Math.min(batchStart + BATCH_SIZE, numSamples)
const batchPromises: Promise<SampleResult>[] = []
// Create batch of render tasks
for (let i = batchStart; i < batchEnd; i++) {
const text = textData[i % textData.length]
batchPromises.push(
renderSample(i, text, config, loadedFonts, augValues, config.dataset.seed)
)
}
// Wait for batch to complete
try {
const batchResults = await Promise.all(batchPromises)
// Process results
for (const result of batchResults) {
// Track stats
uniqueTexts.add(textData[result.index % textData.length])
totalChars += textData[result.index % textData.length].length
// Track font usage
fontUsageCounts[result.fontName] = (fontUsageCounts[result.fontName] || 0) + 1
// Track background usage
backgroundCounts[result.backgroundStyle] = (backgroundCounts[result.backgroundStyle] || 0) + 1
// Track augmentation
if (result.isAugmented) {
augmentedCount++
result.augmentations.forEach(aug => {
augmentationCounts[aug] = (augmentationCounts[aug] || 0) + 1
})
} else {
cleanCount++
}
// Add to zip and labels
imagesFolder.file(result.filename, result.blob)
labels.push(result.label)
}
// Update state for pause/resume
currentState.currentIndex = batchEnd
currentState.cleanCount = cleanCount
currentState.augmentedCount = augmentedCount
currentState.totalChars = totalChars
} catch (err) {
console.error(`Batch error at ${batchStart}-${batchEnd}:`, err)
// Continue with next batch instead of failing completely
}
// Report progress
const progress = ((batchEnd) / numSamples) * 90 + 5 // Reserve 5% for init, 5% for compression
if (batchEnd % (BATCH_SIZE * 5) === 0 || batchEnd >= numSamples) {
onProgress(progress, `Generated ${batchEnd.toLocaleString()}/${numSamples.toLocaleString()} samples`)
}
// Memory cleanup hint for garbage collector (helps with large datasets)
if (batchEnd % 1000 === 0) {
// Allow event loop to process and GC to run
await new Promise(resolve => setTimeout(resolve, 0))
}
}
} // End of main thread fallback
// Build label files with chunked processing
console.log('Building label files...')
onProgress(92, 'Building label files...')
buildLabelFiles(labels, config.output.formats, zip)
// Add metadata
const metadata = {
generated_at: new Date().toISOString(),
config: {
image_size: `${config.image.width}x${config.image.height}`,
background: getBackgroundColor(config),
background_style: config.image.backgroundStyle,
background_mode: config.image.backgroundMode,
text_color: config.image.textColor,
direction: config.image.direction,
augmentation_enabled: config.augmentation.enabled,
augmentation_percentage: config.augmentation.applyPercentage,
fonts_used: config.fonts.distribution?.map(f => f.name) || [],
output_formats: config.output.formats,
},
samples: labels.length,
clean_samples: cleanCount,
augmented_samples: augmentedCount,
font_usage: fontUsageCounts,
background_usage: backgroundCounts,
}
zip.file('metadata.json', JSON.stringify(metadata, null, 2))
// Generate zip blob with dynamic compression based on dataset size
onProgress(95, 'Compressing dataset...')
// Use lower/no compression for large datasets (faster, less memory)
// STORE = no compression, uses less memory
let compressionType: 'DEFLATE' | 'STORE' = 'DEFLATE'
let compressionLevel = 6
if (labels.length > 20000) {
// Very large: no compression (STORE) to prevent memory issues
compressionType = 'STORE'
compressionLevel = 0
console.log(`Large dataset (${labels.length}): Using STORE (no compression) to prevent memory issues`)
} else if (labels.length > 10000) {
compressionLevel = 1 // Minimal compression
console.log(`Medium-large dataset (${labels.length}): Using minimal compression level 1`)
} else if (labels.length > 5000) {
compressionLevel = 3
console.log(`Medium dataset (${labels.length}): Using compression level 3`)
}
console.log(`Compressing ${labels.length} files with ${compressionType}, level ${compressionLevel}`)
let zipBlob: Blob
try {
zipBlob = await zip.generateAsync({
type: 'blob',
compression: compressionType,
compressionOptions: compressionType === 'DEFLATE' ? { level: compressionLevel } : undefined,
streamFiles: true // Stream files to reduce memory usage
})
} catch (memoryError: any) {
console.error('ZIP generation failed, trying without compression:', memoryError)
onProgress(96, 'Retrying with no compression...')
// Retry with no compression at all
try {
zipBlob = await zip.generateAsync({
type: 'blob',
compression: 'STORE',
streamFiles: true
})
} catch (finalError) {
throw new Error(`Failed to create ZIP: Memory limit exceeded. Try using Local Folder storage mode instead of Memory for large datasets.`)
}
}
const endTime = Date.now()
const durationSeconds = (endTime - startTime) / 1000
// Build font distribution stats
const fontStats = Object.entries(fontUsageCounts).map(([family, count]) => ({
family,
count,
percentage: Math.round((count / labels.length) * 100)
}))
// Build augmentation stats
const augStats = Object.entries(augmentationCounts).map(([name, count]) => ({
name,
count,
percentage: augmentedCount > 0 ? Math.round((count / augmentedCount) * 100) : 0
}))
// Build background distribution stats
const backgroundStats = Object.entries(backgroundCounts).map(([name, count]) => ({
name,
count,
percentage: Math.round((count / labels.length) * 100)
}))
// Build result
const result: GenerationResult = {
stats: {
total_samples: labels.length,
duration_seconds: durationSeconds,
samples_per_second: labels.length / durationSeconds,
font_distribution: fontStats.length > 0
? fontStats
: [{ family: 'Default (Arial)', count: labels.length, percentage: 100 }],
clean_samples: cleanCount,
augmented_samples: augmentedCount,
augmentation_stats: augStats,
avg_transforms_per_sample: augmentedCount > 0
? Object.values(augmentationCounts).reduce((a, b) => a + b, 0) / augmentedCount
: 0,
unique_tokens: uniqueTexts.size,
avg_chars_per_sample: labels.length > 0 ? totalChars / labels.length : 0,
unicode_valid: labels.length,
script_pure: labels.length,
rejected_samples: 0,
background_distribution: backgroundStats.length > 0 ? backgroundStats : undefined
},
zipBlob
}
// Clear state
currentState = null
onProgress(100, 'Generation complete!')
return result
}
// Build partial ZIP from current state (for pause/download)
export async function buildPartialZip(state: GenerationState, formats: string[]): Promise<Blob> {
const zip = state.zip
// Build label files for completed samples
buildLabelFiles(state.labels, formats, zip)
// Add partial metadata
zip.file('metadata.json', JSON.stringify({
status: 'PARTIAL',
completed_samples: state.currentIndex,
timestamp: new Date().toISOString(),
font_usage: state.fontUsageCounts,
background_usage: state.backgroundCounts,
clean_samples: state.cleanCount,
augmented_samples: state.augmentedCount,
}, null, 2))
return zip.generateAsync({
type: 'blob',
compression: 'DEFLATE',
compressionOptions: { level: 6 }
})
}
// Download the generated dataset
export function downloadDataset(zipBlob: Blob, filename: string = 'ocr_dataset.zip') {
saveAs(zipBlob, filename)
}
// Parse text file into segments
export function parseTextFile(
text: string,
segmentation: string
): string[] {
switch (segmentation) {
case 'character':
return Array.from(text).filter(c => c.trim().length > 0)
case 'word':
return text.split(/\s+/).filter(w => w.trim().length > 0)
case 'line':
return text.split('\n').filter(l => l.trim().length > 0)
case 'sentence':
return text.split(/[.!?؟۔]+/).filter(s => s.trim().length > 0).map(s => s.trim())
case 'ngram':
const words = text.split(/\s+/).filter(w => w.trim().length > 0)
const ngrams: string[] = []
for (let i = 0; i < words.length; i++) {
for (let len = 2; len <= 4 && i + len <= words.length; len++) {
ngrams.push(words.slice(i, i + len).join(' '))
}
}
return ngrams
default:
return text.split(/\s+/).filter(w => w.trim().length > 0)
}
}