Spaces:

Omarrran
/

OCR_DATASET_MAKER

Running

App Files Files Community

OCR_DATASET_MAKER / web /lib /generator.ts

Omarrran

Update OCR Dataset Generator

004fbbb 4 months ago

raw

history blame contribute delete

41 kB

	import JSZip from 'jszip'
	import { saveAs } from 'file-saver'
	import { getWorkerPool, isWorkerRenderingAvailable, WorkerTask, WorkerResult } from './worker-pool'
	import { isWebGLAvailable, applyGPUAugmentation, GPUAugmentOptions } from './gpu-augmentation'
	import { StorageManager, StorageMode, getStorageManager, StoredSample } from './storage-manager'

	// ============================================
	// Canvas Pool for Performance
	// ============================================
	interface PooledCanvas {
	canvas: HTMLCanvasElement
	ctx: CanvasRenderingContext2D
	inUse: boolean
	width: number
	height: number
	}

	const canvasPool: PooledCanvas[] = []
	const MAX_POOL_SIZE = 8

	function acquireCanvas(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
	// Find available canvas with matching dimensions
	for (const item of canvasPool) {
	if (!item.inUse && item.width === width && item.height === height) {
	item.inUse = true
	item.ctx.clearRect(0, 0, width, height)
	item.ctx.setTransform(1, 0, 0, 1, 0, 0) // Reset transform
	return { canvas: item.canvas, ctx: item.ctx }
	}
	}

	// Create new canvas if pool not full
	if (canvasPool.length < MAX_POOL_SIZE) {
	const canvas = document.createElement('canvas')
	canvas.width = width
	canvas.height = height
	const ctx = canvas.getContext('2d', { willReadFrequently: true })!
	const pooled: PooledCanvas = { canvas, ctx, inUse: true, width, height }
	canvasPool.push(pooled)
	return { canvas, ctx }
	}

	// Pool full, create temporary canvas (will be GC'd)
	const canvas = document.createElement('canvas')
	canvas.width = width
	canvas.height = height
	const ctx = canvas.getContext('2d', { willReadFrequently: true })!
	return { canvas, ctx }
	}

	function releaseCanvas(canvas: HTMLCanvasElement): void {
	for (const item of canvasPool) {
	if (item.canvas === canvas) {
	item.inUse = false
	return
	}
	}
	}

	export function clearCanvasPool(): void {
	canvasPool.length = 0
	}

	export interface FontData {
	name: string
	family: string
	percentage: number
	dataUrl?: string
	}

	export interface GeneratorConfig {
	dataset: {
	size: number
	seed: number
	}
	input: {
	segmentation: string
	textData?: string[]
	}
	image: {
	width: number
	height: number
	background: string
	backgroundStyle?: string
	backgroundMode?: 'single' \| 'mix'
	backgroundPercentages?: Record<string, number>
	customBackgrounds?: { name: string; dataUrl: string; percentage: number }[]
	textColor: string
	direction: string
	}
	fonts: {
	distribution: FontData[]
	}
	augmentation: {
	enabled: boolean
	applyPercentage: number
	preset: string
	customMode?: boolean
	values?: Record<string, number>
	}
	output: {
	formats: string[]
	}
	}

	export interface GenerationResult {
	stats: {
	total_samples: number
	duration_seconds: number
	samples_per_second: number
	font_distribution: { family: string; count: number; percentage: number }[]
	clean_samples: number
	augmented_samples: number
	augmentation_stats: { name: string; count: number; percentage: number }[]
	avg_transforms_per_sample: number
	unique_tokens: number
	avg_chars_per_sample: number
	unicode_valid: number
	script_pure: number
	rejected_samples: number
	background_distribution?: { name: string; count: number; percentage: number }[]
	}
	zipBlob: Blob
	}

	// Generation state for pause/resume functionality
	export interface GenerationState {
	currentIndex: number
	labels: string[]
	fontUsageCounts: Record<string, number>
	augmentationCounts: Record<string, number>
	cleanCount: number
	augmentedCount: number
	totalChars: number
	uniqueTexts: Set<string>
	backgroundCounts: Record<string, number>
	startTime: number
	zip: JSZip
	}

	// Abort controller for cancellation
	let abortController: AbortController \| null = null
	let currentState: GenerationState \| null = null

	// Get current generation state for external access
	export function getGenerationState(): GenerationState \| null {
	return currentState
	}

	// Result from rendering a single sample
	interface SampleResult {
	index: number
	filename: string
	blob: Blob
	label: string
	fontName: string
	augmentations: string[]
	backgroundStyle: string
	isAugmented: boolean
	}

	// Simple seeded random number generator
	function seededRandom(seed: number) {
	let s = seed
	return function () {
	s = Math.sin(s) * 10000
	return s - Math.floor(s)
	}
	}

	// Create independent seeded random for a specific index (for parallel processing)
	function seededRandomForIndex(baseSeed: number, index: number) {
	return seededRandom(baseSeed + index * 1000)
	}

	// Load a font from dataUrl or use system font
	async function loadFont(font: FontData): Promise<string> {
	if (font.dataUrl) {
	try {
	const fontFace = new FontFace(font.family, `url(${font.dataUrl})`)
	await fontFace.load()
	document.fonts.add(fontFace)
	return font.family
	} catch (err) {
	console.warn(`Failed to load font ${font.name}:`, err)
	return 'Arial'
	}
	}
	return font.family \|\| 'Arial'
	}

	// Select a font based on distribution percentages
	function selectFont(fonts: FontData[], random: () => number): FontData {
	if (!fonts \|\| fonts.length === 0) {
	return { name: 'Default', family: 'Arial', percentage: 100 }
	}

	const roll = random() * 100
	let cumulative = 0

	for (const font of fonts) {
	cumulative += font.percentage
	if (roll < cumulative) {
	return font
	}
	}

	return fonts[fonts.length - 1]
	}

	// Apply augmentation to canvas context
	function applyAugmentation(
	ctx: CanvasRenderingContext2D,
	canvas: HTMLCanvasElement,
	augValues: Record<string, number>,
	random: () => number
	): string[] {
	const applied: string[] = []

	// Rotation
	if (augValues.rotation && random() > 0.5) {
	const angle = (random() - 0.5) * 2 * augValues.rotation * Math.PI / 180
	ctx.translate(canvas.width / 2, canvas.height / 2)
	ctx.rotate(angle)
	ctx.translate(-canvas.width / 2, -canvas.height / 2)
	applied.push('rotation')
	}

	// Skew (approximated with transform)
	if (augValues.skew && random() > 0.5) {
	const skewAmount = (random() - 0.5) * augValues.skew * 0.01
	ctx.transform(1, skewAmount, 0, 1, 0, 0)
	applied.push('skew')
	}

	return applied
	}

	// Background style colors
	const backgroundColors: Record<string, string> = {
	clean_white: '#FFFFFF',
	aged_paper: '#F5E6D3',
	book_page: '#FAF0E6',
	newspaper: '#E8E8E8',
	notebook: '#FFFEF0',
	parchment: '#F0E68C',
	weathered: '#D4C4A8',
	coffee_stain: '#E6D5C3',
	old_book: '#E8DCC4',
	recycled: '#D9D4C5',
	cream: '#FFFDD0',
	ivory: '#FFFFF0',
	}

	// Get background color based on style (supports mix mode with percentages)
	function getBackgroundColor(config: GeneratorConfig, random?: () => number): string {
	// Check if mix mode is enabled
	if (config.image.backgroundMode === 'mix' && config.image.backgroundPercentages && random) {
	const percentages = config.image.backgroundPercentages as Record<string, number>
	const roll = random() * 100
	let cumulative = 0

	for (const [styleId, percentage] of Object.entries(percentages)) {
	cumulative += percentage
	if (roll < cumulative && backgroundColors[styleId]) {
	return backgroundColors[styleId]
	}
	}
	}

	// Single mode or fallback
	if (config.image.backgroundStyle && backgroundColors[config.image.backgroundStyle]) {
	return backgroundColors[config.image.backgroundStyle]
	}
	return config.image.background \|\| '#FFFFFF'
	}

	// Load an image from dataUrl
	async function loadImage(dataUrl: string): Promise<HTMLImageElement> {
	return new Promise((resolve, reject) => {
	const img = new Image()
	img.onload = () => resolve(img)
	img.onerror = reject
	img.src = dataUrl
	})
	}

	// Select background - returns either a color string OR an image dataUrl
	function selectBackground(config: GeneratorConfig, random: () => number): { type: 'color', value: string, styleName: string } \| { type: 'image', value: string, styleName: string } {
	// Check for custom backgrounds with percentages > 0
	const customBgs = config.image.customBackgrounds?.filter(bg => bg.percentage > 0) \|\| []
	const hasCustomBgs = customBgs.length > 0

	// Calculate total percentages
	const stylePercentages = config.image.backgroundPercentages \|\| {}
	const totalStylePct = Object.values(stylePercentages).reduce((acc, p) => acc + p, 0)
	const totalCustomPct = customBgs.reduce((acc, bg) => acc + bg.percentage, 0)
	const totalPct = totalStylePct + totalCustomPct

	if (totalPct === 0) {
	// Fallback to single style or default
	const style = config.image.backgroundStyle \|\| 'clean_white'
	return { type: 'color', value: backgroundColors[style] \|\| config.image.background \|\| '#FFFFFF', styleName: style }
	}

	// Random selection based on combined percentages
	const roll = random() * totalPct
	let cumulative = 0

	// Check style backgrounds first
	for (const [styleId, percentage] of Object.entries(stylePercentages)) {
	cumulative += percentage
	if (roll < cumulative && backgroundColors[styleId]) {
	return { type: 'color', value: backgroundColors[styleId], styleName: styleId }
	}
	}

	// Check custom backgrounds
	for (const bg of customBgs) {
	cumulative += bg.percentage
	if (roll < cumulative) {
	return { type: 'image', value: bg.dataUrl, styleName: bg.name }
	}
	}

	// Fallback
	const style = config.image.backgroundStyle \|\| 'clean_white'
	return { type: 'color', value: backgroundColors[style] \|\| config.image.background \|\| '#FFFFFF', styleName: style }
	}

	// Render text to canvas and return as blob
	async function renderTextToCanvas(
	text: string,
	config: GeneratorConfig,
	fontFamily: string,
	shouldAugment: boolean,
	augValues: Record<string, number>,
	random: () => number
	): Promise<{ blob: Blob; augmentations: string[]; backgroundStyle: string }> {
	// Use canvas pool for better performance
	const { canvas, ctx } = acquireCanvas(config.image.width, config.image.height)

	// Declare variables outside try for proper scoping
	let appliedAugmentations: string[] = []
	let bg: ReturnType<typeof selectBackground>

	try {
	// Select and apply background
	bg = selectBackground(config, random)

	if (bg.type === 'color') {
	ctx.fillStyle = bg.value
	ctx.fillRect(0, 0, canvas.width, canvas.height)
	} else {
	// Draw custom background image
	try {
	const img = await loadImage(bg.value)
	ctx.drawImage(img, 0, 0, canvas.width, canvas.height)
	} catch {
	// Fallback to white if image fails
	ctx.fillStyle = '#FFFFFF'
	ctx.fillRect(0, 0, canvas.width, canvas.height)
	}
	}

	// Apply augmentation transforms if enabled
	if (shouldAugment && config.augmentation.enabled) {
	ctx.save()
	appliedAugmentations = applyAugmentation(ctx, canvas, augValues, random)
	}

	// Set text properties with the selected font
	const fontSize = Math.min(canvas.height * 0.6, 48)
	ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
	ctx.fillStyle = config.image.textColor
	ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
	ctx.textBaseline = 'middle'

	// Draw text
	const x = config.image.direction === 'rtl'
	? canvas.width - 10
	: 10
	const y = canvas.height / 2

	ctx.direction = config.image.direction as CanvasDirection
	ctx.fillText(text, x, y)

	if (shouldAugment && config.augmentation.enabled) {
	ctx.restore()
	}

	// Apply post-processing augmentations (GPU-accelerated when available)
	if (shouldAugment && config.augmentation.enabled) {
	const applyBrightness = augValues.brightness && random() > 0.5
	const applyNoise = augValues.gaussian_noise && random() > 0.6

	if (applyBrightness \|\| applyNoise) {
	// Try GPU-accelerated augmentation first
	const useGPU = isWebGLAvailable()

	if (useGPU) {
	// GPU path - apply all augmentations in a single GPU pass
	const gpuOptions: GPUAugmentOptions = {
	brightness: applyBrightness ? (random() - 0.5) * augValues.brightness / 50 : 0,
	contrast: 1, // Could add contrast augmentation here
	noiseAmount: applyNoise ? augValues.gaussian_noise / 200 : 0,
	seed: random() * 1000
	}

	const gpuResult = applyGPUAugmentation(canvas, gpuOptions)
	if (gpuResult && gpuResult !== canvas) {
	// Copy GPU result back to main canvas
	const gpuCtx = gpuResult.getContext('webgl')
	if (gpuCtx) {
	const pixels = new Uint8Array(canvas.width * canvas.height * 4)
	gpuCtx.readPixels(0, 0, canvas.width, canvas.height, gpuCtx.RGBA, gpuCtx.UNSIGNED_BYTE, pixels)

	// Flip Y and apply to main canvas
	const imageData = ctx.createImageData(canvas.width, canvas.height)
	const rowSize = canvas.width * 4
	for (let y = 0; y < canvas.height; y++) {
	const srcRow = (canvas.height - 1 - y) * rowSize
	const dstRow = y * rowSize
	imageData.data.set(pixels.subarray(srcRow, srcRow + rowSize), dstRow)
	}
	ctx.putImageData(imageData, 0, 0)

	if (applyBrightness) appliedAugmentations.push('brightness_gpu')
	if (applyNoise) appliedAugmentations.push('noise_gpu')
	}
	}
	} else {
	// CPU fallback path
	if (applyBrightness) {
	const adjustment = 1 + (random() - 0.5) * augValues.brightness / 50
	const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
	for (let i = 0; i < imageData.data.length; i += 4) {
	imageData.data[i] = Math.min(255, imageData.data[i] * adjustment)
	imageData.data[i + 1] = Math.min(255, imageData.data[i + 1] * adjustment)
	imageData.data[i + 2] = Math.min(255, imageData.data[i + 2] * adjustment)
	}
	ctx.putImageData(imageData, 0, 0)
	appliedAugmentations.push('brightness')
	}

	if (applyNoise) {
	const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
	const noiseLevel = augValues.gaussian_noise / 2
	for (let i = 0; i < imageData.data.length; i += 4) {
	const noise = (random() - 0.5) * noiseLevel
	imageData.data[i] = Math.max(0, Math.min(255, imageData.data[i] + noise))
	imageData.data[i + 1] = Math.max(0, Math.min(255, imageData.data[i + 1] + noise))
	imageData.data[i + 2] = Math.max(0, Math.min(255, imageData.data[i + 2] + noise))
	}
	ctx.putImageData(imageData, 0, 0)
	appliedAugmentations.push('noise')
	}
	}
	}
	}

	// Convert to blob and release canvas when done
	return new Promise((resolve, reject) => {
	canvas.toBlob(
	(blob) => {
	releaseCanvas(canvas) // Return canvas to pool
	if (blob) {
	resolve({ blob, augmentations: appliedAugmentations, backgroundStyle: bg.styleName })
	} else {
	reject(new Error('Failed to convert canvas to blob'))
	}
	},
	'image/png'
	)
	})
	} catch (err) {
	releaseCanvas(canvas) // Ensure canvas is released on error
	throw err
	}
	}

	// Render a single sample (for parallel processing)
	async function renderSample(
	index: number,
	text: string,
	config: GeneratorConfig,
	loadedFonts: Map<string, string>,
	augValues: Record<string, number>,
	baseSeed: number
	): Promise<SampleResult> {
	// Use index-specific random generator for reproducibility
	const random = seededRandomForIndex(baseSeed, index)

	// Select font
	const selectedFont = selectFont(config.fonts.distribution, random)
	const fontFamily = loadedFonts.get(selectedFont.name) \|\| selectedFont.family \|\| 'Arial'

	// Determine augmentation
	const shouldAugment = config.augmentation.enabled &&
	(random() * 100) < config.augmentation.applyPercentage

	// Render
	const { blob, augmentations, backgroundStyle } = await renderTextToCanvas(
	text,
	config,
	fontFamily,
	shouldAugment,
	augValues,
	random
	)

	const filename = `image_${String(index).padStart(6, '0')}.png`

	return {
	index,
	filename,
	blob,
	label: `${filename}\t${text}`,
	fontName: selectedFont.name,
	augmentations,
	backgroundStyle,
	isAugmented: shouldAugment
	}
	}

	// Validate configuration and data
	function validateInputs(config: GeneratorConfig, textData: string[]): { valid: boolean; error?: string; adjustedSize?: number } {
	if (!textData \|\| textData.length === 0) {
	return { valid: false, error: 'No text data provided. Please upload a text file.' }
	}

	if (config.dataset.size <= 0) {
	return { valid: false, error: 'Dataset size must be greater than 0.' }
	}

	// Check background percentages in mix mode
	if (config.image.backgroundMode === 'mix' && config.image.backgroundPercentages) {
	const total = Object.values(config.image.backgroundPercentages).reduce((a, b) => a + b, 0)
	const customTotal = config.image.customBackgrounds?.reduce((a, b) => a + b.percentage, 0) \|\| 0
	if (Math.abs(total + customTotal - 100) > 1 && total + customTotal > 0) {
	console.warn(`Background percentages total ${total + customTotal}%, expected 100%`)
	}
	}

	// Adjust size if needed
	if (config.dataset.size > textData.length) {
	console.warn(`Dataset size (${config.dataset.size}) exceeds available samples (${textData.length}). Adjusting to ${textData.length}.`)
	return { valid: true, adjustedSize: textData.length }
	}

	return { valid: true }
	}

	// Build label files with chunked processing for large datasets
	function buildLabelFiles(labels: string[], formats: string[], zip: JSZip) {
	console.log(`Building label files for ${labels.length} samples, formats: ${formats.join(', ')}`)

	// CRNN/PaddleOCR format: labels.txt
	if (formats.includes('crnn') \|\| formats.includes('paddleocr')) {
	// Process in chunks to avoid string length limits
	const CHUNK_SIZE = 10000
	const chunks: string[] = []
	for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
	chunks.push(labels.slice(i, i + CHUNK_SIZE).join('\n'))
	}
	const labelsContent = chunks.join('\n')
	zip.file('labels.txt', labelsContent)
	console.log(`Created labels.txt with ${labels.length} entries`)
	}

	// TrOCR/JSONL format: data.jsonl
	if (formats.includes('trocr') \|\| formats.includes('jsonl')) {
	const CHUNK_SIZE = 10000
	const chunks: string[] = []
	for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
	const chunk = labels.slice(i, i + CHUNK_SIZE).map((label) => {
	const parts = label.split('\t')
	const filename = parts[0]
	const text = parts.slice(1).join('\t')
	return JSON.stringify({ image: `images/${filename}`, text: text })
	})
	chunks.push(chunk.join('\n'))
	}
	zip.file('data.jsonl', chunks.join('\n'))
	console.log(`Created data.jsonl with ${labels.length} entries`)
	}

	// CSV format: data.csv
	if (formats.includes('csv')) {
	const CHUNK_SIZE = 10000
	const chunks: string[] = ['image,text']
	for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
	const chunk = labels.slice(i, i + CHUNK_SIZE).map(label => {
	const parts = label.split('\t')
	const filename = parts[0]
	const text = parts.slice(1).join('\t')
	return `"images/${filename}","${text.replace(/"/g, '""')}"`
	})
	chunks.push(chunk.join('\n'))
	}
	zip.file('data.csv', chunks.join('\n'))
	console.log(`Created data.csv with ${labels.length} entries`)
	}

	// JSON format: data.json (array format)
	if (formats.includes('json')) {
	const jsonData = labels.map((label) => {
	const parts = label.split('\t')
	const filename = parts[0]
	const text = parts.slice(1).join('\t')
	return { image: `images/${filename}`, text: text }
	})
	zip.file('data.json', JSON.stringify(jsonData, null, 2))
	console.log(`Created data.json with ${labels.length} entries`)
	}

	// HuggingFace format: metadata.csv
	if (formats.includes('huggingface')) {
	const CHUNK_SIZE = 10000
	const chunks: string[] = ['file_name,text']
	for (let i = 0; i < labels.length; i += CHUNK_SIZE) {
	const chunk = labels.slice(i, i + CHUNK_SIZE).map((label) => {
	const parts = label.split('\t')
	const filename = parts[0]
	const text = parts.slice(1).join('\t')
	return `"images/${filename}","${text.replace(/"/g, '""')}"`
	})
	chunks.push(chunk.join('\n'))
	}
	zip.file('metadata.csv', chunks.join('\n'))
	console.log(`Created metadata.csv for HuggingFace with ${labels.length} entries`)
	}
	}

	// Main generation function with parallel processing
	export async function generateDataset(
	config: GeneratorConfig,
	textData: string[],
	onProgress: (progress: number, message: string) => void,
	abortSignal?: AbortSignal
	): Promise<GenerationResult> {
	const startTime = Date.now()

	// Validate inputs
	const validation = validateInputs(config, textData)
	if (!validation.valid) {
	throw new Error(validation.error)
	}
	if (validation.adjustedSize) {
	config.dataset.size = validation.adjustedSize
	}

	const zip = new JSZip()
	const imagesFolder = zip.folder('images')!

	// Get augmentation values
	const augValues = config.augmentation.customMode && config.augmentation.values
	? config.augmentation.values
	: {
	rotation: 5,
	skew: 3,
	gaussian_blur: 1,
	motion_blur: 3,
	gaussian_noise: 10,
	brightness: 15,
	contrast: 20,
	jpeg_quality: 70,
	}

	// Determine number of samples
	const numSamples = Math.min(config.dataset.size, textData.length)

	// Determine batch size based on available resources and dataset size
	// Smaller batches for very large datasets to manage memory
	const cpuCores = typeof navigator !== 'undefined' ? (navigator.hardwareConcurrency \|\| 4) : 4
	let BATCH_SIZE = Math.min(cpuCores, 8)

	// For very large datasets, use smaller batches to manage memory
	if (numSamples > 50000) {
	BATCH_SIZE = Math.min(BATCH_SIZE, 4)
	console.log(`Large dataset detected (${numSamples}), reducing batch size for memory management`)
	}

	console.log(`Starting generation: ${numSamples} samples, batch size: ${BATCH_SIZE}`)

	// Load all fonts before starting
	onProgress(0, 'Loading fonts...')
	const loadedFonts: Map<string, string> = new Map()

	if (config.fonts.distribution && config.fonts.distribution.length > 0) {
	for (const font of config.fonts.distribution) {
	try {
	const loadedFamily = await loadFont(font)
	loadedFonts.set(font.name, loadedFamily)
	onProgress(0, `Loaded font: ${font.name}`)
	} catch (err) {
	console.warn(`Failed to load font ${font.name}, using fallback`)
	loadedFonts.set(font.name, 'Arial')
	}
	}
	}

	// Initialize tracking
	let cleanCount = 0
	let augmentedCount = 0
	const augmentationCounts: Record<string, number> = {}
	const fontUsageCounts: Record<string, number> = {}
	const backgroundCounts: Record<string, number> = {}
	const labels: string[] = []
	let totalChars = 0
	const uniqueTexts = new Set<string>()

	// Store current state for pause/resume
	currentState = {
	currentIndex: 0,
	labels,
	fontUsageCounts,
	augmentationCounts,
	cleanCount,
	augmentedCount,
	totalChars,
	uniqueTexts,
	backgroundCounts,
	startTime,
	zip
	}

	// Check if Web Workers with OffscreenCanvas are available
	const useWorkers = isWorkerRenderingAvailable()
	const workerPool = useWorkers ? getWorkerPool() : null
	const renderMode = useWorkers ? `Web Workers (${workerPool?.getWorkerCount()} threads)` : `Main Thread (${BATCH_SIZE} parallel)`

	console.log(`Starting generation: ${numSamples} samples, mode: ${renderMode}`)

	// Generate samples
	onProgress(1, `Starting generation (${renderMode})...`)

	if (useWorkers && workerPool) {
	// === WEB WORKER RENDERING ===
	// Prepare all tasks upfront for worker distribution
	const tasks: WorkerTask[] = []
	const random = seededRandom(config.dataset.seed)

	for (let i = 0; i < numSamples; i++) {
	const text = textData[i % textData.length]

	// Select font and background for this sample
	const selectedFont = selectFont(config.fonts.distribution, () => seededRandom(config.dataset.seed + i * 1000)())
	const fontFamily = loadedFonts.get(selectedFont.name) \|\| selectedFont.family \|\| 'Arial'
	const bg = selectBackground(config, () => seededRandom(config.dataset.seed + i * 2000)())
	const shouldAugment = config.augmentation.enabled &&
	(seededRandom(config.dataset.seed + i * 3000)() * 100) < config.augmentation.applyPercentage

	tasks.push({
	id: i,
	index: i,
	text,
	config: {
	width: config.image.width,
	height: config.image.height,
	textColor: config.image.textColor,
	direction: config.image.direction,
	backgroundStyle: bg.styleName,
	backgroundColor: bg.type === 'color' ? bg.value : '#FFFFFF'
	},
	fontFamily,
	shouldAugment,
	augValues,
	seed: config.dataset.seed
	})

	// Track font usage
	fontUsageCounts[selectedFont.name] = (fontUsageCounts[selectedFont.name] \|\| 0) + 1
	}

	onProgress(5, `Prepared ${numSamples} tasks, dispatching to ${workerPool.getWorkerCount()} workers...`)

	try {
	// Process all tasks with workers
	let processedCount = 0
	const results = await workerPool.processTasks(tasks, (result: WorkerResult) => {
	processedCount++

	// Track progress
	if (processedCount % 100 === 0 \|\| processedCount >= numSamples) {
	const progress = (processedCount / numSamples) * 85 + 5
	onProgress(progress, `Generated ${processedCount}/${numSamples} samples (Workers: ${workerPool.getWorkerCount()} threads)`)
	}
	})

	// Process results
	for (const result of results) {
	if (result.error) {
	console.warn(`Sample ${result.index} failed:`, result.error)
	continue
	}

	uniqueTexts.add(textData[result.index % textData.length])
	totalChars += textData[result.index % textData.length].length
	backgroundCounts[result.backgroundStyle] = (backgroundCounts[result.backgroundStyle] \|\| 0) + 1

	if (result.isAugmented) {
	augmentedCount++
	result.augmentations.forEach(aug => {
	augmentationCounts[aug] = (augmentationCounts[aug] \|\| 0) + 1
	})
	} else {
	cleanCount++
	}

	imagesFolder.file(result.filename, result.blob)
	labels.push(result.label)
	}

	currentState.currentIndex = numSamples
	currentState.cleanCount = cleanCount
	currentState.augmentedCount = augmentedCount
	currentState.totalChars = totalChars

	} catch (err) {
	console.error('Worker pool error, falling back to main thread:', err)
	// Fallback happens below
	}
	}

	// === MAIN THREAD FALLBACK or PRIMARY (if workers not available) ===
	// Only run if labels were not populated by workers
	if (labels.length === 0) {
	for (let batchStart = 0; batchStart < numSamples; batchStart += BATCH_SIZE) {
	// Check for abort
	if (abortSignal?.aborted) {
	console.log('Generation aborted')
	break
	}

	const batchEnd = Math.min(batchStart + BATCH_SIZE, numSamples)
	const batchPromises: Promise<SampleResult>[] = []

	// Create batch of render tasks
	for (let i = batchStart; i < batchEnd; i++) {
	const text = textData[i % textData.length]
	batchPromises.push(
	renderSample(i, text, config, loadedFonts, augValues, config.dataset.seed)
	)
	}

	// Wait for batch to complete
	try {
	const batchResults = await Promise.all(batchPromises)

	// Process results
	for (const result of batchResults) {
	// Track stats
	uniqueTexts.add(textData[result.index % textData.length])
	totalChars += textData[result.index % textData.length].length

	// Track font usage
	fontUsageCounts[result.fontName] = (fontUsageCounts[result.fontName] \|\| 0) + 1

	// Track background usage
	backgroundCounts[result.backgroundStyle] = (backgroundCounts[result.backgroundStyle] \|\| 0) + 1

	// Track augmentation
	if (result.isAugmented) {
	augmentedCount++
	result.augmentations.forEach(aug => {
	augmentationCounts[aug] = (augmentationCounts[aug] \|\| 0) + 1
	})
	} else {
	cleanCount++
	}

	// Add to zip and labels
	imagesFolder.file(result.filename, result.blob)
	labels.push(result.label)
	}

	// Update state for pause/resume
	currentState.currentIndex = batchEnd
	currentState.cleanCount = cleanCount
	currentState.augmentedCount = augmentedCount
	currentState.totalChars = totalChars

	} catch (err) {
	console.error(`Batch error at ${batchStart}-${batchEnd}:`, err)
	// Continue with next batch instead of failing completely
	}

	// Report progress
	const progress = ((batchEnd) / numSamples) * 90 + 5 // Reserve 5% for init, 5% for compression
	if (batchEnd % (BATCH_SIZE * 5) === 0 \|\| batchEnd >= numSamples) {
	onProgress(progress, `Generated ${batchEnd.toLocaleString()}/${numSamples.toLocaleString()} samples`)
	}

	// Memory cleanup hint for garbage collector (helps with large datasets)
	if (batchEnd % 1000 === 0) {
	// Allow event loop to process and GC to run
	await new Promise(resolve => setTimeout(resolve, 0))
	}
	}
	} // End of main thread fallback

	// Build label files with chunked processing
	console.log('Building label files...')
	onProgress(92, 'Building label files...')
	buildLabelFiles(labels, config.output.formats, zip)

	// Add metadata
	const metadata = {
	generated_at: new Date().toISOString(),
	config: {
	image_size: `${config.image.width}x${config.image.height}`,
	background: getBackgroundColor(config),
	background_style: config.image.backgroundStyle,
	background_mode: config.image.backgroundMode,
	text_color: config.image.textColor,
	direction: config.image.direction,
	augmentation_enabled: config.augmentation.enabled,
	augmentation_percentage: config.augmentation.applyPercentage,
	fonts_used: config.fonts.distribution?.map(f => f.name) \|\| [],
	output_formats: config.output.formats,
	},
	samples: labels.length,
	clean_samples: cleanCount,
	augmented_samples: augmentedCount,
	font_usage: fontUsageCounts,
	background_usage: backgroundCounts,
	}
	zip.file('metadata.json', JSON.stringify(metadata, null, 2))

	// Generate zip blob with dynamic compression based on dataset size
	onProgress(95, 'Compressing dataset...')

	// Use lower/no compression for large datasets (faster, less memory)
	// STORE = no compression, uses less memory
	let compressionType: 'DEFLATE' \| 'STORE' = 'DEFLATE'
	let compressionLevel = 6

	if (labels.length > 20000) {
	// Very large: no compression (STORE) to prevent memory issues
	compressionType = 'STORE'
	compressionLevel = 0
	console.log(`Large dataset (${labels.length}): Using STORE (no compression) to prevent memory issues`)
	} else if (labels.length > 10000) {
	compressionLevel = 1 // Minimal compression
	console.log(`Medium-large dataset (${labels.length}): Using minimal compression level 1`)
	} else if (labels.length > 5000) {
	compressionLevel = 3
	console.log(`Medium dataset (${labels.length}): Using compression level 3`)
	}

	console.log(`Compressing ${labels.length} files with ${compressionType}, level ${compressionLevel}`)

	let zipBlob: Blob
	try {
	zipBlob = await zip.generateAsync({
	type: 'blob',
	compression: compressionType,
	compressionOptions: compressionType === 'DEFLATE' ? { level: compressionLevel } : undefined,
	streamFiles: true // Stream files to reduce memory usage
	})
	} catch (memoryError: any) {
	console.error('ZIP generation failed, trying without compression:', memoryError)
	onProgress(96, 'Retrying with no compression...')

	// Retry with no compression at all
	try {
	zipBlob = await zip.generateAsync({
	type: 'blob',
	compression: 'STORE',
	streamFiles: true
	})
	} catch (finalError) {
	throw new Error(`Failed to create ZIP: Memory limit exceeded. Try using Local Folder storage mode instead of Memory for large datasets.`)
	}
	}

	const endTime = Date.now()
	const durationSeconds = (endTime - startTime) / 1000

	// Build font distribution stats
	const fontStats = Object.entries(fontUsageCounts).map(([family, count]) => ({
	family,
	count,
	percentage: Math.round((count / labels.length) * 100)
	}))

	// Build augmentation stats
	const augStats = Object.entries(augmentationCounts).map(([name, count]) => ({
	name,
	count,
	percentage: augmentedCount > 0 ? Math.round((count / augmentedCount) * 100) : 0
	}))

	// Build background distribution stats
	const backgroundStats = Object.entries(backgroundCounts).map(([name, count]) => ({
	name,
	count,
	percentage: Math.round((count / labels.length) * 100)
	}))

	// Build result
	const result: GenerationResult = {
	stats: {
	total_samples: labels.length,
	duration_seconds: durationSeconds,
	samples_per_second: labels.length / durationSeconds,
	font_distribution: fontStats.length > 0
	? fontStats
	: [{ family: 'Default (Arial)', count: labels.length, percentage: 100 }],
	clean_samples: cleanCount,
	augmented_samples: augmentedCount,
	augmentation_stats: augStats,
	avg_transforms_per_sample: augmentedCount > 0
	? Object.values(augmentationCounts).reduce((a, b) => a + b, 0) / augmentedCount
	: 0,
	unique_tokens: uniqueTexts.size,
	avg_chars_per_sample: labels.length > 0 ? totalChars / labels.length : 0,
	unicode_valid: labels.length,
	script_pure: labels.length,
	rejected_samples: 0,
	background_distribution: backgroundStats.length > 0 ? backgroundStats : undefined
	},
	zipBlob
	}

	// Clear state
	currentState = null

	onProgress(100, 'Generation complete!')

	return result
	}

	// Build partial ZIP from current state (for pause/download)
	export async function buildPartialZip(state: GenerationState, formats: string[]): Promise<Blob> {
	const zip = state.zip

	// Build label files for completed samples
	buildLabelFiles(state.labels, formats, zip)

	// Add partial metadata
	zip.file('metadata.json', JSON.stringify({
	status: 'PARTIAL',
	completed_samples: state.currentIndex,
	timestamp: new Date().toISOString(),
	font_usage: state.fontUsageCounts,
	background_usage: state.backgroundCounts,
	clean_samples: state.cleanCount,
	augmented_samples: state.augmentedCount,
	}, null, 2))

	return zip.generateAsync({
	type: 'blob',
	compression: 'DEFLATE',
	compressionOptions: { level: 6 }
	})
	}

	// Download the generated dataset
	export function downloadDataset(zipBlob: Blob, filename: string = 'ocr_dataset.zip') {
	saveAs(zipBlob, filename)
	}

	// Parse text file into segments
	export function parseTextFile(
	text: string,
	segmentation: string
	): string[] {
	switch (segmentation) {
	case 'character':
	return Array.from(text).filter(c => c.trim().length > 0)
	case 'word':
	return text.split(/\s+/).filter(w => w.trim().length > 0)
	case 'line':
	return text.split('\n').filter(l => l.trim().length > 0)
	case 'sentence':
	return text.split(/[.!?؟۔]+/).filter(s => s.trim().length > 0).map(s => s.trim())
	case 'ngram':
	const words = text.split(/\s+/).filter(w => w.trim().length > 0)
	const ngrams: string[] = []
	for (let i = 0; i < words.length; i++) {
	for (let len = 2; len <= 4 && i + len <= words.length; len++) {
	ngrams.push(words.slice(i, i + len).join(' '))
	}
	}
	return ngrams
	default:
	return text.split(/\s+/).filter(w => w.trim().length > 0)
	}
	}