Spaces:
Running
Running
| import JSZip from 'jszip' | |
| import { saveAs } from 'file-saver' | |
| import { getWorkerPool, isWorkerRenderingAvailable, WorkerTask, WorkerResult } from './worker-pool' | |
| import { isWebGLAvailable, applyGPUAugmentation, GPUAugmentOptions } from './gpu-augmentation' | |
| import { StorageManager, StorageMode, getStorageManager, StoredSample } from './storage-manager' | |
| // ============================================ | |
| // Canvas Pool for Performance | |
| // ============================================ | |
| interface PooledCanvas { | |
| canvas: HTMLCanvasElement | |
| ctx: CanvasRenderingContext2D | |
| inUse: boolean | |
| width: number | |
| height: number | |
| } | |
| const canvasPool: PooledCanvas[] = [] | |
| const MAX_POOL_SIZE = 8 | |
| function acquireCanvas(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } { | |
| // Find available canvas with matching dimensions | |
| for (const item of canvasPool) { | |
| if (!item.inUse && item.width === width && item.height === height) { | |
| item.inUse = true | |
| item.ctx.clearRect(0, 0, width, height) | |
| item.ctx.setTransform(1, 0, 0, 1, 0, 0) // Reset transform | |
| return { canvas: item.canvas, ctx: item.ctx } | |
| } | |
| } | |
| // Create new canvas if pool not full | |
| if (canvasPool.length < MAX_POOL_SIZE) { | |
| const canvas = document.createElement('canvas') | |
| canvas.width = width | |
| canvas.height = height | |
| const ctx = canvas.getContext('2d', { willReadFrequently: true })! | |
| const pooled: PooledCanvas = { canvas, ctx, inUse: true, width, height } | |
| canvasPool.push(pooled) | |
| return { canvas, ctx } | |
| } | |
| // Pool full, create temporary canvas (will be GC'd) | |
| const canvas = document.createElement('canvas') | |
| canvas.width = width | |
| canvas.height = height | |
| const ctx = canvas.getContext('2d', { willReadFrequently: true })! | |
| return { canvas, ctx } | |
| } | |
| function releaseCanvas(canvas: HTMLCanvasElement): void { | |
| for (const item of canvasPool) { | |
| if (item.canvas === canvas) { | |
| item.inUse = false | |
| return | |
| } | |
| } | |
| } | |
| export function clearCanvasPool(): void { | |
| canvasPool.length = 0 | |
| } | |
| export interface FontData { | |
| name: string | |
| family: string | |
| percentage: number | |
| dataUrl?: string | |
| } | |
| export interface GeneratorConfig { | |
| dataset: { | |
| size: number | |
| seed: number | |
| } | |
| input: { | |
| segmentation: string | |
| textData?: string[] | |
| } | |
| image: { | |
| width: number | |
| height: number | |
| background: string | |
| backgroundStyle?: string | |
| backgroundMode?: 'single' | 'mix' | |
| backgroundPercentages?: Record<string, number> | |
| customBackgrounds?: { name: string; dataUrl: string; percentage: number }[] | |
| textColor: string | |
| direction: string | |
| } | |
| fonts: { | |
| distribution: FontData[] | |
| } | |
| augmentation: { | |
| enabled: boolean | |
| applyPercentage: number | |
| preset: string | |
| customMode?: boolean | |
| values?: Record<string, number> | |
| } | |
| output: { | |
| formats: string[] | |
| } | |
| } | |
| export interface GenerationResult { | |
| stats: { | |
| total_samples: number | |
| duration_seconds: number | |
| samples_per_second: number | |
| font_distribution: { family: string; count: number; percentage: number }[] | |
| clean_samples: number | |
| augmented_samples: number | |
| augmentation_stats: { name: string; count: number; percentage: number }[] | |
| avg_transforms_per_sample: number | |
| unique_tokens: number | |
| avg_chars_per_sample: number | |
| unicode_valid: number | |
| script_pure: number | |
| rejected_samples: number | |
| background_distribution?: { name: string; count: number; percentage: number }[] | |
| } | |
| zipBlob: Blob | |
| } | |
| // Generation state for pause/resume functionality | |
| export interface GenerationState { | |
| currentIndex: number | |
| labels: string[] | |
| fontUsageCounts: Record<string, number> | |
| augmentationCounts: Record<string, number> | |
| cleanCount: number | |
| augmentedCount: number | |
| totalChars: number | |
| uniqueTexts: Set<string> | |
| backgroundCounts: Record<string, number> | |
| startTime: number | |
| zip: JSZip | |
| } | |
| // Abort controller for cancellation | |
| let abortController: AbortController | null = null | |
| let currentState: GenerationState | null = null | |
| // Get current generation state for external access | |
| export function getGenerationState(): GenerationState | null { | |
| return currentState | |
| } | |
| // Result from rendering a single sample | |
| interface SampleResult { | |
| index: number | |
| filename: string | |
| blob: Blob | |
| label: string | |
| fontName: string | |
| augmentations: string[] | |
| backgroundStyle: string | |
| isAugmented: boolean | |
| } | |
| // Simple seeded random number generator | |
| function seededRandom(seed: number) { | |
| let s = seed | |
| return function () { | |
| s = Math.sin(s) * 10000 | |
| return s - Math.floor(s) | |
| } | |
| } | |
| // Create independent seeded random for a specific index (for parallel processing) | |
| function seededRandomForIndex(baseSeed: number, index: number) { | |
| return seededRandom(baseSeed + index * 1000) | |
| } | |
| // Load a font from dataUrl or use system font | |
| async function loadFont(font: FontData): Promise<string> { | |
| if (font.dataUrl) { | |
| try { | |
| const fontFace = new FontFace(font.family, `url(${font.dataUrl})`) | |
| await fontFace.load() | |
| document.fonts.add(fontFace) | |
| return font.family | |
| } catch (err) { | |
| console.warn(`Failed to load font ${font.name}:`, err) | |
| return 'Arial' | |
| } | |
| } | |
| return font.family || 'Arial' | |
| } | |
| // Select a font based on distribution percentages | |
| function selectFont(fonts: FontData[], random: () => number): FontData { | |
| if (!fonts || fonts.length === 0) { | |
| return { name: 'Default', family: 'Arial', percentage: 100 } | |
| } | |
| const roll = random() * 100 | |
| let cumulative = 0 | |
| for (const font of fonts) { | |
| cumulative += font.percentage | |
| if (roll < cumulative) { | |
| return font | |
| } | |
| } | |
| return fonts[fonts.length - 1] | |
| } | |
| // Apply augmentation to canvas context | |
| function applyAugmentation( | |
| ctx: CanvasRenderingContext2D, | |
| canvas: HTMLCanvasElement, | |
| augValues: Record<string, number>, | |
| random: () => number | |
| ): string[] { | |
| const applied: string[] = [] | |
| // Rotation | |
| if (augValues.rotation && random() > 0.5) { | |
| const angle = (random() - 0.5) * 2 * augValues.rotation * Math.PI / 180 | |
| ctx.translate(canvas.width / 2, canvas.height / 2) | |
| ctx.rotate(angle) | |
| ctx.translate(-canvas.width / 2, -canvas.height / 2) | |
| applied.push('rotation') | |
| } | |
| // Skew (approximated with transform) | |
| if (augValues.skew && random() > 0.5) { | |
| const skewAmount = (random() - 0.5) * augValues.skew * 0.01 | |
| ctx.transform(1, skewAmount, 0, 1, 0, 0) | |
| applied.push('skew') | |
| } | |
| return applied | |
| } | |
| // Background style colors | |
| const backgroundColors: Record<string, string> = { | |
| clean_white: '#FFFFFF', | |
| aged_paper: '#F5E6D3', | |
| book_page: '#FAF0E6', | |
| newspaper: '#E8E8E8', | |
| notebook: '#FFFEF0', | |
| parchment: '#F0E68C', | |
| weathered: '#D4C4A8', | |
| coffee_stain: '#E6D5C3', | |
| old_book: '#E8DCC4', | |
| recycled: '#D9D4C5', | |
| cream: '#FFFDD0', | |
| ivory: '#FFFFF0', | |
| } | |
| // Get background color based on style (supports mix mode with percentages) | |
| function getBackgroundColor(config: GeneratorConfig, random?: () => number): string { | |
| // Check if mix mode is enabled | |
| if (config.image.backgroundMode === 'mix' && config.image.backgroundPercentages && random) { | |
| const percentages = config.image.backgroundPercentages as Record<string, number> | |
| const roll = random() * 100 | |
| let cumulative = 0 | |
| for (const [styleId, percentage] of Object.entries(percentages)) { | |
| cumulative += percentage | |
| if (roll < cumulative && backgroundColors[styleId]) { | |
| return backgroundColors[styleId] | |
| } | |
| } | |
| } | |
| // Single mode or fallback | |
| if (config.image.backgroundStyle && backgroundColors[config.image.backgroundStyle]) { | |
| return backgroundColors[config.image.backgroundStyle] | |
| } | |
| return config.image.background || '#FFFFFF' | |
| } | |
| // Load an image from dataUrl | |
| async function loadImage(dataUrl: string): Promise<HTMLImageElement> { | |
| return new Promise((resolve, reject) => { | |
| const img = new Image() | |
| img.onload = () => resolve(img) | |
| img.onerror = reject | |
| img.src = dataUrl | |
| }) | |
| } | |
| // Select background - returns either a color string OR an image dataUrl | |
| function selectBackground(config: GeneratorConfig, random: () => number): { type: 'color', value: string, styleName: string } | { type: 'image', value: string, styleName: string } { | |
| // Check for custom backgrounds with percentages > 0 | |
| const customBgs = config.image.customBackgrounds?.filter(bg => bg.percentage > 0) || [] | |
| const hasCustomBgs = customBgs.length > 0 | |
| // Calculate total percentages | |
| const stylePercentages = config.image.backgroundPercentages || {} | |
| const totalStylePct = Object.values(stylePercentages).reduce((acc, p) => acc + p, 0) | |
| const totalCustomPct = customBgs.reduce((acc, bg) => acc + bg.percentage, 0) | |
| const totalPct = totalStylePct + totalCustomPct | |
| if (totalPct === 0) { | |
| // Fallback to single style or default | |
| const style = config.image.backgroundStyle || 'clean_white' | |
| return { type: 'color', value: backgroundColors[style] || config.image.background || '#FFFFFF', styleName: style } | |
| } | |
| // Random selection based on combined percentages | |
| const roll = random() * totalPct | |
| let cumulative = 0 | |
| // Check style backgrounds first | |
| for (const [styleId, percentage] of Object.entries(stylePercentages)) { | |
| cumulative += percentage | |
| if (roll < cumulative && backgroundColors[styleId]) { | |
| return { type: 'color', value: backgroundColors[styleId], styleName: styleId } | |
| } | |
| } | |
| // Check custom backgrounds | |
| for (const bg of customBgs) { | |
| cumulative += bg.percentage | |
| if (roll < cumulative) { | |
| return { type: 'image', value: bg.dataUrl, styleName: bg.name } | |
| } | |
| } | |
| // Fallback | |
| const style = config.image.backgroundStyle || 'clean_white' | |
| return { type: 'color', value: backgroundColors[style] || config.image.background || '#FFFFFF', styleName: style } | |
| } | |
| // Render text to canvas and return as blob | |
| async function renderTextToCanvas( | |
| text: string, | |
| config: GeneratorConfig, | |
| fontFamily: string, | |
| shouldAugment: boolean, | |
| augValues: Record<string, number>, | |
| random: () => number | |
| ): Promise<{ blob: Blob; augmentations: string[]; backgroundStyle: string }> { | |
| // Use canvas pool for better performance | |
| const { canvas, ctx } = acquireCanvas(config.image.width, config.image.height) | |
| // Declare variables outside try for proper scoping | |
| let appliedAugmentations: string[] = [] | |
| let bg: ReturnType<typeof selectBackground> | |
| try { | |
| // Select and apply background | |
| bg = selectBackground(config, random) | |
| if (bg.type === 'color') { | |
| ctx.fillStyle = bg.value | |
| ctx.fillRect(0, 0, canvas.width, canvas.height) | |
| } else { | |
| // Draw custom background image | |
| try { | |
| const img = await loadImage(bg.value) | |
| ctx.drawImage(img, 0, 0, canvas.width, canvas.height) | |
| } catch { | |
| // Fallback to white if image fails | |
| ctx.fillStyle = '#FFFFFF' | |
| ctx.fillRect(0, 0, canvas.width, canvas.height) | |
| } | |
| } | |
| // Apply augmentation transforms if enabled | |
| if (shouldAugment && config.augmentation.enabled) { | |
| ctx.save() | |
| appliedAugmentations = applyAugmentation(ctx, canvas, augValues, random) | |
| } | |
| // Set text properties with the selected font | |
| const fontSize = Math.min(canvas.height * 0.6, 48) | |
| ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif` | |
| ctx.fillStyle = config.image.textColor | |
| ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left' | |
| ctx.textBaseline = 'middle' | |
| // Draw text | |
| const x = config.image.direction === 'rtl' | |
| ? canvas.width - 10 | |
| : 10 | |
| const y = canvas.height / 2 | |
| ctx.direction = config.image.direction as CanvasDirection | |
| ctx.fillText(text, x, y) | |
| if (shouldAugment && config.augmentation.enabled) { | |
| ctx.restore() | |
| } | |
| // Apply post-processing augmentations (GPU-accelerated when available) | |
| if (shouldAugment && config.augmentation.enabled) { | |
| const applyBrightness = augValues.brightness && random() > 0.5 | |
| const applyNoise = augValues.gaussian_noise && random() > 0.6 | |
| if (applyBrightness || applyNoise) { | |
| // Try GPU-accelerated augmentation first | |
| const useGPU = isWebGLAvailable() | |
| if (useGPU) { | |
| // GPU path - apply all augmentations in a single GPU pass | |
| const gpuOptions: GPUAugmentOptions = { | |
| brightness: applyBrightness ? (random() - 0.5) * augValues.brightness / 50 : 0, | |
| contrast: 1, // Could add contrast augmentation here | |
| noiseAmount: applyNoise ? augValues.gaussian_noise / 200 : 0, | |
| seed: random() * 1000 | |
| } | |
| const gpuResult = applyGPUAugmentation(canvas, gpuOptions) | |
| if (gpuResult && gpuResult !== canvas) { | |
| // Copy GPU result back to main canvas | |
| const gpuCtx = gpuResult.getContext('webgl') | |
| if (gpuCtx) { | |
| const pixels = new Uint8Array(canvas.width * canvas.height * 4) | |
| gpuCtx.readPixels(0, 0, canvas.width, canvas.height, gpuCtx.RGBA, gpuCtx.UNSIGNED_BYTE, pixels) | |
| // Flip Y and apply to main canvas | |
| const imageData = ctx.createImageData(canvas.width, canvas.height) | |
| const rowSize = canvas.width * 4 | |
| for (let y = 0; y < canvas.height; y++) { | |
| const srcRow = (canvas.height - 1 - y) * rowSize | |
| const dstRow = y * rowSize | |
| imageData.data.set(pixels.subarray(srcRow, srcRow + rowSize), dstRow) | |
| } | |
| ctx.putImageData(imageData, 0, 0) | |
| if (applyBrightness) appliedAugmentations.push('brightness_gpu') | |
| if (applyNoise) appliedAugmentations.push('noise_gpu') | |
| } | |
| } | |
| } else { | |
| // CPU fallback path | |
| if (applyBrightness) { | |
| const adjustment = 1 + (random() - 0.5) * augValues.brightness / 50 | |
| const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height) | |
| for (let i = 0; i < imageData.data.length; i += 4) { | |
| imageData.data[i] = Math.min(255, imageData.data[i] * adjustment) | |
| imageData.data[i + 1] = Math.min(255, imageData.data[i + 1] * adjustment) | |
| imageData.data[i + 2] = Math.min(255, imageData.data[i + 2] * adjustment) | |
| } | |
| ctx.putImageData(imageData, 0, 0) | |
| appliedAugmentations.push('brightness') | |
| } | |
| if (applyNoise) { | |
| const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height) | |
| const noiseLevel = augValues.gaussian_noise / 2 | |
| for (let i = 0; i < imageData.data.length; i += 4) { | |
| const noise = (random() - 0.5) * noiseLevel | |
| imageData.data[i] = Math.max(0, Math.min(255, imageData.data[i] + noise)) | |
| imageData.data[i + 1] = Math.max(0, Math.min(255, imageData.data[i + 1] + noise)) | |
| imageData.data[i + 2] = Math.max(0, Math.min(255, imageData.data[i + 2] + noise)) | |
| } | |
| ctx.putImageData(imageData, 0, 0) | |
| appliedAugmentations.push('noise') | |
| } | |
| } | |
| } | |
| } | |
| // Convert to blob and release canvas when done | |
| return new Promise((resolve, reject) => { | |
| canvas.toBlob( | |
| (blob) => { | |
| releaseCanvas(canvas) // Return canvas to pool | |
| if (blob) { | |
| resolve({ blob, augmentations: appliedAugmentations, backgroundStyle: bg.styleName }) | |
| } else { | |
| reject(new Error('Failed to convert canvas to blob')) | |
| } | |
| }, | |
| 'image/png' | |
| ) | |
| }) | |
| } catch (err) { | |
| releaseCanvas(canvas) // Ensure canvas is released on error | |
| throw err | |
| } | |
| } | |
| // Render a single sample (for parallel processing) | |
| async function renderSample( | |
| index: number, | |
| text: string, | |
| config: GeneratorConfig, | |
| loadedFonts: Map<string, string>, | |
| augValues: Record<string, number>, | |
| baseSeed: number | |
| ): Promise<SampleResult> { | |
| // Use index-specific random generator for reproducibility | |
| const random = seededRandomForIndex(baseSeed, index) | |
| // Select font | |
| const selectedFont = selectFont(config.fonts.distribution, random) | |
| const fontFamily = loadedFonts.get(selectedFont.name) || selectedFont.family || 'Arial' | |
| // Determine augmentation | |
| const shouldAugment = config.augmentation.enabled && | |
| (random() * 100) < config.augmentation.applyPercentage | |
| // Render | |
| const { blob, augmentations, backgroundStyle } = await renderTextToCanvas( | |
| text, | |
| config, | |
| fontFamily, | |
| shouldAugment, | |
| augValues, | |
| random | |
| ) | |
| const filename = `image_${String(index).padStart(6, '0')}.png` | |
| return { | |
| index, | |
| filename, | |
| blob, | |
| label: `${filename}\t${text}`, | |
| fontName: selectedFont.name, | |
| augmentations, | |
| backgroundStyle, | |
| isAugmented: shouldAugment | |
| } | |
| } | |
| // Validate configuration and data | |
| function validateInputs(config: GeneratorConfig, textData: string[]): { valid: boolean; error?: string; adjustedSize?: number } { | |
| if (!textData || textData.length === 0) { | |
| return { valid: false, error: 'No text data provided. Please upload a text file.' } | |
| } | |
| if (config.dataset.size <= 0) { | |
| return { valid: false, error: 'Dataset size must be greater than 0.' } | |
| } | |
| // Check background percentages in mix mode | |
| if (config.image.backgroundMode === 'mix' && config.image.backgroundPercentages) { | |
| const total = Object.values(config.image.backgroundPercentages).reduce((a, b) => a + b, 0) | |
| const customTotal = config.image.customBackgrounds?.reduce((a, b) => a + b.percentage, 0) || 0 | |
| if (Math.abs(total + customTotal - 100) > 1 && total + customTotal > 0) { | |
| console.warn(`Background percentages total ${total + customTotal}%, expected 100%`) | |
| } | |
| } | |
| // Adjust size if needed | |
| if (config.dataset.size > textData.length) { | |
| console.warn(`Dataset size (${config.dataset.size}) exceeds available samples (${textData.length}). Adjusting to ${textData.length}.`) | |
| return { valid: true, adjustedSize: textData.length } | |
| } | |
| return { valid: true } | |
| } | |
| // Build label files with chunked processing for large datasets | |
| function buildLabelFiles(labels: string[], formats: string[], zip: JSZip) { | |
| console.log(`Building label files for ${labels.length} samples, formats: ${formats.join(', ')}`) | |
| // CRNN/PaddleOCR format: labels.txt | |
| if (formats.includes('crnn') || formats.includes('paddleocr')) { | |
| // Process in chunks to avoid string length limits | |
| const CHUNK_SIZE = 10000 | |
| const chunks: string[] = [] | |
| for (let i = 0; i < labels.length; i += CHUNK_SIZE) { | |
| chunks.push(labels.slice(i, i + CHUNK_SIZE).join('\n')) | |
| } | |
| const labelsContent = chunks.join('\n') | |
| zip.file('labels.txt', labelsContent) | |
| console.log(`Created labels.txt with ${labels.length} entries`) | |
| } | |
| // TrOCR/JSONL format: data.jsonl | |
| if (formats.includes('trocr') || formats.includes('jsonl')) { | |
| const CHUNK_SIZE = 10000 | |
| const chunks: string[] = [] | |
| for (let i = 0; i < labels.length; i += CHUNK_SIZE) { | |
| const chunk = labels.slice(i, i + CHUNK_SIZE).map((label) => { | |
| const parts = label.split('\t') | |
| const filename = parts[0] | |
| const text = parts.slice(1).join('\t') | |
| return JSON.stringify({ image: `images/${filename}`, text: text }) | |
| }) | |
| chunks.push(chunk.join('\n')) | |
| } | |
| zip.file('data.jsonl', chunks.join('\n')) | |
| console.log(`Created data.jsonl with ${labels.length} entries`) | |
| } | |
| // CSV format: data.csv | |
| if (formats.includes('csv')) { | |
| const CHUNK_SIZE = 10000 | |
| const chunks: string[] = ['image,text'] | |
| for (let i = 0; i < labels.length; i += CHUNK_SIZE) { | |
| const chunk = labels.slice(i, i + CHUNK_SIZE).map(label => { | |
| const parts = label.split('\t') | |
| const filename = parts[0] | |
| const text = parts.slice(1).join('\t') | |
| return `"images/${filename}","${text.replace(/"/g, '""')}"` | |
| }) | |
| chunks.push(chunk.join('\n')) | |
| } | |
| zip.file('data.csv', chunks.join('\n')) | |
| console.log(`Created data.csv with ${labels.length} entries`) | |
| } | |
| // JSON format: data.json (array format) | |
| if (formats.includes('json')) { | |
| const jsonData = labels.map((label) => { | |
| const parts = label.split('\t') | |
| const filename = parts[0] | |
| const text = parts.slice(1).join('\t') | |
| return { image: `images/${filename}`, text: text } | |
| }) | |
| zip.file('data.json', JSON.stringify(jsonData, null, 2)) | |
| console.log(`Created data.json with ${labels.length} entries`) | |
| } | |
| // HuggingFace format: metadata.csv | |
| if (formats.includes('huggingface')) { | |
| const CHUNK_SIZE = 10000 | |
| const chunks: string[] = ['file_name,text'] | |
| for (let i = 0; i < labels.length; i += CHUNK_SIZE) { | |
| const chunk = labels.slice(i, i + CHUNK_SIZE).map((label) => { | |
| const parts = label.split('\t') | |
| const filename = parts[0] | |
| const text = parts.slice(1).join('\t') | |
| return `"images/${filename}","${text.replace(/"/g, '""')}"` | |
| }) | |
| chunks.push(chunk.join('\n')) | |
| } | |
| zip.file('metadata.csv', chunks.join('\n')) | |
| console.log(`Created metadata.csv for HuggingFace with ${labels.length} entries`) | |
| } | |
| } | |
| // Main generation function with parallel processing | |
| export async function generateDataset( | |
| config: GeneratorConfig, | |
| textData: string[], | |
| onProgress: (progress: number, message: string) => void, | |
| abortSignal?: AbortSignal | |
| ): Promise<GenerationResult> { | |
| const startTime = Date.now() | |
| // Validate inputs | |
| const validation = validateInputs(config, textData) | |
| if (!validation.valid) { | |
| throw new Error(validation.error) | |
| } | |
| if (validation.adjustedSize) { | |
| config.dataset.size = validation.adjustedSize | |
| } | |
| const zip = new JSZip() | |
| const imagesFolder = zip.folder('images')! | |
| // Get augmentation values | |
| const augValues = config.augmentation.customMode && config.augmentation.values | |
| ? config.augmentation.values | |
| : { | |
| rotation: 5, | |
| skew: 3, | |
| gaussian_blur: 1, | |
| motion_blur: 3, | |
| gaussian_noise: 10, | |
| brightness: 15, | |
| contrast: 20, | |
| jpeg_quality: 70, | |
| } | |
| // Determine number of samples | |
| const numSamples = Math.min(config.dataset.size, textData.length) | |
| // Determine batch size based on available resources and dataset size | |
| // Smaller batches for very large datasets to manage memory | |
| const cpuCores = typeof navigator !== 'undefined' ? (navigator.hardwareConcurrency || 4) : 4 | |
| let BATCH_SIZE = Math.min(cpuCores, 8) | |
| // For very large datasets, use smaller batches to manage memory | |
| if (numSamples > 50000) { | |
| BATCH_SIZE = Math.min(BATCH_SIZE, 4) | |
| console.log(`Large dataset detected (${numSamples}), reducing batch size for memory management`) | |
| } | |
| console.log(`Starting generation: ${numSamples} samples, batch size: ${BATCH_SIZE}`) | |
| // Load all fonts before starting | |
| onProgress(0, 'Loading fonts...') | |
| const loadedFonts: Map<string, string> = new Map() | |
| if (config.fonts.distribution && config.fonts.distribution.length > 0) { | |
| for (const font of config.fonts.distribution) { | |
| try { | |
| const loadedFamily = await loadFont(font) | |
| loadedFonts.set(font.name, loadedFamily) | |
| onProgress(0, `Loaded font: ${font.name}`) | |
| } catch (err) { | |
| console.warn(`Failed to load font ${font.name}, using fallback`) | |
| loadedFonts.set(font.name, 'Arial') | |
| } | |
| } | |
| } | |
| // Initialize tracking | |
| let cleanCount = 0 | |
| let augmentedCount = 0 | |
| const augmentationCounts: Record<string, number> = {} | |
| const fontUsageCounts: Record<string, number> = {} | |
| const backgroundCounts: Record<string, number> = {} | |
| const labels: string[] = [] | |
| let totalChars = 0 | |
| const uniqueTexts = new Set<string>() | |
| // Store current state for pause/resume | |
| currentState = { | |
| currentIndex: 0, | |
| labels, | |
| fontUsageCounts, | |
| augmentationCounts, | |
| cleanCount, | |
| augmentedCount, | |
| totalChars, | |
| uniqueTexts, | |
| backgroundCounts, | |
| startTime, | |
| zip | |
| } | |
| // Check if Web Workers with OffscreenCanvas are available | |
| const useWorkers = isWorkerRenderingAvailable() | |
| const workerPool = useWorkers ? getWorkerPool() : null | |
| const renderMode = useWorkers ? `Web Workers (${workerPool?.getWorkerCount()} threads)` : `Main Thread (${BATCH_SIZE} parallel)` | |
| console.log(`Starting generation: ${numSamples} samples, mode: ${renderMode}`) | |
| // Generate samples | |
| onProgress(1, `Starting generation (${renderMode})...`) | |
| if (useWorkers && workerPool) { | |
| // === WEB WORKER RENDERING === | |
| // Prepare all tasks upfront for worker distribution | |
| const tasks: WorkerTask[] = [] | |
| const random = seededRandom(config.dataset.seed) | |
| for (let i = 0; i < numSamples; i++) { | |
| const text = textData[i % textData.length] | |
| // Select font and background for this sample | |
| const selectedFont = selectFont(config.fonts.distribution, () => seededRandom(config.dataset.seed + i * 1000)()) | |
| const fontFamily = loadedFonts.get(selectedFont.name) || selectedFont.family || 'Arial' | |
| const bg = selectBackground(config, () => seededRandom(config.dataset.seed + i * 2000)()) | |
| const shouldAugment = config.augmentation.enabled && | |
| (seededRandom(config.dataset.seed + i * 3000)() * 100) < config.augmentation.applyPercentage | |
| tasks.push({ | |
| id: i, | |
| index: i, | |
| text, | |
| config: { | |
| width: config.image.width, | |
| height: config.image.height, | |
| textColor: config.image.textColor, | |
| direction: config.image.direction, | |
| backgroundStyle: bg.styleName, | |
| backgroundColor: bg.type === 'color' ? bg.value : '#FFFFFF' | |
| }, | |
| fontFamily, | |
| shouldAugment, | |
| augValues, | |
| seed: config.dataset.seed | |
| }) | |
| // Track font usage | |
| fontUsageCounts[selectedFont.name] = (fontUsageCounts[selectedFont.name] || 0) + 1 | |
| } | |
| onProgress(5, `Prepared ${numSamples} tasks, dispatching to ${workerPool.getWorkerCount()} workers...`) | |
| try { | |
| // Process all tasks with workers | |
| let processedCount = 0 | |
| const results = await workerPool.processTasks(tasks, (result: WorkerResult) => { | |
| processedCount++ | |
| // Track progress | |
| if (processedCount % 100 === 0 || processedCount >= numSamples) { | |
| const progress = (processedCount / numSamples) * 85 + 5 | |
| onProgress(progress, `Generated ${processedCount}/${numSamples} samples (Workers: ${workerPool.getWorkerCount()} threads)`) | |
| } | |
| }) | |
| // Process results | |
| for (const result of results) { | |
| if (result.error) { | |
| console.warn(`Sample ${result.index} failed:`, result.error) | |
| continue | |
| } | |
| uniqueTexts.add(textData[result.index % textData.length]) | |
| totalChars += textData[result.index % textData.length].length | |
| backgroundCounts[result.backgroundStyle] = (backgroundCounts[result.backgroundStyle] || 0) + 1 | |
| if (result.isAugmented) { | |
| augmentedCount++ | |
| result.augmentations.forEach(aug => { | |
| augmentationCounts[aug] = (augmentationCounts[aug] || 0) + 1 | |
| }) | |
| } else { | |
| cleanCount++ | |
| } | |
| imagesFolder.file(result.filename, result.blob) | |
| labels.push(result.label) | |
| } | |
| currentState.currentIndex = numSamples | |
| currentState.cleanCount = cleanCount | |
| currentState.augmentedCount = augmentedCount | |
| currentState.totalChars = totalChars | |
| } catch (err) { | |
| console.error('Worker pool error, falling back to main thread:', err) | |
| // Fallback happens below | |
| } | |
| } | |
| // === MAIN THREAD FALLBACK or PRIMARY (if workers not available) === | |
| // Only run if labels were not populated by workers | |
| if (labels.length === 0) { | |
| for (let batchStart = 0; batchStart < numSamples; batchStart += BATCH_SIZE) { | |
| // Check for abort | |
| if (abortSignal?.aborted) { | |
| console.log('Generation aborted') | |
| break | |
| } | |
| const batchEnd = Math.min(batchStart + BATCH_SIZE, numSamples) | |
| const batchPromises: Promise<SampleResult>[] = [] | |
| // Create batch of render tasks | |
| for (let i = batchStart; i < batchEnd; i++) { | |
| const text = textData[i % textData.length] | |
| batchPromises.push( | |
| renderSample(i, text, config, loadedFonts, augValues, config.dataset.seed) | |
| ) | |
| } | |
| // Wait for batch to complete | |
| try { | |
| const batchResults = await Promise.all(batchPromises) | |
| // Process results | |
| for (const result of batchResults) { | |
| // Track stats | |
| uniqueTexts.add(textData[result.index % textData.length]) | |
| totalChars += textData[result.index % textData.length].length | |
| // Track font usage | |
| fontUsageCounts[result.fontName] = (fontUsageCounts[result.fontName] || 0) + 1 | |
| // Track background usage | |
| backgroundCounts[result.backgroundStyle] = (backgroundCounts[result.backgroundStyle] || 0) + 1 | |
| // Track augmentation | |
| if (result.isAugmented) { | |
| augmentedCount++ | |
| result.augmentations.forEach(aug => { | |
| augmentationCounts[aug] = (augmentationCounts[aug] || 0) + 1 | |
| }) | |
| } else { | |
| cleanCount++ | |
| } | |
| // Add to zip and labels | |
| imagesFolder.file(result.filename, result.blob) | |
| labels.push(result.label) | |
| } | |
| // Update state for pause/resume | |
| currentState.currentIndex = batchEnd | |
| currentState.cleanCount = cleanCount | |
| currentState.augmentedCount = augmentedCount | |
| currentState.totalChars = totalChars | |
| } catch (err) { | |
| console.error(`Batch error at ${batchStart}-${batchEnd}:`, err) | |
| // Continue with next batch instead of failing completely | |
| } | |
| // Report progress | |
| const progress = ((batchEnd) / numSamples) * 90 + 5 // Reserve 5% for init, 5% for compression | |
| if (batchEnd % (BATCH_SIZE * 5) === 0 || batchEnd >= numSamples) { | |
| onProgress(progress, `Generated ${batchEnd.toLocaleString()}/${numSamples.toLocaleString()} samples`) | |
| } | |
| // Memory cleanup hint for garbage collector (helps with large datasets) | |
| if (batchEnd % 1000 === 0) { | |
| // Allow event loop to process and GC to run | |
| await new Promise(resolve => setTimeout(resolve, 0)) | |
| } | |
| } | |
| } // End of main thread fallback | |
| // Build label files with chunked processing | |
| console.log('Building label files...') | |
| onProgress(92, 'Building label files...') | |
| buildLabelFiles(labels, config.output.formats, zip) | |
| // Add metadata | |
| const metadata = { | |
| generated_at: new Date().toISOString(), | |
| config: { | |
| image_size: `${config.image.width}x${config.image.height}`, | |
| background: getBackgroundColor(config), | |
| background_style: config.image.backgroundStyle, | |
| background_mode: config.image.backgroundMode, | |
| text_color: config.image.textColor, | |
| direction: config.image.direction, | |
| augmentation_enabled: config.augmentation.enabled, | |
| augmentation_percentage: config.augmentation.applyPercentage, | |
| fonts_used: config.fonts.distribution?.map(f => f.name) || [], | |
| output_formats: config.output.formats, | |
| }, | |
| samples: labels.length, | |
| clean_samples: cleanCount, | |
| augmented_samples: augmentedCount, | |
| font_usage: fontUsageCounts, | |
| background_usage: backgroundCounts, | |
| } | |
| zip.file('metadata.json', JSON.stringify(metadata, null, 2)) | |
| // Generate zip blob with dynamic compression based on dataset size | |
| onProgress(95, 'Compressing dataset...') | |
| // Use lower/no compression for large datasets (faster, less memory) | |
| // STORE = no compression, uses less memory | |
| let compressionType: 'DEFLATE' | 'STORE' = 'DEFLATE' | |
| let compressionLevel = 6 | |
| if (labels.length > 20000) { | |
| // Very large: no compression (STORE) to prevent memory issues | |
| compressionType = 'STORE' | |
| compressionLevel = 0 | |
| console.log(`Large dataset (${labels.length}): Using STORE (no compression) to prevent memory issues`) | |
| } else if (labels.length > 10000) { | |
| compressionLevel = 1 // Minimal compression | |
| console.log(`Medium-large dataset (${labels.length}): Using minimal compression level 1`) | |
| } else if (labels.length > 5000) { | |
| compressionLevel = 3 | |
| console.log(`Medium dataset (${labels.length}): Using compression level 3`) | |
| } | |
| console.log(`Compressing ${labels.length} files with ${compressionType}, level ${compressionLevel}`) | |
| let zipBlob: Blob | |
| try { | |
| zipBlob = await zip.generateAsync({ | |
| type: 'blob', | |
| compression: compressionType, | |
| compressionOptions: compressionType === 'DEFLATE' ? { level: compressionLevel } : undefined, | |
| streamFiles: true // Stream files to reduce memory usage | |
| }) | |
| } catch (memoryError: any) { | |
| console.error('ZIP generation failed, trying without compression:', memoryError) | |
| onProgress(96, 'Retrying with no compression...') | |
| // Retry with no compression at all | |
| try { | |
| zipBlob = await zip.generateAsync({ | |
| type: 'blob', | |
| compression: 'STORE', | |
| streamFiles: true | |
| }) | |
| } catch (finalError) { | |
| throw new Error(`Failed to create ZIP: Memory limit exceeded. Try using Local Folder storage mode instead of Memory for large datasets.`) | |
| } | |
| } | |
| const endTime = Date.now() | |
| const durationSeconds = (endTime - startTime) / 1000 | |
| // Build font distribution stats | |
| const fontStats = Object.entries(fontUsageCounts).map(([family, count]) => ({ | |
| family, | |
| count, | |
| percentage: Math.round((count / labels.length) * 100) | |
| })) | |
| // Build augmentation stats | |
| const augStats = Object.entries(augmentationCounts).map(([name, count]) => ({ | |
| name, | |
| count, | |
| percentage: augmentedCount > 0 ? Math.round((count / augmentedCount) * 100) : 0 | |
| })) | |
| // Build background distribution stats | |
| const backgroundStats = Object.entries(backgroundCounts).map(([name, count]) => ({ | |
| name, | |
| count, | |
| percentage: Math.round((count / labels.length) * 100) | |
| })) | |
| // Build result | |
| const result: GenerationResult = { | |
| stats: { | |
| total_samples: labels.length, | |
| duration_seconds: durationSeconds, | |
| samples_per_second: labels.length / durationSeconds, | |
| font_distribution: fontStats.length > 0 | |
| ? fontStats | |
| : [{ family: 'Default (Arial)', count: labels.length, percentage: 100 }], | |
| clean_samples: cleanCount, | |
| augmented_samples: augmentedCount, | |
| augmentation_stats: augStats, | |
| avg_transforms_per_sample: augmentedCount > 0 | |
| ? Object.values(augmentationCounts).reduce((a, b) => a + b, 0) / augmentedCount | |
| : 0, | |
| unique_tokens: uniqueTexts.size, | |
| avg_chars_per_sample: labels.length > 0 ? totalChars / labels.length : 0, | |
| unicode_valid: labels.length, | |
| script_pure: labels.length, | |
| rejected_samples: 0, | |
| background_distribution: backgroundStats.length > 0 ? backgroundStats : undefined | |
| }, | |
| zipBlob | |
| } | |
| // Clear state | |
| currentState = null | |
| onProgress(100, 'Generation complete!') | |
| return result | |
| } | |
| // Build partial ZIP from current state (for pause/download) | |
| export async function buildPartialZip(state: GenerationState, formats: string[]): Promise<Blob> { | |
| const zip = state.zip | |
| // Build label files for completed samples | |
| buildLabelFiles(state.labels, formats, zip) | |
| // Add partial metadata | |
| zip.file('metadata.json', JSON.stringify({ | |
| status: 'PARTIAL', | |
| completed_samples: state.currentIndex, | |
| timestamp: new Date().toISOString(), | |
| font_usage: state.fontUsageCounts, | |
| background_usage: state.backgroundCounts, | |
| clean_samples: state.cleanCount, | |
| augmented_samples: state.augmentedCount, | |
| }, null, 2)) | |
| return zip.generateAsync({ | |
| type: 'blob', | |
| compression: 'DEFLATE', | |
| compressionOptions: { level: 6 } | |
| }) | |
| } | |
| // Download the generated dataset | |
| export function downloadDataset(zipBlob: Blob, filename: string = 'ocr_dataset.zip') { | |
| saveAs(zipBlob, filename) | |
| } | |
| // Parse text file into segments | |
| export function parseTextFile( | |
| text: string, | |
| segmentation: string | |
| ): string[] { | |
| switch (segmentation) { | |
| case 'character': | |
| return Array.from(text).filter(c => c.trim().length > 0) | |
| case 'word': | |
| return text.split(/\s+/).filter(w => w.trim().length > 0) | |
| case 'line': | |
| return text.split('\n').filter(l => l.trim().length > 0) | |
| case 'sentence': | |
| return text.split(/[.!?؟۔]+/).filter(s => s.trim().length > 0).map(s => s.trim()) | |
| case 'ngram': | |
| const words = text.split(/\s+/).filter(w => w.trim().length > 0) | |
| const ngrams: string[] = [] | |
| for (let i = 0; i < words.length; i++) { | |
| for (let len = 2; len <= 4 && i + len <= words.length; len++) { | |
| ngrams.push(words.slice(i, i + len).join(' ')) | |
| } | |
| } | |
| return ngrams | |
| default: | |
| return text.split(/\s+/).filter(w => w.trim().length > 0) | |
| } | |
| } | |