Spaces:
Sleeping
Sleeping
Update OCR Dataset Generator
Browse files- web/app/api/storage/route.ts +186 -0
- web/components/generation-panel.tsx +253 -4
- web/lib/cleanup.ts +136 -0
- web/lib/direct-generator.ts +307 -0
- web/lib/generator.ts +226 -130
- web/lib/storage-manager.ts +468 -0
web/app/api/storage/route.ts
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { NextRequest, NextResponse } from 'next/server'
|
| 2 |
+
import { writeFile, mkdir, readdir, rm, stat } from 'fs/promises'
|
| 3 |
+
import { existsSync } from 'fs'
|
| 4 |
+
import path from 'path'
|
| 5 |
+
|
| 6 |
+
// Storage directories
|
| 7 |
+
const OUTPUT_DIR = process.env.OUTPUT_DIR || './output/datasets'
|
| 8 |
+
const DATA_DIR = process.env.DATA_DIR || '/data/datasets'
|
| 9 |
+
const CLEANUP_HOURS = parseInt(process.env.CLEANUP_HOURS || '24')
|
| 10 |
+
|
| 11 |
+
// Get the appropriate directory based on mode
|
| 12 |
+
function getBaseDir(mode: 'local' | 'huggingface'): string {
|
| 13 |
+
return mode === 'huggingface' ? DATA_DIR : OUTPUT_DIR
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
// Ensure directory exists
|
| 17 |
+
async function ensureDir(dirPath: string): Promise<void> {
|
| 18 |
+
if (!existsSync(dirPath)) {
|
| 19 |
+
await mkdir(dirPath, { recursive: true })
|
| 20 |
+
}
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
// POST - Write file or finalize
|
| 24 |
+
export async function POST(request: NextRequest) {
|
| 25 |
+
try {
|
| 26 |
+
const contentType = request.headers.get('content-type') || ''
|
| 27 |
+
|
| 28 |
+
if (contentType.includes('multipart/form-data')) {
|
| 29 |
+
// Handle file upload
|
| 30 |
+
const formData = await request.formData()
|
| 31 |
+
const file = formData.get('file') as File
|
| 32 |
+
const sessionId = formData.get('sessionId') as string
|
| 33 |
+
const mode = (formData.get('mode') as 'local' | 'huggingface') || 'local'
|
| 34 |
+
|
| 35 |
+
if (!file || !sessionId) {
|
| 36 |
+
return NextResponse.json({ error: 'Missing file or sessionId' }, { status: 400 })
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
const baseDir = getBaseDir(mode)
|
| 40 |
+
const sessionDir = path.join(baseDir, sessionId)
|
| 41 |
+
const imagesDir = path.join(sessionDir, 'images')
|
| 42 |
+
await ensureDir(imagesDir)
|
| 43 |
+
|
| 44 |
+
const buffer = Buffer.from(await file.arrayBuffer())
|
| 45 |
+
const filePath = path.join(imagesDir, file.name)
|
| 46 |
+
await writeFile(filePath, buffer)
|
| 47 |
+
|
| 48 |
+
return NextResponse.json({ success: true, path: filePath })
|
| 49 |
+
} else {
|
| 50 |
+
// Handle JSON requests
|
| 51 |
+
const body = await request.json()
|
| 52 |
+
const { sessionId, mode, type, content } = body
|
| 53 |
+
|
| 54 |
+
if (!sessionId || !mode || !type) {
|
| 55 |
+
return NextResponse.json({ error: 'Missing required fields' }, { status: 400 })
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
const baseDir = getBaseDir(mode)
|
| 59 |
+
const sessionDir = path.join(baseDir, sessionId)
|
| 60 |
+
await ensureDir(sessionDir)
|
| 61 |
+
|
| 62 |
+
switch (type) {
|
| 63 |
+
case 'labels':
|
| 64 |
+
await writeFile(path.join(sessionDir, 'labels.txt'), content)
|
| 65 |
+
return NextResponse.json({ success: true })
|
| 66 |
+
|
| 67 |
+
case 'metadata':
|
| 68 |
+
await writeFile(path.join(sessionDir, 'metadata.json'), content)
|
| 69 |
+
return NextResponse.json({ success: true })
|
| 70 |
+
|
| 71 |
+
case 'finalize':
|
| 72 |
+
// Mark as complete by writing a timestamp file
|
| 73 |
+
await writeFile(
|
| 74 |
+
path.join(sessionDir, '.complete'),
|
| 75 |
+
new Date().toISOString()
|
| 76 |
+
)
|
| 77 |
+
return NextResponse.json({
|
| 78 |
+
success: true,
|
| 79 |
+
path: sessionDir,
|
| 80 |
+
downloadUrl: `/api/storage/download?session=${sessionId}&mode=${mode}`
|
| 81 |
+
})
|
| 82 |
+
|
| 83 |
+
default:
|
| 84 |
+
return NextResponse.json({ error: 'Unknown type' }, { status: 400 })
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
} catch (error) {
|
| 88 |
+
console.error('Storage API error:', error)
|
| 89 |
+
return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
// GET - List datasets or get config
|
| 94 |
+
export async function GET(request: NextRequest) {
|
| 95 |
+
try {
|
| 96 |
+
const { searchParams } = new URL(request.url)
|
| 97 |
+
const action = searchParams.get('action')
|
| 98 |
+
const mode = (searchParams.get('mode') as 'local' | 'huggingface') || 'local'
|
| 99 |
+
|
| 100 |
+
switch (action) {
|
| 101 |
+
case 'list':
|
| 102 |
+
const baseDir = getBaseDir(mode)
|
| 103 |
+
if (!existsSync(baseDir)) {
|
| 104 |
+
return NextResponse.json({ datasets: [] })
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
const entries = await readdir(baseDir, { withFileTypes: true })
|
| 108 |
+
const datasets = await Promise.all(
|
| 109 |
+
entries
|
| 110 |
+
.filter(e => e.isDirectory())
|
| 111 |
+
.map(async (entry) => {
|
| 112 |
+
const datasetPath = path.join(baseDir, entry.name)
|
| 113 |
+
const stats = await stat(datasetPath)
|
| 114 |
+
const completeFile = path.join(datasetPath, '.complete')
|
| 115 |
+
const isComplete = existsSync(completeFile)
|
| 116 |
+
|
| 117 |
+
return {
|
| 118 |
+
name: entry.name,
|
| 119 |
+
path: datasetPath,
|
| 120 |
+
createdAt: stats.birthtime.toISOString(),
|
| 121 |
+
isComplete,
|
| 122 |
+
ageHours: (Date.now() - stats.birthtime.getTime()) / (1000 * 60 * 60)
|
| 123 |
+
}
|
| 124 |
+
})
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
return NextResponse.json({ datasets })
|
| 128 |
+
|
| 129 |
+
case 'config':
|
| 130 |
+
return NextResponse.json({
|
| 131 |
+
outputDir: OUTPUT_DIR,
|
| 132 |
+
dataDir: DATA_DIR,
|
| 133 |
+
cleanupHours: CLEANUP_HOURS,
|
| 134 |
+
isHuggingFace: existsSync('/data')
|
| 135 |
+
})
|
| 136 |
+
|
| 137 |
+
default:
|
| 138 |
+
return NextResponse.json({ error: 'Unknown action' }, { status: 400 })
|
| 139 |
+
}
|
| 140 |
+
} catch (error) {
|
| 141 |
+
console.error('Storage GET error:', error)
|
| 142 |
+
return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
// DELETE - Cleanup old datasets
|
| 147 |
+
export async function DELETE(request: NextRequest) {
|
| 148 |
+
try {
|
| 149 |
+
const { searchParams } = new URL(request.url)
|
| 150 |
+
const mode = (searchParams.get('mode') as 'local' | 'huggingface') || 'local'
|
| 151 |
+
const maxAgeHours = parseInt(searchParams.get('maxAge') || String(CLEANUP_HOURS))
|
| 152 |
+
|
| 153 |
+
const baseDir = getBaseDir(mode)
|
| 154 |
+
if (!existsSync(baseDir)) {
|
| 155 |
+
return NextResponse.json({ cleaned: 0, message: 'No datasets directory' })
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
const entries = await readdir(baseDir, { withFileTypes: true })
|
| 159 |
+
let cleaned = 0
|
| 160 |
+
const cleanedPaths: string[] = []
|
| 161 |
+
|
| 162 |
+
for (const entry of entries) {
|
| 163 |
+
if (!entry.isDirectory()) continue
|
| 164 |
+
|
| 165 |
+
const datasetPath = path.join(baseDir, entry.name)
|
| 166 |
+
const stats = await stat(datasetPath)
|
| 167 |
+
const ageHours = (Date.now() - stats.birthtime.getTime()) / (1000 * 60 * 60)
|
| 168 |
+
|
| 169 |
+
if (ageHours > maxAgeHours) {
|
| 170 |
+
await rm(datasetPath, { recursive: true, force: true })
|
| 171 |
+
cleaned++
|
| 172 |
+
cleanedPaths.push(entry.name)
|
| 173 |
+
console.log(`Cleaned up old dataset: ${entry.name} (${ageHours.toFixed(1)} hours old)`)
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
return NextResponse.json({
|
| 178 |
+
cleaned,
|
| 179 |
+
cleanedPaths,
|
| 180 |
+
message: `Cleaned ${cleaned} datasets older than ${maxAgeHours} hours`
|
| 181 |
+
})
|
| 182 |
+
} catch (error) {
|
| 183 |
+
console.error('Cleanup error:', error)
|
| 184 |
+
return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
|
| 185 |
+
}
|
| 186 |
+
}
|
web/components/generation-panel.tsx
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
'use client'
|
| 2 |
|
| 3 |
import { useState, useEffect, useRef, Dispatch, SetStateAction } from 'react'
|
| 4 |
-
import { Play, Pause, StopCircle, Download, CheckCircle2, FileArchive, AlertCircle, DownloadCloud, Cpu, Zap, Layers, HelpCircle } from 'lucide-react'
|
| 5 |
import { GenerationStats } from './stats-panel'
|
| 6 |
-
import { generateDataset, downloadDataset, GeneratorConfig, getGenerationState, buildPartialZip } from '@/lib/generator'
|
| 7 |
import { isWebGLAvailable } from '@/lib/gpu-augmentation'
|
|
|
|
|
|
|
| 8 |
|
| 9 |
interface GenerationPanelProps {
|
| 10 |
config: any
|
|
@@ -46,6 +48,19 @@ export function GenerationPanel({
|
|
| 46 |
const logsEndRef = useRef<HTMLDivElement>(null)
|
| 47 |
const abortControllerRef = useRef<AbortController | null>(null)
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
const scrollToBottom = () => {
|
| 50 |
logsEndRef.current?.scrollIntoView({ behavior: 'smooth' })
|
| 51 |
}
|
|
@@ -123,6 +138,85 @@ export function GenerationPanel({
|
|
| 123 |
])
|
| 124 |
|
| 125 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
const generatorConfig: GeneratorConfig = {
|
| 127 |
dataset: config.dataset,
|
| 128 |
input: {
|
|
@@ -501,8 +595,8 @@ export function GenerationPanel({
|
|
| 501 |
<span className="text-sm">Augmentation</span>
|
| 502 |
</div>
|
| 503 |
<span className={`text-xs font-bold px-2 py-1 rounded ${isWebGLAvailable()
|
| 504 |
-
|
| 505 |
-
|
| 506 |
}`}>
|
| 507 |
{isWebGLAvailable() ? '🎮 GPU' : '💻 CPU'}
|
| 508 |
</span>
|
|
@@ -533,6 +627,161 @@ export function GenerationPanel({
|
|
| 533 |
</div>
|
| 534 |
</div>
|
| 535 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 536 |
</div>
|
| 537 |
|
| 538 |
{/* Progress & Logs */}
|
|
|
|
| 1 |
'use client'
|
| 2 |
|
| 3 |
import { useState, useEffect, useRef, Dispatch, SetStateAction } from 'react'
|
| 4 |
+
import { Play, Pause, StopCircle, Download, CheckCircle2, FileArchive, AlertCircle, DownloadCloud, Cpu, Zap, Layers, HelpCircle, FolderOpen, Database, Cloud, HardDrive, Settings2 } from 'lucide-react'
|
| 5 |
import { GenerationStats } from './stats-panel'
|
| 6 |
+
import { generateDataset, downloadDataset, GeneratorConfig, getGenerationState, buildPartialZip, clearCanvasPool } from '@/lib/generator'
|
| 7 |
import { isWebGLAvailable } from '@/lib/gpu-augmentation'
|
| 8 |
+
import { StorageMode, getStorageManager, resetStorageManager } from '@/lib/storage-manager'
|
| 9 |
+
import { generateDatasetDirect, isDirectGenerationSupported, DirectGenerationConfig } from '@/lib/direct-generator'
|
| 10 |
|
| 11 |
interface GenerationPanelProps {
|
| 12 |
config: any
|
|
|
|
| 48 |
const logsEndRef = useRef<HTMLDivElement>(null)
|
| 49 |
const abortControllerRef = useRef<AbortController | null>(null)
|
| 50 |
|
| 51 |
+
// Storage mode state
|
| 52 |
+
const [storageMode, setStorageMode] = useState<StorageMode>('memory')
|
| 53 |
+
const [selectedFolder, setSelectedFolder] = useState<string | null>(null)
|
| 54 |
+
const [outputPath, setOutputPath] = useState<string | null>(null)
|
| 55 |
+
const [showStorageSettings, setShowStorageSettings] = useState(false)
|
| 56 |
+
|
| 57 |
+
// Directory handle for local folder mode (File System Access API)
|
| 58 |
+
const directoryHandleRef = useRef<FileSystemDirectoryHandle | null>(null)
|
| 59 |
+
|
| 60 |
+
// Rolling rate calculation (last 100 samples)
|
| 61 |
+
const recentSamples = useRef<{ time: number; count: number }[]>([])
|
| 62 |
+
const ROLLING_WINDOW = 100 // Calculate rate from last 100 samples
|
| 63 |
+
|
| 64 |
const scrollToBottom = () => {
|
| 65 |
logsEndRef.current?.scrollIntoView({ behavior: 'smooth' })
|
| 66 |
}
|
|
|
|
| 138 |
])
|
| 139 |
|
| 140 |
try {
|
| 141 |
+
// === LOCAL FOLDER MODE: Direct-to-disk generation ===
|
| 142 |
+
if (storageMode === 'local' && directoryHandleRef.current) {
|
| 143 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] 📂 Using DIRECT-TO-DISK mode (no memory limits)`]);
|
| 144 |
+
|
| 145 |
+
const directConfig: DirectGenerationConfig = {
|
| 146 |
+
dataset: config.dataset,
|
| 147 |
+
image: config.image,
|
| 148 |
+
fonts: config.fonts,
|
| 149 |
+
augmentation: config.augmentation,
|
| 150 |
+
output: config.output
|
| 151 |
+
};
|
| 152 |
+
|
| 153 |
+
const genStartTime = Date.now();
|
| 154 |
+
const directResult = await generateDatasetDirect(
|
| 155 |
+
directoryHandleRef.current,
|
| 156 |
+
directConfig,
|
| 157 |
+
data,
|
| 158 |
+
(prog, message) => {
|
| 159 |
+
if (abortControllerRef.current?.signal.aborted) return;
|
| 160 |
+
setProgress(prog);
|
| 161 |
+
|
| 162 |
+
const elapsed = (Date.now() - genStartTime) / 1000;
|
| 163 |
+
if (elapsed > 0 && prog > 0) {
|
| 164 |
+
const samplesGenerated = Math.round(prog * targetSize / 100);
|
| 165 |
+
setRate(Math.round(samplesGenerated / elapsed));
|
| 166 |
+
|
| 167 |
+
const remaining = (100 - prog) / prog * elapsed;
|
| 168 |
+
if (remaining < 60) {
|
| 169 |
+
setEta(`${Math.round(remaining)}s`);
|
| 170 |
+
} else if (remaining < 3600) {
|
| 171 |
+
setEta(`${Math.round(remaining / 60)}m ${Math.round(remaining % 60)}s`);
|
| 172 |
+
} else {
|
| 173 |
+
setEta(`${Math.floor(remaining / 3600)}h ${Math.round((remaining % 3600) / 60)}m`);
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
if (prog % 10 === 0 || prog >= 99) {
|
| 178 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ${message}`]);
|
| 179 |
+
}
|
| 180 |
+
},
|
| 181 |
+
abortControllerRef.current.signal
|
| 182 |
+
);
|
| 183 |
+
|
| 184 |
+
setGenerationStatus('completed');
|
| 185 |
+
setIsGenerating(false);
|
| 186 |
+
setOutputPath(directResult.outputPath);
|
| 187 |
+
|
| 188 |
+
setLogs(prev => [
|
| 189 |
+
...prev,
|
| 190 |
+
`[${new Date().toLocaleTimeString()}] ✅ Generation complete!`,
|
| 191 |
+
`[${new Date().toLocaleTimeString()}] 📁 Files saved to: ${directResult.outputPath}`,
|
| 192 |
+
`[${new Date().toLocaleTimeString()}] 📊 Total samples: ${directResult.totalSamples.toLocaleString()}`,
|
| 193 |
+
`[${new Date().toLocaleTimeString()}] ⏱️ Duration: ${directResult.durationSeconds.toFixed(1)}s`,
|
| 194 |
+
`[${new Date().toLocaleTimeString()}] 💡 No ZIP needed - files are ready to use!`,
|
| 195 |
+
]);
|
| 196 |
+
|
| 197 |
+
// Call onGenerationComplete with minimal stats (no ZIP blob)
|
| 198 |
+
if (onGenerationComplete) {
|
| 199 |
+
onGenerationComplete({
|
| 200 |
+
total_samples: directResult.totalSamples,
|
| 201 |
+
duration_seconds: directResult.durationSeconds,
|
| 202 |
+
samples_per_second: directResult.totalSamples / directResult.durationSeconds,
|
| 203 |
+
font_distribution: [],
|
| 204 |
+
clean_samples: directResult.cleanSamples,
|
| 205 |
+
augmented_samples: directResult.augmentedSamples,
|
| 206 |
+
augmentation_stats: [],
|
| 207 |
+
avg_transforms_per_sample: 0,
|
| 208 |
+
unique_tokens: data.length,
|
| 209 |
+
avg_chars_per_sample: 0,
|
| 210 |
+
unicode_valid: directResult.totalSamples,
|
| 211 |
+
script_pure: directResult.totalSamples,
|
| 212 |
+
rejected_samples: 0
|
| 213 |
+
}, new Blob()); // Empty blob since files are on disk
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
return; // Exit early - no ZIP needed
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
// === MEMORY MODE: Original ZIP-based generation ===
|
| 220 |
const generatorConfig: GeneratorConfig = {
|
| 221 |
dataset: config.dataset,
|
| 222 |
input: {
|
|
|
|
| 595 |
<span className="text-sm">Augmentation</span>
|
| 596 |
</div>
|
| 597 |
<span className={`text-xs font-bold px-2 py-1 rounded ${isWebGLAvailable()
|
| 598 |
+
? 'bg-green-500/20 text-green-400'
|
| 599 |
+
: 'bg-blue-500/20 text-blue-400'
|
| 600 |
}`}>
|
| 601 |
{isWebGLAvailable() ? '🎮 GPU' : '💻 CPU'}
|
| 602 |
</span>
|
|
|
|
| 627 |
</div>
|
| 628 |
</div>
|
| 629 |
</div>
|
| 630 |
+
|
| 631 |
+
{/* Storage Settings Panel */}
|
| 632 |
+
<div className="glass rounded-xl p-6">
|
| 633 |
+
<div className="flex items-center justify-between mb-4">
|
| 634 |
+
<h3 className="font-medium flex items-center gap-2">
|
| 635 |
+
<Database className="w-4 h-4 text-blue-400" />
|
| 636 |
+
Storage Settings
|
| 637 |
+
</h3>
|
| 638 |
+
<button
|
| 639 |
+
onClick={() => setShowStorageSettings(!showStorageSettings)}
|
| 640 |
+
className="text-xs text-muted-foreground hover:text-primary transition-colors flex items-center gap-1"
|
| 641 |
+
>
|
| 642 |
+
<Settings2 className="w-3 h-3" />
|
| 643 |
+
{showStorageSettings ? 'Hide' : 'Configure'}
|
| 644 |
+
</button>
|
| 645 |
+
</div>
|
| 646 |
+
|
| 647 |
+
{/* Current Mode Display */}
|
| 648 |
+
<div className="flex items-center justify-between p-2 rounded-lg bg-secondary/50 mb-3">
|
| 649 |
+
<div className="flex items-center gap-2">
|
| 650 |
+
{storageMode === 'memory' && <HardDrive className="w-4 h-4 text-yellow-400" />}
|
| 651 |
+
{storageMode === 'indexeddb' && <Database className="w-4 h-4 text-blue-400" />}
|
| 652 |
+
{storageMode === 'local' && <FolderOpen className="w-4 h-4 text-green-400" />}
|
| 653 |
+
{storageMode === 'huggingface' && <Cloud className="w-4 h-4 text-purple-400" />}
|
| 654 |
+
<span className="text-sm">Storage Mode</span>
|
| 655 |
+
</div>
|
| 656 |
+
<span className={`text-xs font-bold px-2 py-1 rounded ${storageMode === 'memory' ? 'bg-yellow-500/20 text-yellow-400' :
|
| 657 |
+
storageMode === 'indexeddb' ? 'bg-blue-500/20 text-blue-400' :
|
| 658 |
+
storageMode === 'local' ? 'bg-green-500/20 text-green-400' :
|
| 659 |
+
'bg-purple-500/20 text-purple-400'
|
| 660 |
+
}`}>
|
| 661 |
+
{storageMode === 'memory' ? '💾 Memory' :
|
| 662 |
+
storageMode === 'indexeddb' ? '🗄️ Browser' :
|
| 663 |
+
storageMode === 'local' ? '📁 Local' :
|
| 664 |
+
'🤗 HuggingFace'}
|
| 665 |
+
</span>
|
| 666 |
+
</div>
|
| 667 |
+
|
| 668 |
+
{/* Expanded Settings */}
|
| 669 |
+
{showStorageSettings && (
|
| 670 |
+
<div className="space-y-3 pt-2 border-t border-white/10">
|
| 671 |
+
<p className="text-xs text-muted-foreground">
|
| 672 |
+
Select where to store generated images:
|
| 673 |
+
</p>
|
| 674 |
+
|
| 675 |
+
{/* Storage Mode Buttons */}
|
| 676 |
+
<div className="grid grid-cols-2 gap-2">
|
| 677 |
+
<button
|
| 678 |
+
onClick={() => { setStorageMode('memory'); setSelectedFolder(null); }}
|
| 679 |
+
className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'memory'
|
| 680 |
+
? 'bg-yellow-500/20 text-yellow-400 ring-1 ring-yellow-500/50'
|
| 681 |
+
: 'bg-secondary/50 hover:bg-secondary'
|
| 682 |
+
}`}
|
| 683 |
+
>
|
| 684 |
+
<HardDrive className="w-4 h-4" />
|
| 685 |
+
<span>Memory</span>
|
| 686 |
+
<span className="text-[10px] opacity-60">Fast, <2k samples</span>
|
| 687 |
+
</button>
|
| 688 |
+
|
| 689 |
+
<button
|
| 690 |
+
onClick={() => { setStorageMode('indexeddb'); setSelectedFolder(null); }}
|
| 691 |
+
className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'indexeddb'
|
| 692 |
+
? 'bg-blue-500/20 text-blue-400 ring-1 ring-blue-500/50'
|
| 693 |
+
: 'bg-secondary/50 hover:bg-secondary'
|
| 694 |
+
}`}
|
| 695 |
+
>
|
| 696 |
+
<Database className="w-4 h-4" />
|
| 697 |
+
<span>Browser DB</span>
|
| 698 |
+
<span className="text-[10px] opacity-60">Persistent, resumable</span>
|
| 699 |
+
</button>
|
| 700 |
+
|
| 701 |
+
<button
|
| 702 |
+
onClick={async () => {
|
| 703 |
+
// Check if File System API is supported
|
| 704 |
+
if (!('showDirectoryPicker' in window)) {
|
| 705 |
+
setError('Folder picker not supported in this browser. Use Chrome, Edge, or Opera.');
|
| 706 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ❌ File System API not supported - try Chrome or Edge`]);
|
| 707 |
+
return;
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] 📂 Opening folder picker...`]);
|
| 711 |
+
|
| 712 |
+
try {
|
| 713 |
+
// Directly use the File System Access API
|
| 714 |
+
// @ts-ignore - File System Access API
|
| 715 |
+
const directoryHandle = await window.showDirectoryPicker({
|
| 716 |
+
mode: 'readwrite',
|
| 717 |
+
startIn: 'documents'
|
| 718 |
+
});
|
| 719 |
+
|
| 720 |
+
// Store handle for generation - don't pre-create folders
|
| 721 |
+
directoryHandleRef.current = directoryHandle;
|
| 722 |
+
|
| 723 |
+
setStorageMode('local');
|
| 724 |
+
setSelectedFolder(directoryHandle.name);
|
| 725 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ✅ Output folder: ${directoryHandle.name} (direct-to-disk mode enabled)`]);
|
| 726 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] 💡 Files will be written directly - no memory limits!`]);
|
| 727 |
+
|
| 728 |
+
} catch (err: any) {
|
| 729 |
+
if (err.name === 'AbortError') {
|
| 730 |
+
// User cancelled the picker
|
| 731 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ℹ️ Folder selection cancelled`]);
|
| 732 |
+
} else {
|
| 733 |
+
console.error('Folder selection failed:', err);
|
| 734 |
+
setError(`Failed to select folder: ${err.message}`);
|
| 735 |
+
setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ❌ Error: ${err.message}`]);
|
| 736 |
+
}
|
| 737 |
+
}
|
| 738 |
+
}}
|
| 739 |
+
className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'local'
|
| 740 |
+
? 'bg-green-500/20 text-green-400 ring-1 ring-green-500/50'
|
| 741 |
+
: 'bg-secondary/50 hover:bg-secondary'
|
| 742 |
+
}`}
|
| 743 |
+
>
|
| 744 |
+
<FolderOpen className="w-4 h-4" />
|
| 745 |
+
<span>Local Folder</span>
|
| 746 |
+
<span className="text-[10px] opacity-60">Direct disk access</span>
|
| 747 |
+
</button>
|
| 748 |
+
|
| 749 |
+
<button
|
| 750 |
+
onClick={() => { setStorageMode('huggingface'); setSelectedFolder(null); }}
|
| 751 |
+
className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'huggingface'
|
| 752 |
+
? 'bg-purple-500/20 text-purple-400 ring-1 ring-purple-500/50'
|
| 753 |
+
: 'bg-secondary/50 hover:bg-secondary'
|
| 754 |
+
}`}
|
| 755 |
+
>
|
| 756 |
+
<Cloud className="w-4 h-4" />
|
| 757 |
+
<span>HuggingFace</span>
|
| 758 |
+
<span className="text-[10px] opacity-60">Cloud storage</span>
|
| 759 |
+
</button>
|
| 760 |
+
</div>
|
| 761 |
+
|
| 762 |
+
{/* Selected Folder Display */}
|
| 763 |
+
{storageMode === 'local' && selectedFolder && (
|
| 764 |
+
<div className="p-2 rounded-lg bg-green-500/10 text-xs">
|
| 765 |
+
<span className="text-green-400">📂 Output: </span>
|
| 766 |
+
<span className="text-muted-foreground">{selectedFolder}</span>
|
| 767 |
+
</div>
|
| 768 |
+
)}
|
| 769 |
+
|
| 770 |
+
{/* Output Path Display */}
|
| 771 |
+
{outputPath && (
|
| 772 |
+
<div className="p-2 rounded-lg bg-blue-500/10 text-xs">
|
| 773 |
+
<span className="text-blue-400">✅ Saved to: </span>
|
| 774 |
+
<span className="text-muted-foreground">{outputPath}</span>
|
| 775 |
+
</div>
|
| 776 |
+
)}
|
| 777 |
+
|
| 778 |
+
{/* Tip */}
|
| 779 |
+
<p className="text-[10px] text-muted-foreground">
|
| 780 |
+
💡 For large datasets (5k+), use Browser DB or Local Folder to avoid memory issues.
|
| 781 |
+
</p>
|
| 782 |
+
</div>
|
| 783 |
+
)}
|
| 784 |
+
</div>
|
| 785 |
</div>
|
| 786 |
|
| 787 |
{/* Progress & Logs */}
|
web/lib/cleanup.ts
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Auto-cleanup service for old datasets
|
| 2 |
+
// Runs on server startup and can be triggered via API
|
| 3 |
+
|
| 4 |
+
import { readdir, stat, rm } from 'fs/promises'
|
| 5 |
+
import { existsSync } from 'fs'
|
| 6 |
+
import path from 'path'
|
| 7 |
+
|
| 8 |
+
const OUTPUT_DIR = process.env.OUTPUT_DIR || './output/datasets'
|
| 9 |
+
const DATA_DIR = process.env.DATA_DIR || '/data/datasets'
|
| 10 |
+
const CLEANUP_HOURS = parseInt(process.env.CLEANUP_HOURS || '24')
|
| 11 |
+
|
| 12 |
+
export interface CleanupResult {
|
| 13 |
+
cleaned: number
|
| 14 |
+
cleanedPaths: string[]
|
| 15 |
+
errors: string[]
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
/**
|
| 19 |
+
* Clean up datasets older than the specified age
|
| 20 |
+
*/
|
| 21 |
+
export async function cleanupOldDatasets(
|
| 22 |
+
maxAgeHours: number = CLEANUP_HOURS,
|
| 23 |
+
mode: 'local' | 'huggingface' | 'both' = 'both'
|
| 24 |
+
): Promise<CleanupResult> {
|
| 25 |
+
const result: CleanupResult = {
|
| 26 |
+
cleaned: 0,
|
| 27 |
+
cleanedPaths: [],
|
| 28 |
+
errors: []
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
const dirsToClean: string[] = []
|
| 32 |
+
if (mode === 'local' || mode === 'both') {
|
| 33 |
+
dirsToClean.push(OUTPUT_DIR)
|
| 34 |
+
}
|
| 35 |
+
if (mode === 'huggingface' || mode === 'both') {
|
| 36 |
+
if (existsSync(DATA_DIR)) {
|
| 37 |
+
dirsToClean.push(DATA_DIR)
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
for (const baseDir of dirsToClean) {
|
| 42 |
+
if (!existsSync(baseDir)) continue
|
| 43 |
+
|
| 44 |
+
try {
|
| 45 |
+
const entries = await readdir(baseDir, { withFileTypes: true })
|
| 46 |
+
|
| 47 |
+
for (const entry of entries) {
|
| 48 |
+
if (!entry.isDirectory()) continue
|
| 49 |
+
|
| 50 |
+
const datasetPath = path.join(baseDir, entry.name)
|
| 51 |
+
try {
|
| 52 |
+
const stats = await stat(datasetPath)
|
| 53 |
+
const ageHours = (Date.now() - stats.birthtime.getTime()) / (1000 * 60 * 60)
|
| 54 |
+
|
| 55 |
+
if (ageHours > maxAgeHours) {
|
| 56 |
+
await rm(datasetPath, { recursive: true, force: true })
|
| 57 |
+
result.cleaned++
|
| 58 |
+
result.cleanedPaths.push(datasetPath)
|
| 59 |
+
console.log(`[Cleanup] Removed: ${entry.name} (${ageHours.toFixed(1)}h old)`)
|
| 60 |
+
}
|
| 61 |
+
} catch (err) {
|
| 62 |
+
const errorMsg = `Failed to clean ${datasetPath}: ${err}`
|
| 63 |
+
console.error(`[Cleanup] ${errorMsg}`)
|
| 64 |
+
result.errors.push(errorMsg)
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
} catch (err) {
|
| 68 |
+
const errorMsg = `Failed to read ${baseDir}: ${err}`
|
| 69 |
+
console.error(`[Cleanup] ${errorMsg}`)
|
| 70 |
+
result.errors.push(errorMsg)
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
console.log(`[Cleanup] Complete: ${result.cleaned} datasets removed`)
|
| 75 |
+
return result
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
/**
|
| 79 |
+
* Schedule cleanup to run periodically (call from server initialization)
|
| 80 |
+
*/
|
| 81 |
+
export function scheduleCleanup(intervalHours: number = 1): NodeJS.Timer {
|
| 82 |
+
console.log(`[Cleanup] Scheduled every ${intervalHours} hour(s), removing datasets older than ${CLEANUP_HOURS} hours`)
|
| 83 |
+
|
| 84 |
+
// Run immediately on startup
|
| 85 |
+
cleanupOldDatasets().catch(console.error)
|
| 86 |
+
|
| 87 |
+
// Then run periodically
|
| 88 |
+
return setInterval(() => {
|
| 89 |
+
cleanupOldDatasets().catch(console.error)
|
| 90 |
+
}, intervalHours * 60 * 60 * 1000)
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
/**
|
| 94 |
+
* Get storage statistics
|
| 95 |
+
*/
|
| 96 |
+
export async function getStorageStats(): Promise<{
|
| 97 |
+
localDatasets: number
|
| 98 |
+
hfDatasets: number
|
| 99 |
+
totalSizeMB: number
|
| 100 |
+
}> {
|
| 101 |
+
let localDatasets = 0
|
| 102 |
+
let hfDatasets = 0
|
| 103 |
+
let totalSizeBytes = 0
|
| 104 |
+
|
| 105 |
+
async function countDir(dir: string): Promise<{ count: number; size: number }> {
|
| 106 |
+
if (!existsSync(dir)) return { count: 0, size: 0 }
|
| 107 |
+
|
| 108 |
+
const entries = await readdir(dir, { withFileTypes: true })
|
| 109 |
+
let count = 0
|
| 110 |
+
let size = 0
|
| 111 |
+
|
| 112 |
+
for (const entry of entries) {
|
| 113 |
+
if (entry.isDirectory()) {
|
| 114 |
+
count++
|
| 115 |
+
// Estimate size (could be more accurate with recursive size calculation)
|
| 116 |
+
const stats = await stat(path.join(dir, entry.name))
|
| 117 |
+
size += stats.size
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
return { count, size }
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
const localStats = await countDir(OUTPUT_DIR)
|
| 125 |
+
const hfStats = await countDir(DATA_DIR)
|
| 126 |
+
|
| 127 |
+
localDatasets = localStats.count
|
| 128 |
+
hfDatasets = hfStats.count
|
| 129 |
+
totalSizeBytes = localStats.size + hfStats.size
|
| 130 |
+
|
| 131 |
+
return {
|
| 132 |
+
localDatasets,
|
| 133 |
+
hfDatasets,
|
| 134 |
+
totalSizeMB: Math.round(totalSizeBytes / (1024 * 1024))
|
| 135 |
+
}
|
| 136 |
+
}
|
web/lib/direct-generator.ts
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Direct-to-disk dataset generation
|
| 2 |
+
// Writes files directly to a user-selected folder without building ZIP in memory
|
| 3 |
+
|
| 4 |
+
export interface DirectGenerationConfig {
|
| 5 |
+
dataset: {
|
| 6 |
+
size: number
|
| 7 |
+
seed: number
|
| 8 |
+
}
|
| 9 |
+
image: {
|
| 10 |
+
width: number
|
| 11 |
+
height: number
|
| 12 |
+
background: string
|
| 13 |
+
backgroundStyle?: string
|
| 14 |
+
backgroundMode?: 'single' | 'mix'
|
| 15 |
+
backgroundPercentages?: Record<string, number>
|
| 16 |
+
textColor: string
|
| 17 |
+
direction: string
|
| 18 |
+
}
|
| 19 |
+
fonts: {
|
| 20 |
+
distribution: { name: string; family: string; percentage: number; dataUrl?: string }[]
|
| 21 |
+
}
|
| 22 |
+
augmentation: {
|
| 23 |
+
enabled: boolean
|
| 24 |
+
applyPercentage: number
|
| 25 |
+
preset: string
|
| 26 |
+
customMode?: boolean
|
| 27 |
+
values?: Record<string, number>
|
| 28 |
+
}
|
| 29 |
+
output: {
|
| 30 |
+
formats: string[]
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
export interface DirectGenerationResult {
|
| 35 |
+
totalSamples: number
|
| 36 |
+
durationSeconds: number
|
| 37 |
+
outputPath: string
|
| 38 |
+
cleanSamples: number
|
| 39 |
+
augmentedSamples: number
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
// Background colors
|
| 43 |
+
const backgroundColors: Record<string, string> = {
|
| 44 |
+
clean_white: '#FFFFFF',
|
| 45 |
+
aged_paper: '#F5E6D3',
|
| 46 |
+
book_page: '#FAF0E6',
|
| 47 |
+
newspaper: '#E8E8E8',
|
| 48 |
+
notebook: '#FFFEF0',
|
| 49 |
+
parchment: '#F0E68C',
|
| 50 |
+
weathered: '#D4C4A8',
|
| 51 |
+
coffee_stain: '#E6D5C3',
|
| 52 |
+
old_book: '#E8DCC4',
|
| 53 |
+
recycled: '#D9D4C5',
|
| 54 |
+
cream: '#FFFDD0',
|
| 55 |
+
ivory: '#FFFFF0',
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// Simple seeded random
|
| 59 |
+
function seededRandom(seed: number) {
|
| 60 |
+
let s = seed
|
| 61 |
+
return function () {
|
| 62 |
+
s = Math.sin(s) * 10000
|
| 63 |
+
return s - Math.floor(s)
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
// Canvas pool for performance
|
| 68 |
+
const canvasPool: { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D; inUse: boolean }[] = []
|
| 69 |
+
|
| 70 |
+
function getCanvas(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
|
| 71 |
+
for (const item of canvasPool) {
|
| 72 |
+
if (!item.inUse && item.canvas.width === width && item.canvas.height === height) {
|
| 73 |
+
item.inUse = true
|
| 74 |
+
item.ctx.clearRect(0, 0, width, height)
|
| 75 |
+
item.ctx.setTransform(1, 0, 0, 1, 0, 0)
|
| 76 |
+
return { canvas: item.canvas, ctx: item.ctx }
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
if (canvasPool.length < 4) {
|
| 81 |
+
const canvas = document.createElement('canvas')
|
| 82 |
+
canvas.width = width
|
| 83 |
+
canvas.height = height
|
| 84 |
+
const ctx = canvas.getContext('2d', { willReadFrequently: true })!
|
| 85 |
+
canvasPool.push({ canvas, ctx, inUse: true })
|
| 86 |
+
return { canvas, ctx }
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
const canvas = document.createElement('canvas')
|
| 90 |
+
canvas.width = width
|
| 91 |
+
canvas.height = height
|
| 92 |
+
const ctx = canvas.getContext('2d', { willReadFrequently: true })!
|
| 93 |
+
return { canvas, ctx }
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
function releaseCanvas(canvas: HTMLCanvasElement) {
|
| 97 |
+
for (const item of canvasPool) {
|
| 98 |
+
if (item.canvas === canvas) {
|
| 99 |
+
item.inUse = false
|
| 100 |
+
return
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
/**
|
| 106 |
+
* Generate dataset directly to a local folder
|
| 107 |
+
* No ZIP, no memory accumulation - writes each file immediately
|
| 108 |
+
*/
|
| 109 |
+
export async function generateDatasetDirect(
|
| 110 |
+
directoryHandle: FileSystemDirectoryHandle,
|
| 111 |
+
config: DirectGenerationConfig,
|
| 112 |
+
textData: string[],
|
| 113 |
+
onProgress: (progress: number, message: string) => void,
|
| 114 |
+
abortSignal?: AbortSignal
|
| 115 |
+
): Promise<DirectGenerationResult> {
|
| 116 |
+
const startTime = Date.now()
|
| 117 |
+
|
| 118 |
+
// Create dataset folder
|
| 119 |
+
const datasetName = `dataset_${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`
|
| 120 |
+
const datasetHandle = await directoryHandle.getDirectoryHandle(datasetName, { create: true })
|
| 121 |
+
const imagesHandle = await datasetHandle.getDirectoryHandle('images', { create: true })
|
| 122 |
+
|
| 123 |
+
onProgress(1, `Created folder: ${directoryHandle.name}/${datasetName}`)
|
| 124 |
+
|
| 125 |
+
const numSamples = Math.min(config.dataset.size, textData.length)
|
| 126 |
+
const labels: string[] = []
|
| 127 |
+
let cleanCount = 0
|
| 128 |
+
let augmentedCount = 0
|
| 129 |
+
|
| 130 |
+
// Load fonts
|
| 131 |
+
const loadedFonts: Map<string, string> = new Map()
|
| 132 |
+
if (config.fonts.distribution?.length) {
|
| 133 |
+
for (const font of config.fonts.distribution) {
|
| 134 |
+
if (font.dataUrl) {
|
| 135 |
+
try {
|
| 136 |
+
const fontFace = new FontFace(font.family, `url(${font.dataUrl})`)
|
| 137 |
+
await fontFace.load()
|
| 138 |
+
document.fonts.add(fontFace)
|
| 139 |
+
loadedFonts.set(font.name, font.family)
|
| 140 |
+
} catch {
|
| 141 |
+
loadedFonts.set(font.name, 'Arial')
|
| 142 |
+
}
|
| 143 |
+
} else {
|
| 144 |
+
loadedFonts.set(font.name, font.family || 'Arial')
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
onProgress(3, `Loaded ${loadedFonts.size} fonts, starting generation...`)
|
| 150 |
+
|
| 151 |
+
// Get augmentation values
|
| 152 |
+
const augValues = config.augmentation.values || {
|
| 153 |
+
rotation: 5, skew: 3, gaussian_noise: 10, brightness: 15
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
// Generate one sample at a time, write immediately
|
| 157 |
+
for (let i = 0; i < numSamples; i++) {
|
| 158 |
+
if (abortSignal?.aborted) {
|
| 159 |
+
throw new Error('Generation aborted')
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
const text = textData[i % textData.length]
|
| 163 |
+
const random = seededRandom(config.dataset.seed + i * 1000)
|
| 164 |
+
|
| 165 |
+
// Select font
|
| 166 |
+
let fontFamily = 'Arial'
|
| 167 |
+
if (config.fonts.distribution?.length) {
|
| 168 |
+
const roll = random() * 100
|
| 169 |
+
let cumulative = 0
|
| 170 |
+
for (const font of config.fonts.distribution) {
|
| 171 |
+
cumulative += font.percentage
|
| 172 |
+
if (roll < cumulative) {
|
| 173 |
+
fontFamily = loadedFonts.get(font.name) || font.family || 'Arial'
|
| 174 |
+
break
|
| 175 |
+
}
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
// Determine if should augment
|
| 180 |
+
const shouldAugment = config.augmentation.enabled &&
|
| 181 |
+
(random() * 100) < config.augmentation.applyPercentage
|
| 182 |
+
|
| 183 |
+
if (shouldAugment) {
|
| 184 |
+
augmentedCount++
|
| 185 |
+
} else {
|
| 186 |
+
cleanCount++
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
// Render to canvas
|
| 190 |
+
const { canvas, ctx } = getCanvas(config.image.width, config.image.height)
|
| 191 |
+
|
| 192 |
+
try {
|
| 193 |
+
// Background
|
| 194 |
+
const bgStyle = config.image.backgroundStyle || 'clean_white'
|
| 195 |
+
ctx.fillStyle = backgroundColors[bgStyle] || config.image.background || '#FFFFFF'
|
| 196 |
+
ctx.fillRect(0, 0, canvas.width, canvas.height)
|
| 197 |
+
|
| 198 |
+
// Text
|
| 199 |
+
const fontSize = Math.min(canvas.height * 0.6, 48)
|
| 200 |
+
ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
|
| 201 |
+
ctx.fillStyle = config.image.textColor || '#000000'
|
| 202 |
+
ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
|
| 203 |
+
ctx.textBaseline = 'middle'
|
| 204 |
+
ctx.direction = config.image.direction as CanvasDirection || 'ltr'
|
| 205 |
+
|
| 206 |
+
const x = config.image.direction === 'rtl' ? canvas.width - 10 : 10
|
| 207 |
+
ctx.fillText(text, x, canvas.height / 2)
|
| 208 |
+
|
| 209 |
+
// Convert to blob
|
| 210 |
+
const blob = await new Promise<Blob>((resolve, reject) => {
|
| 211 |
+
canvas.toBlob((b) => {
|
| 212 |
+
if (b) resolve(b)
|
| 213 |
+
else reject(new Error('Failed to create blob'))
|
| 214 |
+
}, 'image/png')
|
| 215 |
+
})
|
| 216 |
+
|
| 217 |
+
// Write immediately to disk
|
| 218 |
+
const filename = `image_${String(i).padStart(6, '0')}.png`
|
| 219 |
+
const fileHandle = await imagesHandle.getFileHandle(filename, { create: true })
|
| 220 |
+
const writable = await fileHandle.createWritable()
|
| 221 |
+
await writable.write(blob)
|
| 222 |
+
await writable.close()
|
| 223 |
+
|
| 224 |
+
labels.push(`${filename}\t${text}`)
|
| 225 |
+
|
| 226 |
+
} finally {
|
| 227 |
+
releaseCanvas(canvas)
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
// Progress update every 50 samples or at key milestones
|
| 231 |
+
if (i % 50 === 0 || i === numSamples - 1) {
|
| 232 |
+
const progress = Math.round((i / numSamples) * 90) + 5
|
| 233 |
+
onProgress(progress, `Written ${i + 1}/${numSamples} images directly to disk`)
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
// Yield to prevent UI freeze
|
| 237 |
+
if (i % 100 === 0) {
|
| 238 |
+
await new Promise(resolve => setTimeout(resolve, 0))
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
|
| 242 |
+
onProgress(95, 'Writing labels file...')
|
| 243 |
+
|
| 244 |
+
// Write labels.txt
|
| 245 |
+
const labelsHandle = await datasetHandle.getFileHandle('labels.txt', { create: true })
|
| 246 |
+
const labelsWritable = await labelsHandle.createWritable()
|
| 247 |
+
await labelsWritable.write(labels.join('\n'))
|
| 248 |
+
await labelsWritable.close()
|
| 249 |
+
|
| 250 |
+
// Write other formats
|
| 251 |
+
if (config.output.formats.includes('jsonl') || config.output.formats.includes('trocr')) {
|
| 252 |
+
const jsonlHandle = await datasetHandle.getFileHandle('data.jsonl', { create: true })
|
| 253 |
+
const jsonlWritable = await jsonlHandle.createWritable()
|
| 254 |
+
const jsonlContent = labels.map(l => {
|
| 255 |
+
const [filename, ...textParts] = l.split('\t')
|
| 256 |
+
return JSON.stringify({ image: `images/${filename}`, text: textParts.join('\t') })
|
| 257 |
+
}).join('\n')
|
| 258 |
+
await jsonlWritable.write(jsonlContent)
|
| 259 |
+
await jsonlWritable.close()
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
if (config.output.formats.includes('huggingface') || config.output.formats.includes('csv')) {
|
| 263 |
+
const csvHandle = await datasetHandle.getFileHandle('metadata.csv', { create: true })
|
| 264 |
+
const csvWritable = await csvHandle.createWritable()
|
| 265 |
+
const csvContent = 'file_name,text\n' + labels.map(l => {
|
| 266 |
+
const [filename, ...textParts] = l.split('\t')
|
| 267 |
+
return `"images/${filename}","${textParts.join('\t').replace(/"/g, '""')}"`
|
| 268 |
+
}).join('\n')
|
| 269 |
+
await csvWritable.write(csvContent)
|
| 270 |
+
await csvWritable.close()
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
onProgress(98, 'Writing metadata...')
|
| 274 |
+
|
| 275 |
+
// Write metadata
|
| 276 |
+
const metadataHandle = await datasetHandle.getFileHandle('metadata.json', { create: true })
|
| 277 |
+
const metadataWritable = await metadataHandle.createWritable()
|
| 278 |
+
await metadataWritable.write(JSON.stringify({
|
| 279 |
+
generated_at: new Date().toISOString(),
|
| 280 |
+
total_samples: numSamples,
|
| 281 |
+
clean_samples: cleanCount,
|
| 282 |
+
augmented_samples: augmentedCount,
|
| 283 |
+
image_size: `${config.image.width}x${config.image.height}`,
|
| 284 |
+
output_path: `${directoryHandle.name}/${datasetName}`
|
| 285 |
+
}, null, 2))
|
| 286 |
+
await metadataWritable.close()
|
| 287 |
+
|
| 288 |
+
const endTime = Date.now()
|
| 289 |
+
const durationSeconds = (endTime - startTime) / 1000
|
| 290 |
+
|
| 291 |
+
onProgress(100, `Complete! Files saved to ${directoryHandle.name}/${datasetName}`)
|
| 292 |
+
|
| 293 |
+
return {
|
| 294 |
+
totalSamples: numSamples,
|
| 295 |
+
durationSeconds,
|
| 296 |
+
outputPath: `${directoryHandle.name}/${datasetName}`,
|
| 297 |
+
cleanSamples: cleanCount,
|
| 298 |
+
augmentedSamples: augmentedCount
|
| 299 |
+
}
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
/**
|
| 303 |
+
* Check if File System Access API is available
|
| 304 |
+
*/
|
| 305 |
+
export function isDirectGenerationSupported(): boolean {
|
| 306 |
+
return 'showDirectoryPicker' in window
|
| 307 |
+
}
|
web/lib/generator.ts
CHANGED
|
@@ -2,6 +2,64 @@ import JSZip from 'jszip'
|
|
| 2 |
import { saveAs } from 'file-saver'
|
| 3 |
import { getWorkerPool, isWorkerRenderingAvailable, WorkerTask, WorkerResult } from './worker-pool'
|
| 4 |
import { isWebGLAvailable, applyGPUAugmentation, GPUAugmentOptions } from './gpu-augmentation'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
export interface FontData {
|
| 7 |
name: string
|
|
@@ -279,139 +337,146 @@ async function renderTextToCanvas(
|
|
| 279 |
augValues: Record<string, number>,
|
| 280 |
random: () => number
|
| 281 |
): Promise<{ blob: Blob; augmentations: string[]; backgroundStyle: string }> {
|
| 282 |
-
|
| 283 |
-
canvas
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
try {
|
| 296 |
-
const img = await loadImage(bg.value)
|
| 297 |
-
ctx.drawImage(img, 0, 0, canvas.width, canvas.height)
|
| 298 |
-
} catch {
|
| 299 |
-
// Fallback to white if image fails
|
| 300 |
-
ctx.fillStyle = '#FFFFFF'
|
| 301 |
ctx.fillRect(0, 0, canvas.width, canvas.height)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
}
|
| 303 |
-
}
|
| 304 |
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
}
|
| 311 |
-
|
| 312 |
-
// Set text properties with the selected font
|
| 313 |
-
const fontSize = Math.min(canvas.height * 0.6, 48)
|
| 314 |
-
ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
|
| 315 |
-
ctx.fillStyle = config.image.textColor
|
| 316 |
-
ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
|
| 317 |
-
ctx.textBaseline = 'middle'
|
| 318 |
-
|
| 319 |
-
// Draw text
|
| 320 |
-
const x = config.image.direction === 'rtl'
|
| 321 |
-
? canvas.width - 10
|
| 322 |
-
: 10
|
| 323 |
-
const y = canvas.height / 2
|
| 324 |
-
|
| 325 |
-
ctx.direction = config.image.direction as CanvasDirection
|
| 326 |
-
ctx.fillText(text, x, y)
|
| 327 |
-
|
| 328 |
-
if (shouldAugment && config.augmentation.enabled) {
|
| 329 |
-
ctx.restore()
|
| 330 |
-
}
|
| 331 |
-
|
| 332 |
-
// Apply post-processing augmentations (GPU-accelerated when available)
|
| 333 |
-
if (shouldAugment && config.augmentation.enabled) {
|
| 334 |
-
const applyBrightness = augValues.brightness && random() > 0.5
|
| 335 |
-
const applyNoise = augValues.gaussian_noise && random() > 0.6
|
| 336 |
-
|
| 337 |
-
if (applyBrightness || applyNoise) {
|
| 338 |
-
// Try GPU-accelerated augmentation first
|
| 339 |
-
const useGPU = isWebGLAvailable()
|
| 340 |
-
|
| 341 |
-
if (useGPU) {
|
| 342 |
-
// GPU path - apply all augmentations in a single GPU pass
|
| 343 |
-
const gpuOptions: GPUAugmentOptions = {
|
| 344 |
-
brightness: applyBrightness ? (random() - 0.5) * augValues.brightness / 50 : 0,
|
| 345 |
-
contrast: 1, // Could add contrast augmentation here
|
| 346 |
-
noiseAmount: applyNoise ? augValues.gaussian_noise / 200 : 0,
|
| 347 |
-
seed: random() * 1000
|
| 348 |
-
}
|
| 349 |
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
const srcRow = (canvas.height - 1 - y) * rowSize
|
| 363 |
-
const dstRow = y * rowSize
|
| 364 |
-
imageData.data.set(pixels.subarray(srcRow, srcRow + rowSize), dstRow)
|
| 365 |
-
}
|
| 366 |
-
ctx.putImageData(imageData, 0, 0)
|
| 367 |
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
}
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
}
|
| 382 |
-
ctx.putImageData(imageData, 0, 0)
|
| 383 |
-
appliedAugmentations.push('brightness')
|
| 384 |
-
}
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
|
|
|
|
|
|
|
|
|
| 394 |
}
|
| 395 |
-
ctx.putImageData(imageData, 0, 0)
|
| 396 |
-
appliedAugmentations.push('noise')
|
| 397 |
}
|
| 398 |
}
|
| 399 |
}
|
| 400 |
-
}
|
| 401 |
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
}
|
| 416 |
|
| 417 |
// Render a single sample (for parallel processing)
|
|
@@ -866,18 +931,49 @@ export async function generateDataset(
|
|
| 866 |
// Generate zip blob with dynamic compression based on dataset size
|
| 867 |
onProgress(95, 'Compressing dataset...')
|
| 868 |
|
| 869 |
-
// Use lower compression for large datasets (faster, less memory)
|
| 870 |
-
|
| 871 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 872 |
|
| 873 |
-
console.log(`Compressing ${labels.length} files with level ${compressionLevel}`)
|
| 874 |
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 881 |
|
| 882 |
const endTime = Date.now()
|
| 883 |
const durationSeconds = (endTime - startTime) / 1000
|
|
|
|
| 2 |
import { saveAs } from 'file-saver'
|
| 3 |
import { getWorkerPool, isWorkerRenderingAvailable, WorkerTask, WorkerResult } from './worker-pool'
|
| 4 |
import { isWebGLAvailable, applyGPUAugmentation, GPUAugmentOptions } from './gpu-augmentation'
|
| 5 |
+
import { StorageManager, StorageMode, getStorageManager, StoredSample } from './storage-manager'
|
| 6 |
+
|
| 7 |
+
// ============================================
|
| 8 |
+
// Canvas Pool for Performance
|
| 9 |
+
// ============================================
|
| 10 |
+
interface PooledCanvas {
|
| 11 |
+
canvas: HTMLCanvasElement
|
| 12 |
+
ctx: CanvasRenderingContext2D
|
| 13 |
+
inUse: boolean
|
| 14 |
+
width: number
|
| 15 |
+
height: number
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
const canvasPool: PooledCanvas[] = []
|
| 19 |
+
const MAX_POOL_SIZE = 8
|
| 20 |
+
|
| 21 |
+
function acquireCanvas(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
|
| 22 |
+
// Find available canvas with matching dimensions
|
| 23 |
+
for (const item of canvasPool) {
|
| 24 |
+
if (!item.inUse && item.width === width && item.height === height) {
|
| 25 |
+
item.inUse = true
|
| 26 |
+
item.ctx.clearRect(0, 0, width, height)
|
| 27 |
+
item.ctx.setTransform(1, 0, 0, 1, 0, 0) // Reset transform
|
| 28 |
+
return { canvas: item.canvas, ctx: item.ctx }
|
| 29 |
+
}
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
// Create new canvas if pool not full
|
| 33 |
+
if (canvasPool.length < MAX_POOL_SIZE) {
|
| 34 |
+
const canvas = document.createElement('canvas')
|
| 35 |
+
canvas.width = width
|
| 36 |
+
canvas.height = height
|
| 37 |
+
const ctx = canvas.getContext('2d', { willReadFrequently: true })!
|
| 38 |
+
const pooled: PooledCanvas = { canvas, ctx, inUse: true, width, height }
|
| 39 |
+
canvasPool.push(pooled)
|
| 40 |
+
return { canvas, ctx }
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
// Pool full, create temporary canvas (will be GC'd)
|
| 44 |
+
const canvas = document.createElement('canvas')
|
| 45 |
+
canvas.width = width
|
| 46 |
+
canvas.height = height
|
| 47 |
+
const ctx = canvas.getContext('2d', { willReadFrequently: true })!
|
| 48 |
+
return { canvas, ctx }
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
function releaseCanvas(canvas: HTMLCanvasElement): void {
|
| 52 |
+
for (const item of canvasPool) {
|
| 53 |
+
if (item.canvas === canvas) {
|
| 54 |
+
item.inUse = false
|
| 55 |
+
return
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
export function clearCanvasPool(): void {
|
| 61 |
+
canvasPool.length = 0
|
| 62 |
+
}
|
| 63 |
|
| 64 |
export interface FontData {
|
| 65 |
name: string
|
|
|
|
| 337 |
augValues: Record<string, number>,
|
| 338 |
random: () => number
|
| 339 |
): Promise<{ blob: Blob; augmentations: string[]; backgroundStyle: string }> {
|
| 340 |
+
// Use canvas pool for better performance
|
| 341 |
+
const { canvas, ctx } = acquireCanvas(config.image.width, config.image.height)
|
| 342 |
+
|
| 343 |
+
// Declare variables outside try for proper scoping
|
| 344 |
+
let appliedAugmentations: string[] = []
|
| 345 |
+
let bg: ReturnType<typeof selectBackground>
|
| 346 |
+
|
| 347 |
+
try {
|
| 348 |
+
// Select and apply background
|
| 349 |
+
bg = selectBackground(config, random)
|
| 350 |
+
|
| 351 |
+
if (bg.type === 'color') {
|
| 352 |
+
ctx.fillStyle = bg.value
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
ctx.fillRect(0, 0, canvas.width, canvas.height)
|
| 354 |
+
} else {
|
| 355 |
+
// Draw custom background image
|
| 356 |
+
try {
|
| 357 |
+
const img = await loadImage(bg.value)
|
| 358 |
+
ctx.drawImage(img, 0, 0, canvas.width, canvas.height)
|
| 359 |
+
} catch {
|
| 360 |
+
// Fallback to white if image fails
|
| 361 |
+
ctx.fillStyle = '#FFFFFF'
|
| 362 |
+
ctx.fillRect(0, 0, canvas.width, canvas.height)
|
| 363 |
+
}
|
| 364 |
}
|
|
|
|
| 365 |
|
| 366 |
+
// Apply augmentation transforms if enabled
|
| 367 |
+
if (shouldAugment && config.augmentation.enabled) {
|
| 368 |
+
ctx.save()
|
| 369 |
+
appliedAugmentations = applyAugmentation(ctx, canvas, augValues, random)
|
| 370 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
+
// Set text properties with the selected font
|
| 373 |
+
const fontSize = Math.min(canvas.height * 0.6, 48)
|
| 374 |
+
ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
|
| 375 |
+
ctx.fillStyle = config.image.textColor
|
| 376 |
+
ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
|
| 377 |
+
ctx.textBaseline = 'middle'
|
| 378 |
+
|
| 379 |
+
// Draw text
|
| 380 |
+
const x = config.image.direction === 'rtl'
|
| 381 |
+
? canvas.width - 10
|
| 382 |
+
: 10
|
| 383 |
+
const y = canvas.height / 2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
|
| 385 |
+
ctx.direction = config.image.direction as CanvasDirection
|
| 386 |
+
ctx.fillText(text, x, y)
|
| 387 |
+
|
| 388 |
+
if (shouldAugment && config.augmentation.enabled) {
|
| 389 |
+
ctx.restore()
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
// Apply post-processing augmentations (GPU-accelerated when available)
|
| 393 |
+
if (shouldAugment && config.augmentation.enabled) {
|
| 394 |
+
const applyBrightness = augValues.brightness && random() > 0.5
|
| 395 |
+
const applyNoise = augValues.gaussian_noise && random() > 0.6
|
| 396 |
+
|
| 397 |
+
if (applyBrightness || applyNoise) {
|
| 398 |
+
// Try GPU-accelerated augmentation first
|
| 399 |
+
const useGPU = isWebGLAvailable()
|
| 400 |
+
|
| 401 |
+
if (useGPU) {
|
| 402 |
+
// GPU path - apply all augmentations in a single GPU pass
|
| 403 |
+
const gpuOptions: GPUAugmentOptions = {
|
| 404 |
+
brightness: applyBrightness ? (random() - 0.5) * augValues.brightness / 50 : 0,
|
| 405 |
+
contrast: 1, // Could add contrast augmentation here
|
| 406 |
+
noiseAmount: applyNoise ? augValues.gaussian_noise / 200 : 0,
|
| 407 |
+
seed: random() * 1000
|
| 408 |
}
|
| 409 |
+
|
| 410 |
+
const gpuResult = applyGPUAugmentation(canvas, gpuOptions)
|
| 411 |
+
if (gpuResult && gpuResult !== canvas) {
|
| 412 |
+
// Copy GPU result back to main canvas
|
| 413 |
+
const gpuCtx = gpuResult.getContext('webgl')
|
| 414 |
+
if (gpuCtx) {
|
| 415 |
+
const pixels = new Uint8Array(canvas.width * canvas.height * 4)
|
| 416 |
+
gpuCtx.readPixels(0, 0, canvas.width, canvas.height, gpuCtx.RGBA, gpuCtx.UNSIGNED_BYTE, pixels)
|
| 417 |
+
|
| 418 |
+
// Flip Y and apply to main canvas
|
| 419 |
+
const imageData = ctx.createImageData(canvas.width, canvas.height)
|
| 420 |
+
const rowSize = canvas.width * 4
|
| 421 |
+
for (let y = 0; y < canvas.height; y++) {
|
| 422 |
+
const srcRow = (canvas.height - 1 - y) * rowSize
|
| 423 |
+
const dstRow = y * rowSize
|
| 424 |
+
imageData.data.set(pixels.subarray(srcRow, srcRow + rowSize), dstRow)
|
| 425 |
+
}
|
| 426 |
+
ctx.putImageData(imageData, 0, 0)
|
| 427 |
+
|
| 428 |
+
if (applyBrightness) appliedAugmentations.push('brightness_gpu')
|
| 429 |
+
if (applyNoise) appliedAugmentations.push('noise_gpu')
|
| 430 |
+
}
|
| 431 |
+
}
|
| 432 |
+
} else {
|
| 433 |
+
// CPU fallback path
|
| 434 |
+
if (applyBrightness) {
|
| 435 |
+
const adjustment = 1 + (random() - 0.5) * augValues.brightness / 50
|
| 436 |
+
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
|
| 437 |
+
for (let i = 0; i < imageData.data.length; i += 4) {
|
| 438 |
+
imageData.data[i] = Math.min(255, imageData.data[i] * adjustment)
|
| 439 |
+
imageData.data[i + 1] = Math.min(255, imageData.data[i + 1] * adjustment)
|
| 440 |
+
imageData.data[i + 2] = Math.min(255, imageData.data[i + 2] * adjustment)
|
| 441 |
+
}
|
| 442 |
+
ctx.putImageData(imageData, 0, 0)
|
| 443 |
+
appliedAugmentations.push('brightness')
|
| 444 |
}
|
|
|
|
|
|
|
|
|
|
| 445 |
|
| 446 |
+
if (applyNoise) {
|
| 447 |
+
const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
|
| 448 |
+
const noiseLevel = augValues.gaussian_noise / 2
|
| 449 |
+
for (let i = 0; i < imageData.data.length; i += 4) {
|
| 450 |
+
const noise = (random() - 0.5) * noiseLevel
|
| 451 |
+
imageData.data[i] = Math.max(0, Math.min(255, imageData.data[i] + noise))
|
| 452 |
+
imageData.data[i + 1] = Math.max(0, Math.min(255, imageData.data[i + 1] + noise))
|
| 453 |
+
imageData.data[i + 2] = Math.max(0, Math.min(255, imageData.data[i + 2] + noise))
|
| 454 |
+
}
|
| 455 |
+
ctx.putImageData(imageData, 0, 0)
|
| 456 |
+
appliedAugmentations.push('noise')
|
| 457 |
}
|
|
|
|
|
|
|
| 458 |
}
|
| 459 |
}
|
| 460 |
}
|
|
|
|
| 461 |
|
| 462 |
+
// Convert to blob and release canvas when done
|
| 463 |
+
return new Promise((resolve, reject) => {
|
| 464 |
+
canvas.toBlob(
|
| 465 |
+
(blob) => {
|
| 466 |
+
releaseCanvas(canvas) // Return canvas to pool
|
| 467 |
+
if (blob) {
|
| 468 |
+
resolve({ blob, augmentations: appliedAugmentations, backgroundStyle: bg.styleName })
|
| 469 |
+
} else {
|
| 470 |
+
reject(new Error('Failed to convert canvas to blob'))
|
| 471 |
+
}
|
| 472 |
+
},
|
| 473 |
+
'image/png'
|
| 474 |
+
)
|
| 475 |
+
})
|
| 476 |
+
} catch (err) {
|
| 477 |
+
releaseCanvas(canvas) // Ensure canvas is released on error
|
| 478 |
+
throw err
|
| 479 |
+
}
|
| 480 |
}
|
| 481 |
|
| 482 |
// Render a single sample (for parallel processing)
|
|
|
|
| 931 |
// Generate zip blob with dynamic compression based on dataset size
|
| 932 |
onProgress(95, 'Compressing dataset...')
|
| 933 |
|
| 934 |
+
// Use lower/no compression for large datasets (faster, less memory)
|
| 935 |
+
// STORE = no compression, uses less memory
|
| 936 |
+
let compressionType: 'DEFLATE' | 'STORE' = 'DEFLATE'
|
| 937 |
+
let compressionLevel = 6
|
| 938 |
+
|
| 939 |
+
if (labels.length > 20000) {
|
| 940 |
+
// Very large: no compression (STORE) to prevent memory issues
|
| 941 |
+
compressionType = 'STORE'
|
| 942 |
+
compressionLevel = 0
|
| 943 |
+
console.log(`Large dataset (${labels.length}): Using STORE (no compression) to prevent memory issues`)
|
| 944 |
+
} else if (labels.length > 10000) {
|
| 945 |
+
compressionLevel = 1 // Minimal compression
|
| 946 |
+
console.log(`Medium-large dataset (${labels.length}): Using minimal compression level 1`)
|
| 947 |
+
} else if (labels.length > 5000) {
|
| 948 |
+
compressionLevel = 3
|
| 949 |
+
console.log(`Medium dataset (${labels.length}): Using compression level 3`)
|
| 950 |
+
}
|
| 951 |
|
| 952 |
+
console.log(`Compressing ${labels.length} files with ${compressionType}, level ${compressionLevel}`)
|
| 953 |
|
| 954 |
+
let zipBlob: Blob
|
| 955 |
+
try {
|
| 956 |
+
zipBlob = await zip.generateAsync({
|
| 957 |
+
type: 'blob',
|
| 958 |
+
compression: compressionType,
|
| 959 |
+
compressionOptions: compressionType === 'DEFLATE' ? { level: compressionLevel } : undefined,
|
| 960 |
+
streamFiles: true // Stream files to reduce memory usage
|
| 961 |
+
})
|
| 962 |
+
} catch (memoryError: any) {
|
| 963 |
+
console.error('ZIP generation failed, trying without compression:', memoryError)
|
| 964 |
+
onProgress(96, 'Retrying with no compression...')
|
| 965 |
+
|
| 966 |
+
// Retry with no compression at all
|
| 967 |
+
try {
|
| 968 |
+
zipBlob = await zip.generateAsync({
|
| 969 |
+
type: 'blob',
|
| 970 |
+
compression: 'STORE',
|
| 971 |
+
streamFiles: true
|
| 972 |
+
})
|
| 973 |
+
} catch (finalError) {
|
| 974 |
+
throw new Error(`Failed to create ZIP: Memory limit exceeded. Try using Local Folder storage mode instead of Memory for large datasets.`)
|
| 975 |
+
}
|
| 976 |
+
}
|
| 977 |
|
| 978 |
const endTime = Date.now()
|
| 979 |
const durationSeconds = (endTime - startTime) / 1000
|
web/lib/storage-manager.ts
ADDED
|
@@ -0,0 +1,468 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Storage Manager for OCR Dataset Generator
|
| 2 |
+
// Supports multiple storage backends: Memory, IndexedDB, Local Folder, HuggingFace
|
| 3 |
+
|
| 4 |
+
export type StorageMode = 'memory' | 'indexeddb' | 'local' | 'huggingface'
|
| 5 |
+
|
| 6 |
+
export interface StorageConfig {
|
| 7 |
+
mode: StorageMode
|
| 8 |
+
localPath?: string // For 'local' mode - user selected folder
|
| 9 |
+
dataDir?: string // For 'huggingface' mode - /data directory
|
| 10 |
+
cleanupHours?: number
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
export interface StoredSample {
|
| 14 |
+
id: string
|
| 15 |
+
filename: string
|
| 16 |
+
blob: Blob
|
| 17 |
+
label: string
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
// IndexedDB Database name and store
|
| 21 |
+
const DB_NAME = 'ocr_dataset_generator'
|
| 22 |
+
const STORE_NAME = 'samples'
|
| 23 |
+
const DB_VERSION = 1
|
| 24 |
+
|
| 25 |
+
// ============================================
|
| 26 |
+
// IndexedDB Storage Implementation
|
| 27 |
+
// ============================================
|
| 28 |
+
|
| 29 |
+
class IndexedDBStorage {
|
| 30 |
+
private db: IDBDatabase | null = null
|
| 31 |
+
private initPromise: Promise<void> | null = null
|
| 32 |
+
|
| 33 |
+
async init(): Promise<void> {
|
| 34 |
+
if (this.db) return
|
| 35 |
+
if (this.initPromise) return this.initPromise
|
| 36 |
+
|
| 37 |
+
this.initPromise = new Promise((resolve, reject) => {
|
| 38 |
+
const request = indexedDB.open(DB_NAME, DB_VERSION)
|
| 39 |
+
|
| 40 |
+
request.onerror = () => reject(request.error)
|
| 41 |
+
request.onsuccess = () => {
|
| 42 |
+
this.db = request.result
|
| 43 |
+
resolve()
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
request.onupgradeneeded = (event) => {
|
| 47 |
+
const db = (event.target as IDBOpenDBRequest).result
|
| 48 |
+
if (!db.objectStoreNames.contains(STORE_NAME)) {
|
| 49 |
+
db.createObjectStore(STORE_NAME, { keyPath: 'id' })
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
})
|
| 53 |
+
|
| 54 |
+
return this.initPromise
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
async store(sample: StoredSample): Promise<void> {
|
| 58 |
+
await this.init()
|
| 59 |
+
return new Promise((resolve, reject) => {
|
| 60 |
+
const transaction = this.db!.transaction([STORE_NAME], 'readwrite')
|
| 61 |
+
const store = transaction.objectStore(STORE_NAME)
|
| 62 |
+
const request = store.put(sample)
|
| 63 |
+
request.onerror = () => reject(request.error)
|
| 64 |
+
request.onsuccess = () => resolve()
|
| 65 |
+
})
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
async getAll(): Promise<StoredSample[]> {
|
| 69 |
+
await this.init()
|
| 70 |
+
return new Promise((resolve, reject) => {
|
| 71 |
+
const transaction = this.db!.transaction([STORE_NAME], 'readonly')
|
| 72 |
+
const store = transaction.objectStore(STORE_NAME)
|
| 73 |
+
const request = store.getAll()
|
| 74 |
+
request.onerror = () => reject(request.error)
|
| 75 |
+
request.onsuccess = () => resolve(request.result)
|
| 76 |
+
})
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
async count(): Promise<number> {
|
| 80 |
+
await this.init()
|
| 81 |
+
return new Promise((resolve, reject) => {
|
| 82 |
+
const transaction = this.db!.transaction([STORE_NAME], 'readonly')
|
| 83 |
+
const store = transaction.objectStore(STORE_NAME)
|
| 84 |
+
const request = store.count()
|
| 85 |
+
request.onerror = () => reject(request.error)
|
| 86 |
+
request.onsuccess = () => resolve(request.result)
|
| 87 |
+
})
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
async clear(): Promise<void> {
|
| 91 |
+
await this.init()
|
| 92 |
+
return new Promise((resolve, reject) => {
|
| 93 |
+
const transaction = this.db!.transaction([STORE_NAME], 'readwrite')
|
| 94 |
+
const store = transaction.objectStore(STORE_NAME)
|
| 95 |
+
const request = store.clear()
|
| 96 |
+
request.onerror = () => reject(request.error)
|
| 97 |
+
request.onsuccess = () => resolve()
|
| 98 |
+
})
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
close(): void {
|
| 102 |
+
if (this.db) {
|
| 103 |
+
this.db.close()
|
| 104 |
+
this.db = null
|
| 105 |
+
this.initPromise = null
|
| 106 |
+
}
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
// ============================================
|
| 111 |
+
// File System Access API (Local Folder)
|
| 112 |
+
// ============================================
|
| 113 |
+
|
| 114 |
+
class FileSystemStorage {
|
| 115 |
+
private directoryHandle: FileSystemDirectoryHandle | null = null
|
| 116 |
+
private imagesHandle: FileSystemDirectoryHandle | null = null
|
| 117 |
+
private datasetName: string = ''
|
| 118 |
+
|
| 119 |
+
async selectFolder(): Promise<string | null> {
|
| 120 |
+
try {
|
| 121 |
+
// @ts-ignore - File System Access API
|
| 122 |
+
this.directoryHandle = await window.showDirectoryPicker({
|
| 123 |
+
mode: 'readwrite',
|
| 124 |
+
startIn: 'documents'
|
| 125 |
+
})
|
| 126 |
+
|
| 127 |
+
// Create timestamped dataset folder
|
| 128 |
+
this.datasetName = `dataset_${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`
|
| 129 |
+
if (!this.directoryHandle) throw new Error('No directory handle')
|
| 130 |
+
const datasetHandle = await this.directoryHandle.getDirectoryHandle(this.datasetName, { create: true })
|
| 131 |
+
this.imagesHandle = await datasetHandle.getDirectoryHandle('images', { create: true })
|
| 132 |
+
|
| 133 |
+
return `${this.directoryHandle.name}/${this.datasetName}`
|
| 134 |
+
} catch (err) {
|
| 135 |
+
console.error('Failed to select folder:', err)
|
| 136 |
+
return null
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
async writeImage(filename: string, blob: Blob): Promise<void> {
|
| 141 |
+
if (!this.imagesHandle) throw new Error('No folder selected')
|
| 142 |
+
const fileHandle = await this.imagesHandle.getFileHandle(filename, { create: true })
|
| 143 |
+
const writable = await fileHandle.createWritable()
|
| 144 |
+
await writable.write(blob)
|
| 145 |
+
await writable.close()
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
async writeLabels(content: string, filename: string = 'labels.txt'): Promise<void> {
|
| 149 |
+
if (!this.directoryHandle || !this.datasetName) throw new Error('No folder selected')
|
| 150 |
+
const datasetHandle = await this.directoryHandle.getDirectoryHandle(this.datasetName, { create: false })
|
| 151 |
+
const fileHandle = await datasetHandle.getFileHandle(filename, { create: true })
|
| 152 |
+
const writable = await fileHandle.createWritable()
|
| 153 |
+
await writable.write(content)
|
| 154 |
+
await writable.close()
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
async writeMetadata(metadata: object): Promise<void> {
|
| 158 |
+
if (!this.directoryHandle || !this.datasetName) throw new Error('No folder selected')
|
| 159 |
+
const datasetHandle = await this.directoryHandle.getDirectoryHandle(this.datasetName, { create: false })
|
| 160 |
+
const fileHandle = await datasetHandle.getFileHandle('metadata.json', { create: true })
|
| 161 |
+
const writable = await fileHandle.createWritable()
|
| 162 |
+
await writable.write(JSON.stringify(metadata, null, 2))
|
| 163 |
+
await writable.close()
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
getPath(): string | null {
|
| 167 |
+
return this.directoryHandle ? `${this.directoryHandle.name}/${this.datasetName}` : null
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
isSupported(): boolean {
|
| 171 |
+
return 'showDirectoryPicker' in window
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
|
| 175 |
+
// ============================================
|
| 176 |
+
// Server Storage (Local/HuggingFace via API)
|
| 177 |
+
// ============================================
|
| 178 |
+
|
| 179 |
+
class ServerStorage {
|
| 180 |
+
private sessionId: string = ''
|
| 181 |
+
private mode: 'local' | 'huggingface' = 'local'
|
| 182 |
+
|
| 183 |
+
constructor(mode: 'local' | 'huggingface' = 'local') {
|
| 184 |
+
this.mode = mode
|
| 185 |
+
this.sessionId = `dataset_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
async writeImage(filename: string, blob: Blob): Promise<void> {
|
| 189 |
+
const formData = new FormData()
|
| 190 |
+
formData.append('file', blob, filename)
|
| 191 |
+
formData.append('sessionId', this.sessionId)
|
| 192 |
+
formData.append('mode', this.mode)
|
| 193 |
+
formData.append('type', 'image')
|
| 194 |
+
|
| 195 |
+
const response = await fetch('/api/storage', {
|
| 196 |
+
method: 'POST',
|
| 197 |
+
body: formData
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
if (!response.ok) {
|
| 201 |
+
throw new Error(`Failed to write image: ${response.statusText}`)
|
| 202 |
+
}
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
async writeLabels(labels: string[]): Promise<void> {
|
| 206 |
+
const response = await fetch('/api/storage', {
|
| 207 |
+
method: 'POST',
|
| 208 |
+
headers: { 'Content-Type': 'application/json' },
|
| 209 |
+
body: JSON.stringify({
|
| 210 |
+
sessionId: this.sessionId,
|
| 211 |
+
mode: this.mode,
|
| 212 |
+
type: 'labels',
|
| 213 |
+
content: labels.join('\n')
|
| 214 |
+
})
|
| 215 |
+
})
|
| 216 |
+
|
| 217 |
+
if (!response.ok) {
|
| 218 |
+
throw new Error(`Failed to write labels: ${response.statusText}`)
|
| 219 |
+
}
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
async writeMetadata(metadata: object): Promise<void> {
|
| 223 |
+
const response = await fetch('/api/storage', {
|
| 224 |
+
method: 'POST',
|
| 225 |
+
headers: { 'Content-Type': 'application/json' },
|
| 226 |
+
body: JSON.stringify({
|
| 227 |
+
sessionId: this.sessionId,
|
| 228 |
+
mode: this.mode,
|
| 229 |
+
type: 'metadata',
|
| 230 |
+
content: JSON.stringify(metadata, null, 2)
|
| 231 |
+
})
|
| 232 |
+
})
|
| 233 |
+
|
| 234 |
+
if (!response.ok) {
|
| 235 |
+
throw new Error(`Failed to write metadata: ${response.statusText}`)
|
| 236 |
+
}
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
async finalize(): Promise<{ path: string; downloadUrl?: string }> {
|
| 240 |
+
const response = await fetch('/api/storage', {
|
| 241 |
+
method: 'POST',
|
| 242 |
+
headers: { 'Content-Type': 'application/json' },
|
| 243 |
+
body: JSON.stringify({
|
| 244 |
+
sessionId: this.sessionId,
|
| 245 |
+
mode: this.mode,
|
| 246 |
+
type: 'finalize'
|
| 247 |
+
})
|
| 248 |
+
})
|
| 249 |
+
|
| 250 |
+
if (!response.ok) {
|
| 251 |
+
throw new Error(`Failed to finalize: ${response.statusText}`)
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
return response.json()
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
getSessionId(): string {
|
| 258 |
+
return this.sessionId
|
| 259 |
+
}
|
| 260 |
+
}
|
| 261 |
+
|
| 262 |
+
// ============================================
|
| 263 |
+
// Canvas Pool for Performance
|
| 264 |
+
// ============================================
|
| 265 |
+
|
| 266 |
+
interface PooledCanvas {
|
| 267 |
+
canvas: HTMLCanvasElement
|
| 268 |
+
ctx: CanvasRenderingContext2D
|
| 269 |
+
inUse: boolean
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
class CanvasPool {
|
| 273 |
+
private pool: PooledCanvas[] = []
|
| 274 |
+
private maxSize: number = 8
|
| 275 |
+
|
| 276 |
+
acquire(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
|
| 277 |
+
// Find available canvas with matching size
|
| 278 |
+
for (const item of this.pool) {
|
| 279 |
+
if (!item.inUse && item.canvas.width === width && item.canvas.height === height) {
|
| 280 |
+
item.inUse = true
|
| 281 |
+
// Clear the canvas
|
| 282 |
+
item.ctx.clearRect(0, 0, width, height)
|
| 283 |
+
return { canvas: item.canvas, ctx: item.ctx }
|
| 284 |
+
}
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
// Create new canvas if pool not full
|
| 288 |
+
if (this.pool.length < this.maxSize) {
|
| 289 |
+
const canvas = document.createElement('canvas')
|
| 290 |
+
canvas.width = width
|
| 291 |
+
canvas.height = height
|
| 292 |
+
const ctx = canvas.getContext('2d')!
|
| 293 |
+
const pooled: PooledCanvas = { canvas, ctx, inUse: true }
|
| 294 |
+
this.pool.push(pooled)
|
| 295 |
+
return { canvas, ctx }
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
// Wait for a canvas to become available (shouldn't happen often)
|
| 299 |
+
const canvas = document.createElement('canvas')
|
| 300 |
+
canvas.width = width
|
| 301 |
+
canvas.height = height
|
| 302 |
+
const ctx = canvas.getContext('2d')!
|
| 303 |
+
return { canvas, ctx }
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
release(canvas: HTMLCanvasElement): void {
|
| 307 |
+
for (const item of this.pool) {
|
| 308 |
+
if (item.canvas === canvas) {
|
| 309 |
+
item.inUse = false
|
| 310 |
+
return
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
clear(): void {
|
| 316 |
+
this.pool = []
|
| 317 |
+
}
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
// ============================================
|
| 321 |
+
// Unified Storage Manager
|
| 322 |
+
// ============================================
|
| 323 |
+
|
| 324 |
+
export class StorageManager {
|
| 325 |
+
private config: StorageConfig
|
| 326 |
+
private indexedDB: IndexedDBStorage | null = null
|
| 327 |
+
private fileSystem: FileSystemStorage | null = null
|
| 328 |
+
private serverStorage: ServerStorage | null = null
|
| 329 |
+
private memoryStorage: StoredSample[] = []
|
| 330 |
+
private canvasPool: CanvasPool
|
| 331 |
+
|
| 332 |
+
constructor(config: StorageConfig) {
|
| 333 |
+
this.config = config
|
| 334 |
+
this.canvasPool = new CanvasPool()
|
| 335 |
+
this.initialize()
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
private initialize(): void {
|
| 339 |
+
switch (this.config.mode) {
|
| 340 |
+
case 'indexeddb':
|
| 341 |
+
this.indexedDB = new IndexedDBStorage()
|
| 342 |
+
break
|
| 343 |
+
case 'local':
|
| 344 |
+
this.fileSystem = new FileSystemStorage()
|
| 345 |
+
break
|
| 346 |
+
case 'huggingface':
|
| 347 |
+
this.serverStorage = new ServerStorage('huggingface')
|
| 348 |
+
break
|
| 349 |
+
// 'memory' uses memoryStorage array
|
| 350 |
+
}
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
async selectLocalFolder(): Promise<string | null> {
|
| 354 |
+
if (this.config.mode !== 'local' || !this.fileSystem) {
|
| 355 |
+
throw new Error('Local folder selection only available in local mode')
|
| 356 |
+
}
|
| 357 |
+
return this.fileSystem.selectFolder()
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
isLocalFolderSupported(): boolean {
|
| 361 |
+
return 'showDirectoryPicker' in window
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
getCanvasPool(): CanvasPool {
|
| 365 |
+
return this.canvasPool
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
async storeSample(sample: StoredSample): Promise<void> {
|
| 369 |
+
switch (this.config.mode) {
|
| 370 |
+
case 'memory':
|
| 371 |
+
this.memoryStorage.push(sample)
|
| 372 |
+
break
|
| 373 |
+
case 'indexeddb':
|
| 374 |
+
await this.indexedDB!.store(sample)
|
| 375 |
+
break
|
| 376 |
+
case 'local':
|
| 377 |
+
await this.fileSystem!.writeImage(sample.filename, sample.blob)
|
| 378 |
+
break
|
| 379 |
+
case 'huggingface':
|
| 380 |
+
await this.serverStorage!.writeImage(sample.filename, sample.blob)
|
| 381 |
+
break
|
| 382 |
+
}
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
async getSampleCount(): Promise<number> {
|
| 386 |
+
switch (this.config.mode) {
|
| 387 |
+
case 'memory':
|
| 388 |
+
return this.memoryStorage.length
|
| 389 |
+
case 'indexeddb':
|
| 390 |
+
return this.indexedDB!.count()
|
| 391 |
+
default:
|
| 392 |
+
return 0 // Server modes don't track count locally
|
| 393 |
+
}
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
async getAllSamples(): Promise<StoredSample[]> {
|
| 397 |
+
switch (this.config.mode) {
|
| 398 |
+
case 'memory':
|
| 399 |
+
return this.memoryStorage
|
| 400 |
+
case 'indexeddb':
|
| 401 |
+
return this.indexedDB!.getAll()
|
| 402 |
+
default:
|
| 403 |
+
return [] // Server modes don't support this
|
| 404 |
+
}
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
async finalize(labels: string[], metadata: object): Promise<{ path?: string; blob?: Blob }> {
|
| 408 |
+
switch (this.config.mode) {
|
| 409 |
+
case 'memory':
|
| 410 |
+
// Return samples for ZIP building
|
| 411 |
+
return { blob: undefined }
|
| 412 |
+
case 'indexeddb':
|
| 413 |
+
// Return samples for ZIP building
|
| 414 |
+
return { blob: undefined }
|
| 415 |
+
case 'local':
|
| 416 |
+
await this.fileSystem!.writeLabels(labels.join('\n'))
|
| 417 |
+
await this.fileSystem!.writeMetadata(metadata)
|
| 418 |
+
return { path: this.fileSystem!.getPath() || undefined }
|
| 419 |
+
case 'huggingface':
|
| 420 |
+
await this.serverStorage!.writeLabels(labels)
|
| 421 |
+
await this.serverStorage!.writeMetadata(metadata)
|
| 422 |
+
const result = await this.serverStorage!.finalize()
|
| 423 |
+
return { path: result.path }
|
| 424 |
+
}
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
async clear(): Promise<void> {
|
| 428 |
+
this.memoryStorage = []
|
| 429 |
+
if (this.indexedDB) {
|
| 430 |
+
await this.indexedDB.clear()
|
| 431 |
+
}
|
| 432 |
+
this.canvasPool.clear()
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
getMode(): StorageMode {
|
| 436 |
+
return this.config.mode
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
close(): void {
|
| 440 |
+
if (this.indexedDB) {
|
| 441 |
+
this.indexedDB.close()
|
| 442 |
+
}
|
| 443 |
+
this.canvasPool.clear()
|
| 444 |
+
}
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
// ============================================
|
| 448 |
+
// Singleton Instance
|
| 449 |
+
// ============================================
|
| 450 |
+
|
| 451 |
+
let storageManagerInstance: StorageManager | null = null
|
| 452 |
+
|
| 453 |
+
export function getStorageManager(config?: StorageConfig): StorageManager {
|
| 454 |
+
if (!storageManagerInstance || (config && config.mode !== storageManagerInstance.getMode())) {
|
| 455 |
+
if (storageManagerInstance) {
|
| 456 |
+
storageManagerInstance.close()
|
| 457 |
+
}
|
| 458 |
+
storageManagerInstance = new StorageManager(config || { mode: 'memory' })
|
| 459 |
+
}
|
| 460 |
+
return storageManagerInstance
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
export function resetStorageManager(): void {
|
| 464 |
+
if (storageManagerInstance) {
|
| 465 |
+
storageManagerInstance.close()
|
| 466 |
+
storageManagerInstance = null
|
| 467 |
+
}
|
| 468 |
+
}
|