Omarrran commited on
Commit
004fbbb
·
1 Parent(s): 599349f

Update OCR Dataset Generator

Browse files
web/app/api/storage/route.ts ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextRequest, NextResponse } from 'next/server'
2
+ import { writeFile, mkdir, readdir, rm, stat } from 'fs/promises'
3
+ import { existsSync } from 'fs'
4
+ import path from 'path'
5
+
6
+ // Storage directories
7
+ const OUTPUT_DIR = process.env.OUTPUT_DIR || './output/datasets'
8
+ const DATA_DIR = process.env.DATA_DIR || '/data/datasets'
9
+ const CLEANUP_HOURS = parseInt(process.env.CLEANUP_HOURS || '24')
10
+
11
+ // Get the appropriate directory based on mode
12
+ function getBaseDir(mode: 'local' | 'huggingface'): string {
13
+ return mode === 'huggingface' ? DATA_DIR : OUTPUT_DIR
14
+ }
15
+
16
+ // Ensure directory exists
17
+ async function ensureDir(dirPath: string): Promise<void> {
18
+ if (!existsSync(dirPath)) {
19
+ await mkdir(dirPath, { recursive: true })
20
+ }
21
+ }
22
+
23
+ // POST - Write file or finalize
24
+ export async function POST(request: NextRequest) {
25
+ try {
26
+ const contentType = request.headers.get('content-type') || ''
27
+
28
+ if (contentType.includes('multipart/form-data')) {
29
+ // Handle file upload
30
+ const formData = await request.formData()
31
+ const file = formData.get('file') as File
32
+ const sessionId = formData.get('sessionId') as string
33
+ const mode = (formData.get('mode') as 'local' | 'huggingface') || 'local'
34
+
35
+ if (!file || !sessionId) {
36
+ return NextResponse.json({ error: 'Missing file or sessionId' }, { status: 400 })
37
+ }
38
+
39
+ const baseDir = getBaseDir(mode)
40
+ const sessionDir = path.join(baseDir, sessionId)
41
+ const imagesDir = path.join(sessionDir, 'images')
42
+ await ensureDir(imagesDir)
43
+
44
+ const buffer = Buffer.from(await file.arrayBuffer())
45
+ const filePath = path.join(imagesDir, file.name)
46
+ await writeFile(filePath, buffer)
47
+
48
+ return NextResponse.json({ success: true, path: filePath })
49
+ } else {
50
+ // Handle JSON requests
51
+ const body = await request.json()
52
+ const { sessionId, mode, type, content } = body
53
+
54
+ if (!sessionId || !mode || !type) {
55
+ return NextResponse.json({ error: 'Missing required fields' }, { status: 400 })
56
+ }
57
+
58
+ const baseDir = getBaseDir(mode)
59
+ const sessionDir = path.join(baseDir, sessionId)
60
+ await ensureDir(sessionDir)
61
+
62
+ switch (type) {
63
+ case 'labels':
64
+ await writeFile(path.join(sessionDir, 'labels.txt'), content)
65
+ return NextResponse.json({ success: true })
66
+
67
+ case 'metadata':
68
+ await writeFile(path.join(sessionDir, 'metadata.json'), content)
69
+ return NextResponse.json({ success: true })
70
+
71
+ case 'finalize':
72
+ // Mark as complete by writing a timestamp file
73
+ await writeFile(
74
+ path.join(sessionDir, '.complete'),
75
+ new Date().toISOString()
76
+ )
77
+ return NextResponse.json({
78
+ success: true,
79
+ path: sessionDir,
80
+ downloadUrl: `/api/storage/download?session=${sessionId}&mode=${mode}`
81
+ })
82
+
83
+ default:
84
+ return NextResponse.json({ error: 'Unknown type' }, { status: 400 })
85
+ }
86
+ }
87
+ } catch (error) {
88
+ console.error('Storage API error:', error)
89
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
90
+ }
91
+ }
92
+
93
+ // GET - List datasets or get config
94
+ export async function GET(request: NextRequest) {
95
+ try {
96
+ const { searchParams } = new URL(request.url)
97
+ const action = searchParams.get('action')
98
+ const mode = (searchParams.get('mode') as 'local' | 'huggingface') || 'local'
99
+
100
+ switch (action) {
101
+ case 'list':
102
+ const baseDir = getBaseDir(mode)
103
+ if (!existsSync(baseDir)) {
104
+ return NextResponse.json({ datasets: [] })
105
+ }
106
+
107
+ const entries = await readdir(baseDir, { withFileTypes: true })
108
+ const datasets = await Promise.all(
109
+ entries
110
+ .filter(e => e.isDirectory())
111
+ .map(async (entry) => {
112
+ const datasetPath = path.join(baseDir, entry.name)
113
+ const stats = await stat(datasetPath)
114
+ const completeFile = path.join(datasetPath, '.complete')
115
+ const isComplete = existsSync(completeFile)
116
+
117
+ return {
118
+ name: entry.name,
119
+ path: datasetPath,
120
+ createdAt: stats.birthtime.toISOString(),
121
+ isComplete,
122
+ ageHours: (Date.now() - stats.birthtime.getTime()) / (1000 * 60 * 60)
123
+ }
124
+ })
125
+ )
126
+
127
+ return NextResponse.json({ datasets })
128
+
129
+ case 'config':
130
+ return NextResponse.json({
131
+ outputDir: OUTPUT_DIR,
132
+ dataDir: DATA_DIR,
133
+ cleanupHours: CLEANUP_HOURS,
134
+ isHuggingFace: existsSync('/data')
135
+ })
136
+
137
+ default:
138
+ return NextResponse.json({ error: 'Unknown action' }, { status: 400 })
139
+ }
140
+ } catch (error) {
141
+ console.error('Storage GET error:', error)
142
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
143
+ }
144
+ }
145
+
146
+ // DELETE - Cleanup old datasets
147
+ export async function DELETE(request: NextRequest) {
148
+ try {
149
+ const { searchParams } = new URL(request.url)
150
+ const mode = (searchParams.get('mode') as 'local' | 'huggingface') || 'local'
151
+ const maxAgeHours = parseInt(searchParams.get('maxAge') || String(CLEANUP_HOURS))
152
+
153
+ const baseDir = getBaseDir(mode)
154
+ if (!existsSync(baseDir)) {
155
+ return NextResponse.json({ cleaned: 0, message: 'No datasets directory' })
156
+ }
157
+
158
+ const entries = await readdir(baseDir, { withFileTypes: true })
159
+ let cleaned = 0
160
+ const cleanedPaths: string[] = []
161
+
162
+ for (const entry of entries) {
163
+ if (!entry.isDirectory()) continue
164
+
165
+ const datasetPath = path.join(baseDir, entry.name)
166
+ const stats = await stat(datasetPath)
167
+ const ageHours = (Date.now() - stats.birthtime.getTime()) / (1000 * 60 * 60)
168
+
169
+ if (ageHours > maxAgeHours) {
170
+ await rm(datasetPath, { recursive: true, force: true })
171
+ cleaned++
172
+ cleanedPaths.push(entry.name)
173
+ console.log(`Cleaned up old dataset: ${entry.name} (${ageHours.toFixed(1)} hours old)`)
174
+ }
175
+ }
176
+
177
+ return NextResponse.json({
178
+ cleaned,
179
+ cleanedPaths,
180
+ message: `Cleaned ${cleaned} datasets older than ${maxAgeHours} hours`
181
+ })
182
+ } catch (error) {
183
+ console.error('Cleanup error:', error)
184
+ return NextResponse.json({ error: 'Internal server error' }, { status: 500 })
185
+ }
186
+ }
web/components/generation-panel.tsx CHANGED
@@ -1,10 +1,12 @@
1
  'use client'
2
 
3
  import { useState, useEffect, useRef, Dispatch, SetStateAction } from 'react'
4
- import { Play, Pause, StopCircle, Download, CheckCircle2, FileArchive, AlertCircle, DownloadCloud, Cpu, Zap, Layers, HelpCircle } from 'lucide-react'
5
  import { GenerationStats } from './stats-panel'
6
- import { generateDataset, downloadDataset, GeneratorConfig, getGenerationState, buildPartialZip } from '@/lib/generator'
7
  import { isWebGLAvailable } from '@/lib/gpu-augmentation'
 
 
8
 
9
  interface GenerationPanelProps {
10
  config: any
@@ -46,6 +48,19 @@ export function GenerationPanel({
46
  const logsEndRef = useRef<HTMLDivElement>(null)
47
  const abortControllerRef = useRef<AbortController | null>(null)
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  const scrollToBottom = () => {
50
  logsEndRef.current?.scrollIntoView({ behavior: 'smooth' })
51
  }
@@ -123,6 +138,85 @@ export function GenerationPanel({
123
  ])
124
 
125
  try {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  const generatorConfig: GeneratorConfig = {
127
  dataset: config.dataset,
128
  input: {
@@ -501,8 +595,8 @@ export function GenerationPanel({
501
  <span className="text-sm">Augmentation</span>
502
  </div>
503
  <span className={`text-xs font-bold px-2 py-1 rounded ${isWebGLAvailable()
504
- ? 'bg-green-500/20 text-green-400'
505
- : 'bg-blue-500/20 text-blue-400'
506
  }`}>
507
  {isWebGLAvailable() ? '🎮 GPU' : '💻 CPU'}
508
  </span>
@@ -533,6 +627,161 @@ export function GenerationPanel({
533
  </div>
534
  </div>
535
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
  </div>
537
 
538
  {/* Progress & Logs */}
 
1
  'use client'
2
 
3
  import { useState, useEffect, useRef, Dispatch, SetStateAction } from 'react'
4
+ import { Play, Pause, StopCircle, Download, CheckCircle2, FileArchive, AlertCircle, DownloadCloud, Cpu, Zap, Layers, HelpCircle, FolderOpen, Database, Cloud, HardDrive, Settings2 } from 'lucide-react'
5
  import { GenerationStats } from './stats-panel'
6
+ import { generateDataset, downloadDataset, GeneratorConfig, getGenerationState, buildPartialZip, clearCanvasPool } from '@/lib/generator'
7
  import { isWebGLAvailable } from '@/lib/gpu-augmentation'
8
+ import { StorageMode, getStorageManager, resetStorageManager } from '@/lib/storage-manager'
9
+ import { generateDatasetDirect, isDirectGenerationSupported, DirectGenerationConfig } from '@/lib/direct-generator'
10
 
11
  interface GenerationPanelProps {
12
  config: any
 
48
  const logsEndRef = useRef<HTMLDivElement>(null)
49
  const abortControllerRef = useRef<AbortController | null>(null)
50
 
51
+ // Storage mode state
52
+ const [storageMode, setStorageMode] = useState<StorageMode>('memory')
53
+ const [selectedFolder, setSelectedFolder] = useState<string | null>(null)
54
+ const [outputPath, setOutputPath] = useState<string | null>(null)
55
+ const [showStorageSettings, setShowStorageSettings] = useState(false)
56
+
57
+ // Directory handle for local folder mode (File System Access API)
58
+ const directoryHandleRef = useRef<FileSystemDirectoryHandle | null>(null)
59
+
60
+ // Rolling rate calculation (last 100 samples)
61
+ const recentSamples = useRef<{ time: number; count: number }[]>([])
62
+ const ROLLING_WINDOW = 100 // Calculate rate from last 100 samples
63
+
64
  const scrollToBottom = () => {
65
  logsEndRef.current?.scrollIntoView({ behavior: 'smooth' })
66
  }
 
138
  ])
139
 
140
  try {
141
+ // === LOCAL FOLDER MODE: Direct-to-disk generation ===
142
+ if (storageMode === 'local' && directoryHandleRef.current) {
143
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] 📂 Using DIRECT-TO-DISK mode (no memory limits)`]);
144
+
145
+ const directConfig: DirectGenerationConfig = {
146
+ dataset: config.dataset,
147
+ image: config.image,
148
+ fonts: config.fonts,
149
+ augmentation: config.augmentation,
150
+ output: config.output
151
+ };
152
+
153
+ const genStartTime = Date.now();
154
+ const directResult = await generateDatasetDirect(
155
+ directoryHandleRef.current,
156
+ directConfig,
157
+ data,
158
+ (prog, message) => {
159
+ if (abortControllerRef.current?.signal.aborted) return;
160
+ setProgress(prog);
161
+
162
+ const elapsed = (Date.now() - genStartTime) / 1000;
163
+ if (elapsed > 0 && prog > 0) {
164
+ const samplesGenerated = Math.round(prog * targetSize / 100);
165
+ setRate(Math.round(samplesGenerated / elapsed));
166
+
167
+ const remaining = (100 - prog) / prog * elapsed;
168
+ if (remaining < 60) {
169
+ setEta(`${Math.round(remaining)}s`);
170
+ } else if (remaining < 3600) {
171
+ setEta(`${Math.round(remaining / 60)}m ${Math.round(remaining % 60)}s`);
172
+ } else {
173
+ setEta(`${Math.floor(remaining / 3600)}h ${Math.round((remaining % 3600) / 60)}m`);
174
+ }
175
+ }
176
+
177
+ if (prog % 10 === 0 || prog >= 99) {
178
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ${message}`]);
179
+ }
180
+ },
181
+ abortControllerRef.current.signal
182
+ );
183
+
184
+ setGenerationStatus('completed');
185
+ setIsGenerating(false);
186
+ setOutputPath(directResult.outputPath);
187
+
188
+ setLogs(prev => [
189
+ ...prev,
190
+ `[${new Date().toLocaleTimeString()}] ✅ Generation complete!`,
191
+ `[${new Date().toLocaleTimeString()}] 📁 Files saved to: ${directResult.outputPath}`,
192
+ `[${new Date().toLocaleTimeString()}] 📊 Total samples: ${directResult.totalSamples.toLocaleString()}`,
193
+ `[${new Date().toLocaleTimeString()}] ⏱️ Duration: ${directResult.durationSeconds.toFixed(1)}s`,
194
+ `[${new Date().toLocaleTimeString()}] 💡 No ZIP needed - files are ready to use!`,
195
+ ]);
196
+
197
+ // Call onGenerationComplete with minimal stats (no ZIP blob)
198
+ if (onGenerationComplete) {
199
+ onGenerationComplete({
200
+ total_samples: directResult.totalSamples,
201
+ duration_seconds: directResult.durationSeconds,
202
+ samples_per_second: directResult.totalSamples / directResult.durationSeconds,
203
+ font_distribution: [],
204
+ clean_samples: directResult.cleanSamples,
205
+ augmented_samples: directResult.augmentedSamples,
206
+ augmentation_stats: [],
207
+ avg_transforms_per_sample: 0,
208
+ unique_tokens: data.length,
209
+ avg_chars_per_sample: 0,
210
+ unicode_valid: directResult.totalSamples,
211
+ script_pure: directResult.totalSamples,
212
+ rejected_samples: 0
213
+ }, new Blob()); // Empty blob since files are on disk
214
+ }
215
+
216
+ return; // Exit early - no ZIP needed
217
+ }
218
+
219
+ // === MEMORY MODE: Original ZIP-based generation ===
220
  const generatorConfig: GeneratorConfig = {
221
  dataset: config.dataset,
222
  input: {
 
595
  <span className="text-sm">Augmentation</span>
596
  </div>
597
  <span className={`text-xs font-bold px-2 py-1 rounded ${isWebGLAvailable()
598
+ ? 'bg-green-500/20 text-green-400'
599
+ : 'bg-blue-500/20 text-blue-400'
600
  }`}>
601
  {isWebGLAvailable() ? '🎮 GPU' : '💻 CPU'}
602
  </span>
 
627
  </div>
628
  </div>
629
  </div>
630
+
631
+ {/* Storage Settings Panel */}
632
+ <div className="glass rounded-xl p-6">
633
+ <div className="flex items-center justify-between mb-4">
634
+ <h3 className="font-medium flex items-center gap-2">
635
+ <Database className="w-4 h-4 text-blue-400" />
636
+ Storage Settings
637
+ </h3>
638
+ <button
639
+ onClick={() => setShowStorageSettings(!showStorageSettings)}
640
+ className="text-xs text-muted-foreground hover:text-primary transition-colors flex items-center gap-1"
641
+ >
642
+ <Settings2 className="w-3 h-3" />
643
+ {showStorageSettings ? 'Hide' : 'Configure'}
644
+ </button>
645
+ </div>
646
+
647
+ {/* Current Mode Display */}
648
+ <div className="flex items-center justify-between p-2 rounded-lg bg-secondary/50 mb-3">
649
+ <div className="flex items-center gap-2">
650
+ {storageMode === 'memory' && <HardDrive className="w-4 h-4 text-yellow-400" />}
651
+ {storageMode === 'indexeddb' && <Database className="w-4 h-4 text-blue-400" />}
652
+ {storageMode === 'local' && <FolderOpen className="w-4 h-4 text-green-400" />}
653
+ {storageMode === 'huggingface' && <Cloud className="w-4 h-4 text-purple-400" />}
654
+ <span className="text-sm">Storage Mode</span>
655
+ </div>
656
+ <span className={`text-xs font-bold px-2 py-1 rounded ${storageMode === 'memory' ? 'bg-yellow-500/20 text-yellow-400' :
657
+ storageMode === 'indexeddb' ? 'bg-blue-500/20 text-blue-400' :
658
+ storageMode === 'local' ? 'bg-green-500/20 text-green-400' :
659
+ 'bg-purple-500/20 text-purple-400'
660
+ }`}>
661
+ {storageMode === 'memory' ? '💾 Memory' :
662
+ storageMode === 'indexeddb' ? '🗄️ Browser' :
663
+ storageMode === 'local' ? '📁 Local' :
664
+ '🤗 HuggingFace'}
665
+ </span>
666
+ </div>
667
+
668
+ {/* Expanded Settings */}
669
+ {showStorageSettings && (
670
+ <div className="space-y-3 pt-2 border-t border-white/10">
671
+ <p className="text-xs text-muted-foreground">
672
+ Select where to store generated images:
673
+ </p>
674
+
675
+ {/* Storage Mode Buttons */}
676
+ <div className="grid grid-cols-2 gap-2">
677
+ <button
678
+ onClick={() => { setStorageMode('memory'); setSelectedFolder(null); }}
679
+ className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'memory'
680
+ ? 'bg-yellow-500/20 text-yellow-400 ring-1 ring-yellow-500/50'
681
+ : 'bg-secondary/50 hover:bg-secondary'
682
+ }`}
683
+ >
684
+ <HardDrive className="w-4 h-4" />
685
+ <span>Memory</span>
686
+ <span className="text-[10px] opacity-60">Fast, &lt;2k samples</span>
687
+ </button>
688
+
689
+ <button
690
+ onClick={() => { setStorageMode('indexeddb'); setSelectedFolder(null); }}
691
+ className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'indexeddb'
692
+ ? 'bg-blue-500/20 text-blue-400 ring-1 ring-blue-500/50'
693
+ : 'bg-secondary/50 hover:bg-secondary'
694
+ }`}
695
+ >
696
+ <Database className="w-4 h-4" />
697
+ <span>Browser DB</span>
698
+ <span className="text-[10px] opacity-60">Persistent, resumable</span>
699
+ </button>
700
+
701
+ <button
702
+ onClick={async () => {
703
+ // Check if File System API is supported
704
+ if (!('showDirectoryPicker' in window)) {
705
+ setError('Folder picker not supported in this browser. Use Chrome, Edge, or Opera.');
706
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ❌ File System API not supported - try Chrome or Edge`]);
707
+ return;
708
+ }
709
+
710
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] 📂 Opening folder picker...`]);
711
+
712
+ try {
713
+ // Directly use the File System Access API
714
+ // @ts-ignore - File System Access API
715
+ const directoryHandle = await window.showDirectoryPicker({
716
+ mode: 'readwrite',
717
+ startIn: 'documents'
718
+ });
719
+
720
+ // Store handle for generation - don't pre-create folders
721
+ directoryHandleRef.current = directoryHandle;
722
+
723
+ setStorageMode('local');
724
+ setSelectedFolder(directoryHandle.name);
725
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ✅ Output folder: ${directoryHandle.name} (direct-to-disk mode enabled)`]);
726
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] 💡 Files will be written directly - no memory limits!`]);
727
+
728
+ } catch (err: any) {
729
+ if (err.name === 'AbortError') {
730
+ // User cancelled the picker
731
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ℹ️ Folder selection cancelled`]);
732
+ } else {
733
+ console.error('Folder selection failed:', err);
734
+ setError(`Failed to select folder: ${err.message}`);
735
+ setLogs(prev => [...prev, `[${new Date().toLocaleTimeString()}] ❌ Error: ${err.message}`]);
736
+ }
737
+ }
738
+ }}
739
+ className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'local'
740
+ ? 'bg-green-500/20 text-green-400 ring-1 ring-green-500/50'
741
+ : 'bg-secondary/50 hover:bg-secondary'
742
+ }`}
743
+ >
744
+ <FolderOpen className="w-4 h-4" />
745
+ <span>Local Folder</span>
746
+ <span className="text-[10px] opacity-60">Direct disk access</span>
747
+ </button>
748
+
749
+ <button
750
+ onClick={() => { setStorageMode('huggingface'); setSelectedFolder(null); }}
751
+ className={`p-2 rounded-lg text-xs flex flex-col items-center gap-1 transition-all ${storageMode === 'huggingface'
752
+ ? 'bg-purple-500/20 text-purple-400 ring-1 ring-purple-500/50'
753
+ : 'bg-secondary/50 hover:bg-secondary'
754
+ }`}
755
+ >
756
+ <Cloud className="w-4 h-4" />
757
+ <span>HuggingFace</span>
758
+ <span className="text-[10px] opacity-60">Cloud storage</span>
759
+ </button>
760
+ </div>
761
+
762
+ {/* Selected Folder Display */}
763
+ {storageMode === 'local' && selectedFolder && (
764
+ <div className="p-2 rounded-lg bg-green-500/10 text-xs">
765
+ <span className="text-green-400">📂 Output: </span>
766
+ <span className="text-muted-foreground">{selectedFolder}</span>
767
+ </div>
768
+ )}
769
+
770
+ {/* Output Path Display */}
771
+ {outputPath && (
772
+ <div className="p-2 rounded-lg bg-blue-500/10 text-xs">
773
+ <span className="text-blue-400">✅ Saved to: </span>
774
+ <span className="text-muted-foreground">{outputPath}</span>
775
+ </div>
776
+ )}
777
+
778
+ {/* Tip */}
779
+ <p className="text-[10px] text-muted-foreground">
780
+ 💡 For large datasets (5k+), use Browser DB or Local Folder to avoid memory issues.
781
+ </p>
782
+ </div>
783
+ )}
784
+ </div>
785
  </div>
786
 
787
  {/* Progress & Logs */}
web/lib/cleanup.ts ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Auto-cleanup service for old datasets
2
+ // Runs on server startup and can be triggered via API
3
+
4
+ import { readdir, stat, rm } from 'fs/promises'
5
+ import { existsSync } from 'fs'
6
+ import path from 'path'
7
+
8
+ const OUTPUT_DIR = process.env.OUTPUT_DIR || './output/datasets'
9
+ const DATA_DIR = process.env.DATA_DIR || '/data/datasets'
10
+ const CLEANUP_HOURS = parseInt(process.env.CLEANUP_HOURS || '24')
11
+
12
+ export interface CleanupResult {
13
+ cleaned: number
14
+ cleanedPaths: string[]
15
+ errors: string[]
16
+ }
17
+
18
+ /**
19
+ * Clean up datasets older than the specified age
20
+ */
21
+ export async function cleanupOldDatasets(
22
+ maxAgeHours: number = CLEANUP_HOURS,
23
+ mode: 'local' | 'huggingface' | 'both' = 'both'
24
+ ): Promise<CleanupResult> {
25
+ const result: CleanupResult = {
26
+ cleaned: 0,
27
+ cleanedPaths: [],
28
+ errors: []
29
+ }
30
+
31
+ const dirsToClean: string[] = []
32
+ if (mode === 'local' || mode === 'both') {
33
+ dirsToClean.push(OUTPUT_DIR)
34
+ }
35
+ if (mode === 'huggingface' || mode === 'both') {
36
+ if (existsSync(DATA_DIR)) {
37
+ dirsToClean.push(DATA_DIR)
38
+ }
39
+ }
40
+
41
+ for (const baseDir of dirsToClean) {
42
+ if (!existsSync(baseDir)) continue
43
+
44
+ try {
45
+ const entries = await readdir(baseDir, { withFileTypes: true })
46
+
47
+ for (const entry of entries) {
48
+ if (!entry.isDirectory()) continue
49
+
50
+ const datasetPath = path.join(baseDir, entry.name)
51
+ try {
52
+ const stats = await stat(datasetPath)
53
+ const ageHours = (Date.now() - stats.birthtime.getTime()) / (1000 * 60 * 60)
54
+
55
+ if (ageHours > maxAgeHours) {
56
+ await rm(datasetPath, { recursive: true, force: true })
57
+ result.cleaned++
58
+ result.cleanedPaths.push(datasetPath)
59
+ console.log(`[Cleanup] Removed: ${entry.name} (${ageHours.toFixed(1)}h old)`)
60
+ }
61
+ } catch (err) {
62
+ const errorMsg = `Failed to clean ${datasetPath}: ${err}`
63
+ console.error(`[Cleanup] ${errorMsg}`)
64
+ result.errors.push(errorMsg)
65
+ }
66
+ }
67
+ } catch (err) {
68
+ const errorMsg = `Failed to read ${baseDir}: ${err}`
69
+ console.error(`[Cleanup] ${errorMsg}`)
70
+ result.errors.push(errorMsg)
71
+ }
72
+ }
73
+
74
+ console.log(`[Cleanup] Complete: ${result.cleaned} datasets removed`)
75
+ return result
76
+ }
77
+
78
+ /**
79
+ * Schedule cleanup to run periodically (call from server initialization)
80
+ */
81
+ export function scheduleCleanup(intervalHours: number = 1): NodeJS.Timer {
82
+ console.log(`[Cleanup] Scheduled every ${intervalHours} hour(s), removing datasets older than ${CLEANUP_HOURS} hours`)
83
+
84
+ // Run immediately on startup
85
+ cleanupOldDatasets().catch(console.error)
86
+
87
+ // Then run periodically
88
+ return setInterval(() => {
89
+ cleanupOldDatasets().catch(console.error)
90
+ }, intervalHours * 60 * 60 * 1000)
91
+ }
92
+
93
+ /**
94
+ * Get storage statistics
95
+ */
96
+ export async function getStorageStats(): Promise<{
97
+ localDatasets: number
98
+ hfDatasets: number
99
+ totalSizeMB: number
100
+ }> {
101
+ let localDatasets = 0
102
+ let hfDatasets = 0
103
+ let totalSizeBytes = 0
104
+
105
+ async function countDir(dir: string): Promise<{ count: number; size: number }> {
106
+ if (!existsSync(dir)) return { count: 0, size: 0 }
107
+
108
+ const entries = await readdir(dir, { withFileTypes: true })
109
+ let count = 0
110
+ let size = 0
111
+
112
+ for (const entry of entries) {
113
+ if (entry.isDirectory()) {
114
+ count++
115
+ // Estimate size (could be more accurate with recursive size calculation)
116
+ const stats = await stat(path.join(dir, entry.name))
117
+ size += stats.size
118
+ }
119
+ }
120
+
121
+ return { count, size }
122
+ }
123
+
124
+ const localStats = await countDir(OUTPUT_DIR)
125
+ const hfStats = await countDir(DATA_DIR)
126
+
127
+ localDatasets = localStats.count
128
+ hfDatasets = hfStats.count
129
+ totalSizeBytes = localStats.size + hfStats.size
130
+
131
+ return {
132
+ localDatasets,
133
+ hfDatasets,
134
+ totalSizeMB: Math.round(totalSizeBytes / (1024 * 1024))
135
+ }
136
+ }
web/lib/direct-generator.ts ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Direct-to-disk dataset generation
2
+ // Writes files directly to a user-selected folder without building ZIP in memory
3
+
4
+ export interface DirectGenerationConfig {
5
+ dataset: {
6
+ size: number
7
+ seed: number
8
+ }
9
+ image: {
10
+ width: number
11
+ height: number
12
+ background: string
13
+ backgroundStyle?: string
14
+ backgroundMode?: 'single' | 'mix'
15
+ backgroundPercentages?: Record<string, number>
16
+ textColor: string
17
+ direction: string
18
+ }
19
+ fonts: {
20
+ distribution: { name: string; family: string; percentage: number; dataUrl?: string }[]
21
+ }
22
+ augmentation: {
23
+ enabled: boolean
24
+ applyPercentage: number
25
+ preset: string
26
+ customMode?: boolean
27
+ values?: Record<string, number>
28
+ }
29
+ output: {
30
+ formats: string[]
31
+ }
32
+ }
33
+
34
+ export interface DirectGenerationResult {
35
+ totalSamples: number
36
+ durationSeconds: number
37
+ outputPath: string
38
+ cleanSamples: number
39
+ augmentedSamples: number
40
+ }
41
+
42
+ // Background colors
43
+ const backgroundColors: Record<string, string> = {
44
+ clean_white: '#FFFFFF',
45
+ aged_paper: '#F5E6D3',
46
+ book_page: '#FAF0E6',
47
+ newspaper: '#E8E8E8',
48
+ notebook: '#FFFEF0',
49
+ parchment: '#F0E68C',
50
+ weathered: '#D4C4A8',
51
+ coffee_stain: '#E6D5C3',
52
+ old_book: '#E8DCC4',
53
+ recycled: '#D9D4C5',
54
+ cream: '#FFFDD0',
55
+ ivory: '#FFFFF0',
56
+ }
57
+
58
+ // Simple seeded random
59
+ function seededRandom(seed: number) {
60
+ let s = seed
61
+ return function () {
62
+ s = Math.sin(s) * 10000
63
+ return s - Math.floor(s)
64
+ }
65
+ }
66
+
67
+ // Canvas pool for performance
68
+ const canvasPool: { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D; inUse: boolean }[] = []
69
+
70
+ function getCanvas(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
71
+ for (const item of canvasPool) {
72
+ if (!item.inUse && item.canvas.width === width && item.canvas.height === height) {
73
+ item.inUse = true
74
+ item.ctx.clearRect(0, 0, width, height)
75
+ item.ctx.setTransform(1, 0, 0, 1, 0, 0)
76
+ return { canvas: item.canvas, ctx: item.ctx }
77
+ }
78
+ }
79
+
80
+ if (canvasPool.length < 4) {
81
+ const canvas = document.createElement('canvas')
82
+ canvas.width = width
83
+ canvas.height = height
84
+ const ctx = canvas.getContext('2d', { willReadFrequently: true })!
85
+ canvasPool.push({ canvas, ctx, inUse: true })
86
+ return { canvas, ctx }
87
+ }
88
+
89
+ const canvas = document.createElement('canvas')
90
+ canvas.width = width
91
+ canvas.height = height
92
+ const ctx = canvas.getContext('2d', { willReadFrequently: true })!
93
+ return { canvas, ctx }
94
+ }
95
+
96
+ function releaseCanvas(canvas: HTMLCanvasElement) {
97
+ for (const item of canvasPool) {
98
+ if (item.canvas === canvas) {
99
+ item.inUse = false
100
+ return
101
+ }
102
+ }
103
+ }
104
+
105
+ /**
106
+ * Generate dataset directly to a local folder
107
+ * No ZIP, no memory accumulation - writes each file immediately
108
+ */
109
+ export async function generateDatasetDirect(
110
+ directoryHandle: FileSystemDirectoryHandle,
111
+ config: DirectGenerationConfig,
112
+ textData: string[],
113
+ onProgress: (progress: number, message: string) => void,
114
+ abortSignal?: AbortSignal
115
+ ): Promise<DirectGenerationResult> {
116
+ const startTime = Date.now()
117
+
118
+ // Create dataset folder
119
+ const datasetName = `dataset_${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`
120
+ const datasetHandle = await directoryHandle.getDirectoryHandle(datasetName, { create: true })
121
+ const imagesHandle = await datasetHandle.getDirectoryHandle('images', { create: true })
122
+
123
+ onProgress(1, `Created folder: ${directoryHandle.name}/${datasetName}`)
124
+
125
+ const numSamples = Math.min(config.dataset.size, textData.length)
126
+ const labels: string[] = []
127
+ let cleanCount = 0
128
+ let augmentedCount = 0
129
+
130
+ // Load fonts
131
+ const loadedFonts: Map<string, string> = new Map()
132
+ if (config.fonts.distribution?.length) {
133
+ for (const font of config.fonts.distribution) {
134
+ if (font.dataUrl) {
135
+ try {
136
+ const fontFace = new FontFace(font.family, `url(${font.dataUrl})`)
137
+ await fontFace.load()
138
+ document.fonts.add(fontFace)
139
+ loadedFonts.set(font.name, font.family)
140
+ } catch {
141
+ loadedFonts.set(font.name, 'Arial')
142
+ }
143
+ } else {
144
+ loadedFonts.set(font.name, font.family || 'Arial')
145
+ }
146
+ }
147
+ }
148
+
149
+ onProgress(3, `Loaded ${loadedFonts.size} fonts, starting generation...`)
150
+
151
+ // Get augmentation values
152
+ const augValues = config.augmentation.values || {
153
+ rotation: 5, skew: 3, gaussian_noise: 10, brightness: 15
154
+ }
155
+
156
+ // Generate one sample at a time, write immediately
157
+ for (let i = 0; i < numSamples; i++) {
158
+ if (abortSignal?.aborted) {
159
+ throw new Error('Generation aborted')
160
+ }
161
+
162
+ const text = textData[i % textData.length]
163
+ const random = seededRandom(config.dataset.seed + i * 1000)
164
+
165
+ // Select font
166
+ let fontFamily = 'Arial'
167
+ if (config.fonts.distribution?.length) {
168
+ const roll = random() * 100
169
+ let cumulative = 0
170
+ for (const font of config.fonts.distribution) {
171
+ cumulative += font.percentage
172
+ if (roll < cumulative) {
173
+ fontFamily = loadedFonts.get(font.name) || font.family || 'Arial'
174
+ break
175
+ }
176
+ }
177
+ }
178
+
179
+ // Determine if should augment
180
+ const shouldAugment = config.augmentation.enabled &&
181
+ (random() * 100) < config.augmentation.applyPercentage
182
+
183
+ if (shouldAugment) {
184
+ augmentedCount++
185
+ } else {
186
+ cleanCount++
187
+ }
188
+
189
+ // Render to canvas
190
+ const { canvas, ctx } = getCanvas(config.image.width, config.image.height)
191
+
192
+ try {
193
+ // Background
194
+ const bgStyle = config.image.backgroundStyle || 'clean_white'
195
+ ctx.fillStyle = backgroundColors[bgStyle] || config.image.background || '#FFFFFF'
196
+ ctx.fillRect(0, 0, canvas.width, canvas.height)
197
+
198
+ // Text
199
+ const fontSize = Math.min(canvas.height * 0.6, 48)
200
+ ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
201
+ ctx.fillStyle = config.image.textColor || '#000000'
202
+ ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
203
+ ctx.textBaseline = 'middle'
204
+ ctx.direction = config.image.direction as CanvasDirection || 'ltr'
205
+
206
+ const x = config.image.direction === 'rtl' ? canvas.width - 10 : 10
207
+ ctx.fillText(text, x, canvas.height / 2)
208
+
209
+ // Convert to blob
210
+ const blob = await new Promise<Blob>((resolve, reject) => {
211
+ canvas.toBlob((b) => {
212
+ if (b) resolve(b)
213
+ else reject(new Error('Failed to create blob'))
214
+ }, 'image/png')
215
+ })
216
+
217
+ // Write immediately to disk
218
+ const filename = `image_${String(i).padStart(6, '0')}.png`
219
+ const fileHandle = await imagesHandle.getFileHandle(filename, { create: true })
220
+ const writable = await fileHandle.createWritable()
221
+ await writable.write(blob)
222
+ await writable.close()
223
+
224
+ labels.push(`${filename}\t${text}`)
225
+
226
+ } finally {
227
+ releaseCanvas(canvas)
228
+ }
229
+
230
+ // Progress update every 50 samples or at key milestones
231
+ if (i % 50 === 0 || i === numSamples - 1) {
232
+ const progress = Math.round((i / numSamples) * 90) + 5
233
+ onProgress(progress, `Written ${i + 1}/${numSamples} images directly to disk`)
234
+ }
235
+
236
+ // Yield to prevent UI freeze
237
+ if (i % 100 === 0) {
238
+ await new Promise(resolve => setTimeout(resolve, 0))
239
+ }
240
+ }
241
+
242
+ onProgress(95, 'Writing labels file...')
243
+
244
+ // Write labels.txt
245
+ const labelsHandle = await datasetHandle.getFileHandle('labels.txt', { create: true })
246
+ const labelsWritable = await labelsHandle.createWritable()
247
+ await labelsWritable.write(labels.join('\n'))
248
+ await labelsWritable.close()
249
+
250
+ // Write other formats
251
+ if (config.output.formats.includes('jsonl') || config.output.formats.includes('trocr')) {
252
+ const jsonlHandle = await datasetHandle.getFileHandle('data.jsonl', { create: true })
253
+ const jsonlWritable = await jsonlHandle.createWritable()
254
+ const jsonlContent = labels.map(l => {
255
+ const [filename, ...textParts] = l.split('\t')
256
+ return JSON.stringify({ image: `images/${filename}`, text: textParts.join('\t') })
257
+ }).join('\n')
258
+ await jsonlWritable.write(jsonlContent)
259
+ await jsonlWritable.close()
260
+ }
261
+
262
+ if (config.output.formats.includes('huggingface') || config.output.formats.includes('csv')) {
263
+ const csvHandle = await datasetHandle.getFileHandle('metadata.csv', { create: true })
264
+ const csvWritable = await csvHandle.createWritable()
265
+ const csvContent = 'file_name,text\n' + labels.map(l => {
266
+ const [filename, ...textParts] = l.split('\t')
267
+ return `"images/${filename}","${textParts.join('\t').replace(/"/g, '""')}"`
268
+ }).join('\n')
269
+ await csvWritable.write(csvContent)
270
+ await csvWritable.close()
271
+ }
272
+
273
+ onProgress(98, 'Writing metadata...')
274
+
275
+ // Write metadata
276
+ const metadataHandle = await datasetHandle.getFileHandle('metadata.json', { create: true })
277
+ const metadataWritable = await metadataHandle.createWritable()
278
+ await metadataWritable.write(JSON.stringify({
279
+ generated_at: new Date().toISOString(),
280
+ total_samples: numSamples,
281
+ clean_samples: cleanCount,
282
+ augmented_samples: augmentedCount,
283
+ image_size: `${config.image.width}x${config.image.height}`,
284
+ output_path: `${directoryHandle.name}/${datasetName}`
285
+ }, null, 2))
286
+ await metadataWritable.close()
287
+
288
+ const endTime = Date.now()
289
+ const durationSeconds = (endTime - startTime) / 1000
290
+
291
+ onProgress(100, `Complete! Files saved to ${directoryHandle.name}/${datasetName}`)
292
+
293
+ return {
294
+ totalSamples: numSamples,
295
+ durationSeconds,
296
+ outputPath: `${directoryHandle.name}/${datasetName}`,
297
+ cleanSamples: cleanCount,
298
+ augmentedSamples: augmentedCount
299
+ }
300
+ }
301
+
302
+ /**
303
+ * Check if File System Access API is available
304
+ */
305
+ export function isDirectGenerationSupported(): boolean {
306
+ return 'showDirectoryPicker' in window
307
+ }
web/lib/generator.ts CHANGED
@@ -2,6 +2,64 @@ import JSZip from 'jszip'
2
  import { saveAs } from 'file-saver'
3
  import { getWorkerPool, isWorkerRenderingAvailable, WorkerTask, WorkerResult } from './worker-pool'
4
  import { isWebGLAvailable, applyGPUAugmentation, GPUAugmentOptions } from './gpu-augmentation'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  export interface FontData {
7
  name: string
@@ -279,139 +337,146 @@ async function renderTextToCanvas(
279
  augValues: Record<string, number>,
280
  random: () => number
281
  ): Promise<{ blob: Blob; augmentations: string[]; backgroundStyle: string }> {
282
- const canvas = document.createElement('canvas')
283
- canvas.width = config.image.width
284
- canvas.height = config.image.height
285
- const ctx = canvas.getContext('2d')!
286
-
287
- // Select and apply background
288
- const bg = selectBackground(config, random)
289
-
290
- if (bg.type === 'color') {
291
- ctx.fillStyle = bg.value
292
- ctx.fillRect(0, 0, canvas.width, canvas.height)
293
- } else {
294
- // Draw custom background image
295
- try {
296
- const img = await loadImage(bg.value)
297
- ctx.drawImage(img, 0, 0, canvas.width, canvas.height)
298
- } catch {
299
- // Fallback to white if image fails
300
- ctx.fillStyle = '#FFFFFF'
301
  ctx.fillRect(0, 0, canvas.width, canvas.height)
 
 
 
 
 
 
 
 
 
 
302
  }
303
- }
304
 
305
- // Apply augmentation transforms if enabled
306
- let appliedAugmentations: string[] = []
307
- if (shouldAugment && config.augmentation.enabled) {
308
- ctx.save()
309
- appliedAugmentations = applyAugmentation(ctx, canvas, augValues, random)
310
- }
311
-
312
- // Set text properties with the selected font
313
- const fontSize = Math.min(canvas.height * 0.6, 48)
314
- ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
315
- ctx.fillStyle = config.image.textColor
316
- ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
317
- ctx.textBaseline = 'middle'
318
-
319
- // Draw text
320
- const x = config.image.direction === 'rtl'
321
- ? canvas.width - 10
322
- : 10
323
- const y = canvas.height / 2
324
-
325
- ctx.direction = config.image.direction as CanvasDirection
326
- ctx.fillText(text, x, y)
327
-
328
- if (shouldAugment && config.augmentation.enabled) {
329
- ctx.restore()
330
- }
331
-
332
- // Apply post-processing augmentations (GPU-accelerated when available)
333
- if (shouldAugment && config.augmentation.enabled) {
334
- const applyBrightness = augValues.brightness && random() > 0.5
335
- const applyNoise = augValues.gaussian_noise && random() > 0.6
336
-
337
- if (applyBrightness || applyNoise) {
338
- // Try GPU-accelerated augmentation first
339
- const useGPU = isWebGLAvailable()
340
-
341
- if (useGPU) {
342
- // GPU path - apply all augmentations in a single GPU pass
343
- const gpuOptions: GPUAugmentOptions = {
344
- brightness: applyBrightness ? (random() - 0.5) * augValues.brightness / 50 : 0,
345
- contrast: 1, // Could add contrast augmentation here
346
- noiseAmount: applyNoise ? augValues.gaussian_noise / 200 : 0,
347
- seed: random() * 1000
348
- }
349
 
350
- const gpuResult = applyGPUAugmentation(canvas, gpuOptions)
351
- if (gpuResult && gpuResult !== canvas) {
352
- // Copy GPU result back to main canvas
353
- const gpuCtx = gpuResult.getContext('webgl')
354
- if (gpuCtx) {
355
- const pixels = new Uint8Array(canvas.width * canvas.height * 4)
356
- gpuCtx.readPixels(0, 0, canvas.width, canvas.height, gpuCtx.RGBA, gpuCtx.UNSIGNED_BYTE, pixels)
357
-
358
- // Flip Y and apply to main canvas
359
- const imageData = ctx.createImageData(canvas.width, canvas.height)
360
- const rowSize = canvas.width * 4
361
- for (let y = 0; y < canvas.height; y++) {
362
- const srcRow = (canvas.height - 1 - y) * rowSize
363
- const dstRow = y * rowSize
364
- imageData.data.set(pixels.subarray(srcRow, srcRow + rowSize), dstRow)
365
- }
366
- ctx.putImageData(imageData, 0, 0)
367
 
368
- if (applyBrightness) appliedAugmentations.push('brightness_gpu')
369
- if (applyNoise) appliedAugmentations.push('noise_gpu')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  }
371
- }
372
- } else {
373
- // CPU fallback path
374
- if (applyBrightness) {
375
- const adjustment = 1 + (random() - 0.5) * augValues.brightness / 50
376
- const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
377
- for (let i = 0; i < imageData.data.length; i += 4) {
378
- imageData.data[i] = Math.min(255, imageData.data[i] * adjustment)
379
- imageData.data[i + 1] = Math.min(255, imageData.data[i + 1] * adjustment)
380
- imageData.data[i + 2] = Math.min(255, imageData.data[i + 2] * adjustment)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  }
382
- ctx.putImageData(imageData, 0, 0)
383
- appliedAugmentations.push('brightness')
384
- }
385
 
386
- if (applyNoise) {
387
- const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
388
- const noiseLevel = augValues.gaussian_noise / 2
389
- for (let i = 0; i < imageData.data.length; i += 4) {
390
- const noise = (random() - 0.5) * noiseLevel
391
- imageData.data[i] = Math.max(0, Math.min(255, imageData.data[i] + noise))
392
- imageData.data[i + 1] = Math.max(0, Math.min(255, imageData.data[i + 1] + noise))
393
- imageData.data[i + 2] = Math.max(0, Math.min(255, imageData.data[i + 2] + noise))
 
 
 
394
  }
395
- ctx.putImageData(imageData, 0, 0)
396
- appliedAugmentations.push('noise')
397
  }
398
  }
399
  }
400
- }
401
 
402
- // Convert to blob
403
- return new Promise((resolve, reject) => {
404
- canvas.toBlob(
405
- (blob) => {
406
- if (blob) {
407
- resolve({ blob, augmentations: appliedAugmentations, backgroundStyle: bg.styleName })
408
- } else {
409
- reject(new Error('Failed to convert canvas to blob'))
410
- }
411
- },
412
- 'image/png'
413
- )
414
- })
 
 
 
 
 
415
  }
416
 
417
  // Render a single sample (for parallel processing)
@@ -866,18 +931,49 @@ export async function generateDataset(
866
  // Generate zip blob with dynamic compression based on dataset size
867
  onProgress(95, 'Compressing dataset...')
868
 
869
- // Use lower compression for large datasets (faster, less memory)
870
- const compressionLevel = labels.length > 50000 ? 3 :
871
- labels.length > 10000 ? 5 : 6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
872
 
873
- console.log(`Compressing ${labels.length} files with level ${compressionLevel}`)
874
 
875
- const zipBlob = await zip.generateAsync({
876
- type: 'blob',
877
- compression: 'DEFLATE',
878
- compressionOptions: { level: compressionLevel },
879
- streamFiles: true // Stream files to reduce memory usage
880
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
881
 
882
  const endTime = Date.now()
883
  const durationSeconds = (endTime - startTime) / 1000
 
2
  import { saveAs } from 'file-saver'
3
  import { getWorkerPool, isWorkerRenderingAvailable, WorkerTask, WorkerResult } from './worker-pool'
4
  import { isWebGLAvailable, applyGPUAugmentation, GPUAugmentOptions } from './gpu-augmentation'
5
+ import { StorageManager, StorageMode, getStorageManager, StoredSample } from './storage-manager'
6
+
7
+ // ============================================
8
+ // Canvas Pool for Performance
9
+ // ============================================
10
+ interface PooledCanvas {
11
+ canvas: HTMLCanvasElement
12
+ ctx: CanvasRenderingContext2D
13
+ inUse: boolean
14
+ width: number
15
+ height: number
16
+ }
17
+
18
+ const canvasPool: PooledCanvas[] = []
19
+ const MAX_POOL_SIZE = 8
20
+
21
+ function acquireCanvas(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
22
+ // Find available canvas with matching dimensions
23
+ for (const item of canvasPool) {
24
+ if (!item.inUse && item.width === width && item.height === height) {
25
+ item.inUse = true
26
+ item.ctx.clearRect(0, 0, width, height)
27
+ item.ctx.setTransform(1, 0, 0, 1, 0, 0) // Reset transform
28
+ return { canvas: item.canvas, ctx: item.ctx }
29
+ }
30
+ }
31
+
32
+ // Create new canvas if pool not full
33
+ if (canvasPool.length < MAX_POOL_SIZE) {
34
+ const canvas = document.createElement('canvas')
35
+ canvas.width = width
36
+ canvas.height = height
37
+ const ctx = canvas.getContext('2d', { willReadFrequently: true })!
38
+ const pooled: PooledCanvas = { canvas, ctx, inUse: true, width, height }
39
+ canvasPool.push(pooled)
40
+ return { canvas, ctx }
41
+ }
42
+
43
+ // Pool full, create temporary canvas (will be GC'd)
44
+ const canvas = document.createElement('canvas')
45
+ canvas.width = width
46
+ canvas.height = height
47
+ const ctx = canvas.getContext('2d', { willReadFrequently: true })!
48
+ return { canvas, ctx }
49
+ }
50
+
51
+ function releaseCanvas(canvas: HTMLCanvasElement): void {
52
+ for (const item of canvasPool) {
53
+ if (item.canvas === canvas) {
54
+ item.inUse = false
55
+ return
56
+ }
57
+ }
58
+ }
59
+
60
+ export function clearCanvasPool(): void {
61
+ canvasPool.length = 0
62
+ }
63
 
64
  export interface FontData {
65
  name: string
 
337
  augValues: Record<string, number>,
338
  random: () => number
339
  ): Promise<{ blob: Blob; augmentations: string[]; backgroundStyle: string }> {
340
+ // Use canvas pool for better performance
341
+ const { canvas, ctx } = acquireCanvas(config.image.width, config.image.height)
342
+
343
+ // Declare variables outside try for proper scoping
344
+ let appliedAugmentations: string[] = []
345
+ let bg: ReturnType<typeof selectBackground>
346
+
347
+ try {
348
+ // Select and apply background
349
+ bg = selectBackground(config, random)
350
+
351
+ if (bg.type === 'color') {
352
+ ctx.fillStyle = bg.value
 
 
 
 
 
 
353
  ctx.fillRect(0, 0, canvas.width, canvas.height)
354
+ } else {
355
+ // Draw custom background image
356
+ try {
357
+ const img = await loadImage(bg.value)
358
+ ctx.drawImage(img, 0, 0, canvas.width, canvas.height)
359
+ } catch {
360
+ // Fallback to white if image fails
361
+ ctx.fillStyle = '#FFFFFF'
362
+ ctx.fillRect(0, 0, canvas.width, canvas.height)
363
+ }
364
  }
 
365
 
366
+ // Apply augmentation transforms if enabled
367
+ if (shouldAugment && config.augmentation.enabled) {
368
+ ctx.save()
369
+ appliedAugmentations = applyAugmentation(ctx, canvas, augValues, random)
370
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
 
372
+ // Set text properties with the selected font
373
+ const fontSize = Math.min(canvas.height * 0.6, 48)
374
+ ctx.font = `${fontSize}px "${fontFamily}", Arial, sans-serif`
375
+ ctx.fillStyle = config.image.textColor
376
+ ctx.textAlign = config.image.direction === 'rtl' ? 'right' : 'left'
377
+ ctx.textBaseline = 'middle'
378
+
379
+ // Draw text
380
+ const x = config.image.direction === 'rtl'
381
+ ? canvas.width - 10
382
+ : 10
383
+ const y = canvas.height / 2
 
 
 
 
 
384
 
385
+ ctx.direction = config.image.direction as CanvasDirection
386
+ ctx.fillText(text, x, y)
387
+
388
+ if (shouldAugment && config.augmentation.enabled) {
389
+ ctx.restore()
390
+ }
391
+
392
+ // Apply post-processing augmentations (GPU-accelerated when available)
393
+ if (shouldAugment && config.augmentation.enabled) {
394
+ const applyBrightness = augValues.brightness && random() > 0.5
395
+ const applyNoise = augValues.gaussian_noise && random() > 0.6
396
+
397
+ if (applyBrightness || applyNoise) {
398
+ // Try GPU-accelerated augmentation first
399
+ const useGPU = isWebGLAvailable()
400
+
401
+ if (useGPU) {
402
+ // GPU path - apply all augmentations in a single GPU pass
403
+ const gpuOptions: GPUAugmentOptions = {
404
+ brightness: applyBrightness ? (random() - 0.5) * augValues.brightness / 50 : 0,
405
+ contrast: 1, // Could add contrast augmentation here
406
+ noiseAmount: applyNoise ? augValues.gaussian_noise / 200 : 0,
407
+ seed: random() * 1000
408
  }
409
+
410
+ const gpuResult = applyGPUAugmentation(canvas, gpuOptions)
411
+ if (gpuResult && gpuResult !== canvas) {
412
+ // Copy GPU result back to main canvas
413
+ const gpuCtx = gpuResult.getContext('webgl')
414
+ if (gpuCtx) {
415
+ const pixels = new Uint8Array(canvas.width * canvas.height * 4)
416
+ gpuCtx.readPixels(0, 0, canvas.width, canvas.height, gpuCtx.RGBA, gpuCtx.UNSIGNED_BYTE, pixels)
417
+
418
+ // Flip Y and apply to main canvas
419
+ const imageData = ctx.createImageData(canvas.width, canvas.height)
420
+ const rowSize = canvas.width * 4
421
+ for (let y = 0; y < canvas.height; y++) {
422
+ const srcRow = (canvas.height - 1 - y) * rowSize
423
+ const dstRow = y * rowSize
424
+ imageData.data.set(pixels.subarray(srcRow, srcRow + rowSize), dstRow)
425
+ }
426
+ ctx.putImageData(imageData, 0, 0)
427
+
428
+ if (applyBrightness) appliedAugmentations.push('brightness_gpu')
429
+ if (applyNoise) appliedAugmentations.push('noise_gpu')
430
+ }
431
+ }
432
+ } else {
433
+ // CPU fallback path
434
+ if (applyBrightness) {
435
+ const adjustment = 1 + (random() - 0.5) * augValues.brightness / 50
436
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
437
+ for (let i = 0; i < imageData.data.length; i += 4) {
438
+ imageData.data[i] = Math.min(255, imageData.data[i] * adjustment)
439
+ imageData.data[i + 1] = Math.min(255, imageData.data[i + 1] * adjustment)
440
+ imageData.data[i + 2] = Math.min(255, imageData.data[i + 2] * adjustment)
441
+ }
442
+ ctx.putImageData(imageData, 0, 0)
443
+ appliedAugmentations.push('brightness')
444
  }
 
 
 
445
 
446
+ if (applyNoise) {
447
+ const imageData = ctx.getImageData(0, 0, canvas.width, canvas.height)
448
+ const noiseLevel = augValues.gaussian_noise / 2
449
+ for (let i = 0; i < imageData.data.length; i += 4) {
450
+ const noise = (random() - 0.5) * noiseLevel
451
+ imageData.data[i] = Math.max(0, Math.min(255, imageData.data[i] + noise))
452
+ imageData.data[i + 1] = Math.max(0, Math.min(255, imageData.data[i + 1] + noise))
453
+ imageData.data[i + 2] = Math.max(0, Math.min(255, imageData.data[i + 2] + noise))
454
+ }
455
+ ctx.putImageData(imageData, 0, 0)
456
+ appliedAugmentations.push('noise')
457
  }
 
 
458
  }
459
  }
460
  }
 
461
 
462
+ // Convert to blob and release canvas when done
463
+ return new Promise((resolve, reject) => {
464
+ canvas.toBlob(
465
+ (blob) => {
466
+ releaseCanvas(canvas) // Return canvas to pool
467
+ if (blob) {
468
+ resolve({ blob, augmentations: appliedAugmentations, backgroundStyle: bg.styleName })
469
+ } else {
470
+ reject(new Error('Failed to convert canvas to blob'))
471
+ }
472
+ },
473
+ 'image/png'
474
+ )
475
+ })
476
+ } catch (err) {
477
+ releaseCanvas(canvas) // Ensure canvas is released on error
478
+ throw err
479
+ }
480
  }
481
 
482
  // Render a single sample (for parallel processing)
 
931
  // Generate zip blob with dynamic compression based on dataset size
932
  onProgress(95, 'Compressing dataset...')
933
 
934
+ // Use lower/no compression for large datasets (faster, less memory)
935
+ // STORE = no compression, uses less memory
936
+ let compressionType: 'DEFLATE' | 'STORE' = 'DEFLATE'
937
+ let compressionLevel = 6
938
+
939
+ if (labels.length > 20000) {
940
+ // Very large: no compression (STORE) to prevent memory issues
941
+ compressionType = 'STORE'
942
+ compressionLevel = 0
943
+ console.log(`Large dataset (${labels.length}): Using STORE (no compression) to prevent memory issues`)
944
+ } else if (labels.length > 10000) {
945
+ compressionLevel = 1 // Minimal compression
946
+ console.log(`Medium-large dataset (${labels.length}): Using minimal compression level 1`)
947
+ } else if (labels.length > 5000) {
948
+ compressionLevel = 3
949
+ console.log(`Medium dataset (${labels.length}): Using compression level 3`)
950
+ }
951
 
952
+ console.log(`Compressing ${labels.length} files with ${compressionType}, level ${compressionLevel}`)
953
 
954
+ let zipBlob: Blob
955
+ try {
956
+ zipBlob = await zip.generateAsync({
957
+ type: 'blob',
958
+ compression: compressionType,
959
+ compressionOptions: compressionType === 'DEFLATE' ? { level: compressionLevel } : undefined,
960
+ streamFiles: true // Stream files to reduce memory usage
961
+ })
962
+ } catch (memoryError: any) {
963
+ console.error('ZIP generation failed, trying without compression:', memoryError)
964
+ onProgress(96, 'Retrying with no compression...')
965
+
966
+ // Retry with no compression at all
967
+ try {
968
+ zipBlob = await zip.generateAsync({
969
+ type: 'blob',
970
+ compression: 'STORE',
971
+ streamFiles: true
972
+ })
973
+ } catch (finalError) {
974
+ throw new Error(`Failed to create ZIP: Memory limit exceeded. Try using Local Folder storage mode instead of Memory for large datasets.`)
975
+ }
976
+ }
977
 
978
  const endTime = Date.now()
979
  const durationSeconds = (endTime - startTime) / 1000
web/lib/storage-manager.ts ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Storage Manager for OCR Dataset Generator
2
+ // Supports multiple storage backends: Memory, IndexedDB, Local Folder, HuggingFace
3
+
4
+ export type StorageMode = 'memory' | 'indexeddb' | 'local' | 'huggingface'
5
+
6
+ export interface StorageConfig {
7
+ mode: StorageMode
8
+ localPath?: string // For 'local' mode - user selected folder
9
+ dataDir?: string // For 'huggingface' mode - /data directory
10
+ cleanupHours?: number
11
+ }
12
+
13
+ export interface StoredSample {
14
+ id: string
15
+ filename: string
16
+ blob: Blob
17
+ label: string
18
+ }
19
+
20
+ // IndexedDB Database name and store
21
+ const DB_NAME = 'ocr_dataset_generator'
22
+ const STORE_NAME = 'samples'
23
+ const DB_VERSION = 1
24
+
25
+ // ============================================
26
+ // IndexedDB Storage Implementation
27
+ // ============================================
28
+
29
+ class IndexedDBStorage {
30
+ private db: IDBDatabase | null = null
31
+ private initPromise: Promise<void> | null = null
32
+
33
+ async init(): Promise<void> {
34
+ if (this.db) return
35
+ if (this.initPromise) return this.initPromise
36
+
37
+ this.initPromise = new Promise((resolve, reject) => {
38
+ const request = indexedDB.open(DB_NAME, DB_VERSION)
39
+
40
+ request.onerror = () => reject(request.error)
41
+ request.onsuccess = () => {
42
+ this.db = request.result
43
+ resolve()
44
+ }
45
+
46
+ request.onupgradeneeded = (event) => {
47
+ const db = (event.target as IDBOpenDBRequest).result
48
+ if (!db.objectStoreNames.contains(STORE_NAME)) {
49
+ db.createObjectStore(STORE_NAME, { keyPath: 'id' })
50
+ }
51
+ }
52
+ })
53
+
54
+ return this.initPromise
55
+ }
56
+
57
+ async store(sample: StoredSample): Promise<void> {
58
+ await this.init()
59
+ return new Promise((resolve, reject) => {
60
+ const transaction = this.db!.transaction([STORE_NAME], 'readwrite')
61
+ const store = transaction.objectStore(STORE_NAME)
62
+ const request = store.put(sample)
63
+ request.onerror = () => reject(request.error)
64
+ request.onsuccess = () => resolve()
65
+ })
66
+ }
67
+
68
+ async getAll(): Promise<StoredSample[]> {
69
+ await this.init()
70
+ return new Promise((resolve, reject) => {
71
+ const transaction = this.db!.transaction([STORE_NAME], 'readonly')
72
+ const store = transaction.objectStore(STORE_NAME)
73
+ const request = store.getAll()
74
+ request.onerror = () => reject(request.error)
75
+ request.onsuccess = () => resolve(request.result)
76
+ })
77
+ }
78
+
79
+ async count(): Promise<number> {
80
+ await this.init()
81
+ return new Promise((resolve, reject) => {
82
+ const transaction = this.db!.transaction([STORE_NAME], 'readonly')
83
+ const store = transaction.objectStore(STORE_NAME)
84
+ const request = store.count()
85
+ request.onerror = () => reject(request.error)
86
+ request.onsuccess = () => resolve(request.result)
87
+ })
88
+ }
89
+
90
+ async clear(): Promise<void> {
91
+ await this.init()
92
+ return new Promise((resolve, reject) => {
93
+ const transaction = this.db!.transaction([STORE_NAME], 'readwrite')
94
+ const store = transaction.objectStore(STORE_NAME)
95
+ const request = store.clear()
96
+ request.onerror = () => reject(request.error)
97
+ request.onsuccess = () => resolve()
98
+ })
99
+ }
100
+
101
+ close(): void {
102
+ if (this.db) {
103
+ this.db.close()
104
+ this.db = null
105
+ this.initPromise = null
106
+ }
107
+ }
108
+ }
109
+
110
+ // ============================================
111
+ // File System Access API (Local Folder)
112
+ // ============================================
113
+
114
+ class FileSystemStorage {
115
+ private directoryHandle: FileSystemDirectoryHandle | null = null
116
+ private imagesHandle: FileSystemDirectoryHandle | null = null
117
+ private datasetName: string = ''
118
+
119
+ async selectFolder(): Promise<string | null> {
120
+ try {
121
+ // @ts-ignore - File System Access API
122
+ this.directoryHandle = await window.showDirectoryPicker({
123
+ mode: 'readwrite',
124
+ startIn: 'documents'
125
+ })
126
+
127
+ // Create timestamped dataset folder
128
+ this.datasetName = `dataset_${new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19)}`
129
+ if (!this.directoryHandle) throw new Error('No directory handle')
130
+ const datasetHandle = await this.directoryHandle.getDirectoryHandle(this.datasetName, { create: true })
131
+ this.imagesHandle = await datasetHandle.getDirectoryHandle('images', { create: true })
132
+
133
+ return `${this.directoryHandle.name}/${this.datasetName}`
134
+ } catch (err) {
135
+ console.error('Failed to select folder:', err)
136
+ return null
137
+ }
138
+ }
139
+
140
+ async writeImage(filename: string, blob: Blob): Promise<void> {
141
+ if (!this.imagesHandle) throw new Error('No folder selected')
142
+ const fileHandle = await this.imagesHandle.getFileHandle(filename, { create: true })
143
+ const writable = await fileHandle.createWritable()
144
+ await writable.write(blob)
145
+ await writable.close()
146
+ }
147
+
148
+ async writeLabels(content: string, filename: string = 'labels.txt'): Promise<void> {
149
+ if (!this.directoryHandle || !this.datasetName) throw new Error('No folder selected')
150
+ const datasetHandle = await this.directoryHandle.getDirectoryHandle(this.datasetName, { create: false })
151
+ const fileHandle = await datasetHandle.getFileHandle(filename, { create: true })
152
+ const writable = await fileHandle.createWritable()
153
+ await writable.write(content)
154
+ await writable.close()
155
+ }
156
+
157
+ async writeMetadata(metadata: object): Promise<void> {
158
+ if (!this.directoryHandle || !this.datasetName) throw new Error('No folder selected')
159
+ const datasetHandle = await this.directoryHandle.getDirectoryHandle(this.datasetName, { create: false })
160
+ const fileHandle = await datasetHandle.getFileHandle('metadata.json', { create: true })
161
+ const writable = await fileHandle.createWritable()
162
+ await writable.write(JSON.stringify(metadata, null, 2))
163
+ await writable.close()
164
+ }
165
+
166
+ getPath(): string | null {
167
+ return this.directoryHandle ? `${this.directoryHandle.name}/${this.datasetName}` : null
168
+ }
169
+
170
+ isSupported(): boolean {
171
+ return 'showDirectoryPicker' in window
172
+ }
173
+ }
174
+
175
+ // ============================================
176
+ // Server Storage (Local/HuggingFace via API)
177
+ // ============================================
178
+
179
+ class ServerStorage {
180
+ private sessionId: string = ''
181
+ private mode: 'local' | 'huggingface' = 'local'
182
+
183
+ constructor(mode: 'local' | 'huggingface' = 'local') {
184
+ this.mode = mode
185
+ this.sessionId = `dataset_${Date.now()}_${Math.random().toString(36).slice(2, 8)}`
186
+ }
187
+
188
+ async writeImage(filename: string, blob: Blob): Promise<void> {
189
+ const formData = new FormData()
190
+ formData.append('file', blob, filename)
191
+ formData.append('sessionId', this.sessionId)
192
+ formData.append('mode', this.mode)
193
+ formData.append('type', 'image')
194
+
195
+ const response = await fetch('/api/storage', {
196
+ method: 'POST',
197
+ body: formData
198
+ })
199
+
200
+ if (!response.ok) {
201
+ throw new Error(`Failed to write image: ${response.statusText}`)
202
+ }
203
+ }
204
+
205
+ async writeLabels(labels: string[]): Promise<void> {
206
+ const response = await fetch('/api/storage', {
207
+ method: 'POST',
208
+ headers: { 'Content-Type': 'application/json' },
209
+ body: JSON.stringify({
210
+ sessionId: this.sessionId,
211
+ mode: this.mode,
212
+ type: 'labels',
213
+ content: labels.join('\n')
214
+ })
215
+ })
216
+
217
+ if (!response.ok) {
218
+ throw new Error(`Failed to write labels: ${response.statusText}`)
219
+ }
220
+ }
221
+
222
+ async writeMetadata(metadata: object): Promise<void> {
223
+ const response = await fetch('/api/storage', {
224
+ method: 'POST',
225
+ headers: { 'Content-Type': 'application/json' },
226
+ body: JSON.stringify({
227
+ sessionId: this.sessionId,
228
+ mode: this.mode,
229
+ type: 'metadata',
230
+ content: JSON.stringify(metadata, null, 2)
231
+ })
232
+ })
233
+
234
+ if (!response.ok) {
235
+ throw new Error(`Failed to write metadata: ${response.statusText}`)
236
+ }
237
+ }
238
+
239
+ async finalize(): Promise<{ path: string; downloadUrl?: string }> {
240
+ const response = await fetch('/api/storage', {
241
+ method: 'POST',
242
+ headers: { 'Content-Type': 'application/json' },
243
+ body: JSON.stringify({
244
+ sessionId: this.sessionId,
245
+ mode: this.mode,
246
+ type: 'finalize'
247
+ })
248
+ })
249
+
250
+ if (!response.ok) {
251
+ throw new Error(`Failed to finalize: ${response.statusText}`)
252
+ }
253
+
254
+ return response.json()
255
+ }
256
+
257
+ getSessionId(): string {
258
+ return this.sessionId
259
+ }
260
+ }
261
+
262
+ // ============================================
263
+ // Canvas Pool for Performance
264
+ // ============================================
265
+
266
+ interface PooledCanvas {
267
+ canvas: HTMLCanvasElement
268
+ ctx: CanvasRenderingContext2D
269
+ inUse: boolean
270
+ }
271
+
272
+ class CanvasPool {
273
+ private pool: PooledCanvas[] = []
274
+ private maxSize: number = 8
275
+
276
+ acquire(width: number, height: number): { canvas: HTMLCanvasElement; ctx: CanvasRenderingContext2D } {
277
+ // Find available canvas with matching size
278
+ for (const item of this.pool) {
279
+ if (!item.inUse && item.canvas.width === width && item.canvas.height === height) {
280
+ item.inUse = true
281
+ // Clear the canvas
282
+ item.ctx.clearRect(0, 0, width, height)
283
+ return { canvas: item.canvas, ctx: item.ctx }
284
+ }
285
+ }
286
+
287
+ // Create new canvas if pool not full
288
+ if (this.pool.length < this.maxSize) {
289
+ const canvas = document.createElement('canvas')
290
+ canvas.width = width
291
+ canvas.height = height
292
+ const ctx = canvas.getContext('2d')!
293
+ const pooled: PooledCanvas = { canvas, ctx, inUse: true }
294
+ this.pool.push(pooled)
295
+ return { canvas, ctx }
296
+ }
297
+
298
+ // Wait for a canvas to become available (shouldn't happen often)
299
+ const canvas = document.createElement('canvas')
300
+ canvas.width = width
301
+ canvas.height = height
302
+ const ctx = canvas.getContext('2d')!
303
+ return { canvas, ctx }
304
+ }
305
+
306
+ release(canvas: HTMLCanvasElement): void {
307
+ for (const item of this.pool) {
308
+ if (item.canvas === canvas) {
309
+ item.inUse = false
310
+ return
311
+ }
312
+ }
313
+ }
314
+
315
+ clear(): void {
316
+ this.pool = []
317
+ }
318
+ }
319
+
320
+ // ============================================
321
+ // Unified Storage Manager
322
+ // ============================================
323
+
324
+ export class StorageManager {
325
+ private config: StorageConfig
326
+ private indexedDB: IndexedDBStorage | null = null
327
+ private fileSystem: FileSystemStorage | null = null
328
+ private serverStorage: ServerStorage | null = null
329
+ private memoryStorage: StoredSample[] = []
330
+ private canvasPool: CanvasPool
331
+
332
+ constructor(config: StorageConfig) {
333
+ this.config = config
334
+ this.canvasPool = new CanvasPool()
335
+ this.initialize()
336
+ }
337
+
338
+ private initialize(): void {
339
+ switch (this.config.mode) {
340
+ case 'indexeddb':
341
+ this.indexedDB = new IndexedDBStorage()
342
+ break
343
+ case 'local':
344
+ this.fileSystem = new FileSystemStorage()
345
+ break
346
+ case 'huggingface':
347
+ this.serverStorage = new ServerStorage('huggingface')
348
+ break
349
+ // 'memory' uses memoryStorage array
350
+ }
351
+ }
352
+
353
+ async selectLocalFolder(): Promise<string | null> {
354
+ if (this.config.mode !== 'local' || !this.fileSystem) {
355
+ throw new Error('Local folder selection only available in local mode')
356
+ }
357
+ return this.fileSystem.selectFolder()
358
+ }
359
+
360
+ isLocalFolderSupported(): boolean {
361
+ return 'showDirectoryPicker' in window
362
+ }
363
+
364
+ getCanvasPool(): CanvasPool {
365
+ return this.canvasPool
366
+ }
367
+
368
+ async storeSample(sample: StoredSample): Promise<void> {
369
+ switch (this.config.mode) {
370
+ case 'memory':
371
+ this.memoryStorage.push(sample)
372
+ break
373
+ case 'indexeddb':
374
+ await this.indexedDB!.store(sample)
375
+ break
376
+ case 'local':
377
+ await this.fileSystem!.writeImage(sample.filename, sample.blob)
378
+ break
379
+ case 'huggingface':
380
+ await this.serverStorage!.writeImage(sample.filename, sample.blob)
381
+ break
382
+ }
383
+ }
384
+
385
+ async getSampleCount(): Promise<number> {
386
+ switch (this.config.mode) {
387
+ case 'memory':
388
+ return this.memoryStorage.length
389
+ case 'indexeddb':
390
+ return this.indexedDB!.count()
391
+ default:
392
+ return 0 // Server modes don't track count locally
393
+ }
394
+ }
395
+
396
+ async getAllSamples(): Promise<StoredSample[]> {
397
+ switch (this.config.mode) {
398
+ case 'memory':
399
+ return this.memoryStorage
400
+ case 'indexeddb':
401
+ return this.indexedDB!.getAll()
402
+ default:
403
+ return [] // Server modes don't support this
404
+ }
405
+ }
406
+
407
+ async finalize(labels: string[], metadata: object): Promise<{ path?: string; blob?: Blob }> {
408
+ switch (this.config.mode) {
409
+ case 'memory':
410
+ // Return samples for ZIP building
411
+ return { blob: undefined }
412
+ case 'indexeddb':
413
+ // Return samples for ZIP building
414
+ return { blob: undefined }
415
+ case 'local':
416
+ await this.fileSystem!.writeLabels(labels.join('\n'))
417
+ await this.fileSystem!.writeMetadata(metadata)
418
+ return { path: this.fileSystem!.getPath() || undefined }
419
+ case 'huggingface':
420
+ await this.serverStorage!.writeLabels(labels)
421
+ await this.serverStorage!.writeMetadata(metadata)
422
+ const result = await this.serverStorage!.finalize()
423
+ return { path: result.path }
424
+ }
425
+ }
426
+
427
+ async clear(): Promise<void> {
428
+ this.memoryStorage = []
429
+ if (this.indexedDB) {
430
+ await this.indexedDB.clear()
431
+ }
432
+ this.canvasPool.clear()
433
+ }
434
+
435
+ getMode(): StorageMode {
436
+ return this.config.mode
437
+ }
438
+
439
+ close(): void {
440
+ if (this.indexedDB) {
441
+ this.indexedDB.close()
442
+ }
443
+ this.canvasPool.clear()
444
+ }
445
+ }
446
+
447
+ // ============================================
448
+ // Singleton Instance
449
+ // ============================================
450
+
451
+ let storageManagerInstance: StorageManager | null = null
452
+
453
+ export function getStorageManager(config?: StorageConfig): StorageManager {
454
+ if (!storageManagerInstance || (config && config.mode !== storageManagerInstance.getMode())) {
455
+ if (storageManagerInstance) {
456
+ storageManagerInstance.close()
457
+ }
458
+ storageManagerInstance = new StorageManager(config || { mode: 'memory' })
459
+ }
460
+ return storageManagerInstance
461
+ }
462
+
463
+ export function resetStorageManager(): void {
464
+ if (storageManagerInstance) {
465
+ storageManagerInstance.close()
466
+ storageManagerInstance = null
467
+ }
468
+ }