Spaces:
Running
Running
| /** | |
| * Configuration types for the OCR Dataset Generator | |
| */ | |
| // ============ Input Configuration ============ | |
| export type SegmentationMode = 'character' | 'word' | 'ngram' | 'sentence' | 'line'; | |
| export interface InputConfig { | |
| file: string; | |
| encoding?: string; | |
| segmentation: SegmentationMode; | |
| ngram_size?: number; | |
| min_length?: number; | |
| max_length?: number; | |
| shuffle?: boolean; | |
| } | |
| // ============ Dataset Configuration ============ | |
| export interface DatasetConfig { | |
| size: number; | |
| seed?: number; | |
| train_split?: number; | |
| val_split?: number; | |
| } | |
| // ============ Image Configuration ============ | |
| export type TextDirection = 'rtl' | 'ltr'; | |
| export type TextAlignment = 'left' | 'center' | 'right'; | |
| export interface PaddingConfig { | |
| left: number; | |
| right: number; | |
| top: number; | |
| bottom: number; | |
| } | |
| export interface ImageConfig { | |
| width: number; | |
| height: number; | |
| dpi?: number; | |
| padding?: PaddingConfig | number; | |
| background: string; | |
| text_color: string; | |
| direction: TextDirection; | |
| alignment?: TextAlignment; | |
| antialiasing?: boolean; | |
| } | |
| // ============ Font Configuration ============ | |
| export type SizeDistribution = 'uniform' | 'normal' | 'random'; | |
| export interface FontSizeConfig { | |
| min: number; | |
| max: number; | |
| distribution?: SizeDistribution; | |
| } | |
| export interface FontDistributionEntry { | |
| family: string; | |
| percentage: number; | |
| styles?: string[]; | |
| weight?: number[]; | |
| } | |
| export interface FontConfig { | |
| directory: string; | |
| fallback?: string; | |
| size: FontSizeConfig; | |
| distribution: FontDistributionEntry[]; | |
| } | |
| // ============ Unicode Configuration ============ | |
| export type NormalizationForm = 'NFC' | 'NFD' | 'none'; | |
| export interface UnicodeConfig { | |
| normalization: NormalizationForm; | |
| enforce_script?: string; | |
| reject_mixed?: boolean; | |
| allowed_scripts?: string[]; | |
| preserve_diacritics?: boolean; | |
| } | |
| // ============ Augmentation Configuration ============ | |
| export interface RangeConfig { | |
| enabled: boolean; | |
| percentage: number; | |
| } | |
| export interface RotationConfig extends RangeConfig { | |
| range: [number, number]; | |
| } | |
| export interface SkewConfig extends RangeConfig { | |
| x_range: [number, number]; | |
| y_range: [number, number]; | |
| } | |
| export interface PerspectiveConfig extends RangeConfig { | |
| intensity: [number, number]; | |
| } | |
| export interface GaussianBlurConfig extends RangeConfig { | |
| sigma: [number, number]; | |
| } | |
| export interface MotionBlurConfig extends RangeConfig { | |
| kernel_size: [number, number]; | |
| angle: [number, number]; | |
| } | |
| export interface GaussianNoiseConfig extends RangeConfig { | |
| mean: number; | |
| std: [number, number]; | |
| } | |
| export interface SaltPepperConfig extends RangeConfig { | |
| amount: [number, number]; | |
| } | |
| export interface JpegArtifactsConfig extends RangeConfig { | |
| quality: [number, number]; | |
| } | |
| export interface ResolutionDegradationConfig extends RangeConfig { | |
| scale_factor: [number, number]; | |
| } | |
| export interface PaperTextureConfig extends RangeConfig { | |
| textures?: string[]; | |
| opacity: [number, number]; | |
| } | |
| export interface ShadowConfig extends RangeConfig { | |
| position?: ('left' | 'right' | 'top' | 'bottom')[]; | |
| intensity: [number, number]; | |
| } | |
| export interface InkBleedConfig extends RangeConfig { | |
| intensity: [number, number]; | |
| } | |
| export interface BrightnessConfig extends RangeConfig { | |
| range: [number, number]; | |
| } | |
| export interface ContrastConfig extends RangeConfig { | |
| range: [number, number]; | |
| } | |
| export interface TransformsConfig { | |
| rotation?: RotationConfig; | |
| skew?: SkewConfig; | |
| perspective?: PerspectiveConfig; | |
| gaussian_blur?: GaussianBlurConfig; | |
| motion_blur?: MotionBlurConfig; | |
| gaussian_noise?: GaussianNoiseConfig; | |
| salt_pepper?: SaltPepperConfig; | |
| jpeg_artifacts?: JpegArtifactsConfig; | |
| resolution_degradation?: ResolutionDegradationConfig; | |
| paper_texture?: PaperTextureConfig; | |
| shadow?: ShadowConfig; | |
| ink_bleed?: InkBleedConfig; | |
| brightness?: BrightnessConfig; | |
| contrast?: ContrastConfig; | |
| } | |
| export type AugmentationPreset = 'clean' | 'balanced' | 'heavy' | 'document_scan' | 'mobile_camera'; | |
| export interface AugmentationConfig { | |
| enabled: boolean; | |
| preset?: AugmentationPreset; | |
| apply_percentage: number; | |
| max_transforms_per_sample?: number; | |
| transforms?: TransformsConfig; | |
| } | |
| // ============ Output Configuration ============ | |
| export type OutputFormat = 'crnn' | 'trocr' | 'sar' | 'paddleocr' | 'tesseract' | 'csv' | 'tsv' | 'json' | 'jsonl' | 'lmdb'; | |
| export interface OutputOptions { | |
| crnn?: { delimiter?: string }; | |
| trocr?: { format?: 'json' | 'jsonl' }; | |
| csv?: { include_metadata?: boolean }; | |
| } | |
| export interface OutputConfig { | |
| directory: string; | |
| formats: OutputFormat[]; | |
| options?: OutputOptions; | |
| } | |
| // ============ Parallelism Configuration ============ | |
| export interface ParallelismConfig { | |
| workers: 'auto' | number; | |
| batch_size?: number; | |
| memory_limit?: string; | |
| } | |
| // ============ Logging Configuration ============ | |
| export type LogLevel = 'debug' | 'info' | 'warn' | 'error'; | |
| export interface LoggingConfig { | |
| level: LogLevel; | |
| file?: string; | |
| progress_bar?: boolean; | |
| stats_report?: boolean; | |
| } | |
| // ============ Recovery Configuration ============ | |
| export interface RecoveryConfig { | |
| enabled: boolean; | |
| checkpoint_interval?: number; | |
| resume_on_failure?: boolean; | |
| } | |
| // ============ Main Configuration ============ | |
| export interface Config { | |
| input: InputConfig; | |
| dataset: DatasetConfig; | |
| image: ImageConfig; | |
| fonts: FontConfig; | |
| unicode: UnicodeConfig; | |
| augmentation: AugmentationConfig; | |
| output: OutputConfig; | |
| parallelism?: ParallelismConfig; | |
| logging?: LoggingConfig; | |
| recovery?: RecoveryConfig; | |
| } | |
| // ============ Sample Metadata ============ | |
| export interface SampleMetadata { | |
| id: string; | |
| text: string; | |
| image_path: string; | |
| font_family: string; | |
| font_size: number; | |
| augmentations_applied: string[]; | |
| unicode_normalized: boolean; | |
| } | |
| // ============ Statistics ============ | |
| export interface FontStats { | |
| family: string; | |
| count: number; | |
| percentage: number; | |
| } | |
| export interface AugmentationStats { | |
| name: string; | |
| count: number; | |
| percentage: number; | |
| } | |
| export interface GenerationStats { | |
| total_samples: number; | |
| duration_seconds: number; | |
| samples_per_second: number; | |
| font_distribution: FontStats[]; | |
| clean_samples: number; | |
| augmented_samples: number; | |
| augmentation_stats: AugmentationStats[]; | |
| avg_transforms_per_sample: number; | |
| unique_tokens: number; | |
| avg_chars_per_sample: number; | |
| unicode_valid: number; | |
| script_pure: number; | |
| rejected_samples: number; | |
| } | |
| // ============ Default Values ============ | |
| export const DEFAULT_CONFIG: Partial<Config> = { | |
| input: { | |
| file: './input/text.txt', | |
| encoding: 'utf-8', | |
| segmentation: 'word', | |
| min_length: 1, | |
| max_length: 50, | |
| shuffle: true, | |
| }, | |
| dataset: { | |
| size: 10000, | |
| seed: 42, | |
| train_split: 0.9, | |
| val_split: 0.1, | |
| }, | |
| image: { | |
| width: 256, | |
| height: 64, | |
| dpi: 150, | |
| padding: { left: 10, right: 10, top: 5, bottom: 5 }, | |
| background: '#FFFFFF', | |
| text_color: '#000000', | |
| direction: 'rtl', | |
| alignment: 'center', | |
| antialiasing: true, | |
| }, | |
| parallelism: { | |
| workers: 'auto', | |
| batch_size: 100, | |
| }, | |
| logging: { | |
| level: 'info', | |
| progress_bar: true, | |
| stats_report: true, | |
| }, | |
| recovery: { | |
| enabled: true, | |
| checkpoint_interval: 1000, | |
| resume_on_failure: true, | |
| }, | |
| }; | |