OCR_DATASET_MAKER / src /core /config.ts
Omarrran's picture
OCR Dataset Generator for HF Spaces
24a732c
/**
* Configuration types for the OCR Dataset Generator
*/
// ============ Input Configuration ============
export type SegmentationMode = 'character' | 'word' | 'ngram' | 'sentence' | 'line';
export interface InputConfig {
file: string;
encoding?: string;
segmentation: SegmentationMode;
ngram_size?: number;
min_length?: number;
max_length?: number;
shuffle?: boolean;
}
// ============ Dataset Configuration ============
export interface DatasetConfig {
size: number;
seed?: number;
train_split?: number;
val_split?: number;
}
// ============ Image Configuration ============
export type TextDirection = 'rtl' | 'ltr';
export type TextAlignment = 'left' | 'center' | 'right';
export interface PaddingConfig {
left: number;
right: number;
top: number;
bottom: number;
}
export interface ImageConfig {
width: number;
height: number;
dpi?: number;
padding?: PaddingConfig | number;
background: string;
text_color: string;
direction: TextDirection;
alignment?: TextAlignment;
antialiasing?: boolean;
}
// ============ Font Configuration ============
export type SizeDistribution = 'uniform' | 'normal' | 'random';
export interface FontSizeConfig {
min: number;
max: number;
distribution?: SizeDistribution;
}
export interface FontDistributionEntry {
family: string;
percentage: number;
styles?: string[];
weight?: number[];
}
export interface FontConfig {
directory: string;
fallback?: string;
size: FontSizeConfig;
distribution: FontDistributionEntry[];
}
// ============ Unicode Configuration ============
export type NormalizationForm = 'NFC' | 'NFD' | 'none';
export interface UnicodeConfig {
normalization: NormalizationForm;
enforce_script?: string;
reject_mixed?: boolean;
allowed_scripts?: string[];
preserve_diacritics?: boolean;
}
// ============ Augmentation Configuration ============
export interface RangeConfig {
enabled: boolean;
percentage: number;
}
export interface RotationConfig extends RangeConfig {
range: [number, number];
}
export interface SkewConfig extends RangeConfig {
x_range: [number, number];
y_range: [number, number];
}
export interface PerspectiveConfig extends RangeConfig {
intensity: [number, number];
}
export interface GaussianBlurConfig extends RangeConfig {
sigma: [number, number];
}
export interface MotionBlurConfig extends RangeConfig {
kernel_size: [number, number];
angle: [number, number];
}
export interface GaussianNoiseConfig extends RangeConfig {
mean: number;
std: [number, number];
}
export interface SaltPepperConfig extends RangeConfig {
amount: [number, number];
}
export interface JpegArtifactsConfig extends RangeConfig {
quality: [number, number];
}
export interface ResolutionDegradationConfig extends RangeConfig {
scale_factor: [number, number];
}
export interface PaperTextureConfig extends RangeConfig {
textures?: string[];
opacity: [number, number];
}
export interface ShadowConfig extends RangeConfig {
position?: ('left' | 'right' | 'top' | 'bottom')[];
intensity: [number, number];
}
export interface InkBleedConfig extends RangeConfig {
intensity: [number, number];
}
export interface BrightnessConfig extends RangeConfig {
range: [number, number];
}
export interface ContrastConfig extends RangeConfig {
range: [number, number];
}
export interface TransformsConfig {
rotation?: RotationConfig;
skew?: SkewConfig;
perspective?: PerspectiveConfig;
gaussian_blur?: GaussianBlurConfig;
motion_blur?: MotionBlurConfig;
gaussian_noise?: GaussianNoiseConfig;
salt_pepper?: SaltPepperConfig;
jpeg_artifacts?: JpegArtifactsConfig;
resolution_degradation?: ResolutionDegradationConfig;
paper_texture?: PaperTextureConfig;
shadow?: ShadowConfig;
ink_bleed?: InkBleedConfig;
brightness?: BrightnessConfig;
contrast?: ContrastConfig;
}
export type AugmentationPreset = 'clean' | 'balanced' | 'heavy' | 'document_scan' | 'mobile_camera';
export interface AugmentationConfig {
enabled: boolean;
preset?: AugmentationPreset;
apply_percentage: number;
max_transforms_per_sample?: number;
transforms?: TransformsConfig;
}
// ============ Output Configuration ============
export type OutputFormat = 'crnn' | 'trocr' | 'sar' | 'paddleocr' | 'tesseract' | 'csv' | 'tsv' | 'json' | 'jsonl' | 'lmdb';
export interface OutputOptions {
crnn?: { delimiter?: string };
trocr?: { format?: 'json' | 'jsonl' };
csv?: { include_metadata?: boolean };
}
export interface OutputConfig {
directory: string;
formats: OutputFormat[];
options?: OutputOptions;
}
// ============ Parallelism Configuration ============
export interface ParallelismConfig {
workers: 'auto' | number;
batch_size?: number;
memory_limit?: string;
}
// ============ Logging Configuration ============
export type LogLevel = 'debug' | 'info' | 'warn' | 'error';
export interface LoggingConfig {
level: LogLevel;
file?: string;
progress_bar?: boolean;
stats_report?: boolean;
}
// ============ Recovery Configuration ============
export interface RecoveryConfig {
enabled: boolean;
checkpoint_interval?: number;
resume_on_failure?: boolean;
}
// ============ Main Configuration ============
export interface Config {
input: InputConfig;
dataset: DatasetConfig;
image: ImageConfig;
fonts: FontConfig;
unicode: UnicodeConfig;
augmentation: AugmentationConfig;
output: OutputConfig;
parallelism?: ParallelismConfig;
logging?: LoggingConfig;
recovery?: RecoveryConfig;
}
// ============ Sample Metadata ============
export interface SampleMetadata {
id: string;
text: string;
image_path: string;
font_family: string;
font_size: number;
augmentations_applied: string[];
unicode_normalized: boolean;
}
// ============ Statistics ============
export interface FontStats {
family: string;
count: number;
percentage: number;
}
export interface AugmentationStats {
name: string;
count: number;
percentage: number;
}
export interface GenerationStats {
total_samples: number;
duration_seconds: number;
samples_per_second: number;
font_distribution: FontStats[];
clean_samples: number;
augmented_samples: number;
augmentation_stats: AugmentationStats[];
avg_transforms_per_sample: number;
unique_tokens: number;
avg_chars_per_sample: number;
unicode_valid: number;
script_pure: number;
rejected_samples: number;
}
// ============ Default Values ============
export const DEFAULT_CONFIG: Partial<Config> = {
input: {
file: './input/text.txt',
encoding: 'utf-8',
segmentation: 'word',
min_length: 1,
max_length: 50,
shuffle: true,
},
dataset: {
size: 10000,
seed: 42,
train_split: 0.9,
val_split: 0.1,
},
image: {
width: 256,
height: 64,
dpi: 150,
padding: { left: 10, right: 10, top: 5, bottom: 5 },
background: '#FFFFFF',
text_color: '#000000',
direction: 'rtl',
alignment: 'center',
antialiasing: true,
},
parallelism: {
workers: 'auto',
batch_size: 100,
},
logging: {
level: 'info',
progress_bar: true,
stats_report: true,
},
recovery: {
enabled: true,
checkpoint_interval: 1000,
resume_on_failure: true,
},
};