""" Configuration file with exact hyperparameters from the LREC-COLING 2024 paper. """ import os from dataclasses import dataclass from typing import Optional @dataclass class Config: # Paths data_dir: str = "data" db_path: str = "data/ideograph.db" # Not used in demo font_dir: str = "data/font" # Not used in demo real_data_dir: str = "data/real" resnet_weights: str = None # Don't load local resnet weights file, allow torchvision download or None checkpoint_dir: str = "checkpoints" log_dir: str = "logs" # Model configuration roberta_model: str = "ethanyt/guwenbert-base" image_size: int = 64 vocab_size: int = 23292 # GuwenBERT vocab size (actual) hidden_dim: int = 768 # RoBERTa large hidden size resnet_out_dim: int = 2048 # ResNet50 output dimension # Image decoder configuration num_deconv_layers: int = 5 # Training hyperparameters (exact from paper, optimized for 4090 D) batch_size: int = 256 # Match paper's batch size (256) use_weighted_sampling_for_eval: bool = False # Use natural distribution for eval (match paper) num_epochs: int = 30 curriculum_epochs: int = 10 # First 10 epochs use curriculum learning learning_rate: float = 0.0001 min_lr: float = 1e-5 alpha: float = 100.0 # Loss weight for image reconstruction # Optimizer optimizer: str = "adam" weight_decay: float = 0.0 # Data sampling max_seq_length: int = 50 num_masks_min: int = 1 num_masks_max: int = 5 # Font filtering threshold (from Appendix) min_black_pixels: int = 510 # Image augmentation parameters (from Appendix) rotation_degrees: float = 5.0 translation_percent: float = 0.05 scale_percent: float = 0.10 brightness_range: tuple = (0.7, 1.3) contrast_range: tuple = (0.2, 1.0) blur_kernel_range: tuple = (2, 10) # Must be odd blur_sigma_range: tuple = (1.0, 10.0) # Damage simulation (from Appendix) num_small_masks_min: int = 1 num_small_masks_max: int = 20 # Evaluation num_eval_samples: int = 30 # Number of random samplings for evaluation top_k_values: list = None # [5, 10, 20] # Device device: str = "cuda" # NVIDIA GeForce RTX 4090 D num_workers: int = 4 # Reduced from 16 to 4 to prevent hanging/deadlocks pin_memory: bool = True # Optimization use_amp: bool = True # Automatic Mixed Precision gradient_accumulation_steps: int = 1 # Simulated batch size multiplier # Seed for reproducibility seed: int = 42 # TensorBoard configuration tensorboard_log_dir: str = "logs/tensorboard" tensorboard_enabled: bool = True tensorboard_log_images_interval: int = 5 # Log sample images every N epochs def __post_init__(self): if self.top_k_values is None: self.top_k_values = [5, 10, 20] # Create directories if they don't exist os.makedirs(self.checkpoint_dir, exist_ok=True) os.makedirs(self.log_dir, exist_ok=True) def get_phase1_checkpoint_path(self): """Path for Phase 1 (RoBERTa fine-tuning) checkpoint""" return os.path.join(self.checkpoint_dir, "phase1_roberta_finetuned.pt") def get_phase2_checkpoint_path(self, epoch: Optional[int] = None): """Path for Phase 2 (MMRM) checkpoint""" if epoch is not None: return os.path.join(self.checkpoint_dir, f"phase2_mmrm_epoch{epoch}.pt") return os.path.join(self.checkpoint_dir, "phase2_mmrm_best.pt") def get_baseline_checkpoint_path(self, baseline_name: str): """Path for baseline model checkpoints""" return os.path.join(self.checkpoint_dir, f"baseline_{baseline_name}.pt")