from dataclasses import dataclass from pathlib import Path @dataclass(frozen=True) class Paths: project_root: Path = Path(".") model_dir: Path = Path("./model") data_dir: Path = Path("./data") output_dir: Path = Path("./output") logs_dir: Path = Path("./logs") train_jsonl: Path = Path("./data/final/train.jsonl") dataset_cache_dir: Path = Path("./data/cache") raw_dataset_dir: Path = Path("./data/cache/raw") checkpoint_dir: Path = Path("./output/checkpoints") lora_output_dir: Path = Path("./output/lora_adapters") tokenizer_output_dir: Path = Path("./output/tokenizer") @dataclass(frozen=True) class DataConfig: max_total_samples: int = 200000 max_humaneval_samples: int = 20000 max_mbpp_samples: int = 50000 max_codesearchnet_samples: int = 180000 min_output_chars: int = 40 @dataclass(frozen=True) class TrainingConfig: num_train_epochs: int = 5 per_device_train_batch_size: int = 2 gradient_accumulation_steps: int = 16 learning_rate: float = 2e-5 max_length: int = 1024 save_steps: int = 250 logging_steps: int = 20 eval_max_new_tokens: int = 220 resume_training: bool = True PATHS = Paths() DATA_CONFIG = DataConfig() TRAINING_CONFIG = TrainingConfig()