| from dataclasses import dataclass |
| from pathlib import Path |
|
|
|
|
| @dataclass(frozen=True) |
| class Paths: |
| project_root: Path = Path(".") |
| model_dir: Path = Path("./model") |
| data_dir: Path = Path("./data") |
| output_dir: Path = Path("./output") |
| logs_dir: Path = Path("./logs") |
|
|
| train_jsonl: Path = Path("./data/final/train.jsonl") |
| dataset_cache_dir: Path = Path("./data/cache") |
| raw_dataset_dir: Path = Path("./data/cache/raw") |
| checkpoint_dir: Path = Path("./output/checkpoints") |
| lora_output_dir: Path = Path("./output/lora_adapters") |
| tokenizer_output_dir: Path = Path("./output/tokenizer") |
|
|
|
|
| @dataclass(frozen=True) |
| class DataConfig: |
| max_total_samples: int = 200000 |
| max_humaneval_samples: int = 20000 |
| max_mbpp_samples: int = 50000 |
| max_codesearchnet_samples: int = 180000 |
| min_output_chars: int = 40 |
|
|
|
|
| @dataclass(frozen=True) |
| class TrainingConfig: |
| num_train_epochs: int = 5 |
| per_device_train_batch_size: int = 2 |
| gradient_accumulation_steps: int = 16 |
| learning_rate: float = 2e-5 |
| max_length: int = 1024 |
| save_steps: int = 250 |
| logging_steps: int = 20 |
| eval_max_new_tokens: int = 220 |
| resume_training: bool = True |
|
|
|
|
| PATHS = Paths() |
| DATA_CONFIG = DataConfig() |
| TRAINING_CONFIG = TrainingConfig() |
|
|