File size: 1,270 Bytes
53f0cc2
 
 
 
 
 
 
 
 
 
 
 
0f8853a
53f0cc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f8853a
 
 
53f0cc2
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class Paths:
    project_root: Path = Path(".")
    model_dir: Path = Path("./model")
    data_dir: Path = Path("./data")
    output_dir: Path = Path("./output")
    logs_dir: Path = Path("./logs")

    train_jsonl: Path = Path("./data/final/train.jsonl")
    dataset_cache_dir: Path = Path("./data/cache")
    raw_dataset_dir: Path = Path("./data/cache/raw")
    checkpoint_dir: Path = Path("./output/checkpoints")
    lora_output_dir: Path = Path("./output/lora_adapters")
    tokenizer_output_dir: Path = Path("./output/tokenizer")


@dataclass(frozen=True)
class DataConfig:
    max_total_samples: int = 200000
    max_humaneval_samples: int = 20000
    max_mbpp_samples: int = 50000
    max_codesearchnet_samples: int = 180000
    min_output_chars: int = 40


@dataclass(frozen=True)
class TrainingConfig:
    num_train_epochs: int = 5
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 16
    learning_rate: float = 2e-5
    max_length: int = 1024
    save_steps: int = 250
    logging_steps: int = 20
    eval_max_new_tokens: int = 220
    resume_training: bool = True


PATHS = Paths()
DATA_CONFIG = DataConfig()
TRAINING_CONFIG = TrainingConfig()