mindi-backup / config.py
Mindigenous
Sync latest dataset/config updates and logs
0f8853a
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class Paths:
project_root: Path = Path(".")
model_dir: Path = Path("./model")
data_dir: Path = Path("./data")
output_dir: Path = Path("./output")
logs_dir: Path = Path("./logs")
train_jsonl: Path = Path("./data/final/train.jsonl")
dataset_cache_dir: Path = Path("./data/cache")
raw_dataset_dir: Path = Path("./data/cache/raw")
checkpoint_dir: Path = Path("./output/checkpoints")
lora_output_dir: Path = Path("./output/lora_adapters")
tokenizer_output_dir: Path = Path("./output/tokenizer")
@dataclass(frozen=True)
class DataConfig:
max_total_samples: int = 200000
max_humaneval_samples: int = 20000
max_mbpp_samples: int = 50000
max_codesearchnet_samples: int = 180000
min_output_chars: int = 40
@dataclass(frozen=True)
class TrainingConfig:
num_train_epochs: int = 5
per_device_train_batch_size: int = 2
gradient_accumulation_steps: int = 16
learning_rate: float = 2e-5
max_length: int = 1024
save_steps: int = 250
logging_steps: int = 20
eval_max_new_tokens: int = 220
resume_training: bool = True
PATHS = Paths()
DATA_CONFIG = DataConfig()
TRAINING_CONFIG = TrainingConfig()