| """ | |
| Data Config | |
| Specifies the hyperparameters for the dataset, dataloader, and tokenizer. | |
| """ | |
| from dataclasses import dataclass, field | |
| from ._constants import BATCH_SIZE, VOCAB_SIZE | |
| class DatasetConfig: | |
| # Defines the HuggingFace name of a dataset | |
| name: str = "pico-lm/pretokenized-dolma" | |
| class DataLoaderConfig: | |
| # NOTE: You should only change these values jointly with the training config; so that the | |
| # sub-batch size is consistent with the gradient accumulation steps | |
| batch_size: int = BATCH_SIZE | |
| class TokenizerConfig: | |
| # Specify a tokenizer to use | |
| name: str = "allenai/OLMo-7B-0724-hf" | |
| vocab_size: int = VOCAB_SIZE | |
| class DataConfig: | |
| dataset: DatasetConfig = field(default_factory=DatasetConfig) | |
| dataloader: DataLoaderConfig = field(default_factory=DataLoaderConfig) | |
| tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig) | |