File size: 936 Bytes
feba2ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
"""
Data Config
Specifies the hyperparameters for the dataset, dataloader, and tokenizer.
"""
from dataclasses import dataclass, field
from ._constants import BATCH_SIZE, VOCAB_SIZE
@dataclass
class DatasetConfig:
# Defines the HuggingFace name of a dataset
name: str = "pico-lm/pretokenized-dolma"
@dataclass
class DataLoaderConfig:
# NOTE: You should only change these values jointly with the training config; so that the
# sub-batch size is consistent with the gradient accumulation steps
batch_size: int = BATCH_SIZE
@dataclass
class TokenizerConfig:
# Specify a tokenizer to use
name: str = "allenai/OLMo-7B-0724-hf"
vocab_size: int = VOCAB_SIZE
@dataclass
class DataConfig:
dataset: DatasetConfig = field(default_factory=DatasetConfig)
dataloader: DataLoaderConfig = field(default_factory=DataLoaderConfig)
tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)
|