File size: 873 Bytes
969a1de | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | # 5M config — Chinchilla-optimal BPE training (~5M params)
# Target: 100M tokens at 20 tokens/param
# RTX 3060 12GB: batch=32, ctx=256 → 8192 tokens/step → ~12300 steps
[model]
arch = "transformer"
embed_dim = 256
n_layers = 6
n_heads = 4
head_dim = 64
ffn_mult = 4
context_length = 256
dropout = 0.0
bias = false
weight_tying = true
[training]
optimizer = "adamw"
lr = 6e-4
min_lr = 6e-5
warmup_steps = 500
max_steps = 12305
batch_size = 32
grad_clip = 1.0
precision = "f16"
eval_interval = 500
eval_steps = 25
checkpoint_interval = 2000
seed = 42
[training.curriculum]
enabled = false
[training.coreset]
enabled = false
[data]
train_path = "../text-pipeline/output/train.txt"
val_path = "../text-pipeline/output/val.txt"
tokenizer_dir = "../text-pipeline/output"
[inference]
precision = "f16"
compile = false
temperature = 0.8
top_k = 40
max_new_tokens = 500
|