| # 5M config — Chinchilla-optimal BPE training (~5M params) | |
| # Target: 100M tokens at 20 tokens/param | |
| # RTX 3060 12GB: batch=32, ctx=256 → 8192 tokens/step → ~12300 steps | |
| [model] | |
| arch = "transformer" | |
| embed_dim = 256 | |
| n_layers = 6 | |
| n_heads = 4 | |
| head_dim = 64 | |
| ffn_mult = 4 | |
| context_length = 256 | |
| dropout = 0.0 | |
| bias = false | |
| weight_tying = true | |
| [training] | |
| optimizer = "adamw" | |
| lr = 6e-4 | |
| min_lr = 6e-5 | |
| warmup_steps = 500 | |
| max_steps = 12305 | |
| batch_size = 32 | |
| grad_clip = 1.0 | |
| precision = "f16" | |
| eval_interval = 500 | |
| eval_steps = 25 | |
| checkpoint_interval = 2000 | |
| seed = 42 | |
| [training.curriculum] | |
| enabled = false | |
| [training.coreset] | |
| enabled = false | |
| [data] | |
| train_path = "../text-pipeline/output/train.txt" | |
| val_path = "../text-pipeline/output/val.txt" | |
| tokenizer_dir = "../text-pipeline/output" | |
| [inference] | |
| precision = "f16" | |
| compile = false | |
| temperature = 0.8 | |
| top_k = 40 | |
| max_new_tokens = 500 | |