# 5M config — Chinchilla-optimal BPE training (~5M params) # Target: 100M tokens at 20 tokens/param # RTX 3060 12GB: batch=32, ctx=256 → 8192 tokens/step → ~12300 steps [model] arch = "transformer" embed_dim = 256 n_layers = 6 n_heads = 4 head_dim = 64 ffn_mult = 4 context_length = 256 dropout = 0.0 bias = false weight_tying = true [training] optimizer = "adamw" lr = 6e-4 min_lr = 6e-5 warmup_steps = 500 max_steps = 12305 batch_size = 32 grad_clip = 1.0 precision = "f16" eval_interval = 500 eval_steps = 25 checkpoint_interval = 2000 seed = 42 [training.curriculum] enabled = false [training.coreset] enabled = false [data] train_path = "../text-pipeline/output/train.txt" val_path = "../text-pipeline/output/val.txt" tokenizer_dir = "../text-pipeline/output" [inference] precision = "f16" compile = false temperature = 0.8 top_k = 40 max_new_tokens = 500