LisaMegaWatts
/

JuliaSLM

+# 5M config — Chinchilla-optimal BPE training (~5M params)
+# Target: 100M tokens at 20 tokens/param
+# RTX 3060 12GB: batch=32, ctx=256 → 8192 tokens/step → ~12300 steps
+[model]
+arch = "transformer"
+embed_dim = 256
+n_layers = 6
+n_heads = 4
+head_dim = 64
+ffn_mult = 4
+context_length = 256
+dropout = 0.0
+bias = false
+weight_tying = true
+[training]
+optimizer = "adamw"
+lr = 6e-4
+min_lr = 6e-5
+warmup_steps = 500
+max_steps = 12305
+batch_size = 32
+grad_clip = 1.0
+precision = "f16"
+eval_interval = 500
+eval_steps = 25
+checkpoint_interval = 2000
+seed = 42
+[training.curriculum]
+enabled = false
+[training.coreset]
+enabled = false
+[data]
+train_path = "../text-pipeline/output/train.txt"
+val_path = "../text-pipeline/output/val.txt"
+tokenizer_dir = "../text-pipeline/output"
+[inference]
+precision = "f16"
+compile = false
+temperature = 0.8
+top_k = 40
+max_new_tokens = 500