# 5M Monarch Mixer config — same param budget as 5m.toml # 8 blocks (vs 6 for transformer) due to cheaper sequence mixing # Monarch sequence mixer: 67K params/block vs 262K for attention [model] arch = "monarch" embed_dim = 256 n_layers = 8 n_heads = 4 # unused by Monarch, kept for struct compat head_dim = 64 # unused by Monarch n_monarch_heads = 8 conv_kernel_size = 4 ffn_mult = 4 context_length = 256 dropout = 0.0 bias = false weight_tying = true [training] optimizer = "adamw" lr = 6e-4 min_lr = 6e-5 warmup_steps = 500 max_steps = 12305 batch_size = 32 grad_clip = 1.0 precision = "f16" eval_interval = 500 eval_steps = 25 checkpoint_interval = 2000 seed = 42 [training.curriculum] enabled = false [training.coreset] enabled = false [data] train_path = "../text-pipeline/output/train.txt" val_path = "../text-pipeline/output/val.txt" tokenizer_dir = "../text-pipeline/output" [inference] precision = "f16" compile = false temperature = 0.8 top_k = 40 max_new_tokens = 500