LisaMegaWatts
/

MonarchSLM

Text Generation

structured-matrix

Eval Results (legacy)

Model card Files Files and versions

MonarchSLM / config.toml

LisaMegaWatts's picture

Upload config.toml with huggingface_hub

fde7116 verified 5 days ago

history blame contribute delete

1.01 kB

	# 5M Monarch Mixer config — same param budget as 5m.toml
	# 8 blocks (vs 6 for transformer) due to cheaper sequence mixing
	# Monarch sequence mixer: 67K params/block vs 262K for attention

	[model]
	arch = "monarch"
	embed_dim = 256
	n_layers = 8
	n_heads = 4 # unused by Monarch, kept for struct compat
	head_dim = 64 # unused by Monarch
	n_monarch_heads = 8
	conv_kernel_size = 4
	ffn_mult = 4
	context_length = 256
	dropout = 0.0
	bias = false
	weight_tying = true

	[training]
	optimizer = "adamw"
	lr = 6e-4
	min_lr = 6e-5
	warmup_steps = 500
	max_steps = 12305
	batch_size = 32
	grad_clip = 1.0
	precision = "f16"
	eval_interval = 500
	eval_steps = 25
	checkpoint_interval = 2000
	seed = 42

	[training.curriculum]
	enabled = false

	[training.coreset]
	enabled = false

	[data]
	train_path = "../text-pipeline/output/train.txt"
	val_path = "../text-pipeline/output/val.txt"
	tokenizer_dir = "../text-pipeline/output"

	[inference]
	precision = "f16"
	compile = false
	temperature = 0.8
	top_k = 40
	max_new_tokens = 500