LisaMegaWatts
/

JuliaSLM

Text Generation

Eval Results (legacy)

Model card Files Files and versions

JuliaSLM / config.toml

LisaMegaWatts's picture

Upload config.toml with huggingface_hub

969a1de verified 5 days ago

history blame contribute delete

873 Bytes

	# 5M config — Chinchilla-optimal BPE training (~5M params)
	# Target: 100M tokens at 20 tokens/param
	# RTX 3060 12GB: batch=32, ctx=256 → 8192 tokens/step → ~12300 steps

	[model]
	arch = "transformer"
	embed_dim = 256
	n_layers = 6
	n_heads = 4
	head_dim = 64
	ffn_mult = 4
	context_length = 256
	dropout = 0.0
	bias = false
	weight_tying = true

	[training]
	optimizer = "adamw"
	lr = 6e-4
	min_lr = 6e-5
	warmup_steps = 500
	max_steps = 12305
	batch_size = 32
	grad_clip = 1.0
	precision = "f16"
	eval_interval = 500
	eval_steps = 25
	checkpoint_interval = 2000
	seed = 42

	[training.curriculum]
	enabled = false

	[training.coreset]
	enabled = false

	[data]
	train_path = "../text-pipeline/output/train.txt"
	val_path = "../text-pipeline/output/val.txt"
	tokenizer_dir = "../text-pipeline/output"

	[inference]
	precision = "f16"
	compile = false
	temperature = 0.8
	top_k = 40
	max_new_tokens = 500