source/configs/small_fp8.yaml · pathcosmos/frankenstallm at main

Upload folder using huggingface_hub (#16)

09ea133 about 2 months ago

1.05 kB

	# Small LLM ~125M parameters — FP8 variant (B200 TransformerEngine)
	# Based on small.yaml; only changed fields are listed explicitly.
	model:
	vocab_size: 32000
	d_model: 768
	n_layers: 12
	n_heads: 12
	n_kv_heads: 12 # MHA (same as n_heads)
	max_seq_len: 2048
	rope_theta: 10000.0
	dropout: 0.0
	bias: false
	use_flash_attn: true
	use_fp8: true # Enable TransformerEngine FP8 kernels

	train:
	max_steps: 100000
	batch_size: 8 # per GPU; 8 * 2048 = 16384 tokens → divisible by 8 ✓
	grad_accum_steps: 4 # effective batch = 8 * 8 GPUs * 4 = 256
	lr: 3.0e-4
	weight_decay: 0.1
	warmup_steps: 2000
	max_grad_norm: 1.0
	log_interval: 10
	save_interval: 1000
	eval_interval: 500
	use_amp: false # fp8_autocast replaces torch.autocast
	compile_model: false # torch.compile + TE 2.10 stability not verified
	fp8_amax_history_len: 16
	fp8_amax_compute_algo: "max"
	fp8_format: "MXFP8" # B200 native block scaling (better than HYBRID on Blackwell)

	tokenizer:
	vocab_size: 32000
	type: bpe