frankenstallm / source /configs /small_fp8.yaml
pathcosmos's picture
Upload folder using huggingface_hub (#16)
09ea133
# Small LLM ~125M parameters — FP8 variant (B200 TransformerEngine)
# Based on small.yaml; only changed fields are listed explicitly.
model:
vocab_size: 32000
d_model: 768
n_layers: 12
n_heads: 12
n_kv_heads: 12 # MHA (same as n_heads)
max_seq_len: 2048
rope_theta: 10000.0
dropout: 0.0
bias: false
use_flash_attn: true
use_fp8: true # Enable TransformerEngine FP8 kernels
train:
max_steps: 100000
batch_size: 8 # per GPU; 8 * 2048 = 16384 tokens → divisible by 8 ✓
grad_accum_steps: 4 # effective batch = 8 * 8 GPUs * 4 = 256
lr: 3.0e-4
weight_decay: 0.1
warmup_steps: 2000
max_grad_norm: 1.0
log_interval: 10
save_interval: 1000
eval_interval: 500
use_amp: false # fp8_autocast replaces torch.autocast
compile_model: false # torch.compile + TE 2.10 stability not verified
fp8_amax_history_len: 16
fp8_amax_compute_algo: "max"
fp8_format: "MXFP8" # B200 native block scaling (better than HYBRID on Blackwell)
tokenizer:
vocab_size: 32000
type: bpe