| # Small LLM ~125M parameters — FP8 variant (B200 TransformerEngine) | |
| # Based on small.yaml; only changed fields are listed explicitly. | |
| model: | |
| vocab_size: 32000 | |
| d_model: 768 | |
| n_layers: 12 | |
| n_heads: 12 | |
| n_kv_heads: 12 # MHA (same as n_heads) | |
| max_seq_len: 2048 | |
| rope_theta: 10000.0 | |
| dropout: 0.0 | |
| bias: false | |
| use_flash_attn: true | |
| use_fp8: true # Enable TransformerEngine FP8 kernels | |
| train: | |
| max_steps: 100000 | |
| batch_size: 8 # per GPU; 8 * 2048 = 16384 tokens → divisible by 8 ✓ | |
| grad_accum_steps: 4 # effective batch = 8 * 8 GPUs * 4 = 256 | |
| lr: 3.0e-4 | |
| weight_decay: 0.1 | |
| warmup_steps: 2000 | |
| max_grad_norm: 1.0 | |
| log_interval: 10 | |
| save_interval: 1000 | |
| eval_interval: 500 | |
| use_amp: false # fp8_autocast replaces torch.autocast | |
| compile_model: false # torch.compile + TE 2.10 stability not verified | |
| fp8_amax_history_len: 16 | |
| fp8_amax_compute_algo: "max" | |
| fp8_format: "MXFP8" # B200 native block scaling (better than HYBRID on Blackwell) | |
| tokenizer: | |
| vocab_size: 32000 | |
| type: bpe | |