aetheris / config.yaml
rcgalbo's picture
Full vocab config for SFT model
709e6da verified
raw
history blame contribute delete
316 Bytes
checkpoint_ssm_layers: true
d_ff: 3072
d_inner: 2048
d_model: 1024
dtype: float16
gradient_checkpointing: true
load_balancing_coef: 0.01
max_seq_len: 2048
n_layer: 24
num_experts: 4
router_z_loss_coef: 0.001
ssm_d_state: 16
ssm_expand: 2
top_k: 1
use_cpu_offload: false
use_flash_attention: false
vocab_size: 261019