SHOREKEEPER / configs /training.yaml
geoore's picture
Restructure to src/ layout with attention, per-layer MoE, and working chat
73400c8
training:
batch_size: 2
gradient_accumulation: 16
learning_rate: 3e-4
min_lr: 3e-5
warmup_steps: 2000
total_steps: 250000
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
checkpoint:
save_every_steps: 5000
keep_last_n: 3
keep_best_n: 2
max_space_gb: 50.0
save_optimizer: true
save_scheduler: true
save_experts_only: false
checkpoint_dir: "./outputs/checkpoints"
resume_from: null
grpo:
group_size: 8
epsilon: 0.2
beta: 0.04
learning_rate: 1e-6