Restructure to src/ layout with attention, per-layer MoE, and working chat

73400c8 24 days ago

521 Bytes

	training:
	batch_size: 2
	gradient_accumulation: 16
	learning_rate: 3e-4
	min_lr: 3e-5
	warmup_steps: 2000
	total_steps: 250000
	weight_decay: 0.1
	beta1: 0.9
	beta2: 0.95
	grad_clip: 1.0

	checkpoint:
	save_every_steps: 5000
	keep_last_n: 3
	keep_best_n: 2
	max_space_gb: 50.0
	save_optimizer: true
	save_scheduler: true
	save_experts_only: false
	checkpoint_dir: "./outputs/checkpoints"
	resume_from: null

	grpo:
	group_size: 8
	epsilon: 0.2
	beta: 0.04
	learning_rate: 1e-6