Arko007's picture
step 2000
fc5f706 verified
{
"step": 2000,
"tokens_consumed": 1048576000,
"sequences_consumed": 512000,
"config": {
"vocab_size": 129280,
"dim": 1536,
"n_layers": 16,
"n_heads": 12,
"head_dim": 128,
"rope_head_dim": 32,
"q_lora_rank": 384,
"o_groups": 4,
"o_lora_rank": 256,
"norm_eps": 1e-06,
"n_routed_experts": 12,
"n_shared_experts": 1,
"n_activated_experts": 2,
"moe_inter_dim": 1408,
"score_func": "sqrtsoftplus",
"route_scale": 2.5,
"swiglu_limit": 10.0,
"n_dense_layers": 2,
"load_balance_alpha": 0.0001,
"hc_mult": 3,
"hc_sinkhorn_iters": 8,
"hc_eps": 1e-06,
"window_size": 128,
"compress_ratio": 4,
"n_mtp_layers": 1,
"mtp_loss_weight": 0.3,
"rope_theta": 10000.0,
"rope_factor": 1.0,
"original_seq_len": 0,
"beta_fast": 32,
"beta_slow": 1,
"ctx_phase_boundaries": [
60000,
80000,
100000
],
"ctx_phase_lengths": [
2048,
4096,
8192
],
"yarn_scale": 40.0,
"yarn_alpha": 1.0,
"yarn_beta": 32.0,
"yarn_original_seq_len": 4096,
"micro_batch_size": 1,
"gradient_accumulation_steps": 32,
"max_lr": 0.00042,
"min_lr": 4.2e-05,
"warmup_steps": 2000,
"total_steps": 200000,
"decay_steps": 180000,
"weight_decay": 0.1,
"muon_beta": 0.95,
"muon_ns_steps": 5,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"grad_clip_norm": 1.0,
"init_std": 0.006,
"dtype": "bfloat16",
"fineweb_config": "default",
"approx_tokens_per_fineweb_doc": 800,
"approx_tokens_per_code_doc": 1200,
"approx_tokens_per_math_doc": 600,
"data_mix_probs": [
0.6,
0.25,
0.15
],
"save_every_steps": 400,
"log_every_steps": 10,
"push_every_steps": 400,
"checkpoint_dir": "/kaggle/working/zenyxv3checkpoints"
}
}