step 2800
Browse files
checkpoints/step_2800/metadata.json
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"step": 2800,
|
| 3 |
+
"tokens_consumed": 1468006400,
|
| 4 |
+
"sequences_consumed": 716800,
|
| 5 |
+
"config": {
|
| 6 |
+
"vocab_size": 129280,
|
| 7 |
+
"dim": 1536,
|
| 8 |
+
"n_layers": 16,
|
| 9 |
+
"n_heads": 12,
|
| 10 |
+
"head_dim": 128,
|
| 11 |
+
"rope_head_dim": 32,
|
| 12 |
+
"q_lora_rank": 384,
|
| 13 |
+
"o_groups": 4,
|
| 14 |
+
"o_lora_rank": 256,
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"n_routed_experts": 12,
|
| 17 |
+
"n_shared_experts": 1,
|
| 18 |
+
"n_activated_experts": 2,
|
| 19 |
+
"moe_inter_dim": 1408,
|
| 20 |
+
"score_func": "sqrtsoftplus",
|
| 21 |
+
"route_scale": 2.5,
|
| 22 |
+
"swiglu_limit": 10.0,
|
| 23 |
+
"n_dense_layers": 2,
|
| 24 |
+
"load_balance_alpha": 0.0001,
|
| 25 |
+
"hc_mult": 3,
|
| 26 |
+
"hc_sinkhorn_iters": 8,
|
| 27 |
+
"hc_eps": 1e-06,
|
| 28 |
+
"window_size": 128,
|
| 29 |
+
"compress_ratio": 4,
|
| 30 |
+
"n_mtp_layers": 1,
|
| 31 |
+
"mtp_loss_weight": 0.3,
|
| 32 |
+
"rope_theta": 10000.0,
|
| 33 |
+
"rope_factor": 1.0,
|
| 34 |
+
"original_seq_len": 0,
|
| 35 |
+
"beta_fast": 32,
|
| 36 |
+
"beta_slow": 1,
|
| 37 |
+
"ctx_phase_boundaries": [
|
| 38 |
+
60000,
|
| 39 |
+
80000,
|
| 40 |
+
100000
|
| 41 |
+
],
|
| 42 |
+
"ctx_phase_lengths": [
|
| 43 |
+
2048,
|
| 44 |
+
4096,
|
| 45 |
+
8192
|
| 46 |
+
],
|
| 47 |
+
"yarn_scale": 40.0,
|
| 48 |
+
"yarn_alpha": 1.0,
|
| 49 |
+
"yarn_beta": 32.0,
|
| 50 |
+
"yarn_original_seq_len": 4096,
|
| 51 |
+
"micro_batch_size": 1,
|
| 52 |
+
"gradient_accumulation_steps": 32,
|
| 53 |
+
"max_lr": 0.00042,
|
| 54 |
+
"min_lr": 4.2e-05,
|
| 55 |
+
"warmup_steps": 2000,
|
| 56 |
+
"total_steps": 200000,
|
| 57 |
+
"decay_steps": 180000,
|
| 58 |
+
"weight_decay": 0.1,
|
| 59 |
+
"muon_beta": 0.95,
|
| 60 |
+
"muon_ns_steps": 5,
|
| 61 |
+
"adam_beta1": 0.9,
|
| 62 |
+
"adam_beta2": 0.95,
|
| 63 |
+
"grad_clip_norm": 1.0,
|
| 64 |
+
"init_std": 0.006,
|
| 65 |
+
"dtype": "bfloat16",
|
| 66 |
+
"fineweb_config": "default",
|
| 67 |
+
"approx_tokens_per_fineweb_doc": 800,
|
| 68 |
+
"approx_tokens_per_code_doc": 1200,
|
| 69 |
+
"approx_tokens_per_math_doc": 600,
|
| 70 |
+
"data_mix_probs": [
|
| 71 |
+
0.6,
|
| 72 |
+
0.25,
|
| 73 |
+
0.15
|
| 74 |
+
],
|
| 75 |
+
"save_every_steps": 400,
|
| 76 |
+
"log_every_steps": 10,
|
| 77 |
+
"push_every_steps": 400,
|
| 78 |
+
"checkpoint_dir": "/kaggle/working/zenyxv3checkpoints"
|
| 79 |
+
}
|
| 80 |
+
}
|
checkpoints/step_2800/opt_state.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e69f2d9252378fea67694546cf4b05ce5f57ec838bdcc65fc0ad0069920f745
|
| 3 |
+
size 3154940698
|
checkpoints/step_2800/params.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0b8ba4b6facd725943eb0eebca6ee1028a87f0e9f94e424c0250fec3c920beec
|
| 3 |
+
size 3154768094
|