Upload 7 files
#1
by NLPGroupProject - opened
- GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/architecture.txt +300 -0
- GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/config_snapshot.json +131 -0
- GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/eval_metrics.jsonl +20 -0
- GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/events.jsonl +29 -0
- GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/metrics.png +3 -0
- GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/results.md +24 -0
- GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/train_metrics.jsonl +205 -0
GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/architecture.txt
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
=== Raw Model ===
|
| 2 |
+
GPT(
|
| 3 |
+
(transformer): ModuleDict(
|
| 4 |
+
(drop): Dropout(p=0.0, inplace=False)
|
| 5 |
+
(h): ModuleList(
|
| 6 |
+
(0-17): 18 x Block(
|
| 7 |
+
(ln_1): RMSNorm()
|
| 8 |
+
(attn): CausalSelfAttention(
|
| 9 |
+
(rotary): RotaryEmbedding()
|
| 10 |
+
(q_proj): Linear(in_features=320, out_features=320, bias=False)
|
| 11 |
+
(k_proj): Linear(in_features=320, out_features=64, bias=False)
|
| 12 |
+
(v_proj): Linear(in_features=320, out_features=64, bias=False)
|
| 13 |
+
(c_proj): Linear(in_features=320, out_features=320, bias=False)
|
| 14 |
+
(resid_dropout): Dropout(p=0.0, inplace=False)
|
| 15 |
+
)
|
| 16 |
+
(ln_2): RMSNorm()
|
| 17 |
+
(mlp): MLP(
|
| 18 |
+
(c_fc): Linear(in_features=320, out_features=2048, bias=False)
|
| 19 |
+
(c_proj): Linear(in_features=1024, out_features=320, bias=False)
|
| 20 |
+
(dropout): Dropout(p=0.0, inplace=False)
|
| 21 |
+
)
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
(ln_f): RMSNorm()
|
| 25 |
+
(wte): Embedding(50304, 320)
|
| 26 |
+
)
|
| 27 |
+
(lm_head): Linear(in_features=320, out_features=50304, bias=False)
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
=== Forward Summary (torchinfo, uncompiled model) ===
|
| 31 |
+
====================================================================================================
|
| 32 |
+
Layer (type:depth-idx) Output Shape Param #
|
| 33 |
+
====================================================================================================
|
| 34 |
+
GPT [1, 1, 50304] --
|
| 35 |
+
├─ModuleDict: 1-1 -- --
|
| 36 |
+
│ └─Embedding: 2-1 [1, 1024, 320] 16,097,280
|
| 37 |
+
│ └─Dropout: 2-2 [1, 1024, 320] --
|
| 38 |
+
│ └─ModuleList: 2-3 -- --
|
| 39 |
+
│ │ └─Block: 3-1 [1, 1024, 320] --
|
| 40 |
+
│ │ │ └─RMSNorm: 4-1 [1, 1024, 320] 320
|
| 41 |
+
│ │ │ └─CausalSelfAttention: 4-2 [1, 1024, 320] --
|
| 42 |
+
│ │ │ │ └─Linear: 5-1 [1, 1024, 320] 102,400
|
| 43 |
+
│ │ │ │ └─Linear: 5-2 [1, 1024, 64] 20,480
|
| 44 |
+
│ │ │ │ └─Linear: 5-3 [1, 1024, 64] 20,480
|
| 45 |
+
│ │ │ │ └─RotaryEmbedding: 5-4 [1, 1, 1024, 64] --
|
| 46 |
+
│ │ │ │ └─Linear: 5-5 [1, 1024, 320] 102,400
|
| 47 |
+
│ │ │ │ └─Dropout: 5-6 [1, 1024, 320] --
|
| 48 |
+
│ │ │ └─RMSNorm: 4-3 [1, 1024, 320] 320
|
| 49 |
+
│ │ │ └─MLP: 4-4 [1, 1024, 320] --
|
| 50 |
+
│ │ │ │ └─Linear: 5-7 [1, 1024, 2048] 655,360
|
| 51 |
+
│ │ │ │ └─Linear: 5-8 [1, 1024, 320] 327,680
|
| 52 |
+
│ │ │ │ └─Dropout: 5-9 [1, 1024, 320] --
|
| 53 |
+
│ │ └─Block: 3-2 [1, 1024, 320] --
|
| 54 |
+
│ │ │ └─RMSNorm: 4-5 [1, 1024, 320] 320
|
| 55 |
+
│ │ │ └─CausalSelfAttention: 4-6 [1, 1024, 320] --
|
| 56 |
+
│ │ │ │ └─Linear: 5-10 [1, 1024, 320] 102,400
|
| 57 |
+
│ │ │ │ └─Linear: 5-11 [1, 1024, 64] 20,480
|
| 58 |
+
│ │ │ │ └─Linear: 5-12 [1, 1024, 64] 20,480
|
| 59 |
+
│ │ │ │ └─RotaryEmbedding: 5-13 [1, 1, 1024, 64] --
|
| 60 |
+
│ │ │ │ └─Linear: 5-14 [1, 1024, 320] 102,400
|
| 61 |
+
│ │ │ │ └─Dropout: 5-15 [1, 1024, 320] --
|
| 62 |
+
│ │ │ └─RMSNorm: 4-7 [1, 1024, 320] 320
|
| 63 |
+
│ │ │ └─MLP: 4-8 [1, 1024, 320] --
|
| 64 |
+
│ │ │ │ └─Linear: 5-16 [1, 1024, 2048] 655,360
|
| 65 |
+
│ │ │ │ └─Linear: 5-17 [1, 1024, 320] 327,680
|
| 66 |
+
│ │ │ │ └─Dropout: 5-18 [1, 1024, 320] --
|
| 67 |
+
│ │ └─Block: 3-3 [1, 1024, 320] --
|
| 68 |
+
│ │ │ └─RMSNorm: 4-9 [1, 1024, 320] 320
|
| 69 |
+
│ │ │ └─CausalSelfAttention: 4-10 [1, 1024, 320] --
|
| 70 |
+
│ │ │ │ └─Linear: 5-19 [1, 1024, 320] 102,400
|
| 71 |
+
│ │ │ │ └─Linear: 5-20 [1, 1024, 64] 20,480
|
| 72 |
+
│ │ │ │ └─Linear: 5-21 [1, 1024, 64] 20,480
|
| 73 |
+
│ │ │ │ └─RotaryEmbedding: 5-22 [1, 1, 1024, 64] --
|
| 74 |
+
│ │ │ │ └─Linear: 5-23 [1, 1024, 320] 102,400
|
| 75 |
+
│ │ │ │ └─Dropout: 5-24 [1, 1024, 320] --
|
| 76 |
+
│ │ │ └─RMSNorm: 4-11 [1, 1024, 320] 320
|
| 77 |
+
│ │ │ └─MLP: 4-12 [1, 1024, 320] --
|
| 78 |
+
│ │ │ │ └─Linear: 5-25 [1, 1024, 2048] 655,360
|
| 79 |
+
│ │ │ │ └─Linear: 5-26 [1, 1024, 320] 327,680
|
| 80 |
+
│ │ │ │ └─Dropout: 5-27 [1, 1024, 320] --
|
| 81 |
+
│ │ └─Block: 3-4 [1, 1024, 320] --
|
| 82 |
+
│ │ │ └─RMSNorm: 4-13 [1, 1024, 320] 320
|
| 83 |
+
│ │ │ └─CausalSelfAttention: 4-14 [1, 1024, 320] --
|
| 84 |
+
│ │ │ │ └─Linear: 5-28 [1, 1024, 320] 102,400
|
| 85 |
+
│ │ │ │ └─Linear: 5-29 [1, 1024, 64] 20,480
|
| 86 |
+
│ │ │ │ └─Linear: 5-30 [1, 1024, 64] 20,480
|
| 87 |
+
│ │ │ │ └─RotaryEmbedding: 5-31 [1, 1, 1024, 64] --
|
| 88 |
+
│ │ │ │ └─Linear: 5-32 [1, 1024, 320] 102,400
|
| 89 |
+
│ │ │ │ └─Dropout: 5-33 [1, 1024, 320] --
|
| 90 |
+
│ │ │ └─RMSNorm: 4-15 [1, 1024, 320] 320
|
| 91 |
+
│ │ │ └─MLP: 4-16 [1, 1024, 320] --
|
| 92 |
+
│ │ │ │ └─Linear: 5-34 [1, 1024, 2048] 655,360
|
| 93 |
+
│ │ │ │ └─Linear: 5-35 [1, 1024, 320] 327,680
|
| 94 |
+
│ │ │ │ └─Dropout: 5-36 [1, 1024, 320] --
|
| 95 |
+
│ │ └─Block: 3-5 [1, 1024, 320] --
|
| 96 |
+
│ │ │ └─RMSNorm: 4-17 [1, 1024, 320] 320
|
| 97 |
+
│ │ │ └─CausalSelfAttention: 4-18 [1, 1024, 320] --
|
| 98 |
+
│ │ │ │ └─Linear: 5-37 [1, 1024, 320] 102,400
|
| 99 |
+
│ │ │ │ └─Linear: 5-38 [1, 1024, 64] 20,480
|
| 100 |
+
│ │ │ │ └─Linear: 5-39 [1, 1024, 64] 20,480
|
| 101 |
+
│ │ │ │ └─RotaryEmbedding: 5-40 [1, 1, 1024, 64] --
|
| 102 |
+
│ │ │ │ └─Linear: 5-41 [1, 1024, 320] 102,400
|
| 103 |
+
│ │ │ │ └─Dropout: 5-42 [1, 1024, 320] --
|
| 104 |
+
│ │ │ └─RMSNorm: 4-19 [1, 1024, 320] 320
|
| 105 |
+
│ │ │ └─MLP: 4-20 [1, 1024, 320] --
|
| 106 |
+
│ │ │ │ └─Linear: 5-43 [1, 1024, 2048] 655,360
|
| 107 |
+
│ │ │ │ └─Linear: 5-44 [1, 1024, 320] 327,680
|
| 108 |
+
│ │ │ │ └─Dropout: 5-45 [1, 1024, 320] --
|
| 109 |
+
│ │ └─Block: 3-6 [1, 1024, 320] --
|
| 110 |
+
│ │ │ └─RMSNorm: 4-21 [1, 1024, 320] 320
|
| 111 |
+
│ │ │ └─CausalSelfAttention: 4-22 [1, 1024, 320] --
|
| 112 |
+
│ │ │ │ └─Linear: 5-46 [1, 1024, 320] 102,400
|
| 113 |
+
│ │ │ │ └─Linear: 5-47 [1, 1024, 64] 20,480
|
| 114 |
+
│ │ │ │ └─Linear: 5-48 [1, 1024, 64] 20,480
|
| 115 |
+
│ │ │ │ └─RotaryEmbedding: 5-49 [1, 1, 1024, 64] --
|
| 116 |
+
│ │ │ │ └─Linear: 5-50 [1, 1024, 320] 102,400
|
| 117 |
+
│ │ │ │ └─Dropout: 5-51 [1, 1024, 320] --
|
| 118 |
+
│ │ │ └─RMSNorm: 4-23 [1, 1024, 320] 320
|
| 119 |
+
│ │ │ └─MLP: 4-24 [1, 1024, 320] --
|
| 120 |
+
│ │ │ │ └─Linear: 5-52 [1, 1024, 2048] 655,360
|
| 121 |
+
│ │ │ │ └─Linear: 5-53 [1, 1024, 320] 327,680
|
| 122 |
+
│ │ │ │ └─Dropout: 5-54 [1, 1024, 320] --
|
| 123 |
+
│ │ └─Block: 3-7 [1, 1024, 320] --
|
| 124 |
+
│ │ │ └─RMSNorm: 4-25 [1, 1024, 320] 320
|
| 125 |
+
│ │ │ └─CausalSelfAttention: 4-26 [1, 1024, 320] --
|
| 126 |
+
│ │ │ │ └─Linear: 5-55 [1, 1024, 320] 102,400
|
| 127 |
+
│ │ │ │ └─Linear: 5-56 [1, 1024, 64] 20,480
|
| 128 |
+
│ │ │ ��� └─Linear: 5-57 [1, 1024, 64] 20,480
|
| 129 |
+
│ │ │ │ └─RotaryEmbedding: 5-58 [1, 1, 1024, 64] --
|
| 130 |
+
│ │ │ │ └─Linear: 5-59 [1, 1024, 320] 102,400
|
| 131 |
+
│ │ │ │ └─Dropout: 5-60 [1, 1024, 320] --
|
| 132 |
+
│ │ │ └─RMSNorm: 4-27 [1, 1024, 320] 320
|
| 133 |
+
│ │ │ └─MLP: 4-28 [1, 1024, 320] --
|
| 134 |
+
│ │ │ │ └─Linear: 5-61 [1, 1024, 2048] 655,360
|
| 135 |
+
│ │ │ │ └─Linear: 5-62 [1, 1024, 320] 327,680
|
| 136 |
+
│ │ │ │ └─Dropout: 5-63 [1, 1024, 320] --
|
| 137 |
+
│ │ └─Block: 3-8 [1, 1024, 320] --
|
| 138 |
+
│ │ │ └─RMSNorm: 4-29 [1, 1024, 320] 320
|
| 139 |
+
│ │ │ └─CausalSelfAttention: 4-30 [1, 1024, 320] --
|
| 140 |
+
│ │ │ │ └─Linear: 5-64 [1, 1024, 320] 102,400
|
| 141 |
+
│ │ │ │ └─Linear: 5-65 [1, 1024, 64] 20,480
|
| 142 |
+
│ │ │ │ └─Linear: 5-66 [1, 1024, 64] 20,480
|
| 143 |
+
│ │ │ │ └─RotaryEmbedding: 5-67 [1, 1, 1024, 64] --
|
| 144 |
+
│ │ │ │ └─Linear: 5-68 [1, 1024, 320] 102,400
|
| 145 |
+
│ │ │ │ └─Dropout: 5-69 [1, 1024, 320] --
|
| 146 |
+
│ │ │ └─RMSNorm: 4-31 [1, 1024, 320] 320
|
| 147 |
+
│ │ │ └─MLP: 4-32 [1, 1024, 320] --
|
| 148 |
+
│ │ │ │ └─Linear: 5-70 [1, 1024, 2048] 655,360
|
| 149 |
+
│ │ │ │ └─Linear: 5-71 [1, 1024, 320] 327,680
|
| 150 |
+
│ │ │ │ └─Dropout: 5-72 [1, 1024, 320] --
|
| 151 |
+
│ │ └─Block: 3-9 [1, 1024, 320] --
|
| 152 |
+
│ │ │ └─RMSNorm: 4-33 [1, 1024, 320] 320
|
| 153 |
+
│ │ │ └─CausalSelfAttention: 4-34 [1, 1024, 320] --
|
| 154 |
+
│ │ │ │ └─Linear: 5-73 [1, 1024, 320] 102,400
|
| 155 |
+
│ │ │ │ └─Linear: 5-74 [1, 1024, 64] 20,480
|
| 156 |
+
│ │ │ │ └─Linear: 5-75 [1, 1024, 64] 20,480
|
| 157 |
+
│ │ │ │ └─RotaryEmbedding: 5-76 [1, 1, 1024, 64] --
|
| 158 |
+
│ │ │ │ └─Linear: 5-77 [1, 1024, 320] 102,400
|
| 159 |
+
│ │ │ │ └─Dropout: 5-78 [1, 1024, 320] --
|
| 160 |
+
│ │ │ └─RMSNorm: 4-35 [1, 1024, 320] 320
|
| 161 |
+
│ │ │ └─MLP: 4-36 [1, 1024, 320] --
|
| 162 |
+
│ │ │ │ └─Linear: 5-79 [1, 1024, 2048] 655,360
|
| 163 |
+
│ │ │ │ └─Linear: 5-80 [1, 1024, 320] 327,680
|
| 164 |
+
│ │ │ │ └─Dropout: 5-81 [1, 1024, 320] --
|
| 165 |
+
│ │ └─Block: 3-10 [1, 1024, 320] --
|
| 166 |
+
│ │ │ └─RMSNorm: 4-37 [1, 1024, 320] 320
|
| 167 |
+
│ │ │ └─CausalSelfAttention: 4-38 [1, 1024, 320] --
|
| 168 |
+
│ │ │ │ └─Linear: 5-82 [1, 1024, 320] 102,400
|
| 169 |
+
│ │ │ │ └─Linear: 5-83 [1, 1024, 64] 20,480
|
| 170 |
+
│ │ │ │ └─Linear: 5-84 [1, 1024, 64] 20,480
|
| 171 |
+
│ │ │ │ └─RotaryEmbedding: 5-85 [1, 1, 1024, 64] --
|
| 172 |
+
│ │ │ │ └─Linear: 5-86 [1, 1024, 320] 102,400
|
| 173 |
+
│ │ │ │ └─Dropout: 5-87 [1, 1024, 320] --
|
| 174 |
+
│ │ │ └─RMSNorm: 4-39 [1, 1024, 320] 320
|
| 175 |
+
│ │ │ └─MLP: 4-40 [1, 1024, 320] --
|
| 176 |
+
│ │ │ │ └─Linear: 5-88 [1, 1024, 2048] 655,360
|
| 177 |
+
│ │ │ │ └─Linear: 5-89 [1, 1024, 320] 327,680
|
| 178 |
+
│ │ │ │ └─Dropout: 5-90 [1, 1024, 320] --
|
| 179 |
+
│ │ └─Block: 3-11 [1, 1024, 320] --
|
| 180 |
+
│ │ │ └─RMSNorm: 4-41 [1, 1024, 320] 320
|
| 181 |
+
│ │ │ └─CausalSelfAttention: 4-42 [1, 1024, 320] --
|
| 182 |
+
│ │ │ │ └─Linear: 5-91 [1, 1024, 320] 102,400
|
| 183 |
+
│ │ │ │ └─Linear: 5-92 [1, 1024, 64] 20,480
|
| 184 |
+
│ │ │ │ └─Linear: 5-93 [1, 1024, 64] 20,480
|
| 185 |
+
│ │ │ │ └─RotaryEmbedding: 5-94 [1, 1, 1024, 64] --
|
| 186 |
+
│ │ │ │ └─Linear: 5-95 [1, 1024, 320] 102,400
|
| 187 |
+
│ │ │ │ └─Dropout: 5-96 [1, 1024, 320] --
|
| 188 |
+
│ │ │ └─RMSNorm: 4-43 [1, 1024, 320] 320
|
| 189 |
+
│ │ │ └─MLP: 4-44 [1, 1024, 320] --
|
| 190 |
+
│ │ │ │ └─Linear: 5-97 [1, 1024, 2048] 655,360
|
| 191 |
+
│ │ │ │ └─Linear: 5-98 [1, 1024, 320] 327,680
|
| 192 |
+
│ │ │ │ └─Dropout: 5-99 [1, 1024, 320] --
|
| 193 |
+
│ │ └─Block: 3-12 [1, 1024, 320] --
|
| 194 |
+
│ │ │ └─RMSNorm: 4-45 [1, 1024, 320] 320
|
| 195 |
+
│ │ │ └─CausalSelfAttention: 4-46 [1, 1024, 320] --
|
| 196 |
+
│ │ │ │ └─Linear: 5-100 [1, 1024, 320] 102,400
|
| 197 |
+
│ │ │ │ └─Linear: 5-101 [1, 1024, 64] 20,480
|
| 198 |
+
│ │ │ │ └─Linear: 5-102 [1, 1024, 64] 20,480
|
| 199 |
+
│ │ │ │ └─RotaryEmbedding: 5-103 [1, 1, 1024, 64] --
|
| 200 |
+
│ │ │ │ └─Linear: 5-104 [1, 1024, 320] 102,400
|
| 201 |
+
│ │ │ │ └─Dropout: 5-105 [1, 1024, 320] --
|
| 202 |
+
│ │ │ └─RMSNorm: 4-47 [1, 1024, 320] 320
|
| 203 |
+
│ │ │ └─MLP: 4-48 [1, 1024, 320] --
|
| 204 |
+
│ │ │ │ └─Linear: 5-106 [1, 1024, 2048] 655,360
|
| 205 |
+
│ │ │ │ └─Linear: 5-107 [1, 1024, 320] 327,680
|
| 206 |
+
│ │ │ │ └─Dropout: 5-108 [1, 1024, 320] --
|
| 207 |
+
│ │ └─Block: 3-13 [1, 1024, 320] --
|
| 208 |
+
│ │ │ └─RMSNorm: 4-49 [1, 1024, 320] 320
|
| 209 |
+
│ │ │ └─CausalSelfAttention: 4-50 [1, 1024, 320] --
|
| 210 |
+
│ │ │ │ └─Linear: 5-109 [1, 1024, 320] 102,400
|
| 211 |
+
│ │ │ │ └─Linear: 5-110 [1, 1024, 64] 20,480
|
| 212 |
+
│ │ │ │ └─Linear: 5-111 [1, 1024, 64] 20,480
|
| 213 |
+
│ │ │ │ └─RotaryEmbedding: 5-112 [1, 1, 1024, 64] --
|
| 214 |
+
│ │ │ │ └─Linear: 5-113 [1, 1024, 320] 102,400
|
| 215 |
+
│ │ │ │ └─Dropout: 5-114 [1, 1024, 320] --
|
| 216 |
+
│ │ │ └─RMSNorm: 4-51 [1, 1024, 320] 320
|
| 217 |
+
│ │ │ └─MLP: 4-52 [1, 1024, 320] --
|
| 218 |
+
│ │ │ │ └─Linear: 5-115 [1, 1024, 2048] 655,360
|
| 219 |
+
│ │ │ │ └─Linear: 5-116 [1, 1024, 320] 327,680
|
| 220 |
+
│ │ │ │ └─Dropout: 5-117 [1, 1024, 320] --
|
| 221 |
+
│ │ └─Block: 3-14 [1, 1024, 320] --
|
| 222 |
+
│ │ │ └─RMSNorm: 4-53 [1, 1024, 320] 320
|
| 223 |
+
│ │ │ └─CausalSelfAttention: 4-54 [1, 1024, 320] --
|
| 224 |
+
│ │ │ │ └─Linear: 5-118 [1, 1024, 320] 102,400
|
| 225 |
+
│ │ │ │ └─Linear: 5-119 [1, 1024, 64] 20,480
|
| 226 |
+
│ │ │ │ └─Linear: 5-120 [1, 1024, 64] 20,480
|
| 227 |
+
│ │ │ │ └─RotaryEmbedding: 5-121 [1, 1, 1024, 64] --
|
| 228 |
+
│ │ │ │ └─Linear: 5-122 [1, 1024, 320] 102,400
|
| 229 |
+
│ │ │ │ └─Dropout: 5-123 [1, 1024, 320] --
|
| 230 |
+
│ │ │ └─RMSNorm: 4-55 [1, 1024, 320] 320
|
| 231 |
+
│ │ │ └─MLP: 4-56 [1, 1024, 320] --
|
| 232 |
+
│ │ │ │ └─Linear: 5-124 [1, 1024, 2048] 655,360
|
| 233 |
+
│ │ │ │ └─Linear: 5-125 [1, 1024, 320] 327,680
|
| 234 |
+
│ │ │ │ └─Dropout: 5-126 [1, 1024, 320] --
|
| 235 |
+
│ │ └─Block: 3-15 [1, 1024, 320] --
|
| 236 |
+
│ │ │ └─RMSNorm: 4-57 [1, 1024, 320] 320
|
| 237 |
+
│ │ │ └─CausalSelfAttention: 4-58 [1, 1024, 320] --
|
| 238 |
+
│ │ │ │ └─Linear: 5-127 [1, 1024, 320] 102,400
|
| 239 |
+
│ │ │ │ └─Linear: 5-128 [1, 1024, 64] 20,480
|
| 240 |
+
│ │ │ │ └─Linear: 5-129 [1, 1024, 64] 20,480
|
| 241 |
+
│ │ │ │ └─RotaryEmbedding: 5-130 [1, 1, 1024, 64] --
|
| 242 |
+
│ │ │ │ └─Linear: 5-131 [1, 1024, 320] 102,400
|
| 243 |
+
│ │ │ │ └─Dropout: 5-132 [1, 1024, 320] --
|
| 244 |
+
│ │ │ └─RMSNorm: 4-59 [1, 1024, 320] 320
|
| 245 |
+
│ │ │ └─MLP: 4-60 [1, 1024, 320] --
|
| 246 |
+
│ │ │ │ └─Linear: 5-133 [1, 1024, 2048] 655,360
|
| 247 |
+
│ │ │ │ └─Linear: 5-134 [1, 1024, 320] 327,680
|
| 248 |
+
│ │ │ │ └─Dropout: 5-135 [1, 1024, 320] --
|
| 249 |
+
│ │ └─Block: 3-16 [1, 1024, 320] --
|
| 250 |
+
│ │ │ └─RMSNorm: 4-61 [1, 1024, 320] 320
|
| 251 |
+
│ │ │ └─CausalSelfAttention: 4-62 [1, 1024, 320] --
|
| 252 |
+
│ │ │ │ └─Linear: 5-136 [1, 1024, 320] 102,400
|
| 253 |
+
│ │ │ │ └─Linear: 5-137 [1, 1024, 64] 20,480
|
| 254 |
+
│ │ │ │ └─Linear: 5-138 [1, 1024, 64] 20,480
|
| 255 |
+
│ │ │ │ └─RotaryEmbedding: 5-139 [1, 1, 1024, 64] --
|
| 256 |
+
│ │ │ │ └─Linear: 5-140 [1, 1024, 320] 102,400
|
| 257 |
+
│ │ │ │ └─Dropout: 5-141 [1, 1024, 320] --
|
| 258 |
+
│ │ │ └─RMSNorm: 4-63 [1, 1024, 320] 320
|
| 259 |
+
│ │ │ └─MLP: 4-64 [1, 1024, 320] --
|
| 260 |
+
│ │ │ │ └─Linear: 5-142 [1, 1024, 2048] 655,360
|
| 261 |
+
│ │ │ │ └─Linear: 5-143 [1, 1024, 320] 327,680
|
| 262 |
+
│ │ │ │ └─Dropout: 5-144 [1, 1024, 320] --
|
| 263 |
+
│ │ └─Block: 3-17 [1, 1024, 320] --
|
| 264 |
+
│ │ │ └─RMSNorm: 4-65 [1, 1024, 320] 320
|
| 265 |
+
│ │ │ └─CausalSelfAttention: 4-66 [1, 1024, 320] --
|
| 266 |
+
│ │ │ │ └─Linear: 5-145 [1, 1024, 320] 102,400
|
| 267 |
+
│ │ │ │ └─Linear: 5-146 [1, 1024, 64] 20,480
|
| 268 |
+
│ │ │ │ └─Linear: 5-147 [1, 1024, 64] 20,480
|
| 269 |
+
│ │ │ │ └─RotaryEmbedding: 5-148 [1, 1, 1024, 64] --
|
| 270 |
+
│ │ │ │ └─Linear: 5-149 [1, 1024, 320] 102,400
|
| 271 |
+
│ │ │ │ └─Dropout: 5-150 [1, 1024, 320] --
|
| 272 |
+
│ │ │ └─RMSNorm: 4-67 [1, 1024, 320] 320
|
| 273 |
+
│ │ │ └─MLP: 4-68 [1, 1024, 320] --
|
| 274 |
+
│ │ │ │ └─Linear: 5-151 [1, 1024, 2048] 655,360
|
| 275 |
+
│ │ │ │ └─Linear: 5-152 [1, 1024, 320] 327,680
|
| 276 |
+
│ │ │ │ └─Dropout: 5-153 [1, 1024, 320] --
|
| 277 |
+
│ │ └─Block: 3-18 [1, 1024, 320] --
|
| 278 |
+
│ │ │ └─RMSNorm: 4-69 [1, 1024, 320] 320
|
| 279 |
+
│ │ │ └─CausalSelfAttention: 4-70 [1, 1024, 320] --
|
| 280 |
+
│ │ │ │ └─Linear: 5-154 [1, 1024, 320] 102,400
|
| 281 |
+
│ │ │ │ └─Linear: 5-155 [1, 1024, 64] 20,480
|
| 282 |
+
│ │ │ │ └─Linear: 5-156 [1, 1024, 64] 20,480
|
| 283 |
+
│ │ │ │ └─RotaryEmbedding: 5-157 [1, 1, 1024, 64] --
|
| 284 |
+
│ │ │ │ └─Linear: 5-158 [1, 1024, 320] 102,400
|
| 285 |
+
│ │ │ │ └─Dropout: 5-159 [1, 1024, 320] --
|
| 286 |
+
│ │ │ └─RMSNorm: 4-71 [1, 1024, 320] 320
|
| 287 |
+
│ │ │ └─MLP: 4-72 [1, 1024, 320] --
|
| 288 |
+
│ │ │ │ └─Linear: 5-160 [1, 1024, 2048] 655,360
|
| 289 |
+
│ │ │ │ └─Linear: 5-161 [1, 1024, 320] 327,680
|
| 290 |
+
│ │ │ │ └─Dropout: 5-162 [1, 1024, 320] --
|
| 291 |
+
│ ��─RMSNorm: 2-4 [1, 1024, 320] 320
|
| 292 |
+
├─Linear: 1-2 [1, 1, 50304] 16,097,280
|
| 293 |
+
====================================================================================================
|
| 294 |
+
|
| 295 |
+
=== Parameter Counts (unique tensors) ===
|
| 296 |
+
Total params: 38,227,520
|
| 297 |
+
Trainable params: 38,227,520
|
| 298 |
+
Weight tying (wte = lm_head): True
|
| 299 |
+
Embedding mode: standard tied token embedding
|
| 300 |
+
Note: module-level torchinfo totals may double-count the tied LM head; use the unique counts above.
|
GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/config_snapshot.json
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"run": {
|
| 3 |
+
"name": "final_c2_muon_bs512_lr12_seed3_mix3to1",
|
| 4 |
+
"artifacts_root": "artifacts/final_c2_muon",
|
| 5 |
+
"resume": true,
|
| 6 |
+
"deterministic": false
|
| 7 |
+
},
|
| 8 |
+
"distributed": {
|
| 9 |
+
"enabled": false,
|
| 10 |
+
"backend": "nccl"
|
| 11 |
+
},
|
| 12 |
+
"preprocessing": {
|
| 13 |
+
"data_dir": "data",
|
| 14 |
+
"processed_dir": "data/processed_owt",
|
| 15 |
+
"processed_dirs": [
|
| 16 |
+
"data/processed_owt",
|
| 17 |
+
"data/processed_nonwiki_2b"
|
| 18 |
+
],
|
| 19 |
+
"log_dir": "logs/preprocessing",
|
| 20 |
+
"train_split": 0.9,
|
| 21 |
+
"dataset_name": "openwebtext",
|
| 22 |
+
"dataset_config_name": null,
|
| 23 |
+
"dataset_split": "train",
|
| 24 |
+
"dataset_text_column": "text",
|
| 25 |
+
"dataset_repo_id": null,
|
| 26 |
+
"dataset_repo_ids": [
|
| 27 |
+
"huiting123/processedOWT",
|
| 28 |
+
"huiting123/extra_mixed"
|
| 29 |
+
],
|
| 30 |
+
"dataset_weights": [
|
| 31 |
+
3.0,
|
| 32 |
+
1.0
|
| 33 |
+
],
|
| 34 |
+
"num_proc": 4,
|
| 35 |
+
"tokenization_num_proc": 0,
|
| 36 |
+
"tokenization_batch_size": 1000,
|
| 37 |
+
"tokenization_chunk_size": 100000,
|
| 38 |
+
"shard_write_batch_size": 5000,
|
| 39 |
+
"seed": 42,
|
| 40 |
+
"subset_size": 0,
|
| 41 |
+
"raw_data_path": null,
|
| 42 |
+
"test_data_path": null,
|
| 43 |
+
"skip_language_filter": false,
|
| 44 |
+
"skip_repetition_filter": false,
|
| 45 |
+
"skip_quality_filter": false,
|
| 46 |
+
"min_words": 100,
|
| 47 |
+
"max_words": 10000,
|
| 48 |
+
"max_non_ascii": 0.3,
|
| 49 |
+
"min_line_uniqueness": 0.7,
|
| 50 |
+
"min_sentence_uniqueness": 0.8,
|
| 51 |
+
"max_train_tokens": 0
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"vocab_size": 50304,
|
| 55 |
+
"n_layers": 18,
|
| 56 |
+
"n_heads": 5,
|
| 57 |
+
"n_kv_heads": 1,
|
| 58 |
+
"n_embd": 320,
|
| 59 |
+
"embedding_dim": null,
|
| 60 |
+
"tie_embeddings": true,
|
| 61 |
+
"context_len": 1024,
|
| 62 |
+
"dropout": 0.0,
|
| 63 |
+
"bias": false,
|
| 64 |
+
"norm_type": "rmsnorm",
|
| 65 |
+
"norm_eps": 1e-05,
|
| 66 |
+
"positional_embedding": "rope",
|
| 67 |
+
"rope_theta": 10000.0,
|
| 68 |
+
"rope_fraction": 1.0,
|
| 69 |
+
"mlp_type": "swiglu",
|
| 70 |
+
"mlp_hidden_mult": 4.0,
|
| 71 |
+
"mlp_hidden_dim": 1024,
|
| 72 |
+
"qk_norm": false,
|
| 73 |
+
"block_style": "sequential"
|
| 74 |
+
},
|
| 75 |
+
"training": {
|
| 76 |
+
"seed": 3,
|
| 77 |
+
"optimizer": "muon",
|
| 78 |
+
"learning_rate": 0.011088,
|
| 79 |
+
"min_lr": 0.0011088,
|
| 80 |
+
"weight_decay": 0.03,
|
| 81 |
+
"beta1": 0.9,
|
| 82 |
+
"beta2": 0.95,
|
| 83 |
+
"muon_momentum": 0.95,
|
| 84 |
+
"muon_ns_steps": 5,
|
| 85 |
+
"grad_clip": 1.0,
|
| 86 |
+
"max_iters": 15259,
|
| 87 |
+
"warmup_steps": 153,
|
| 88 |
+
"lr_schedule": "wsd",
|
| 89 |
+
"wsd_stable_frac": 0.85,
|
| 90 |
+
"batch_size": 4,
|
| 91 |
+
"gradient_accumulation_steps": 128,
|
| 92 |
+
"dtype": "bfloat16",
|
| 93 |
+
"device": "cuda",
|
| 94 |
+
"eval_step_interval": 100,
|
| 95 |
+
"eval_batches": 20,
|
| 96 |
+
"log_interval": 10,
|
| 97 |
+
"max_checkpoints": 3
|
| 98 |
+
},
|
| 99 |
+
"inference": {
|
| 100 |
+
"checkpoint": null,
|
| 101 |
+
"prompt": "",
|
| 102 |
+
"max_tokens": 100,
|
| 103 |
+
"temperature": 1.0,
|
| 104 |
+
"seed": 3,
|
| 105 |
+
"device": "auto",
|
| 106 |
+
"leaderboard": false
|
| 107 |
+
},
|
| 108 |
+
"post_training": {
|
| 109 |
+
"base_checkpoint": null,
|
| 110 |
+
"learning_rate": 1e-05,
|
| 111 |
+
"max_iters": 1000,
|
| 112 |
+
"checkpoint_dir": "checkpoints/post",
|
| 113 |
+
"log_dir": "logs/post"
|
| 114 |
+
},
|
| 115 |
+
"evaluation": {
|
| 116 |
+
"checkpoint": null,
|
| 117 |
+
"batch_size": 4,
|
| 118 |
+
"device": "cuda",
|
| 119 |
+
"log_dir": "logs/evaluation"
|
| 120 |
+
},
|
| 121 |
+
"notifications": {
|
| 122 |
+
"enabled": false,
|
| 123 |
+
"smtp_host": "smtp.gmail.com",
|
| 124 |
+
"smtp_port": 587,
|
| 125 |
+
"smtp_user": "",
|
| 126 |
+
"to_addresses": [],
|
| 127 |
+
"cooldown_minutes": 5,
|
| 128 |
+
"periodic_status_hours": 4.0,
|
| 129 |
+
"disk_min_gb": 5.0
|
| 130 |
+
}
|
| 131 |
+
}
|
GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/eval_metrics.jsonl
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 13300, "epoch": 0, "val_loss": 3.6002248883247376, "val_ppl": 36.6064658848532, "is_best": false, "timestamp": "2026-05-07T06:41:19.954963"}
|
| 2 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 13400, "epoch": 0, "val_loss": 3.367728364467621, "val_ppl": 29.012546213092545, "is_best": true, "timestamp": "2026-05-07T06:47:14.957164"}
|
| 3 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 13500, "epoch": 1, "val_loss": 3.4373834848403932, "val_ppl": 31.105463681753438, "is_best": false, "timestamp": "2026-05-07T06:53:18.567216"}
|
| 4 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 13600, "epoch": 1, "val_loss": 3.422644519805908, "val_ppl": 30.65036343219071, "is_best": false, "timestamp": "2026-05-07T06:59:13.002678"}
|
| 5 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 13700, "epoch": 1, "val_loss": 3.4340498089790343, "val_ppl": 31.0019408000613, "is_best": false, "timestamp": "2026-05-07T07:05:10.895979"}
|
| 6 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 13800, "epoch": 1, "val_loss": 3.519092357158661, "val_ppl": 33.75377818106467, "is_best": false, "timestamp": "2026-05-07T07:11:04.596778"}
|
| 7 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 13900, "epoch": 1, "val_loss": 3.468717622756958, "val_ppl": 32.09555743058299, "is_best": false, "timestamp": "2026-05-07T07:16:58.613087"}
|
| 8 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14000, "epoch": 1, "val_loss": 3.4682725429534913, "val_ppl": 32.08127552471933, "is_best": false, "timestamp": "2026-05-07T07:22:53.412319"}
|
| 9 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14100, "epoch": 1, "val_loss": 3.589690363407135, "val_ppl": 36.22285826744947, "is_best": false, "timestamp": "2026-05-07T07:28:47.081624"}
|
| 10 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14200, "epoch": 1, "val_loss": 3.379741358757019, "val_ppl": 29.363175602839853, "is_best": false, "timestamp": "2026-05-07T07:34:46.517647"}
|
| 11 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14300, "epoch": 1, "val_loss": 3.428925859928131, "val_ppl": 30.84349471641736, "is_best": false, "timestamp": "2026-05-07T07:40:39.426837"}
|
| 12 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14400, "epoch": 1, "val_loss": 3.60848673582077, "val_ppl": 36.9101557154563, "is_best": false, "timestamp": "2026-05-07T07:46:33.703787"}
|
| 13 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14500, "epoch": 1, "val_loss": 3.5552117228507996, "val_ppl": 34.99522847292765, "is_best": false, "timestamp": "2026-05-07T07:52:32.770855"}
|
| 14 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14600, "epoch": 1, "val_loss": 3.3107362270355223, "val_ppl": 27.405294565251683, "is_best": true, "timestamp": "2026-05-07T07:58:28.773036"}
|
| 15 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14700, "epoch": 1, "val_loss": 3.319306743144989, "val_ppl": 27.64118147619665, "is_best": false, "timestamp": "2026-05-07T08:04:28.107628"}
|
| 16 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14800, "epoch": 1, "val_loss": 3.324813175201416, "val_ppl": 27.793805585458443, "is_best": false, "timestamp": "2026-05-07T08:10:20.501603"}
|
| 17 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 14900, "epoch": 1, "val_loss": 3.4178273677825928, "val_ppl": 30.503071021760462, "is_best": false, "timestamp": "2026-05-07T08:16:11.295289"}
|
| 18 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 15000, "epoch": 1, "val_loss": 3.4002161502838133, "val_ppl": 29.970577496152444, "is_best": false, "timestamp": "2026-05-07T08:21:59.647460"}
|
| 19 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 15100, "epoch": 1, "val_loss": 3.379251945018768, "val_ppl": 29.3488083773467, "is_best": false, "timestamp": "2026-05-07T08:27:51.392961"}
|
| 20 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "eval", "step": 15200, "epoch": 1, "val_loss": 3.240513730049133, "val_ppl": 25.546842557467986, "is_best": true, "timestamp": "2026-05-07T08:33:46.115803"}
|
GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/events.jsonl
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "model_summary", "total_params": 38227520, "trainable_params": 38227520, "weight_tied_lm_head": true, "timestamp": "2026-05-07T06:34:20.360062"}
|
| 2 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "config", "model": {"vocab_size": 50304, "n_layers": 18, "n_heads": 5, "n_kv_heads": 1, "n_embd": 320, "embedding_dim": null, "tie_embeddings": true, "context_len": 1024, "dropout": 0.0, "bias": false, "norm_type": "rmsnorm", "norm_eps": 1e-05, "positional_embedding": "rope", "rope_theta": 10000.0, "rope_fraction": 1.0, "mlp_type": "swiglu", "mlp_hidden_mult": 4.0, "mlp_hidden_dim": 1024, "qk_norm": false, "block_style": "sequential"}, "training": {"seed": 3, "optimizer": "muon", "learning_rate": 0.011088, "min_lr": 0.0011088, "weight_decay": 0.03, "beta1": 0.9, "beta2": 0.95, "muon_momentum": 0.95, "muon_ns_steps": 5, "grad_clip": 1.0, "max_iters": 15259, "warmup_steps": 153, "lr_schedule": "wsd", "wsd_stable_frac": 0.85, "batch_size": 4, "gradient_accumulation_steps": 128, "dtype": "bfloat16", "device": "cuda", "eval_step_interval": 100, "eval_batches": 20, "log_interval": 10, "max_checkpoints": 3}, "distributed": {"enabled": false, "backend": "nccl"}, "timestamp": "2026-05-07T06:34:20.360323"}
|
| 3 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "resume", "checkpoint": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/latest_ckpt.pt", "step": 13200, "best_val_loss": 3.5190807700157167, "timestamp": "2026-05-07T06:34:26.585075"}
|
| 4 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 13300, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0013300.pt", "timestamp": "2026-05-07T06:41:29.848711"}
|
| 5 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 13400, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0013400.pt", "timestamp": "2026-05-07T06:47:24.766605"}
|
| 6 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 13400, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/best_ckpt.pt", "timestamp": "2026-05-07T06:47:25.278303"}
|
| 7 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 13500, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0013500.pt", "timestamp": "2026-05-07T06:53:27.334976"}
|
| 8 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 13600, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0013600.pt", "timestamp": "2026-05-07T06:59:24.971980"}
|
| 9 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 13700, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0013700.pt", "timestamp": "2026-05-07T07:05:21.263653"}
|
| 10 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 13800, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0013800.pt", "timestamp": "2026-05-07T07:11:14.510639"}
|
| 11 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 13900, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0013900.pt", "timestamp": "2026-05-07T07:17:08.466509"}
|
| 12 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14000, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014000.pt", "timestamp": "2026-05-07T07:23:02.934107"}
|
| 13 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14100, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014100.pt", "timestamp": "2026-05-07T07:29:01.698085"}
|
| 14 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14200, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014200.pt", "timestamp": "2026-05-07T07:34:55.663753"}
|
| 15 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14300, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014300.pt", "timestamp": "2026-05-07T07:40:49.115477"}
|
| 16 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14400, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014400.pt", "timestamp": "2026-05-07T07:46:47.951120"}
|
| 17 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14500, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014500.pt", "timestamp": "2026-05-07T07:52:43.436517"}
|
| 18 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14600, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014600.pt", "timestamp": "2026-05-07T07:58:38.011313"}
|
| 19 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 14600, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/best_ckpt.pt", "timestamp": "2026-05-07T07:58:38.760394"}
|
| 20 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14700, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014700.pt", "timestamp": "2026-05-07T08:04:38.510177"}
|
| 21 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14800, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014800.pt", "timestamp": "2026-05-07T08:10:30.543314"}
|
| 22 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 14900, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0014900.pt", "timestamp": "2026-05-07T08:16:19.978518"}
|
| 23 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 15000, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0015000.pt", "timestamp": "2026-05-07T08:22:09.744903"}
|
| 24 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 15100, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0015100.pt", "timestamp": "2026-05-07T08:28:05.563922"}
|
| 25 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "checkpoint_saved", "step": 15200, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0015200.pt", "timestamp": "2026-05-07T08:33:55.757161"}
|
| 26 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "best_checkpoint_saved", "step": 15200, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/best_ckpt.pt", "timestamp": "2026-05-07T08:33:56.544454"}
|
| 27 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "final_checkpoint_saved", "step": 15259, "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/checkpoints/ckpt_step0015259.pt", "best_val_loss_so_far": 3.240513730049133, "timestamp": "2026-05-07T08:37:32.797718"}
|
| 28 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "metrics_plot_saved", "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/metrics.png", "timestamp": "2026-05-07T08:37:34.088645"}
|
| 29 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "results_doc_saved", "path": "artifacts/final_c2_muon/final_c2_muon_bs512_lr12_seed3_mix3to1/results.md", "timestamp": "2026-05-07T08:37:34.088801"}
|
GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/metrics.png
ADDED
|
Git LFS Details
|
GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/results.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Results: final_c2_muon_bs512_lr12_seed3_mix3to1
|
| 2 |
+
|
| 3 |
+
Automatically generated after pretraining.
|
| 4 |
+
|
| 5 |
+
## Summary
|
| 6 |
+
- Model: `18L / 5H / 320d`
|
| 7 |
+
- Total parameters: `38227520`
|
| 8 |
+
- Last logged train step: `15250`
|
| 9 |
+
- Best validation loss: `3.2405`
|
| 10 |
+
- Best validation perplexity: `25.55`
|
| 11 |
+
- Last validation step: `15200`
|
| 12 |
+
- Learning rate: `0.011088`
|
| 13 |
+
- Effective tokens/update: `524288`
|
| 14 |
+
|
| 15 |
+
## Files
|
| 16 |
+
- [Config snapshot](config_snapshot.json)
|
| 17 |
+
- [Train metrics](train_metrics.jsonl)
|
| 18 |
+
- [Eval metrics](eval_metrics.jsonl)
|
| 19 |
+
- [Events](events.jsonl)
|
| 20 |
+
- [Metrics plot](metrics.png)
|
| 21 |
+
|
| 22 |
+
## Metrics Plot
|
| 23 |
+
|
| 24 |
+

|
GPU_Run_Checkpoints/final_c2_muon_bs512_lr12_seed3_mix3to1/train_metrics.jsonl
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13210, "epoch": 0, "train_loss": 3.5143189132213593, "train_ppl": 33.59304035495711, "lr": 0.010041688073394495, "grad_norm": 0.2339, "tokens_per_sec": 62857, "dt_s": 83.41, "eta_s": 17091, "world_size": 1, "timestamp": "2026-05-07T06:35:49.995042"}
|
| 2 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13220, "epoch": 0, "train_loss": 3.4969701636582613, "train_ppl": 33.01526940388632, "lr": 0.009998091743119266, "grad_norm": 0.2137, "tokens_per_sec": 153670, "dt_s": 34.118, "eta_s": 11982, "world_size": 1, "timestamp": "2026-05-07T06:36:24.112854"}
|
| 3 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13230, "epoch": 0, "train_loss": 3.5149079747498035, "train_ppl": 33.612834552081324, "lr": 0.009954495412844037, "grad_norm": 0.1912, "tokens_per_sec": 152248, "dt_s": 34.436, "eta_s": 10278, "world_size": 1, "timestamp": "2026-05-07T06:36:58.549233"}
|
| 4 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13240, "epoch": 0, "train_loss": 3.504879495128989, "train_ppl": 33.27743351794254, "lr": 0.009910899082568808, "grad_norm": 0.1723, "tokens_per_sec": 154801, "dt_s": 33.869, "eta_s": 9380, "world_size": 1, "timestamp": "2026-05-07T06:37:32.417776"}
|
| 5 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13250, "epoch": 0, "train_loss": 3.510416094213724, "train_ppl": 33.46218831055335, "lr": 0.00986730275229358, "grad_norm": 0.209, "tokens_per_sec": 151398, "dt_s": 34.63, "eta_s": 8858, "world_size": 1, "timestamp": "2026-05-07T06:38:07.047500"}
|
| 6 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13260, "epoch": 0, "train_loss": 3.4893480129539967, "train_ppl": 32.76457865996413, "lr": 0.009823706422018349, "grad_norm": 0.2544, "tokens_per_sec": 152207, "dt_s": 34.446, "eta_s": 6856, "world_size": 1, "timestamp": "2026-05-07T06:38:41.493316"}
|
| 7 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13270, "epoch": 0, "train_loss": 3.5076797120273113, "train_ppl": 33.37074813922697, "lr": 0.00978011009174312, "grad_norm": 0.2213, "tokens_per_sec": 155484, "dt_s": 33.72, "eta_s": 6806, "world_size": 1, "timestamp": "2026-05-07T06:39:15.212967"}
|
| 8 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13280, "epoch": 0, "train_loss": 3.53508198261261, "train_ppl": 34.29782642438871, "lr": 0.009736513761467891, "grad_norm": 0.2009, "tokens_per_sec": 153405, "dt_s": 34.177, "eta_s": 6762, "world_size": 1, "timestamp": "2026-05-07T06:39:49.389623"}
|
| 9 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13290, "epoch": 0, "train_loss": 3.5499034132808447, "train_ppl": 34.80995514586437, "lr": 0.00969291743119266, "grad_norm": 0.204, "tokens_per_sec": 153248, "dt_s": 34.212, "eta_s": 6741, "world_size": 1, "timestamp": "2026-05-07T06:40:23.601295"}
|
| 10 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13300, "epoch": 0, "train_loss": 3.513550203293562, "train_ppl": 33.56722697410355, "lr": 0.009649321100917431, "grad_norm": 0.1732, "tokens_per_sec": 152395, "dt_s": 34.403, "eta_s": 6698, "world_size": 1, "timestamp": "2026-05-07T06:40:58.004521"}
|
| 11 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13310, "epoch": 0, "train_loss": 3.5649184621870518, "train_ppl": 35.3365720200108, "lr": 0.009605724770642202, "grad_norm": 0.1931, "tokens_per_sec": 79074, "dt_s": 66.304, "eta_s": 6664, "world_size": 1, "timestamp": "2026-05-07T06:42:04.308265"}
|
| 12 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13320, "epoch": 0, "train_loss": 3.513483438640833, "train_ppl": 33.56498594466316, "lr": 0.009562128440366973, "grad_norm": 0.2556, "tokens_per_sec": 151578, "dt_s": 34.589, "eta_s": 6664, "world_size": 1, "timestamp": "2026-05-07T06:42:38.896914"}
|
| 13 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13330, "epoch": 0, "train_loss": 3.517059152945876, "train_ppl": 33.685219577508285, "lr": 0.009518532110091744, "grad_norm": 0.2058, "tokens_per_sec": 151737, "dt_s": 34.553, "eta_s": 6644, "world_size": 1, "timestamp": "2026-05-07T06:43:13.449403"}
|
| 14 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13340, "epoch": 0, "train_loss": 3.5744987931102514, "train_ppl": 35.67673490842049, "lr": 0.009474935779816514, "grad_norm": 0.2414, "tokens_per_sec": 152574, "dt_s": 34.363, "eta_s": 6615, "world_size": 1, "timestamp": "2026-05-07T06:43:47.812224"}
|
| 15 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13350, "epoch": 0, "train_loss": 3.460279941558838, "train_ppl": 31.825884655491954, "lr": 0.009431339449541285, "grad_norm": 0.2067, "tokens_per_sec": 149734, "dt_s": 35.015, "eta_s": 6604, "world_size": 1, "timestamp": "2026-05-07T06:44:22.826770"}
|
| 16 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13360, "epoch": 0, "train_loss": 3.4766224175691605, "train_ppl": 32.35027163187021, "lr": 0.009387743119266056, "grad_norm": 0.1474, "tokens_per_sec": 152701, "dt_s": 34.334, "eta_s": 6565, "world_size": 1, "timestamp": "2026-05-07T06:44:57.160951"}
|
| 17 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13370, "epoch": 0, "train_loss": 3.4896799102425575, "train_ppl": 32.77545493958623, "lr": 0.009344146788990827, "grad_norm": 0.2011, "tokens_per_sec": 152448, "dt_s": 34.391, "eta_s": 6523, "world_size": 1, "timestamp": "2026-05-07T06:45:31.552113"}
|
| 18 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13380, "epoch": 0, "train_loss": 3.525322986766696, "train_ppl": 33.964742007032164, "lr": 0.009300550458715596, "grad_norm": 0.1959, "tokens_per_sec": 152591, "dt_s": 34.359, "eta_s": 6481, "world_size": 1, "timestamp": "2026-05-07T06:46:05.911190"}
|
| 19 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13390, "epoch": 0, "train_loss": 3.5303658582270145, "train_ppl": 34.13645443310793, "lr": 0.009256954128440367, "grad_norm": 0.1618, "tokens_per_sec": 152636, "dt_s": 34.349, "eta_s": 6446, "world_size": 1, "timestamp": "2026-05-07T06:46:40.260115"}
|
| 20 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13400, "epoch": 0, "train_loss": 3.519121842458844, "train_ppl": 33.75477343601928, "lr": 0.009213357798165138, "grad_norm": 0.1619, "tokens_per_sec": 151920, "dt_s": 34.511, "eta_s": 6393, "world_size": 1, "timestamp": "2026-05-07T06:47:14.771007"}
|
| 21 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13410, "epoch": 0, "train_loss": 3.4871773403137922, "train_ppl": 32.6935346200715, "lr": 0.00916976146788991, "grad_norm": 0.1367, "tokens_per_sec": 103772, "dt_s": 50.523, "eta_s": 6373, "world_size": 1, "timestamp": "2026-05-07T06:48:05.293906"}
|
| 22 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13420, "epoch": 0, "train_loss": 3.514303194358945, "train_ppl": 33.5925123147278, "lr": 0.009126165137614679, "grad_norm": 0.1996, "tokens_per_sec": 151484, "dt_s": 34.61, "eta_s": 6347, "world_size": 1, "timestamp": "2026-05-07T06:48:39.903932"}
|
| 23 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13430, "epoch": 0, "train_loss": 3.493043350055814, "train_ppl": 32.88587880754496, "lr": 0.00908256880733945, "grad_norm": 0.1526, "tokens_per_sec": 151349, "dt_s": 34.641, "eta_s": 6323, "world_size": 1, "timestamp": "2026-05-07T06:49:14.544982"}
|
| 24 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13440, "epoch": 0, "train_loss": 3.5093121714890003, "train_ppl": 33.42526902222832, "lr": 0.00903897247706422, "grad_norm": 0.182, "tokens_per_sec": 149296, "dt_s": 35.117, "eta_s": 6316, "world_size": 1, "timestamp": "2026-05-07T06:49:49.662415"}
|
| 25 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13450, "epoch": 0, "train_loss": 3.5217913668602705, "train_ppl": 33.845003008810465, "lr": 0.008995376146788992, "grad_norm": 0.1737, "tokens_per_sec": 150061, "dt_s": 34.938, "eta_s": 6297, "world_size": 1, "timestamp": "2026-05-07T06:50:24.600595"}
|
| 26 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13460, "epoch": 0, "train_loss": 3.533335978165269, "train_ppl": 34.237994515507054, "lr": 0.008951779816513763, "grad_norm": 0.1609, "tokens_per_sec": 150049, "dt_s": 34.941, "eta_s": 6269, "world_size": 1, "timestamp": "2026-05-07T06:50:59.541840"}
|
| 27 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13470, "epoch": 0, "train_loss": 3.4956674315035343, "train_ppl": 32.97228735396494, "lr": 0.008908183486238532, "grad_norm": 0.1745, "tokens_per_sec": 150506, "dt_s": 34.835, "eta_s": 6242, "world_size": 1, "timestamp": "2026-05-07T06:51:34.376928"}
|
| 28 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13480, "epoch": 1, "train_loss": 3.5247141402214766, "train_ppl": 33.944068985189496, "lr": 0.008864587155963303, "grad_norm": 0.1525, "tokens_per_sec": 151003, "dt_s": 34.72, "eta_s": 6210, "world_size": 1, "timestamp": "2026-05-07T06:52:09.097411"}
|
| 29 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13490, "epoch": 1, "train_loss": 3.5071450378745794, "train_ppl": 33.35291043184509, "lr": 0.008820990825688074, "grad_norm": 0.2675, "tokens_per_sec": 151329, "dt_s": 34.646, "eta_s": 6159, "world_size": 1, "timestamp": "2026-05-07T06:52:43.742931"}
|
| 30 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13500, "epoch": 1, "train_loss": 3.758951438590884, "train_ppl": 42.90341551893903, "lr": 0.008777394495412844, "grad_norm": 0.1915, "tokens_per_sec": 151359, "dt_s": 34.639, "eta_s": 6113, "world_size": 1, "timestamp": "2026-05-07T06:53:18.381714"}
|
| 31 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13510, "epoch": 1, "train_loss": 3.5037634279578924, "train_ppl": 33.24031438443095, "lr": 0.008733798165137615, "grad_norm": 0.221, "tokens_per_sec": 118344, "dt_s": 44.302, "eta_s": 6093, "world_size": 1, "timestamp": "2026-05-07T06:54:02.683645"}
|
| 32 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13520, "epoch": 1, "train_loss": 3.529634101316333, "train_ppl": 34.111483983933944, "lr": 0.008690201834862386, "grad_norm": 0.1713, "tokens_per_sec": 151680, "dt_s": 34.565, "eta_s": 6049, "world_size": 1, "timestamp": "2026-05-07T06:54:37.249123"}
|
| 33 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13530, "epoch": 1, "train_loss": 3.5473738480359316, "train_ppl": 34.72201236854023, "lr": 0.008646605504587157, "grad_norm": 0.1501, "tokens_per_sec": 152653, "dt_s": 34.345, "eta_s": 6001, "world_size": 1, "timestamp": "2026-05-07T06:55:11.594254"}
|
| 34 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13540, "epoch": 1, "train_loss": 3.5175323486328125, "train_ppl": 33.70116305001959, "lr": 0.008603009174311928, "grad_norm": 0.1561, "tokens_per_sec": 153649, "dt_s": 34.123, "eta_s": 5948, "world_size": 1, "timestamp": "2026-05-07T06:55:45.716786"}
|
| 35 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13550, "epoch": 1, "train_loss": 3.5146703850477934, "train_ppl": 33.60484943736446, "lr": 0.008559412844036697, "grad_norm": 0.156, "tokens_per_sec": 153547, "dt_s": 34.145, "eta_s": 5897, "world_size": 1, "timestamp": "2026-05-07T06:56:19.861820"}
|
| 36 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13560, "epoch": 1, "train_loss": 3.462028205394745, "train_ppl": 31.88157336377424, "lr": 0.008515816513761468, "grad_norm": 0.1891, "tokens_per_sec": 151807, "dt_s": 34.536, "eta_s": 5835, "world_size": 1, "timestamp": "2026-05-07T06:56:54.398339"}
|
| 37 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13570, "epoch": 1, "train_loss": 3.511418791487813, "train_ppl": 33.49575758265365, "lr": 0.00847222018348624, "grad_norm": 0.1574, "tokens_per_sec": 151382, "dt_s": 34.634, "eta_s": 5803, "world_size": 1, "timestamp": "2026-05-07T06:57:29.031860"}
|
| 38 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13580, "epoch": 1, "train_loss": 3.500875186175108, "train_ppl": 33.14444683053379, "lr": 0.00842862385321101, "grad_norm": 0.1755, "tokens_per_sec": 151383, "dt_s": 34.633, "eta_s": 5778, "world_size": 1, "timestamp": "2026-05-07T06:58:03.665024"}
|
| 39 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13590, "epoch": 1, "train_loss": 3.4892140440642834, "train_ppl": 32.760189519749815, "lr": 0.008385027522935781, "grad_norm": 0.141, "tokens_per_sec": 151435, "dt_s": 34.621, "eta_s": 5760, "world_size": 1, "timestamp": "2026-05-07T06:58:38.286396"}
|
| 40 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13600, "epoch": 1, "train_loss": 3.5014922376722097, "train_ppl": 33.16490497228555, "lr": 0.00834143119266055, "grad_norm": 0.1823, "tokens_per_sec": 151818, "dt_s": 34.534, "eta_s": 5739, "world_size": 1, "timestamp": "2026-05-07T06:59:12.820429"}
|
| 41 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13610, "epoch": 1, "train_loss": 3.470853390172124, "train_ppl": 32.16417933043875, "lr": 0.008297834862385322, "grad_norm": 0.18, "tokens_per_sec": 112351, "dt_s": 46.665, "eta_s": 5703, "world_size": 1, "timestamp": "2026-05-07T06:59:59.485476"}
|
| 42 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13620, "epoch": 1, "train_loss": 3.4917505197227, "train_ppl": 32.84339041694951, "lr": 0.008254238532110093, "grad_norm": 0.2021, "tokens_per_sec": 151243, "dt_s": 34.665, "eta_s": 5670, "world_size": 1, "timestamp": "2026-05-07T07:00:34.150674"}
|
| 43 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13630, "epoch": 1, "train_loss": 3.4871725477278233, "train_ppl": 32.69337793387167, "lr": 0.008210642201834864, "grad_norm": 0.1497, "tokens_per_sec": 151868, "dt_s": 34.523, "eta_s": 5632, "world_size": 1, "timestamp": "2026-05-07T07:01:08.673195"}
|
| 44 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13640, "epoch": 1, "train_loss": 3.494139740243554, "train_ppl": 32.92195433519826, "lr": 0.008167045871559633, "grad_norm": 0.1623, "tokens_per_sec": 152140, "dt_s": 34.461, "eta_s": 5592, "world_size": 1, "timestamp": "2026-05-07T07:01:43.134043"}
|
| 45 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13650, "epoch": 1, "train_loss": 3.504551747813821, "train_ppl": 33.266528715558806, "lr": 0.008123449541284404, "grad_norm": 0.1658, "tokens_per_sec": 151366, "dt_s": 34.637, "eta_s": 5561, "world_size": 1, "timestamp": "2026-05-07T07:02:17.771134"}
|
| 46 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13660, "epoch": 1, "train_loss": 3.4701273441314697, "train_ppl": 32.1408351308923, "lr": 0.008079853211009175, "grad_norm": 0.162, "tokens_per_sec": 149956, "dt_s": 34.963, "eta_s": 5540, "world_size": 1, "timestamp": "2026-05-07T07:02:52.733855"}
|
| 47 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13670, "epoch": 1, "train_loss": 3.494923708960414, "train_ppl": 32.94777423717023, "lr": 0.008036256880733946, "grad_norm": 0.1802, "tokens_per_sec": 152174, "dt_s": 34.453, "eta_s": 5499, "world_size": 1, "timestamp": "2026-05-07T07:03:27.187067"}
|
| 48 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13680, "epoch": 1, "train_loss": 3.449562318623066, "train_ppl": 31.48660819041427, "lr": 0.007992660550458715, "grad_norm": 0.2095, "tokens_per_sec": 151514, "dt_s": 34.603, "eta_s": 5467, "world_size": 1, "timestamp": "2026-05-07T07:04:01.790404"}
|
| 49 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13690, "epoch": 1, "train_loss": 3.4082366973161697, "train_ppl": 30.211924496226768, "lr": 0.007949064220183487, "grad_norm": 0.1791, "tokens_per_sec": 152613, "dt_s": 34.354, "eta_s": 5429, "world_size": 1, "timestamp": "2026-05-07T07:04:36.144403"}
|
| 50 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13700, "epoch": 1, "train_loss": 3.492555683478713, "train_ppl": 32.86984537339467, "lr": 0.007905467889908258, "grad_norm": 0.1366, "tokens_per_sec": 151697, "dt_s": 34.562, "eta_s": 5392, "world_size": 1, "timestamp": "2026-05-07T07:05:10.706065"}
|
| 51 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13710, "epoch": 1, "train_loss": 3.497991234064102, "train_ppl": 33.04899753488333, "lr": 0.007861871559633029, "grad_norm": 0.1236, "tokens_per_sec": 115943, "dt_s": 45.219, "eta_s": 5348, "world_size": 1, "timestamp": "2026-05-07T07:05:55.925433"}
|
| 52 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13720, "epoch": 1, "train_loss": 3.4796548020094633, "train_ppl": 32.44851897878438, "lr": 0.007818275229357798, "grad_norm": 0.1313, "tokens_per_sec": 153333, "dt_s": 34.193, "eta_s": 5306, "world_size": 1, "timestamp": "2026-05-07T07:06:30.118202"}
|
| 53 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13730, "epoch": 1, "train_loss": 3.459994599223137, "train_ppl": 31.8168046787411, "lr": 0.007774678899082569, "grad_norm": 0.1853, "tokens_per_sec": 152836, "dt_s": 34.304, "eta_s": 5262, "world_size": 1, "timestamp": "2026-05-07T07:07:04.422154"}
|
| 54 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13740, "epoch": 1, "train_loss": 3.519534735009074, "train_ppl": 33.768713408162995, "lr": 0.00773108256880734, "grad_norm": 0.1929, "tokens_per_sec": 154307, "dt_s": 33.977, "eta_s": 5216, "world_size": 1, "timestamp": "2026-05-07T07:07:38.399018"}
|
| 55 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13750, "epoch": 1, "train_loss": 3.5011332165449858, "train_ppl": 33.153000207876964, "lr": 0.007687486238532111, "grad_norm": 0.1532, "tokens_per_sec": 154438, "dt_s": 33.948, "eta_s": 5163, "world_size": 1, "timestamp": "2026-05-07T07:08:12.347089"}
|
| 56 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13760, "epoch": 1, "train_loss": 3.5007860623300076, "train_ppl": 33.14149300161877, "lr": 0.007643889908256881, "grad_norm": 0.1783, "tokens_per_sec": 152290, "dt_s": 34.427, "eta_s": 5122, "world_size": 1, "timestamp": "2026-05-07T07:08:46.774010"}
|
| 57 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13770, "epoch": 1, "train_loss": 3.520602088421583, "train_ppl": 33.80477580189109, "lr": 0.007600293577981651, "grad_norm": 0.1535, "tokens_per_sec": 150564, "dt_s": 34.822, "eta_s": 5106, "world_size": 1, "timestamp": "2026-05-07T07:09:21.595598"}
|
| 58 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13780, "epoch": 1, "train_loss": 3.4685318246483803, "train_ppl": 32.0895946906687, "lr": 0.0075566972477064225, "grad_norm": 0.17, "tokens_per_sec": 151656, "dt_s": 34.571, "eta_s": 5080, "world_size": 1, "timestamp": "2026-05-07T07:09:56.166514"}
|
| 59 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13790, "epoch": 1, "train_loss": 3.445944856852293, "train_ppl": 31.37291235814307, "lr": 0.0075131009174311935, "grad_norm": 0.143, "tokens_per_sec": 153118, "dt_s": 34.241, "eta_s": 5053, "world_size": 1, "timestamp": "2026-05-07T07:10:30.407306"}
|
| 60 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13800, "epoch": 1, "train_loss": 3.4981111753731966, "train_ppl": 33.052961712641505, "lr": 0.0074695045871559645, "grad_norm": 0.1988, "tokens_per_sec": 154181, "dt_s": 34.005, "eta_s": 5021, "world_size": 1, "timestamp": "2026-05-07T07:11:04.412138"}
|
| 61 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13810, "epoch": 1, "train_loss": 3.4679648485034704, "train_ppl": 32.07140581279644, "lr": 0.007425908256880734, "grad_norm": 0.156, "tokens_per_sec": 117634, "dt_s": 44.569, "eta_s": 4988, "world_size": 1, "timestamp": "2026-05-07T07:11:48.981422"}
|
| 62 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13820, "epoch": 1, "train_loss": 3.5082423221319914, "train_ppl": 33.38952814174505, "lr": 0.007382311926605505, "grad_norm": 0.1935, "tokens_per_sec": 151137, "dt_s": 34.69, "eta_s": 4949, "world_size": 1, "timestamp": "2026-05-07T07:12:23.671108"}
|
| 63 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13830, "epoch": 1, "train_loss": 3.4733642991632223, "train_ppl": 32.245042134597455, "lr": 0.007338715596330276, "grad_norm": 0.1638, "tokens_per_sec": 150502, "dt_s": 34.836, "eta_s": 4923, "world_size": 1, "timestamp": "2026-05-07T07:12:58.506976"}
|
| 64 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13840, "epoch": 1, "train_loss": 3.4963016640394926, "train_ppl": 32.993206084358285, "lr": 0.007295119266055047, "grad_norm": 0.1712, "tokens_per_sec": 152626, "dt_s": 34.351, "eta_s": 4891, "world_size": 1, "timestamp": "2026-05-07T07:13:32.858036"}
|
| 65 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13850, "epoch": 1, "train_loss": 3.5225153751671314, "train_ppl": 33.86951594484845, "lr": 0.007251522935779816, "grad_norm": 0.2007, "tokens_per_sec": 152034, "dt_s": 34.485, "eta_s": 4870, "world_size": 1, "timestamp": "2026-05-07T07:14:07.342888"}
|
| 66 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13860, "epoch": 1, "train_loss": 3.4580016750842333, "train_ppl": 31.753459342864502, "lr": 0.007207926605504587, "grad_norm": 0.1546, "tokens_per_sec": 153459, "dt_s": 34.165, "eta_s": 4827, "world_size": 1, "timestamp": "2026-05-07T07:14:41.507464"}
|
| 67 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13870, "epoch": 1, "train_loss": 3.4459212739020586, "train_ppl": 31.372172501036257, "lr": 0.007164330275229358, "grad_norm": 0.1532, "tokens_per_sec": 153586, "dt_s": 34.136, "eta_s": 4777, "world_size": 1, "timestamp": "2026-05-07T07:15:15.643890"}
|
| 68 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13880, "epoch": 1, "train_loss": 3.486993245780468, "train_ppl": 32.68751647304388, "lr": 0.007120733944954129, "grad_norm": 0.1392, "tokens_per_sec": 153034, "dt_s": 34.26, "eta_s": 4727, "world_size": 1, "timestamp": "2026-05-07T07:15:49.903463"}
|
| 69 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13890, "epoch": 1, "train_loss": 3.4775203093886375, "train_ppl": 32.379331720581916, "lr": 0.0070771376146789, "grad_norm": 0.1849, "tokens_per_sec": 152978, "dt_s": 34.272, "eta_s": 4691, "world_size": 1, "timestamp": "2026-05-07T07:16:24.175632"}
|
| 70 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13900, "epoch": 1, "train_loss": 3.4698590599000454, "train_ppl": 32.1322134082297, "lr": 0.00703354128440367, "grad_norm": 0.1381, "tokens_per_sec": 153054, "dt_s": 34.255, "eta_s": 4650, "world_size": 1, "timestamp": "2026-05-07T07:16:58.430630"}
|
| 71 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13910, "epoch": 1, "train_loss": 3.4849653728306293, "train_ppl": 32.621297507108196, "lr": 0.006989944954128441, "grad_norm": 0.1439, "tokens_per_sec": 117699, "dt_s": 44.545, "eta_s": 4625, "world_size": 1, "timestamp": "2026-05-07T07:17:42.975463"}
|
| 72 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13920, "epoch": 1, "train_loss": 3.435989737510681, "train_ppl": 31.062140722449975, "lr": 0.006946348623853212, "grad_norm": 0.1394, "tokens_per_sec": 152245, "dt_s": 34.437, "eta_s": 4599, "world_size": 1, "timestamp": "2026-05-07T07:18:17.412710"}
|
| 73 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13930, "epoch": 1, "train_loss": 3.4891370572149754, "train_ppl": 32.75766751305788, "lr": 0.006902752293577982, "grad_norm": 0.1375, "tokens_per_sec": 152256, "dt_s": 34.435, "eta_s": 4569, "world_size": 1, "timestamp": "2026-05-07T07:18:51.847360"}
|
| 74 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13940, "epoch": 1, "train_loss": 3.4434200935065746, "train_ppl": 31.29380308708976, "lr": 0.006859155963302752, "grad_norm": 0.1494, "tokens_per_sec": 152648, "dt_s": 34.346, "eta_s": 4537, "world_size": 1, "timestamp": "2026-05-07T07:19:26.193590"}
|
| 75 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13950, "epoch": 1, "train_loss": 3.4169581700116396, "train_ppl": 30.476569339691125, "lr": 0.006815559633027523, "grad_norm": 0.1984, "tokens_per_sec": 152938, "dt_s": 34.281, "eta_s": 4503, "world_size": 1, "timestamp": "2026-05-07T07:20:00.474592"}
|
| 76 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13960, "epoch": 1, "train_loss": 3.41582078486681, "train_ppl": 30.44192544791746, "lr": 0.006771963302752294, "grad_norm": 0.1647, "tokens_per_sec": 152071, "dt_s": 34.477, "eta_s": 4468, "world_size": 1, "timestamp": "2026-05-07T07:20:34.951130"}
|
| 77 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13970, "epoch": 1, "train_loss": 3.4778663869947195, "train_ppl": 32.39053942144559, "lr": 0.0067283669724770645, "grad_norm": 0.1244, "tokens_per_sec": 151425, "dt_s": 34.624, "eta_s": 4438, "world_size": 1, "timestamp": "2026-05-07T07:21:09.574749"}
|
| 78 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13980, "epoch": 1, "train_loss": 3.4563572965562344, "train_ppl": 31.701287542981117, "lr": 0.006684770642201835, "grad_norm": 0.1316, "tokens_per_sec": 150473, "dt_s": 34.843, "eta_s": 4414, "world_size": 1, "timestamp": "2026-05-07T07:21:44.417427"}
|
| 79 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 13990, "epoch": 1, "train_loss": 3.4518488124012947, "train_ppl": 31.55868449374718, "lr": 0.006641174311926606, "grad_norm": 0.1335, "tokens_per_sec": 152531, "dt_s": 34.373, "eta_s": 4380, "world_size": 1, "timestamp": "2026-05-07T07:22:18.790053"}
|
| 80 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14000, "epoch": 1, "train_loss": 3.4840596299618483, "train_ppl": 32.59176437625109, "lr": 0.006597577981651377, "grad_norm": 0.1736, "tokens_per_sec": 152238, "dt_s": 34.439, "eta_s": 4350, "world_size": 1, "timestamp": "2026-05-07T07:22:53.228758"}
|
| 81 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14010, "epoch": 1, "train_loss": 3.470248404890299, "train_ppl": 32.14472636031548, "lr": 0.006553981651376147, "grad_norm": 0.1324, "tokens_per_sec": 118424, "dt_s": 44.272, "eta_s": 4318, "world_size": 1, "timestamp": "2026-05-07T07:23:37.500819"}
|
| 82 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14020, "epoch": 1, "train_loss": 3.4516573287546635, "train_ppl": 31.55264210028566, "lr": 0.006510385321100917, "grad_norm": 0.143, "tokens_per_sec": 151804, "dt_s": 34.537, "eta_s": 4281, "world_size": 1, "timestamp": "2026-05-07T07:24:12.038045"}
|
| 83 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14030, "epoch": 1, "train_loss": 3.4393704757094383, "train_ppl": 31.167331399005967, "lr": 0.006466788990825688, "grad_norm": 0.1643, "tokens_per_sec": 152830, "dt_s": 34.305, "eta_s": 4233, "world_size": 1, "timestamp": "2026-05-07T07:24:46.343411"}
|
| 84 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14040, "epoch": 1, "train_loss": 3.430293381214142, "train_ppl": 30.88570270556101, "lr": 0.006423192660550459, "grad_norm": 0.1522, "tokens_per_sec": 152635, "dt_s": 34.349, "eta_s": 4198, "world_size": 1, "timestamp": "2026-05-07T07:25:20.692460"}
|
| 85 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14050, "epoch": 1, "train_loss": 3.453440722078085, "train_ppl": 31.608963077843924, "lr": 0.0063795963302752294, "grad_norm": 0.1418, "tokens_per_sec": 151776, "dt_s": 34.544, "eta_s": 4166, "world_size": 1, "timestamp": "2026-05-07T07:25:55.236000"}
|
| 86 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14060, "epoch": 1, "train_loss": 3.4545179046690464, "train_ppl": 31.64303004746897, "lr": 0.0063360000000000005, "grad_norm": 0.1739, "tokens_per_sec": 152962, "dt_s": 34.276, "eta_s": 4125, "world_size": 1, "timestamp": "2026-05-07T07:26:29.511733"}
|
| 87 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14070, "epoch": 1, "train_loss": 3.4558912720531225, "train_ppl": 31.686517408095593, "lr": 0.006292403669724771, "grad_norm": 0.1395, "tokens_per_sec": 152673, "dt_s": 34.341, "eta_s": 4086, "world_size": 1, "timestamp": "2026-05-07T07:27:03.852267"}
|
| 88 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14080, "epoch": 1, "train_loss": 3.4684044271707535, "train_ppl": 32.08550681764493, "lr": 0.006248807339449542, "grad_norm": 0.1235, "tokens_per_sec": 153056, "dt_s": 34.255, "eta_s": 4050, "world_size": 1, "timestamp": "2026-05-07T07:27:38.106862"}
|
| 89 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14090, "epoch": 1, "train_loss": 3.4567455425858498, "train_ppl": 31.713597831559074, "lr": 0.006205211009174313, "grad_norm": 0.137, "tokens_per_sec": 152422, "dt_s": 34.397, "eta_s": 4017, "world_size": 1, "timestamp": "2026-05-07T07:28:12.504098"}
|
| 90 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14100, "epoch": 1, "train_loss": 3.5002449862658978, "train_ppl": 33.123565783458425, "lr": 0.006161614678899083, "grad_norm": 0.1736, "tokens_per_sec": 152435, "dt_s": 34.394, "eta_s": 3979, "world_size": 1, "timestamp": "2026-05-07T07:28:46.898389"}
|
| 91 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14110, "epoch": 1, "train_loss": 3.4513726085424423, "train_ppl": 31.54365970412628, "lr": 0.006118018348623853, "grad_norm": 0.161, "tokens_per_sec": 106171, "dt_s": 49.382, "eta_s": 3952, "world_size": 1, "timestamp": "2026-05-07T07:29:36.280091"}
|
| 92 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14120, "epoch": 1, "train_loss": 3.4739826042205095, "train_ppl": 32.2649855721505, "lr": 0.006074422018348624, "grad_norm": 0.1583, "tokens_per_sec": 152263, "dt_s": 34.433, "eta_s": 3919, "world_size": 1, "timestamp": "2026-05-07T07:30:10.713145"}
|
| 93 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14130, "epoch": 1, "train_loss": 3.480673646554351, "train_ppl": 32.48159582255524, "lr": 0.006030825688073394, "grad_norm": 0.1412, "tokens_per_sec": 152506, "dt_s": 34.378, "eta_s": 3888, "world_size": 1, "timestamp": "2026-05-07T07:30:45.091371"}
|
| 94 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14140, "epoch": 1, "train_loss": 3.4537435304373503, "train_ppl": 31.618535985396672, "lr": 0.005987229357798165, "grad_norm": 0.1403, "tokens_per_sec": 152032, "dt_s": 34.485, "eta_s": 3855, "world_size": 1, "timestamp": "2026-05-07T07:31:19.576765"}
|
| 95 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14150, "epoch": 1, "train_loss": 3.4652359262108803, "train_ppl": 31.98400474811586, "lr": 0.005943633027522936, "grad_norm": 0.1312, "tokens_per_sec": 152932, "dt_s": 34.282, "eta_s": 3818, "world_size": 1, "timestamp": "2026-05-07T07:31:53.859099"}
|
| 96 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14160, "epoch": 1, "train_loss": 3.43232717551291, "train_ppl": 30.9485817915263, "lr": 0.0059000366972477075, "grad_norm": 0.1761, "tokens_per_sec": 152641, "dt_s": 34.348, "eta_s": 3779, "world_size": 1, "timestamp": "2026-05-07T07:32:28.206928"}
|
| 97 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14170, "epoch": 1, "train_loss": 3.462757198140025, "train_ppl": 31.904823272942465, "lr": 0.005856440366972478, "grad_norm": 0.1323, "tokens_per_sec": 151978, "dt_s": 34.498, "eta_s": 3746, "world_size": 1, "timestamp": "2026-05-07T07:33:02.704464"}
|
| 98 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14180, "epoch": 1, "train_loss": 3.416829116642475, "train_ppl": 30.472636489516503, "lr": 0.005812844036697248, "grad_norm": 0.1534, "tokens_per_sec": 153132, "dt_s": 34.238, "eta_s": 3708, "world_size": 1, "timestamp": "2026-05-07T07:33:36.942022"}
|
| 99 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14190, "epoch": 1, "train_loss": 3.4522590106353164, "train_ppl": 31.571632465830646, "lr": 0.005769247706422019, "grad_norm": 0.1279, "tokens_per_sec": 150572, "dt_s": 34.82, "eta_s": 3681, "world_size": 1, "timestamp": "2026-05-07T07:34:11.761676"}
|
| 100 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14200, "epoch": 1, "train_loss": 3.4197248332202435, "train_ppl": 30.561004490762162, "lr": 0.005725651376146789, "grad_norm": 0.1247, "tokens_per_sec": 151661, "dt_s": 34.57, "eta_s": 3653, "world_size": 1, "timestamp": "2026-05-07T07:34:46.331446"}
|
| 101 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14210, "epoch": 1, "train_loss": 3.45742904022336, "train_ppl": 31.735281410247143, "lr": 0.00568205504587156, "grad_norm": 0.1278, "tokens_per_sec": 118807, "dt_s": 44.129, "eta_s": 3628, "world_size": 1, "timestamp": "2026-05-07T07:35:30.460676"}
|
| 102 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14220, "epoch": 1, "train_loss": 3.425647424533963, "train_ppl": 30.742541885747073, "lr": 0.00563845871559633, "grad_norm": 0.1222, "tokens_per_sec": 152252, "dt_s": 34.435, "eta_s": 3592, "world_size": 1, "timestamp": "2026-05-07T07:36:04.896164"}
|
| 103 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14230, "epoch": 1, "train_loss": 3.4246704038232565, "train_ppl": 30.712520453793452, "lr": 0.005594862385321101, "grad_norm": 0.1318, "tokens_per_sec": 153466, "dt_s": 34.163, "eta_s": 3556, "world_size": 1, "timestamp": "2026-05-07T07:36:39.059329"}
|
| 104 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14240, "epoch": 1, "train_loss": 3.444010604172945, "train_ppl": 31.312287868797224, "lr": 0.005551266055045872, "grad_norm": 0.1329, "tokens_per_sec": 151733, "dt_s": 34.553, "eta_s": 3516, "world_size": 1, "timestamp": "2026-05-07T07:37:13.612698"}
|
| 105 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14250, "epoch": 1, "train_loss": 3.432269701734185, "train_ppl": 30.946803110698816, "lr": 0.005507669724770643, "grad_norm": 0.1587, "tokens_per_sec": 152645, "dt_s": 34.347, "eta_s": 3477, "world_size": 1, "timestamp": "2026-05-07T07:37:47.959639"}
|
| 106 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14260, "epoch": 1, "train_loss": 3.484632920473814, "train_ppl": 32.61045428239726, "lr": 0.005464073394495413, "grad_norm": 0.1441, "tokens_per_sec": 154318, "dt_s": 33.974, "eta_s": 3426, "world_size": 1, "timestamp": "2026-05-07T07:38:21.934089"}
|
| 107 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14270, "epoch": 1, "train_loss": 3.4777634125202894, "train_ppl": 32.38720419439673, "lr": 0.005420477064220184, "grad_norm": 0.1558, "tokens_per_sec": 152058, "dt_s": 34.48, "eta_s": 3393, "world_size": 1, "timestamp": "2026-05-07T07:38:56.413676"}
|
| 108 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14280, "epoch": 1, "train_loss": 3.4490829911082983, "train_ppl": 31.471519409284767, "lr": 0.005376880733944954, "grad_norm": 0.1633, "tokens_per_sec": 151448, "dt_s": 34.618, "eta_s": 3367, "world_size": 1, "timestamp": "2026-05-07T07:39:31.032093"}
|
| 109 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14290, "epoch": 1, "train_loss": 3.4691202118992805, "train_ppl": 32.108481354862576, "lr": 0.005333284403669726, "grad_norm": 0.1322, "tokens_per_sec": 153703, "dt_s": 34.11, "eta_s": 3324, "world_size": 1, "timestamp": "2026-05-07T07:40:05.142509"}
|
| 110 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14300, "epoch": 1, "train_loss": 3.4538164604455233, "train_ppl": 31.620842009572662, "lr": 0.005289688073394496, "grad_norm": 0.1264, "tokens_per_sec": 153741, "dt_s": 34.102, "eta_s": 3285, "world_size": 1, "timestamp": "2026-05-07T07:40:39.244463"}
|
| 111 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14310, "epoch": 1, "train_loss": 3.4575282502919436, "train_ppl": 31.738430025876973, "lr": 0.005246091743119266, "grad_norm": 0.1382, "tokens_per_sec": 119316, "dt_s": 43.941, "eta_s": 3253, "world_size": 1, "timestamp": "2026-05-07T07:41:23.185722"}
|
| 112 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14320, "epoch": 1, "train_loss": 3.428134234622121, "train_ppl": 30.819087887280435, "lr": 0.005202495412844037, "grad_norm": 0.1213, "tokens_per_sec": 152489, "dt_s": 34.382, "eta_s": 3217, "world_size": 1, "timestamp": "2026-05-07T07:41:57.567845"}
|
| 113 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14330, "epoch": 1, "train_loss": 3.4154378306120634, "train_ppl": 30.430269814974064, "lr": 0.0051588990825688075, "grad_norm": 0.1233, "tokens_per_sec": 151956, "dt_s": 34.503, "eta_s": 3180, "world_size": 1, "timestamp": "2026-05-07T07:42:32.070482"}
|
| 114 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14340, "epoch": 1, "train_loss": 3.466863000765443, "train_ppl": 32.036087468138575, "lr": 0.005115302752293578, "grad_norm": 0.1526, "tokens_per_sec": 152135, "dt_s": 34.462, "eta_s": 3152, "world_size": 1, "timestamp": "2026-05-07T07:43:06.532469"}
|
| 115 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14350, "epoch": 1, "train_loss": 3.439975783228874, "train_ppl": 31.186202930026145, "lr": 0.005071706422018349, "grad_norm": 0.1267, "tokens_per_sec": 152711, "dt_s": 34.332, "eta_s": 3122, "world_size": 1, "timestamp": "2026-05-07T07:43:40.864522"}
|
| 116 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14360, "epoch": 1, "train_loss": 3.4511474445462227, "train_ppl": 31.53655800720514, "lr": 0.005028110091743119, "grad_norm": 0.1551, "tokens_per_sec": 151908, "dt_s": 34.514, "eta_s": 3096, "world_size": 1, "timestamp": "2026-05-07T07:44:15.378111"}
|
| 117 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14370, "epoch": 1, "train_loss": 3.4377310797572136, "train_ppl": 31.116277662147983, "lr": 0.004984513761467891, "grad_norm": 0.1251, "tokens_per_sec": 151289, "dt_s": 34.655, "eta_s": 3066, "world_size": 1, "timestamp": "2026-05-07T07:44:50.033003"}
|
| 118 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14380, "epoch": 1, "train_loss": 3.434554073959589, "train_ppl": 31.017577935434527, "lr": 0.004940917431192661, "grad_norm": 0.1217, "tokens_per_sec": 151648, "dt_s": 34.573, "eta_s": 3033, "world_size": 1, "timestamp": "2026-05-07T07:45:24.605487"}
|
| 119 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14390, "epoch": 1, "train_loss": 3.4117383658885956, "train_ppl": 30.317902083652733, "lr": 0.004897321100917431, "grad_norm": 0.1266, "tokens_per_sec": 152506, "dt_s": 34.378, "eta_s": 2997, "world_size": 1, "timestamp": "2026-05-07T07:45:58.983658"}
|
| 120 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14400, "epoch": 1, "train_loss": 3.393234556540847, "train_ppl": 29.762063825659236, "lr": 0.004853724770642202, "grad_norm": 0.1388, "tokens_per_sec": 151797, "dt_s": 34.539, "eta_s": 2966, "world_size": 1, "timestamp": "2026-05-07T07:46:33.522465"}
|
| 121 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14410, "epoch": 1, "train_loss": 3.422414356842637, "train_ppl": 30.64330966550688, "lr": 0.004810128440366972, "grad_norm": 0.1204, "tokens_per_sec": 106982, "dt_s": 49.007, "eta_s": 2933, "world_size": 1, "timestamp": "2026-05-07T07:47:22.529454"}
|
| 122 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14420, "epoch": 1, "train_loss": 3.4418497644364834, "train_ppl": 31.24470008241662, "lr": 0.004766532110091744, "grad_norm": 0.1668, "tokens_per_sec": 152486, "dt_s": 34.383, "eta_s": 2894, "world_size": 1, "timestamp": "2026-05-07T07:47:56.912239"}
|
| 123 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14430, "epoch": 1, "train_loss": 3.435611156746745, "train_ppl": 31.050383419170636, "lr": 0.0047229357798165145, "grad_norm": 0.1183, "tokens_per_sec": 152077, "dt_s": 34.475, "eta_s": 2858, "world_size": 1, "timestamp": "2026-05-07T07:48:31.387326"}
|
| 124 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14440, "epoch": 1, "train_loss": 3.4387031169608235, "train_ppl": 31.146538546643537, "lr": 0.004679339449541285, "grad_norm": 0.123, "tokens_per_sec": 151973, "dt_s": 34.499, "eta_s": 2825, "world_size": 1, "timestamp": "2026-05-07T07:49:05.886149"}
|
| 125 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14450, "epoch": 1, "train_loss": 3.3902548905462027, "train_ppl": 29.673514804915005, "lr": 0.004635743119266056, "grad_norm": 0.1182, "tokens_per_sec": 152690, "dt_s": 34.337, "eta_s": 2787, "world_size": 1, "timestamp": "2026-05-07T07:49:40.222964"}
|
| 126 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14460, "epoch": 1, "train_loss": 3.417823538184166, "train_ppl": 30.50295420747134, "lr": 0.004592146788990826, "grad_norm": 0.1165, "tokens_per_sec": 151932, "dt_s": 34.508, "eta_s": 2752, "world_size": 1, "timestamp": "2026-05-07T07:50:14.730992"}
|
| 127 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14470, "epoch": 1, "train_loss": 3.4487975910305977, "train_ppl": 31.462538716806126, "lr": 0.004548550458715596, "grad_norm": 0.1236, "tokens_per_sec": 152842, "dt_s": 34.303, "eta_s": 2716, "world_size": 1, "timestamp": "2026-05-07T07:50:49.033501"}
|
| 128 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14480, "epoch": 1, "train_loss": 3.4574535097926855, "train_ppl": 31.736057968416656, "lr": 0.004504954128440367, "grad_norm": 0.1148, "tokens_per_sec": 151941, "dt_s": 34.506, "eta_s": 2682, "world_size": 1, "timestamp": "2026-05-07T07:51:23.539517"}
|
| 129 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14490, "epoch": 1, "train_loss": 3.422091230750084, "train_ppl": 30.633409612160687, "lr": 0.004461357798165137, "grad_norm": 0.1149, "tokens_per_sec": 150989, "dt_s": 34.724, "eta_s": 2651, "world_size": 1, "timestamp": "2026-05-07T07:51:58.263064"}
|
| 130 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14500, "epoch": 1, "train_loss": 3.4096069000661373, "train_ppl": 30.253349331984136, "lr": 0.004417761467889909, "grad_norm": 0.1274, "tokens_per_sec": 152749, "dt_s": 34.324, "eta_s": 2616, "world_size": 1, "timestamp": "2026-05-07T07:52:32.586646"}
|
| 131 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14510, "epoch": 1, "train_loss": 3.416939763352275, "train_ppl": 30.47600837302346, "lr": 0.004374165137614679, "grad_norm": 0.1178, "tokens_per_sec": 114789, "dt_s": 45.674, "eta_s": 2587, "world_size": 1, "timestamp": "2026-05-07T07:53:18.260888"}
|
| 132 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14520, "epoch": 1, "train_loss": 3.413985440507531, "train_ppl": 30.38610527205263, "lr": 0.0043305688073394496, "grad_norm": 0.1385, "tokens_per_sec": 151585, "dt_s": 34.587, "eta_s": 2556, "world_size": 1, "timestamp": "2026-05-07T07:53:52.847982"}
|
| 133 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14530, "epoch": 1, "train_loss": 3.436718450859189, "train_ppl": 31.08478436838916, "lr": 0.004286972477064221, "grad_norm": 0.1377, "tokens_per_sec": 150728, "dt_s": 34.784, "eta_s": 2526, "world_size": 1, "timestamp": "2026-05-07T07:54:27.631830"}
|
| 134 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14540, "epoch": 1, "train_loss": 3.413869872689247, "train_ppl": 30.382593819069577, "lr": 0.004243376146788991, "grad_norm": 0.1249, "tokens_per_sec": 151163, "dt_s": 34.684, "eta_s": 2491, "world_size": 1, "timestamp": "2026-05-07T07:55:02.315532"}
|
| 135 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14550, "epoch": 1, "train_loss": 3.39662572927773, "train_ppl": 29.863163451357813, "lr": 0.004199779816513762, "grad_norm": 0.1179, "tokens_per_sec": 151862, "dt_s": 34.524, "eta_s": 2459, "world_size": 1, "timestamp": "2026-05-07T07:55:36.839562"}
|
| 136 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14560, "epoch": 1, "train_loss": 3.4174587689340115, "train_ppl": 30.49182969680064, "lr": 0.004156183486238533, "grad_norm": 0.1235, "tokens_per_sec": 152585, "dt_s": 34.36, "eta_s": 2418, "world_size": 1, "timestamp": "2026-05-07T07:56:11.199915"}
|
| 137 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14570, "epoch": 1, "train_loss": 3.4058915209025145, "train_ppl": 30.141155219141844, "lr": 0.004112587155963303, "grad_norm": 0.1181, "tokens_per_sec": 152674, "dt_s": 34.34, "eta_s": 2380, "world_size": 1, "timestamp": "2026-05-07T07:56:45.540177"}
|
| 138 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14580, "epoch": 1, "train_loss": 3.394247941672802, "train_ppl": 29.79223954586715, "lr": 0.004068990825688074, "grad_norm": 0.151, "tokens_per_sec": 152645, "dt_s": 34.347, "eta_s": 2339, "world_size": 1, "timestamp": "2026-05-07T07:57:19.886987"}
|
| 139 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14590, "epoch": 1, "train_loss": 3.367717931047082, "train_ppl": 29.012243514576088, "lr": 0.004025394495412844, "grad_norm": 0.1219, "tokens_per_sec": 152415, "dt_s": 34.399, "eta_s": 2301, "world_size": 1, "timestamp": "2026-05-07T07:57:54.285608"}
|
| 140 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14600, "epoch": 1, "train_loss": 3.426645915955305, "train_ppl": 30.773253380120497, "lr": 0.0039817981651376145, "grad_norm": 0.1196, "tokens_per_sec": 152832, "dt_s": 34.305, "eta_s": 2264, "world_size": 1, "timestamp": "2026-05-07T07:58:28.590493"}
|
| 141 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14610, "epoch": 1, "train_loss": 3.4511796198785305, "train_ppl": 31.537572722763183, "lr": 0.0039382018348623855, "grad_norm": 0.1115, "tokens_per_sec": 105630, "dt_s": 49.634, "eta_s": 2229, "world_size": 1, "timestamp": "2026-05-07T07:59:18.224817"}
|
| 142 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14620, "epoch": 1, "train_loss": 3.4053111728280783, "train_ppl": 30.123667932596483, "lr": 0.0038946055045871557, "grad_norm": 0.1121, "tokens_per_sec": 153166, "dt_s": 34.23, "eta_s": 2194, "world_size": 1, "timestamp": "2026-05-07T07:59:52.454757"}
|
| 143 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14630, "epoch": 1, "train_loss": 3.4237597342580557, "train_ppl": 30.68456422751984, "lr": 0.0038510091743119276, "grad_norm": 0.125, "tokens_per_sec": 152211, "dt_s": 34.445, "eta_s": 2160, "world_size": 1, "timestamp": "2026-05-07T08:00:26.899592"}
|
| 144 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14640, "epoch": 1, "train_loss": 3.417356174439192, "train_ppl": 30.488701563404145, "lr": 0.0038074128440366978, "grad_norm": 0.1336, "tokens_per_sec": 153292, "dt_s": 34.202, "eta_s": 2124, "world_size": 1, "timestamp": "2026-05-07T08:01:01.101447"}
|
| 145 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14650, "epoch": 1, "train_loss": 3.3914885986596346, "train_ppl": 29.710145852247873, "lr": 0.003763816513761468, "grad_norm": 0.1134, "tokens_per_sec": 152236, "dt_s": 34.439, "eta_s": 2091, "world_size": 1, "timestamp": "2026-05-07T08:01:35.540592"}
|
| 146 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14660, "epoch": 1, "train_loss": 3.393052676692605, "train_ppl": 29.75665119824611, "lr": 0.003720220183486239, "grad_norm": 0.113, "tokens_per_sec": 152106, "dt_s": 34.469, "eta_s": 2058, "world_size": 1, "timestamp": "2026-05-07T08:02:10.009108"}
|
| 147 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14670, "epoch": 1, "train_loss": 3.4062552098184824, "train_ppl": 30.15211921683088, "lr": 0.003676623853211009, "grad_norm": 0.1128, "tokens_per_sec": 154091, "dt_s": 34.025, "eta_s": 2021, "world_size": 1, "timestamp": "2026-05-07T08:02:44.033747"}
|
| 148 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14680, "epoch": 1, "train_loss": 3.4487738721072674, "train_ppl": 31.46179246811267, "lr": 0.0036330275229357802, "grad_norm": 0.1149, "tokens_per_sec": 152393, "dt_s": 34.404, "eta_s": 1986, "world_size": 1, "timestamp": "2026-05-07T08:03:18.437490"}
|
| 149 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14690, "epoch": 1, "train_loss": 3.3708790000528097, "train_ppl": 29.104098321541144, "lr": 0.0035894311926605504, "grad_norm": 0.1117, "tokens_per_sec": 150673, "dt_s": 34.796, "eta_s": 1959, "world_size": 1, "timestamp": "2026-05-07T08:03:53.233905"}
|
| 150 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14700, "epoch": 1, "train_loss": 3.413738513365388, "train_ppl": 30.378603044206898, "lr": 0.0035458348623853206, "grad_norm": 0.1193, "tokens_per_sec": 151135, "dt_s": 34.69, "eta_s": 1927, "world_size": 1, "timestamp": "2026-05-07T08:04:27.923992"}
|
| 151 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14710, "epoch": 1, "train_loss": 3.4028570540249348, "train_ppl": 30.049831511436086, "lr": 0.0035022385321100925, "grad_norm": 0.1076, "tokens_per_sec": 116214, "dt_s": 45.114, "eta_s": 1893, "world_size": 1, "timestamp": "2026-05-07T08:05:13.038049"}
|
| 152 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14720, "epoch": 1, "train_loss": 3.433362228795886, "train_ppl": 30.980631806588374, "lr": 0.0034586422018348627, "grad_norm": 0.1099, "tokens_per_sec": 152544, "dt_s": 34.37, "eta_s": 1863, "world_size": 1, "timestamp": "2026-05-07T08:05:47.407679"}
|
| 153 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14730, "epoch": 1, "train_loss": 3.4365407302975655, "train_ppl": 31.07926045392439, "lr": 0.003415045871559633, "grad_norm": 0.111, "tokens_per_sec": 153781, "dt_s": 34.093, "eta_s": 1825, "world_size": 1, "timestamp": "2026-05-07T08:06:21.500781"}
|
| 154 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14740, "epoch": 1, "train_loss": 3.4651960730552673, "train_ppl": 31.98273010999685, "lr": 0.003371449541284404, "grad_norm": 0.1012, "tokens_per_sec": 154222, "dt_s": 33.996, "eta_s": 1782, "world_size": 1, "timestamp": "2026-05-07T08:06:55.496440"}
|
| 155 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14750, "epoch": 1, "train_loss": 3.4456415846943855, "train_ppl": 31.36339926991268, "lr": 0.003327853211009174, "grad_norm": 0.1212, "tokens_per_sec": 152991, "dt_s": 34.269, "eta_s": 1743, "world_size": 1, "timestamp": "2026-05-07T08:07:29.765580"}
|
| 156 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14760, "epoch": 1, "train_loss": 3.4012427739799023, "train_ppl": 30.001361800439813, "lr": 0.003284256880733946, "grad_norm": 0.1123, "tokens_per_sec": 154420, "dt_s": 33.952, "eta_s": 1703, "world_size": 1, "timestamp": "2026-05-07T08:08:03.717563"}
|
| 157 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14770, "epoch": 1, "train_loss": 3.380577217787504, "train_ppl": 29.387729338632553, "lr": 0.003240660550458716, "grad_norm": 0.1136, "tokens_per_sec": 153758, "dt_s": 34.098, "eta_s": 1667, "world_size": 1, "timestamp": "2026-05-07T08:08:37.815879"}
|
| 158 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14780, "epoch": 1, "train_loss": 3.4051584862172604, "train_ppl": 30.119068802956146, "lr": 0.0031970642201834855, "grad_norm": 0.1021, "tokens_per_sec": 152513, "dt_s": 34.377, "eta_s": 1635, "world_size": 1, "timestamp": "2026-05-07T08:09:12.192514"}
|
| 159 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14790, "epoch": 1, "train_loss": 3.4182691629976034, "train_ppl": 30.516550109860077, "lr": 0.0031534678899082565, "grad_norm": 0.1069, "tokens_per_sec": 154377, "dt_s": 33.962, "eta_s": 1601, "world_size": 1, "timestamp": "2026-05-07T08:09:46.154082"}
|
| 160 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14800, "epoch": 1, "train_loss": 3.4209798015654087, "train_ppl": 30.599381660023994, "lr": 0.0031098715596330276, "grad_norm": 0.1097, "tokens_per_sec": 153453, "dt_s": 34.166, "eta_s": 1566, "world_size": 1, "timestamp": "2026-05-07T08:10:20.320124"}
|
| 161 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14810, "epoch": 1, "train_loss": 3.4488363061100245, "train_ppl": 31.46375681507075, "lr": 0.0030662752293577986, "grad_norm": 0.1225, "tokens_per_sec": 118052, "dt_s": 44.412, "eta_s": 1534, "world_size": 1, "timestamp": "2026-05-07T08:11:04.731737"}
|
| 162 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14820, "epoch": 1, "train_loss": 3.4084146730601788, "train_ppl": 30.21730196448219, "lr": 0.0030226788990825697, "grad_norm": 0.1086, "tokens_per_sec": 153894, "dt_s": 34.068, "eta_s": 1499, "world_size": 1, "timestamp": "2026-05-07T08:11:38.799892"}
|
| 163 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14830, "epoch": 1, "train_loss": 3.4228743985295296, "train_ppl": 30.657410108524484, "lr": 0.002979082568807339, "grad_norm": 0.1107, "tokens_per_sec": 153730, "dt_s": 34.104, "eta_s": 1463, "world_size": 1, "timestamp": "2026-05-07T08:12:12.904342"}
|
| 164 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14840, "epoch": 1, "train_loss": 3.414470875635743, "train_ppl": 30.400859335741902, "lr": 0.00293548623853211, "grad_norm": 0.1059, "tokens_per_sec": 155096, "dt_s": 33.804, "eta_s": 1427, "world_size": 1, "timestamp": "2026-05-07T08:12:46.708552"}
|
| 165 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14850, "epoch": 1, "train_loss": 3.3855159170925617, "train_ppl": 29.53322548221618, "lr": 0.002891889908256881, "grad_norm": 0.1123, "tokens_per_sec": 154632, "dt_s": 33.906, "eta_s": 1391, "world_size": 1, "timestamp": "2026-05-07T08:13:20.614092"}
|
| 166 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14860, "epoch": 1, "train_loss": 3.3912165705114603, "train_ppl": 29.702064955455224, "lr": 0.0028482935779816504, "grad_norm": 0.1162, "tokens_per_sec": 153715, "dt_s": 34.108, "eta_s": 1356, "world_size": 1, "timestamp": "2026-05-07T08:13:54.721879"}
|
| 167 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14870, "epoch": 1, "train_loss": 3.4287780951708555, "train_ppl": 30.83893747161544, "lr": 0.002804697247706423, "grad_norm": 0.1059, "tokens_per_sec": 152306, "dt_s": 34.423, "eta_s": 1325, "world_size": 1, "timestamp": "2026-05-07T08:14:29.145258"}
|
| 168 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14880, "epoch": 1, "train_loss": 3.4119360465556383, "train_ppl": 30.323895939175127, "lr": 0.0027611009174311925, "grad_norm": 0.1112, "tokens_per_sec": 154637, "dt_s": 33.904, "eta_s": 1290, "world_size": 1, "timestamp": "2026-05-07T08:15:03.049596"}
|
| 169 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14890, "epoch": 1, "train_loss": 3.4452989771962166, "train_ppl": 31.352655774660846, "lr": 0.0027175045871559635, "grad_norm": 0.111, "tokens_per_sec": 153984, "dt_s": 34.048, "eta_s": 1257, "world_size": 1, "timestamp": "2026-05-07T08:15:37.097906"}
|
| 170 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14900, "epoch": 1, "train_loss": 3.3889274951070547, "train_ppl": 29.634152447186167, "lr": 0.0026739082568807346, "grad_norm": 0.1041, "tokens_per_sec": 154124, "dt_s": 34.017, "eta_s": 1224, "world_size": 1, "timestamp": "2026-05-07T08:16:11.115123"}
|
| 171 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14910, "epoch": 1, "train_loss": 3.3937618620693684, "train_ppl": 29.777761664869683, "lr": 0.002630311926605504, "grad_norm": 0.0972, "tokens_per_sec": 121696, "dt_s": 43.082, "eta_s": 1191, "world_size": 1, "timestamp": "2026-05-07T08:16:54.196848"}
|
| 172 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14920, "epoch": 1, "train_loss": 3.394768614321947, "train_ppl": 29.80775558919462, "lr": 0.002586715596330275, "grad_norm": 0.1085, "tokens_per_sec": 154682, "dt_s": 33.895, "eta_s": 1153, "world_size": 1, "timestamp": "2026-05-07T08:17:28.091379"}
|
| 173 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14930, "epoch": 1, "train_loss": 3.387140903621912, "train_ppl": 29.581255589362655, "lr": 0.002543119266055046, "grad_norm": 0.1058, "tokens_per_sec": 154991, "dt_s": 33.827, "eta_s": 1119, "world_size": 1, "timestamp": "2026-05-07T08:18:01.918472"}
|
| 174 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14940, "epoch": 1, "train_loss": 3.403923511505127, "train_ppl": 30.081895473417074, "lr": 0.0024995229357798153, "grad_norm": 0.1013, "tokens_per_sec": 154661, "dt_s": 33.899, "eta_s": 1084, "world_size": 1, "timestamp": "2026-05-07T08:18:35.817563"}
|
| 175 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14950, "epoch": 1, "train_loss": 3.392174554988742, "train_ppl": 29.730532706280304, "lr": 0.002455926605504588, "grad_norm": 0.1095, "tokens_per_sec": 154772, "dt_s": 33.875, "eta_s": 1049, "world_size": 1, "timestamp": "2026-05-07T08:19:09.692347"}
|
| 176 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14960, "epoch": 1, "train_loss": 3.389118816703558, "train_ppl": 29.639822642941542, "lr": 0.0024123302752293574, "grad_norm": 0.0948, "tokens_per_sec": 155168, "dt_s": 33.788, "eta_s": 1012, "world_size": 1, "timestamp": "2026-05-07T08:19:43.480735"}
|
| 177 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14970, "epoch": 1, "train_loss": 3.3476616255939007, "train_ppl": 28.436161446606526, "lr": 0.0023687339449541284, "grad_norm": 0.1109, "tokens_per_sec": 153630, "dt_s": 34.127, "eta_s": 980, "world_size": 1, "timestamp": "2026-05-07T08:20:17.607382"}
|
| 178 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14980, "epoch": 1, "train_loss": 3.39289803057909, "train_ppl": 29.752049803590154, "lr": 0.0023251376146788995, "grad_norm": 0.1062, "tokens_per_sec": 154363, "dt_s": 33.965, "eta_s": 947, "world_size": 1, "timestamp": "2026-05-07T08:20:51.571898"}
|
| 179 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 14990, "epoch": 1, "train_loss": 3.376034192740917, "train_ppl": 29.25452295725296, "lr": 0.002281541284403669, "grad_norm": 0.0975, "tokens_per_sec": 154167, "dt_s": 34.008, "eta_s": 913, "world_size": 1, "timestamp": "2026-05-07T08:21:25.579728"}
|
| 180 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15000, "epoch": 1, "train_loss": 3.432103695347905, "train_ppl": 30.941666170141758, "lr": 0.0022379449541284416, "grad_norm": 0.1055, "tokens_per_sec": 154725, "dt_s": 33.885, "eta_s": 879, "world_size": 1, "timestamp": "2026-05-07T08:21:59.464907"}
|
| 181 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15010, "epoch": 1, "train_loss": 3.4614342227578163, "train_ppl": 31.86264188579941, "lr": 0.002194348623853211, "grad_norm": 0.1088, "tokens_per_sec": 117543, "dt_s": 44.604, "eta_s": 848, "world_size": 1, "timestamp": "2026-05-07T08:22:44.068867"}
|
| 182 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15020, "epoch": 1, "train_loss": 3.4082186073064804, "train_ppl": 30.21137796716327, "lr": 0.002150752293577982, "grad_norm": 0.1085, "tokens_per_sec": 154250, "dt_s": 33.99, "eta_s": 813, "world_size": 1, "timestamp": "2026-05-07T08:23:18.058444"}
|
| 183 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15030, "epoch": 1, "train_loss": 3.4596411362290382, "train_ppl": 31.805560602996106, "lr": 0.002107155963302753, "grad_norm": 0.1049, "tokens_per_sec": 153836, "dt_s": 34.081, "eta_s": 780, "world_size": 1, "timestamp": "2026-05-07T08:23:52.139318"}
|
| 184 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15040, "epoch": 1, "train_loss": 3.35403717122972, "train_ppl": 28.618036652388945, "lr": 0.0020635596330275223, "grad_norm": 0.1013, "tokens_per_sec": 153237, "dt_s": 34.214, "eta_s": 747, "world_size": 1, "timestamp": "2026-05-07T08:24:26.353419"}
|
| 185 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15050, "epoch": 1, "train_loss": 3.3611203990876675, "train_ppl": 28.82146433863964, "lr": 0.0020199633027522933, "grad_norm": 0.0991, "tokens_per_sec": 153471, "dt_s": 34.162, "eta_s": 714, "world_size": 1, "timestamp": "2026-05-07T08:25:00.515350"}
|
| 186 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15060, "epoch": 1, "train_loss": 3.379062168300152, "train_ppl": 29.343239185265762, "lr": 0.0019763669724770644, "grad_norm": 0.1018, "tokens_per_sec": 153378, "dt_s": 34.183, "eta_s": 679, "world_size": 1, "timestamp": "2026-05-07T08:25:34.698167"}
|
| 187 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15070, "epoch": 1, "train_loss": 3.339954435825348, "train_ppl": 28.21784095348321, "lr": 0.0019327706422018337, "grad_norm": 0.1014, "tokens_per_sec": 153378, "dt_s": 34.183, "eta_s": 646, "world_size": 1, "timestamp": "2026-05-07T08:26:08.881001"}
|
| 188 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15080, "epoch": 1, "train_loss": 3.401428686454892, "train_ppl": 30.00693994637259, "lr": 0.0018891743119266065, "grad_norm": 0.1071, "tokens_per_sec": 153371, "dt_s": 34.184, "eta_s": 612, "world_size": 1, "timestamp": "2026-05-07T08:26:43.065387"}
|
| 189 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15090, "epoch": 1, "train_loss": 3.3325652200728655, "train_ppl": 28.010101698916387, "lr": 0.0018455779816513758, "grad_norm": 0.1127, "tokens_per_sec": 153738, "dt_s": 34.103, "eta_s": 577, "world_size": 1, "timestamp": "2026-05-07T08:27:17.168142"}
|
| 190 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15100, "epoch": 1, "train_loss": 3.376005206257105, "train_ppl": 29.253674983786762, "lr": 0.0018019816513761468, "grad_norm": 0.1039, "tokens_per_sec": 154034, "dt_s": 34.037, "eta_s": 543, "world_size": 1, "timestamp": "2026-05-07T08:27:51.205227"}
|
| 191 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15110, "epoch": 1, "train_loss": 3.434177067130804, "train_ppl": 31.00588630079203, "lr": 0.0017583853211009179, "grad_norm": 0.1001, "tokens_per_sec": 107982, "dt_s": 48.553, "eta_s": 509, "world_size": 1, "timestamp": "2026-05-07T08:28:39.758496"}
|
| 192 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15120, "epoch": 1, "train_loss": 3.4106926154345274, "train_ppl": 30.286213695743236, "lr": 0.0017147889908256872, "grad_norm": 0.1035, "tokens_per_sec": 154103, "dt_s": 34.022, "eta_s": 474, "world_size": 1, "timestamp": "2026-05-07T08:29:13.780510"}
|
| 193 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15130, "epoch": 1, "train_loss": 3.363372951745987, "train_ppl": 28.88645937964647, "lr": 0.0016711926605504582, "grad_norm": 0.1032, "tokens_per_sec": 153676, "dt_s": 34.116, "eta_s": 440, "world_size": 1, "timestamp": "2026-05-07T08:29:47.896904"}
|
| 194 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15140, "epoch": 1, "train_loss": 3.3809038754552603, "train_ppl": 29.397330633841932, "lr": 0.0016275963302752293, "grad_norm": 0.1015, "tokens_per_sec": 153277, "dt_s": 34.205, "eta_s": 406, "world_size": 1, "timestamp": "2026-05-07T08:30:22.102239"}
|
| 195 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15150, "epoch": 1, "train_loss": 3.3895953968167305, "train_ppl": 29.653951759531694, "lr": 0.0015840000000000003, "grad_norm": 0.0997, "tokens_per_sec": 154173, "dt_s": 34.006, "eta_s": 372, "world_size": 1, "timestamp": "2026-05-07T08:30:56.108618"}
|
| 196 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15160, "epoch": 1, "train_loss": 3.38456766679883, "train_ppl": 29.505233866104383, "lr": 0.0015404036697247714, "grad_norm": 0.1098, "tokens_per_sec": 153587, "dt_s": 34.136, "eta_s": 338, "world_size": 1, "timestamp": "2026-05-07T08:31:30.244878"}
|
| 197 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15170, "epoch": 1, "train_loss": 3.447007045149803, "train_ppl": 31.40625400292636, "lr": 0.0014968073394495407, "grad_norm": 0.1013, "tokens_per_sec": 154615, "dt_s": 33.909, "eta_s": 303, "world_size": 1, "timestamp": "2026-05-07T08:32:04.154182"}
|
| 198 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15180, "epoch": 1, "train_loss": 3.3957910407334566, "train_ppl": 29.838247410942547, "lr": 0.0014532110091743117, "grad_norm": 0.1007, "tokens_per_sec": 154628, "dt_s": 33.906, "eta_s": 269, "world_size": 1, "timestamp": "2026-05-07T08:32:38.060481"}
|
| 199 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15190, "epoch": 1, "train_loss": 3.349813250824809, "train_ppl": 28.497411278757724, "lr": 0.0014096146788990828, "grad_norm": 0.1003, "tokens_per_sec": 154749, "dt_s": 33.88, "eta_s": 234, "world_size": 1, "timestamp": "2026-05-07T08:33:11.940391"}
|
| 200 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15200, "epoch": 1, "train_loss": 3.402709014713764, "train_ppl": 30.0453832843427, "lr": 0.001366018348623852, "grad_norm": 0.1053, "tokens_per_sec": 154312, "dt_s": 33.976, "eta_s": 200, "world_size": 1, "timestamp": "2026-05-07T08:33:45.916330"}
|
| 201 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15210, "epoch": 1, "train_loss": 3.354764824733138, "train_ppl": 28.63886824519158, "lr": 0.0013224220183486249, "grad_norm": 0.1059, "tokens_per_sec": 105182, "dt_s": 49.846, "eta_s": 167, "world_size": 1, "timestamp": "2026-05-07T08:34:35.762038"}
|
| 202 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15220, "epoch": 1, "train_loss": 3.382032288238406, "train_ppl": 29.43052168059836, "lr": 0.0012788256880733942, "grad_norm": 0.0998, "tokens_per_sec": 153139, "dt_s": 34.236, "eta_s": 133, "world_size": 1, "timestamp": "2026-05-07T08:35:09.998222"}
|
| 203 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15230, "epoch": 1, "train_loss": 3.3915587421506643, "train_ppl": 29.712229898687244, "lr": 0.0012352293577981652, "grad_norm": 0.1019, "tokens_per_sec": 153865, "dt_s": 34.074, "eta_s": 99, "world_size": 1, "timestamp": "2026-05-07T08:35:44.072675"}
|
| 204 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15240, "epoch": 1, "train_loss": 3.392764439806342, "train_ppl": 29.74807546973913, "lr": 0.0011916330275229363, "grad_norm": 0.1005, "tokens_per_sec": 154637, "dt_s": 33.904, "eta_s": 65, "world_size": 1, "timestamp": "2026-05-07T08:36:17.977171"}
|
| 205 |
+
{"run_name": "final_c2_muon_bs512_lr12_seed3_mix3to1", "stage": "pretraining", "event": "train_step", "step": 15250, "epoch": 1, "train_loss": 3.368881171569228, "train_ppl": 29.04601136813073, "lr": 0.0011480366972477056, "grad_norm": 0.0951, "tokens_per_sec": 154289, "dt_s": 33.981, "eta_s": 31, "world_size": 1, "timestamp": "2026-05-07T08:36:51.958178"}
|