fullrun / final /train_metrics.jsonl
huiting tang
Upload folder using huggingface_hub
ba9bb9a verified
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 10, "epoch": 0, "train_loss": 9.714061617851257, "train_ppl": 16548.679975757317, "lr": 0.00044, "grad_norm": 5.8358, "tokens_per_sec": 93514, "dt_s": 56.065, "eta_s": 1121234, "world_size": 2, "timestamp": "2026-05-01T22:51:47.504120"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 20, "epoch": 0, "train_loss": 8.264804065227509, "train_ppl": 3884.7117531178224, "lr": 0.00066, "grad_norm": 1.2947, "tokens_per_sec": 217691, "dt_s": 24.084, "eta_s": 801390, "world_size": 2, "timestamp": "2026-05-01T22:52:11.588206"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 30, "epoch": 0, "train_loss": 7.582939103245735, "train_ppl": 1964.3940460710432, "lr": 0.00066, "grad_norm": 0.487, "tokens_per_sec": 217784, "dt_s": 24.074, "eta_s": 694695, "world_size": 2, "timestamp": "2026-05-01T22:52:35.661979"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 40, "epoch": 0, "train_loss": 7.30265711247921, "train_ppl": 1484.2384812679115, "lr": 0.00066, "grad_norm": 0.5469, "tokens_per_sec": 217254, "dt_s": 24.133, "eta_s": 641629, "world_size": 2, "timestamp": "2026-05-01T22:52:59.794518"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 50, "epoch": 0, "train_loss": 7.118218153715134, "train_ppl": 1234.249230539367, "lr": 0.00066, "grad_norm": 1.0615, "tokens_per_sec": 217361, "dt_s": 24.121, "eta_s": 609732, "world_size": 2, "timestamp": "2026-05-01T22:53:23.915142"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 60, "epoch": 0, "train_loss": 7.022111967206001, "train_ppl": 1121.1519552721263, "lr": 0.00066, "grad_norm": 0.3998, "tokens_per_sec": 217252, "dt_s": 24.133, "eta_s": 482009, "world_size": 2, "timestamp": "2026-05-01T22:53:48.047877"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 70, "epoch": 0, "train_loss": 6.927608221769333, "train_ppl": 1020.0513230919989, "lr": 0.00066, "grad_norm": 0.2864, "tokens_per_sec": 217079, "dt_s": 24.152, "eta_s": 482258, "world_size": 2, "timestamp": "2026-05-01T22:54:12.199790"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 80, "epoch": 0, "train_loss": 6.765780031681061, "train_ppl": 867.6427327194519, "lr": 0.00066, "grad_norm": 0.588, "tokens_per_sec": 217043, "dt_s": 24.156, "eta_s": 482562, "world_size": 2, "timestamp": "2026-05-01T22:54:36.355802"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 90, "epoch": 0, "train_loss": 6.560252025723457, "train_ppl": 706.4497156621557, "lr": 0.00066, "grad_norm": 0.4138, "tokens_per_sec": 216935, "dt_s": 24.168, "eta_s": 482680, "world_size": 2, "timestamp": "2026-05-01T22:55:00.523767"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 100, "epoch": 0, "train_loss": 6.55733186006546, "train_ppl": 704.3897746117563, "lr": 0.00066, "grad_norm": 0.3154, "tokens_per_sec": 216627, "dt_s": 24.202, "eta_s": 482983, "world_size": 2, "timestamp": "2026-05-01T22:55:24.726155"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 110, "epoch": 0, "train_loss": 6.48424918949604, "train_ppl": 654.7471885803924, "lr": 0.00066, "grad_norm": 0.4683, "tokens_per_sec": 216530, "dt_s": 24.213, "eta_s": 483280, "world_size": 2, "timestamp": "2026-05-01T22:55:48.939399"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 120, "epoch": 0, "train_loss": 6.376413524150848, "train_ppl": 587.8157357142162, "lr": 0.00066, "grad_norm": 0.3407, "tokens_per_sec": 216580, "dt_s": 24.208, "eta_s": 483479, "world_size": 2, "timestamp": "2026-05-01T22:56:13.146977"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 130, "epoch": 0, "train_loss": 6.322140544652939, "train_ppl": 556.7634949570315, "lr": 0.00066, "grad_norm": 0.6832, "tokens_per_sec": 216398, "dt_s": 24.228, "eta_s": 483742, "world_size": 2, "timestamp": "2026-05-01T22:56:37.374938"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 140, "epoch": 0, "train_loss": 6.3091679364442825, "train_ppl": 549.5874668095711, "lr": 0.00066, "grad_norm": 0.3633, "tokens_per_sec": 216494, "dt_s": 24.217, "eta_s": 483914, "world_size": 2, "timestamp": "2026-05-01T22:57:01.592100"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 150, "epoch": 0, "train_loss": 6.175947576761246, "train_ppl": 481.0386290523523, "lr": 0.00066, "grad_norm": 0.5308, "tokens_per_sec": 216450, "dt_s": 24.222, "eta_s": 483969, "world_size": 2, "timestamp": "2026-05-01T22:57:25.814218"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 160, "epoch": 0, "train_loss": 6.094743117690086, "train_ppl": 443.5200980749687, "lr": 0.00066, "grad_norm": 0.5934, "tokens_per_sec": 216427, "dt_s": 24.225, "eta_s": 483991, "world_size": 2, "timestamp": "2026-05-01T22:57:50.038898"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 170, "epoch": 0, "train_loss": 6.066118851304054, "train_ppl": 431.0046381022878, "lr": 0.00066, "grad_norm": 0.4757, "tokens_per_sec": 216419, "dt_s": 24.226, "eta_s": 484039, "world_size": 2, "timestamp": "2026-05-01T22:58:14.264547"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 180, "epoch": 0, "train_loss": 5.994265928864479, "train_ppl": 401.1221237172093, "lr": 0.00066, "grad_norm": 0.7316, "tokens_per_sec": 216340, "dt_s": 24.234, "eta_s": 484041, "world_size": 2, "timestamp": "2026-05-01T22:58:38.499046"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 190, "epoch": 0, "train_loss": 5.897585526108742, "train_ppl": 364.157157600573, "lr": 0.00066, "grad_norm": 0.443, "tokens_per_sec": 216086, "dt_s": 24.263, "eta_s": 484200, "world_size": 2, "timestamp": "2026-05-01T22:59:02.761987"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 200, "epoch": 0, "train_loss": 5.917451128363609, "train_ppl": 371.4636929267322, "lr": 0.00066, "grad_norm": 0.5644, "tokens_per_sec": 216035, "dt_s": 24.269, "eta_s": 484361, "world_size": 2, "timestamp": "2026-05-01T22:59:27.030633"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 210, "epoch": 0, "train_loss": 5.840434417128563, "train_ppl": 343.9287167424116, "lr": 0.00066, "grad_norm": 0.7434, "tokens_per_sec": 216051, "dt_s": 24.267, "eta_s": 484506, "world_size": 2, "timestamp": "2026-05-01T22:59:51.297526"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 220, "epoch": 0, "train_loss": 5.710009142756462, "train_ppl": 301.8738282290672, "lr": 0.00066, "grad_norm": 0.6202, "tokens_per_sec": 216211, "dt_s": 24.249, "eta_s": 484574, "world_size": 2, "timestamp": "2026-05-01T23:00:15.546488"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 230, "epoch": 0, "train_loss": 5.66944494843483, "train_ppl": 289.87359493844673, "lr": 0.00066, "grad_norm": 0.575, "tokens_per_sec": 216341, "dt_s": 24.234, "eta_s": 484549, "world_size": 2, "timestamp": "2026-05-01T23:00:39.780821"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 240, "epoch": 0, "train_loss": 5.755365818738937, "train_ppl": 315.8810816095787, "lr": 0.00066, "grad_norm": 0.5618, "tokens_per_sec": 216264, "dt_s": 24.243, "eta_s": 484445, "world_size": 2, "timestamp": "2026-05-01T23:01:04.023811"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 250, "epoch": 0, "train_loss": 5.613779231905937, "train_ppl": 274.17846648763236, "lr": 0.00066, "grad_norm": 0.3584, "tokens_per_sec": 216642, "dt_s": 24.201, "eta_s": 484150, "world_size": 2, "timestamp": "2026-05-01T23:01:28.224514"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 260, "epoch": 0, "train_loss": 5.602131277322769, "train_ppl": 271.00337571709616, "lr": 0.00066, "grad_norm": 1.0164, "tokens_per_sec": 216777, "dt_s": 24.186, "eta_s": 483801, "world_size": 2, "timestamp": "2026-05-01T23:01:52.410170"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 270, "epoch": 0, "train_loss": 5.5301821529865265, "train_ppl": 252.18984397322512, "lr": 0.00066, "grad_norm": 0.774, "tokens_per_sec": 216781, "dt_s": 24.185, "eta_s": 483522, "world_size": 2, "timestamp": "2026-05-01T23:02:16.595254"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 280, "epoch": 0, "train_loss": 5.556942284107208, "train_ppl": 259.0295852932062, "lr": 0.00066, "grad_norm": 0.3995, "tokens_per_sec": 216828, "dt_s": 24.18, "eta_s": 483280, "world_size": 2, "timestamp": "2026-05-01T23:02:40.775212"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 290, "epoch": 0, "train_loss": 5.503597214818001, "train_ppl": 245.57372676005505, "lr": 0.00066, "grad_norm": 0.9132, "tokens_per_sec": 216823, "dt_s": 24.181, "eta_s": 483007, "world_size": 2, "timestamp": "2026-05-01T23:03:04.955726"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 300, "epoch": 0, "train_loss": 5.459054425358772, "train_ppl": 234.8752272710654, "lr": 0.00066, "grad_norm": 0.6277, "tokens_per_sec": 216849, "dt_s": 24.178, "eta_s": 482890, "world_size": 2, "timestamp": "2026-05-01T23:03:29.133337"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 310, "epoch": 0, "train_loss": 5.420411080121994, "train_ppl": 225.97199600718537, "lr": 0.00066, "grad_norm": 0.4227, "tokens_per_sec": 216910, "dt_s": 24.171, "eta_s": 482807, "world_size": 2, "timestamp": "2026-05-01T23:03:53.304041"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 320, "epoch": 0, "train_loss": 5.451010346412659, "train_ppl": 232.9934511253768, "lr": 0.00066, "grad_norm": 0.5027, "tokens_per_sec": 216767, "dt_s": 24.187, "eta_s": 482789, "world_size": 2, "timestamp": "2026-05-01T23:04:17.490855"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 330, "epoch": 0, "train_loss": 5.384555801749229, "train_ppl": 218.01324149544388, "lr": 0.00066, "grad_norm": 0.7347, "tokens_per_sec": 216817, "dt_s": 24.181, "eta_s": 482769, "world_size": 2, "timestamp": "2026-05-01T23:04:41.671932"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 340, "epoch": 0, "train_loss": 5.29522867500782, "train_ppl": 199.38321471205222, "lr": 0.00066, "grad_norm": 0.4429, "tokens_per_sec": 216661, "dt_s": 24.199, "eta_s": 482817, "world_size": 2, "timestamp": "2026-05-01T23:05:05.870474"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 350, "epoch": 0, "train_loss": 5.304015561938286, "train_ppl": 201.14289219715462, "lr": 0.00066, "grad_norm": 0.8687, "tokens_per_sec": 216712, "dt_s": 24.193, "eta_s": 482853, "world_size": 2, "timestamp": "2026-05-01T23:05:30.063337"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 360, "epoch": 0, "train_loss": 5.277590945363045, "train_ppl": 195.89737895635275, "lr": 0.00066, "grad_norm": 0.7429, "tokens_per_sec": 216756, "dt_s": 24.188, "eta_s": 482898, "world_size": 2, "timestamp": "2026-05-01T23:05:54.251256"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 370, "epoch": 0, "train_loss": 5.2177044451236725, "train_ppl": 184.51014439202532, "lr": 0.00066, "grad_norm": 0.4458, "tokens_per_sec": 216869, "dt_s": 24.175, "eta_s": 482828, "world_size": 2, "timestamp": "2026-05-01T23:06:18.426602"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 380, "epoch": 0, "train_loss": 5.204479023814201, "train_ppl": 182.08598559168618, "lr": 0.00066, "grad_norm": 0.9497, "tokens_per_sec": 216886, "dt_s": 24.173, "eta_s": 482773, "world_size": 2, "timestamp": "2026-05-01T23:06:42.600032"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 390, "epoch": 0, "train_loss": 5.1971063911914825, "train_ppl": 180.74846908156732, "lr": 0.00066, "grad_norm": 0.4673, "tokens_per_sec": 216969, "dt_s": 24.164, "eta_s": 482612, "world_size": 2, "timestamp": "2026-05-01T23:07:06.764222"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 400, "epoch": 0, "train_loss": 5.177317887544632, "train_ppl": 177.20688424273442, "lr": 0.00066, "grad_norm": 0.7051, "tokens_per_sec": 216779, "dt_s": 24.185, "eta_s": 482558, "world_size": 2, "timestamp": "2026-05-01T23:07:30.949631"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 410, "epoch": 0, "train_loss": 5.12025149166584, "train_ppl": 167.3774583642263, "lr": 0.00066, "grad_norm": 0.6419, "tokens_per_sec": 216788, "dt_s": 24.184, "eta_s": 482519, "world_size": 2, "timestamp": "2026-05-01T23:07:55.133999"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 420, "epoch": 0, "train_loss": 5.103575512766838, "train_ppl": 164.6094194261373, "lr": 0.00066, "grad_norm": 0.637, "tokens_per_sec": 216790, "dt_s": 24.184, "eta_s": 482530, "world_size": 2, "timestamp": "2026-05-01T23:08:19.318188"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 430, "epoch": 0, "train_loss": 5.1082246750593185, "train_ppl": 165.37649708450314, "lr": 0.00066, "grad_norm": 0.515, "tokens_per_sec": 216706, "dt_s": 24.194, "eta_s": 482587, "world_size": 2, "timestamp": "2026-05-01T23:08:43.511776"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 440, "epoch": 0, "train_loss": 5.097349151968956, "train_ppl": 163.58768593158993, "lr": 0.00066, "grad_norm": 0.5139, "tokens_per_sec": 216851, "dt_s": 24.177, "eta_s": 482615, "world_size": 2, "timestamp": "2026-05-01T23:09:07.689157"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 450, "epoch": 0, "train_loss": 5.091397240757942, "train_ppl": 162.61691837666163, "lr": 0.00066, "grad_norm": 0.5417, "tokens_per_sec": 216238, "dt_s": 24.246, "eta_s": 482833, "world_size": 2, "timestamp": "2026-05-01T23:09:31.935066"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 460, "epoch": 0, "train_loss": 5.042646273970604, "train_ppl": 154.87932634197423, "lr": 0.00066, "grad_norm": 0.5235, "tokens_per_sec": 216734, "dt_s": 24.19, "eta_s": 482832, "world_size": 2, "timestamp": "2026-05-01T23:09:56.125423"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 470, "epoch": 0, "train_loss": 5.022189408540726, "train_ppl": 151.7431681565104, "lr": 0.00066, "grad_norm": 0.5028, "tokens_per_sec": 216765, "dt_s": 24.187, "eta_s": 482819, "world_size": 2, "timestamp": "2026-05-01T23:10:20.312315"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 480, "epoch": 0, "train_loss": 5.021097660064697, "train_ppl": 151.57759318348596, "lr": 0.00066, "grad_norm": 0.8012, "tokens_per_sec": 216658, "dt_s": 24.199, "eta_s": 482816, "world_size": 2, "timestamp": "2026-05-01T23:10:44.511202"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 490, "epoch": 0, "train_loss": 4.962590724229813, "train_ppl": 142.96369604678782, "lr": 0.00066, "grad_norm": 0.5379, "tokens_per_sec": 216808, "dt_s": 24.182, "eta_s": 482811, "world_size": 2, "timestamp": "2026-05-01T23:11:08.693398"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 500, "epoch": 0, "train_loss": 4.9745132476091385, "train_ppl": 144.67838545620066, "lr": 0.00066, "grad_norm": 0.5531, "tokens_per_sec": 216703, "dt_s": 24.194, "eta_s": 482579, "world_size": 2, "timestamp": "2026-05-01T23:11:32.887254"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 510, "epoch": 0, "train_loss": 4.97058829665184, "train_ppl": 144.11164283476822, "lr": 0.00066, "grad_norm": 0.4682, "tokens_per_sec": 120644, "dt_s": 43.457, "eta_s": 482213, "world_size": 2, "timestamp": "2026-05-01T23:12:16.344568"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 520, "epoch": 0, "train_loss": 4.858549058437347, "train_ppl": 128.8371312771928, "lr": 0.00066, "grad_norm": 0.4908, "tokens_per_sec": 217099, "dt_s": 24.15, "eta_s": 482041, "world_size": 2, "timestamp": "2026-05-01T23:12:40.494339"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 530, "epoch": 0, "train_loss": 4.83178748190403, "train_ppl": 125.43497313060017, "lr": 0.00066, "grad_norm": 0.5068, "tokens_per_sec": 217093, "dt_s": 24.15, "eta_s": 481823, "world_size": 2, "timestamp": "2026-05-01T23:13:04.644696"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 540, "epoch": 0, "train_loss": 4.815204486250877, "train_ppl": 123.37203761641175, "lr": 0.00066, "grad_norm": 0.6602, "tokens_per_sec": 218300, "dt_s": 24.017, "eta_s": 481139, "world_size": 2, "timestamp": "2026-05-01T23:13:28.661544"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 550, "epoch": 0, "train_loss": 4.870279416441917, "train_ppl": 130.3573357911475, "lr": 0.00066, "grad_norm": 0.6222, "tokens_per_sec": 216844, "dt_s": 24.178, "eta_s": 481051, "world_size": 2, "timestamp": "2026-05-01T23:13:52.839620"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 560, "epoch": 0, "train_loss": 4.7692625522613525, "train_ppl": 117.83231473842494, "lr": 0.00066, "grad_norm": 0.4479, "tokens_per_sec": 216425, "dt_s": 24.225, "eta_s": 481506, "world_size": 2, "timestamp": "2026-05-01T23:14:17.064518"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 570, "epoch": 0, "train_loss": 4.80383725464344, "train_ppl": 121.97757966942281, "lr": 0.00066, "grad_norm": 0.5289, "tokens_per_sec": 216254, "dt_s": 24.244, "eta_s": 481858, "world_size": 2, "timestamp": "2026-05-01T23:14:41.308593"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 580, "epoch": 0, "train_loss": 4.849370688199997, "train_ppl": 127.66002660078323, "lr": 0.00066, "grad_norm": 0.5438, "tokens_per_sec": 216519, "dt_s": 24.214, "eta_s": 482090, "world_size": 2, "timestamp": "2026-05-01T23:15:05.523072"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 590, "epoch": 0, "train_loss": 4.726563081145287, "train_ppl": 112.90684307521103, "lr": 0.00066, "grad_norm": 0.4503, "tokens_per_sec": 216416, "dt_s": 24.226, "eta_s": 482900, "world_size": 2, "timestamp": "2026-05-01T23:15:29.748972"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 600, "epoch": 0, "train_loss": 4.749731004238129, "train_ppl": 115.55319702590141, "lr": 0.00066, "grad_norm": 0.6285, "tokens_per_sec": 216657, "dt_s": 24.199, "eta_s": 482958, "world_size": 2, "timestamp": "2026-05-01T23:15:53.947909"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 610, "epoch": 0, "train_loss": 4.748478531837463, "train_ppl": 115.40856043139667, "lr": 0.00066, "grad_norm": 0.5496, "tokens_per_sec": 216734, "dt_s": 24.19, "eta_s": 482797, "world_size": 2, "timestamp": "2026-05-01T23:16:18.138307"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 620, "epoch": 0, "train_loss": 4.710269808769226, "train_ppl": 111.08212679476617, "lr": 0.00066, "grad_norm": 0.4179, "tokens_per_sec": 216649, "dt_s": 24.2, "eta_s": 482596, "world_size": 2, "timestamp": "2026-05-01T23:16:42.338169"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 630, "epoch": 0, "train_loss": 4.765258684754372, "train_ppl": 117.36147298517322, "lr": 0.00066, "grad_norm": 0.419, "tokens_per_sec": 216756, "dt_s": 24.188, "eta_s": 482466, "world_size": 2, "timestamp": "2026-05-01T23:17:06.526148"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 640, "epoch": 0, "train_loss": 4.713030740618706, "train_ppl": 111.38924074179309, "lr": 0.00066, "grad_norm": 0.8006, "tokens_per_sec": 216720, "dt_s": 24.192, "eta_s": 482307, "world_size": 2, "timestamp": "2026-05-01T23:17:30.718073"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 650, "epoch": 0, "train_loss": 4.67011758685112, "train_ppl": 106.71028942167109, "lr": 0.00066, "grad_norm": 0.554, "tokens_per_sec": 216739, "dt_s": 24.19, "eta_s": 482246, "world_size": 2, "timestamp": "2026-05-01T23:17:54.907931"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 660, "epoch": 0, "train_loss": 4.616837337613106, "train_ppl": 101.17354783058941, "lr": 0.00066, "grad_norm": 0.5135, "tokens_per_sec": 216819, "dt_s": 24.181, "eta_s": 482184, "world_size": 2, "timestamp": "2026-05-01T23:18:19.088855"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 670, "epoch": 0, "train_loss": 4.666563987731934, "train_ppl": 106.33175680610326, "lr": 0.00066, "grad_norm": 0.4335, "tokens_per_sec": 216701, "dt_s": 24.194, "eta_s": 482137, "world_size": 2, "timestamp": "2026-05-01T23:18:43.282930"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 680, "epoch": 0, "train_loss": 4.653371065855026, "train_ppl": 104.9381433755967, "lr": 0.00066, "grad_norm": 0.5608, "tokens_per_sec": 216850, "dt_s": 24.177, "eta_s": 482071, "world_size": 2, "timestamp": "2026-05-01T23:19:07.460407"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 690, "epoch": 0, "train_loss": 4.6472422778606415, "train_ppl": 104.29696656767591, "lr": 0.00066, "grad_norm": 0.4134, "tokens_per_sec": 216803, "dt_s": 24.183, "eta_s": 482009, "world_size": 2, "timestamp": "2026-05-01T23:19:31.642975"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 700, "epoch": 0, "train_loss": 4.663647904992104, "train_ppl": 106.02213626434985, "lr": 0.00066, "grad_norm": 0.4698, "tokens_per_sec": 216632, "dt_s": 24.202, "eta_s": 482033, "world_size": 2, "timestamp": "2026-05-01T23:19:55.844773"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 710, "epoch": 0, "train_loss": 4.625503018498421, "train_ppl": 102.05409527137347, "lr": 0.00066, "grad_norm": 0.5234, "tokens_per_sec": 216655, "dt_s": 24.199, "eta_s": 482081, "world_size": 2, "timestamp": "2026-05-01T23:20:20.043983"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 720, "epoch": 0, "train_loss": 4.523751184344292, "train_ppl": 92.18073717119445, "lr": 0.00066, "grad_norm": 0.4558, "tokens_per_sec": 216946, "dt_s": 24.167, "eta_s": 481948, "world_size": 2, "timestamp": "2026-05-01T23:20:44.210690"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 730, "epoch": 0, "train_loss": 4.620301708579063, "train_ppl": 101.5246583696302, "lr": 0.00066, "grad_norm": 0.4345, "tokens_per_sec": 216792, "dt_s": 24.184, "eta_s": 481950, "world_size": 2, "timestamp": "2026-05-01T23:21:08.394946"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 740, "epoch": 0, "train_loss": 4.503212496638298, "train_ppl": 90.3067760244673, "lr": 0.00066, "grad_norm": 0.5062, "tokens_per_sec": 216824, "dt_s": 24.18, "eta_s": 481916, "world_size": 2, "timestamp": "2026-05-01T23:21:32.574957"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 750, "epoch": 0, "train_loss": 4.545242458581924, "train_ppl": 94.18325998095025, "lr": 0.00066, "grad_norm": 0.4143, "tokens_per_sec": 216809, "dt_s": 24.182, "eta_s": 481812, "world_size": 2, "timestamp": "2026-05-01T23:21:56.756935"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 760, "epoch": 0, "train_loss": 4.48856021463871, "train_ppl": 88.99322245476031, "lr": 0.00066, "grad_norm": 0.5738, "tokens_per_sec": 216837, "dt_s": 24.179, "eta_s": 481707, "world_size": 2, "timestamp": "2026-05-01T23:22:20.935837"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 770, "epoch": 0, "train_loss": 4.519600808620453, "train_ppl": 91.7989453153086, "lr": 0.00066, "grad_norm": 0.4533, "tokens_per_sec": 216765, "dt_s": 24.187, "eta_s": 481764, "world_size": 2, "timestamp": "2026-05-01T23:22:45.122856"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 780, "epoch": 0, "train_loss": 4.538744002580643, "train_ppl": 93.57319858537805, "lr": 0.00066, "grad_norm": 0.6213, "tokens_per_sec": 216742, "dt_s": 24.19, "eta_s": 481762, "world_size": 2, "timestamp": "2026-05-01T23:23:09.312363"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 790, "epoch": 0, "train_loss": 4.434737667441368, "train_ppl": 84.3299994817768, "lr": 0.00066, "grad_norm": 0.5772, "tokens_per_sec": 216749, "dt_s": 24.189, "eta_s": 481772, "world_size": 2, "timestamp": "2026-05-01T23:23:33.501097"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 800, "epoch": 0, "train_loss": 4.470322519540787, "train_ppl": 87.38490180749353, "lr": 0.00066, "grad_norm": 0.4702, "tokens_per_sec": 216806, "dt_s": 24.182, "eta_s": 481750, "world_size": 2, "timestamp": "2026-05-01T23:23:57.683433"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 810, "epoch": 0, "train_loss": 4.369812771677971, "train_ppl": 79.0288338783631, "lr": 0.00066, "grad_norm": 0.5209, "tokens_per_sec": 216697, "dt_s": 24.195, "eta_s": 481788, "world_size": 2, "timestamp": "2026-05-01T23:24:21.877913"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 820, "epoch": 0, "train_loss": 4.4195940643548965, "train_ppl": 83.06256045967349, "lr": 0.00066, "grad_norm": 0.5341, "tokens_per_sec": 216824, "dt_s": 24.18, "eta_s": 481737, "world_size": 2, "timestamp": "2026-05-01T23:24:46.058303"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 830, "epoch": 0, "train_loss": 4.408064007759094, "train_ppl": 82.11034453743561, "lr": 0.00066, "grad_norm": 0.4836, "tokens_per_sec": 216781, "dt_s": 24.185, "eta_s": 481696, "world_size": 2, "timestamp": "2026-05-01T23:25:10.243516"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 840, "epoch": 0, "train_loss": 4.439011737704277, "train_ppl": 84.6912031799631, "lr": 0.00066, "grad_norm": 0.5845, "tokens_per_sec": 216702, "dt_s": 24.194, "eta_s": 481692, "world_size": 2, "timestamp": "2026-05-01T23:25:34.437431"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 850, "epoch": 0, "train_loss": 4.408279716968536, "train_ppl": 82.12805840539635, "lr": 0.00066, "grad_norm": 0.6451, "tokens_per_sec": 216901, "dt_s": 24.172, "eta_s": 481626, "world_size": 2, "timestamp": "2026-05-01T23:25:58.609235"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 860, "epoch": 0, "train_loss": 4.376768663525581, "train_ppl": 79.58046622292834, "lr": 0.00066, "grad_norm": 0.5594, "tokens_per_sec": 216752, "dt_s": 24.188, "eta_s": 481577, "world_size": 2, "timestamp": "2026-05-01T23:26:22.797635"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 870, "epoch": 0, "train_loss": 4.320889040827751, "train_ppl": 75.25550377547005, "lr": 0.00066, "grad_norm": 0.4469, "tokens_per_sec": 216774, "dt_s": 24.186, "eta_s": 481575, "world_size": 2, "timestamp": "2026-05-01T23:26:46.983569"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 880, "epoch": 0, "train_loss": 4.345994517207146, "train_ppl": 77.16874497424865, "lr": 0.00066, "grad_norm": 0.5091, "tokens_per_sec": 216887, "dt_s": 24.173, "eta_s": 481504, "world_size": 2, "timestamp": "2026-05-01T23:27:11.156890"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 890, "epoch": 0, "train_loss": 4.347221061587334, "train_ppl": 77.26345393522146, "lr": 0.00066, "grad_norm": 0.5202, "tokens_per_sec": 216653, "dt_s": 24.199, "eta_s": 481502, "world_size": 2, "timestamp": "2026-05-01T23:27:35.746444"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 900, "epoch": 0, "train_loss": 4.312873691320419, "train_ppl": 74.65471559050825, "lr": 0.00066, "grad_norm": 0.5521, "tokens_per_sec": 213496, "dt_s": 24.557, "eta_s": 481459, "world_size": 2, "timestamp": "2026-05-01T23:27:59.913585"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 910, "epoch": 0, "train_loss": 4.360861703753471, "train_ppl": 78.32459795216616, "lr": 0.00066, "grad_norm": 0.42, "tokens_per_sec": 216962, "dt_s": 24.165, "eta_s": 481342, "world_size": 2, "timestamp": "2026-05-01T23:28:24.078551"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 920, "epoch": 0, "train_loss": 4.383555501699448, "train_ppl": 80.12240290708712, "lr": 0.00066, "grad_norm": 0.5427, "tokens_per_sec": 216914, "dt_s": 24.17, "eta_s": 481256, "world_size": 2, "timestamp": "2026-05-01T23:28:48.248930"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 930, "epoch": 0, "train_loss": 4.280279219150543, "train_ppl": 72.26061373793397, "lr": 0.00066, "grad_norm": 0.4604, "tokens_per_sec": 216934, "dt_s": 24.168, "eta_s": 481210, "world_size": 2, "timestamp": "2026-05-01T23:29:12.417009"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 940, "epoch": 0, "train_loss": 4.254790976643562, "train_ppl": 70.44209160706873, "lr": 0.00066, "grad_norm": 0.4387, "tokens_per_sec": 216834, "dt_s": 24.179, "eta_s": 481106, "world_size": 2, "timestamp": "2026-05-01T23:29:36.596253"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 950, "epoch": 0, "train_loss": 4.337277486920357, "train_ppl": 76.498986082441, "lr": 0.00066, "grad_norm": 0.4928, "tokens_per_sec": 216844, "dt_s": 24.178, "eta_s": 481125, "world_size": 2, "timestamp": "2026-05-01T23:30:00.774363"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 960, "epoch": 0, "train_loss": 4.2899879813194275, "train_ppl": 72.96559154422533, "lr": 0.00066, "grad_norm": 0.4714, "tokens_per_sec": 216911, "dt_s": 24.171, "eta_s": 481123, "world_size": 2, "timestamp": "2026-05-01T23:30:24.945004"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 970, "epoch": 0, "train_loss": 4.288772277534008, "train_ppl": 72.87694089576243, "lr": 0.00066, "grad_norm": 0.4992, "tokens_per_sec": 216880, "dt_s": 24.174, "eta_s": 481114, "world_size": 2, "timestamp": "2026-05-01T23:30:49.119123"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 980, "epoch": 0, "train_loss": 4.27728271484375, "train_ppl": 72.04440858926247, "lr": 0.00066, "grad_norm": 0.4109, "tokens_per_sec": 216980, "dt_s": 24.163, "eta_s": 481070, "world_size": 2, "timestamp": "2026-05-01T23:31:13.282181"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 990, "epoch": 0, "train_loss": 4.233216822147369, "train_ppl": 68.93863921081365, "lr": 0.00066, "grad_norm": 0.5187, "tokens_per_sec": 217003, "dt_s": 24.16, "eta_s": 480971, "world_size": 2, "timestamp": "2026-05-01T23:31:37.442540"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1000, "epoch": 0, "train_loss": 4.267744272947311, "train_ppl": 71.3604841560902, "lr": 0.00066, "grad_norm": 0.4928, "tokens_per_sec": 216931, "dt_s": 24.168, "eta_s": 480908, "world_size": 2, "timestamp": "2026-05-01T23:32:01.610976"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1010, "epoch": 0, "train_loss": 4.253322139382362, "train_ppl": 70.33869958977205, "lr": 0.00066, "grad_norm": 0.5036, "tokens_per_sec": 199164, "dt_s": 26.324, "eta_s": 480917, "world_size": 2, "timestamp": "2026-05-01T23:32:27.935381"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1020, "epoch": 0, "train_loss": 4.204853542149067, "train_ppl": 67.01078269235978, "lr": 0.00066, "grad_norm": 0.4104, "tokens_per_sec": 216949, "dt_s": 24.166, "eta_s": 480862, "world_size": 2, "timestamp": "2026-05-01T23:32:52.101838"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1030, "epoch": 0, "train_loss": 4.212213516235352, "train_ppl": 67.50579973823697, "lr": 0.00066, "grad_norm": 0.7294, "tokens_per_sec": 216844, "dt_s": 24.178, "eta_s": 480898, "world_size": 2, "timestamp": "2026-05-01T23:33:16.279931"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1040, "epoch": 0, "train_loss": 4.165649831295013, "train_ppl": 64.43454042252341, "lr": 0.00066, "grad_norm": 0.4593, "tokens_per_sec": 216912, "dt_s": 24.171, "eta_s": 480915, "world_size": 2, "timestamp": "2026-05-01T23:33:40.450522"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1050, "epoch": 0, "train_loss": 4.1890241876244545, "train_ppl": 65.95839652747473, "lr": 0.00066, "grad_norm": 0.488, "tokens_per_sec": 216961, "dt_s": 24.165, "eta_s": 480877, "world_size": 2, "timestamp": "2026-05-01T23:34:04.615553"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1060, "epoch": 0, "train_loss": 4.206989154219627, "train_ppl": 67.15404465026901, "lr": 0.00066, "grad_norm": 0.4994, "tokens_per_sec": 216919, "dt_s": 24.17, "eta_s": 480816, "world_size": 2, "timestamp": "2026-05-01T23:34:28.785333"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1070, "epoch": 0, "train_loss": 4.2545151039958, "train_ppl": 70.42266124101967, "lr": 0.00066, "grad_norm": 0.5118, "tokens_per_sec": 216856, "dt_s": 24.177, "eta_s": 480833, "world_size": 2, "timestamp": "2026-05-01T23:34:52.962099"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1080, "epoch": 0, "train_loss": 4.216456174850464, "train_ppl": 67.7928122185303, "lr": 0.00066, "grad_norm": 0.5129, "tokens_per_sec": 216860, "dt_s": 24.176, "eta_s": 480802, "world_size": 2, "timestamp": "2026-05-01T23:35:17.138463"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1090, "epoch": 0, "train_loss": 4.1820008382201195, "train_ppl": 65.49677064003033, "lr": 0.00066, "grad_norm": 0.5812, "tokens_per_sec": 216757, "dt_s": 24.188, "eta_s": 480846, "world_size": 2, "timestamp": "2026-05-01T23:35:41.326275"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1100, "epoch": 0, "train_loss": 4.174636386334896, "train_ppl": 65.01619458563684, "lr": 0.00066, "grad_norm": 0.528, "tokens_per_sec": 216871, "dt_s": 24.175, "eta_s": 480862, "world_size": 2, "timestamp": "2026-05-01T23:36:05.501379"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1110, "epoch": 0, "train_loss": 4.2105367705225945, "train_ppl": 67.39270452039752, "lr": 0.00066, "grad_norm": 0.4485, "tokens_per_sec": 216911, "dt_s": 24.171, "eta_s": 480842, "world_size": 2, "timestamp": "2026-05-01T23:36:29.672027"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1120, "epoch": 0, "train_loss": 4.179254099726677, "train_ppl": 65.3171149854203, "lr": 0.00066, "grad_norm": 0.8633, "tokens_per_sec": 216858, "dt_s": 24.177, "eta_s": 480817, "world_size": 2, "timestamp": "2026-05-01T23:36:53.848592"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1130, "epoch": 0, "train_loss": 4.186238966882229, "train_ppl": 65.77494343062183, "lr": 0.00066, "grad_norm": 0.4486, "tokens_per_sec": 217019, "dt_s": 24.159, "eta_s": 480722, "world_size": 2, "timestamp": "2026-05-01T23:37:18.007211"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1140, "epoch": 0, "train_loss": 4.137194745242596, "train_ppl": 62.626890416609385, "lr": 0.00066, "grad_norm": 0.5124, "tokens_per_sec": 216815, "dt_s": 24.181, "eta_s": 480672, "world_size": 2, "timestamp": "2026-05-01T23:37:42.188543"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1150, "epoch": 0, "train_loss": 4.211117766797543, "train_ppl": 67.4318708073142, "lr": 0.00066, "grad_norm": 0.439, "tokens_per_sec": 216884, "dt_s": 24.174, "eta_s": 480642, "world_size": 2, "timestamp": "2026-05-01T23:38:06.362203"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1160, "epoch": 0, "train_loss": 4.1624238938093185, "train_ppl": 64.22701353755163, "lr": 0.00066, "grad_norm": 0.4071, "tokens_per_sec": 216956, "dt_s": 24.166, "eta_s": 480598, "world_size": 2, "timestamp": "2026-05-01T23:38:30.527846"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1170, "epoch": 0, "train_loss": 4.164682507514954, "train_ppl": 64.37224149578996, "lr": 0.00066, "grad_norm": 0.575, "tokens_per_sec": 216810, "dt_s": 24.182, "eta_s": 480595, "world_size": 2, "timestamp": "2026-05-01T23:38:54.709713"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1180, "epoch": 0, "train_loss": 4.1749816462397575, "train_ppl": 65.03864594634824, "lr": 0.00066, "grad_norm": 0.3936, "tokens_per_sec": 216852, "dt_s": 24.177, "eta_s": 480645, "world_size": 2, "timestamp": "2026-05-01T23:39:18.886986"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1190, "epoch": 0, "train_loss": 4.080268956720829, "train_ppl": 59.16137956092093, "lr": 0.00066, "grad_norm": 0.542, "tokens_per_sec": 216982, "dt_s": 24.163, "eta_s": 480547, "world_size": 2, "timestamp": "2026-05-01T23:39:43.049675"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1200, "epoch": 0, "train_loss": 4.1528176590800285, "train_ppl": 63.61298772940824, "lr": 0.00066, "grad_norm": 0.4851, "tokens_per_sec": 216905, "dt_s": 24.171, "eta_s": 480513, "world_size": 2, "timestamp": "2026-05-01T23:40:07.220964"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1210, "epoch": 0, "train_loss": 4.147434026002884, "train_ppl": 63.271438956108554, "lr": 0.00066, "grad_norm": 0.4437, "tokens_per_sec": 216889, "dt_s": 24.173, "eta_s": 480519, "world_size": 2, "timestamp": "2026-05-01T23:40:31.394029"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1220, "epoch": 0, "train_loss": 4.1342137679457664, "train_ppl": 62.440479060158275, "lr": 0.00066, "grad_norm": 0.5077, "tokens_per_sec": 216859, "dt_s": 24.176, "eta_s": 480473, "world_size": 2, "timestamp": "2026-05-01T23:40:55.570477"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1230, "epoch": 0, "train_loss": 4.115422308444977, "train_ppl": 61.27808705287302, "lr": 0.00066, "grad_norm": 0.4435, "tokens_per_sec": 216902, "dt_s": 24.172, "eta_s": 480426, "world_size": 2, "timestamp": "2026-05-01T23:41:19.742112"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1240, "epoch": 0, "train_loss": 4.089035093784332, "train_ppl": 59.68227611260798, "lr": 0.00066, "grad_norm": 0.5809, "tokens_per_sec": 216876, "dt_s": 24.175, "eta_s": 480449, "world_size": 2, "timestamp": "2026-05-01T23:41:43.916681"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1250, "epoch": 0, "train_loss": 4.073485217988491, "train_ppl": 58.76140242301771, "lr": 0.00066, "grad_norm": 0.4159, "tokens_per_sec": 216782, "dt_s": 24.185, "eta_s": 480479, "world_size": 2, "timestamp": "2026-05-01T23:42:08.101705"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1260, "epoch": 0, "train_loss": 4.091670364141464, "train_ppl": 59.83976246410996, "lr": 0.00066, "grad_norm": 0.4491, "tokens_per_sec": 216813, "dt_s": 24.182, "eta_s": 480489, "world_size": 2, "timestamp": "2026-05-01T23:42:32.283240"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1270, "epoch": 0, "train_loss": 4.037875227630138, "train_ppl": 56.705727947332555, "lr": 0.00066, "grad_norm": 0.4595, "tokens_per_sec": 216894, "dt_s": 24.173, "eta_s": 480449, "world_size": 2, "timestamp": "2026-05-01T23:42:56.455830"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1280, "epoch": 0, "train_loss": 4.082754954695702, "train_ppl": 59.30863759644894, "lr": 0.00066, "grad_norm": 0.441, "tokens_per_sec": 216716, "dt_s": 24.192, "eta_s": 480508, "world_size": 2, "timestamp": "2026-05-01T23:43:20.648200"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1290, "epoch": 0, "train_loss": 4.105241373181343, "train_ppl": 60.65738384647883, "lr": 0.00066, "grad_norm": 0.4184, "tokens_per_sec": 216870, "dt_s": 24.175, "eta_s": 480486, "world_size": 2, "timestamp": "2026-05-01T23:43:44.823452"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1300, "epoch": 0, "train_loss": 4.130386285483837, "train_ppl": 62.201946003388265, "lr": 0.00066, "grad_norm": 0.5519, "tokens_per_sec": 216893, "dt_s": 24.173, "eta_s": 480413, "world_size": 2, "timestamp": "2026-05-01T23:44:08.996553"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1310, "epoch": 0, "train_loss": 4.101701967418194, "train_ppl": 60.44307224410575, "lr": 0.00066, "grad_norm": 0.4773, "tokens_per_sec": 216876, "dt_s": 24.175, "eta_s": 480360, "world_size": 2, "timestamp": "2026-05-01T23:44:33.170722"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1320, "epoch": 0, "train_loss": 4.056430846452713, "train_ppl": 57.7677606852235, "lr": 0.00066, "grad_norm": 0.4126, "tokens_per_sec": 216825, "dt_s": 24.18, "eta_s": 480366, "world_size": 2, "timestamp": "2026-05-01T23:44:57.350932"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1330, "epoch": 0, "train_loss": 4.090197987854481, "train_ppl": 59.751720648085595, "lr": 0.00066, "grad_norm": 0.4045, "tokens_per_sec": 216770, "dt_s": 24.186, "eta_s": 480318, "world_size": 2, "timestamp": "2026-05-01T23:45:21.537261"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1340, "epoch": 0, "train_loss": 4.073062211275101, "train_ppl": 58.736551211789696, "lr": 0.00066, "grad_norm": 0.4701, "tokens_per_sec": 216824, "dt_s": 24.18, "eta_s": 480314, "world_size": 2, "timestamp": "2026-05-01T23:45:45.717607"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1350, "epoch": 0, "train_loss": 4.0197117775678635, "train_ppl": 55.68505383198514, "lr": 0.00066, "grad_norm": 0.4167, "tokens_per_sec": 216884, "dt_s": 24.174, "eta_s": 480293, "world_size": 2, "timestamp": "2026-05-01T23:46:09.891245"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1360, "epoch": 0, "train_loss": 4.0113003477454185, "train_ppl": 55.218627316522785, "lr": 0.00066, "grad_norm": 0.4354, "tokens_per_sec": 216916, "dt_s": 24.17, "eta_s": 480253, "world_size": 2, "timestamp": "2026-05-01T23:46:34.061318"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1370, "epoch": 0, "train_loss": 4.056912779808044, "train_ppl": 57.795607605601425, "lr": 0.00066, "grad_norm": 0.4077, "tokens_per_sec": 216836, "dt_s": 24.179, "eta_s": 480224, "world_size": 2, "timestamp": "2026-05-01T23:46:58.240317"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1380, "epoch": 0, "train_loss": 4.115881018340588, "train_ppl": 61.30620236568138, "lr": 0.00066, "grad_norm": 0.4925, "tokens_per_sec": 216999, "dt_s": 24.161, "eta_s": 480099, "world_size": 2, "timestamp": "2026-05-01T23:47:22.401240"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1390, "epoch": 0, "train_loss": 4.06686557084322, "train_ppl": 58.373707291425156, "lr": 0.00066, "grad_norm": 0.3619, "tokens_per_sec": 216768, "dt_s": 24.187, "eta_s": 480099, "world_size": 2, "timestamp": "2026-05-01T23:47:46.587837"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1400, "epoch": 0, "train_loss": 4.031082317233086, "train_ppl": 56.321836365124916, "lr": 0.00066, "grad_norm": 0.3838, "tokens_per_sec": 216920, "dt_s": 24.17, "eta_s": 480059, "world_size": 2, "timestamp": "2026-05-01T23:48:10.757461"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1410, "epoch": 0, "train_loss": 4.039397664368153, "train_ppl": 56.792124580832244, "lr": 0.00066, "grad_norm": 0.4567, "tokens_per_sec": 216877, "dt_s": 24.174, "eta_s": 480052, "world_size": 2, "timestamp": "2026-05-01T23:48:34.931883"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1420, "epoch": 0, "train_loss": 4.024139933288097, "train_ppl": 55.9321826803345, "lr": 0.00066, "grad_norm": 0.3738, "tokens_per_sec": 216798, "dt_s": 24.183, "eta_s": 480045, "world_size": 2, "timestamp": "2026-05-01T23:48:59.115151"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1430, "epoch": 0, "train_loss": 3.9526471197605133, "train_ppl": 52.073028093182565, "lr": 0.00066, "grad_norm": 0.5036, "tokens_per_sec": 216911, "dt_s": 24.171, "eta_s": 480059, "world_size": 2, "timestamp": "2026-05-01T23:49:23.285777"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1440, "epoch": 0, "train_loss": 4.052224300801754, "train_ppl": 57.525268347866565, "lr": 0.00066, "grad_norm": 0.4718, "tokens_per_sec": 216893, "dt_s": 24.173, "eta_s": 479980, "world_size": 2, "timestamp": "2026-05-01T23:49:47.458473"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1450, "epoch": 0, "train_loss": 4.027884058654308, "train_ppl": 56.141992315984076, "lr": 0.00066, "grad_norm": 0.3888, "tokens_per_sec": 216742, "dt_s": 24.19, "eta_s": 480035, "world_size": 2, "timestamp": "2026-05-01T23:50:11.647998"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1460, "epoch": 0, "train_loss": 4.003548629581928, "train_ppl": 54.79224282149513, "lr": 0.00066, "grad_norm": 0.3863, "tokens_per_sec": 216965, "dt_s": 24.165, "eta_s": 479972, "world_size": 2, "timestamp": "2026-05-01T23:50:35.812670"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1470, "epoch": 0, "train_loss": 4.06822719424963, "train_ppl": 58.45324443512176, "lr": 0.00066, "grad_norm": 0.6008, "tokens_per_sec": 216801, "dt_s": 24.183, "eta_s": 479946, "world_size": 2, "timestamp": "2026-05-01T23:50:59.995565"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1480, "epoch": 0, "train_loss": 4.0035864263772964, "train_ppl": 54.794313831823345, "lr": 0.00066, "grad_norm": 0.3577, "tokens_per_sec": 216932, "dt_s": 24.168, "eta_s": 479913, "world_size": 2, "timestamp": "2026-05-01T23:51:24.163933"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1490, "epoch": 0, "train_loss": 3.993551269173622, "train_ppl": 54.24719408641924, "lr": 0.00066, "grad_norm": 0.3745, "tokens_per_sec": 216857, "dt_s": 24.177, "eta_s": 479905, "world_size": 2, "timestamp": "2026-05-01T23:51:48.340627"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1500, "epoch": 0, "train_loss": 4.006705395877361, "train_ppl": 54.96548242148682, "lr": 0.00066, "grad_norm": 0.3953, "tokens_per_sec": 216991, "dt_s": 24.162, "eta_s": 479771, "world_size": 2, "timestamp": "2026-05-01T23:52:12.502441"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1510, "epoch": 0, "train_loss": 4.006954044103622, "train_ppl": 54.97915119048367, "lr": 0.00066, "grad_norm": 0.4963, "tokens_per_sec": 198734, "dt_s": 26.381, "eta_s": 479792, "world_size": 2, "timestamp": "2026-05-01T23:52:38.883916"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1520, "epoch": 0, "train_loss": 3.9796829000115395, "train_ppl": 53.50006666690399, "lr": 0.00066, "grad_norm": 0.3873, "tokens_per_sec": 216951, "dt_s": 24.166, "eta_s": 479702, "world_size": 2, "timestamp": "2026-05-01T23:53:03.050063"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1530, "epoch": 0, "train_loss": 4.037487745285034, "train_ppl": 56.6837597353087, "lr": 0.00066, "grad_norm": 0.403, "tokens_per_sec": 217064, "dt_s": 24.154, "eta_s": 479619, "world_size": 2, "timestamp": "2026-05-01T23:53:27.203708"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1540, "epoch": 0, "train_loss": 4.030413657426834, "train_ppl": 56.2841888050411, "lr": 0.00066, "grad_norm": 0.4157, "tokens_per_sec": 216933, "dt_s": 24.168, "eta_s": 479561, "world_size": 2, "timestamp": "2026-05-01T23:53:51.371865"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1550, "epoch": 0, "train_loss": 4.017305485904217, "train_ppl": 55.55122043682833, "lr": 0.00066, "grad_norm": 0.5525, "tokens_per_sec": 217020, "dt_s": 24.158, "eta_s": 479524, "world_size": 2, "timestamp": "2026-05-01T23:54:15.530337"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1560, "epoch": 0, "train_loss": 3.9920295774936676, "train_ppl": 54.16470935659704, "lr": 0.00066, "grad_norm": 0.4376, "tokens_per_sec": 217032, "dt_s": 24.157, "eta_s": 479424, "world_size": 2, "timestamp": "2026-05-01T23:54:39.687561"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1570, "epoch": 0, "train_loss": 3.9936564564704895, "train_ppl": 54.2529005022438, "lr": 0.00066, "grad_norm": 0.5739, "tokens_per_sec": 217135, "dt_s": 24.146, "eta_s": 479319, "world_size": 2, "timestamp": "2026-05-01T23:55:03.833235"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1580, "epoch": 0, "train_loss": 4.0018918588757515, "train_ppl": 54.70153979646764, "lr": 0.00066, "grad_norm": 0.421, "tokens_per_sec": 217022, "dt_s": 24.158, "eta_s": 479313, "world_size": 2, "timestamp": "2026-05-01T23:55:27.991467"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1590, "epoch": 0, "train_loss": 4.018896035850048, "train_ppl": 55.63964773285814, "lr": 0.00066, "grad_norm": 0.408, "tokens_per_sec": 217027, "dt_s": 24.158, "eta_s": 479247, "world_size": 2, "timestamp": "2026-05-01T23:55:52.149171"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1600, "epoch": 0, "train_loss": 3.954232983291149, "train_ppl": 52.15567432486668, "lr": 0.00066, "grad_norm": 0.3934, "tokens_per_sec": 216939, "dt_s": 24.168, "eta_s": 479259, "world_size": 2, "timestamp": "2026-05-01T23:56:16.316716"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1610, "epoch": 0, "train_loss": 3.98891818523407, "train_ppl": 53.99644360528468, "lr": 0.00066, "grad_norm": 0.521, "tokens_per_sec": 216595, "dt_s": 24.206, "eta_s": 479428, "world_size": 2, "timestamp": "2026-05-01T23:56:40.522635"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1620, "epoch": 0, "train_loss": 4.02095115929842, "train_ppl": 55.75411165603251, "lr": 0.00066, "grad_norm": 0.3441, "tokens_per_sec": 216643, "dt_s": 24.201, "eta_s": 479622, "world_size": 2, "timestamp": "2026-05-01T23:57:04.723174"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1630, "epoch": 0, "train_loss": 4.012794718146324, "train_ppl": 55.30120608502942, "lr": 0.00066, "grad_norm": 0.4834, "tokens_per_sec": 216617, "dt_s": 24.203, "eta_s": 479777, "world_size": 2, "timestamp": "2026-05-01T23:57:28.926592"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1640, "epoch": 0, "train_loss": 3.9680802896618843, "train_ppl": 52.882913457240264, "lr": 0.00066, "grad_norm": 0.411, "tokens_per_sec": 216510, "dt_s": 24.215, "eta_s": 479982, "world_size": 2, "timestamp": "2026-05-01T23:57:53.141986"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1650, "epoch": 0, "train_loss": 3.9354239851236343, "train_ppl": 51.18384654632274, "lr": 0.00066, "grad_norm": 0.397, "tokens_per_sec": 216606, "dt_s": 24.205, "eta_s": 480105, "world_size": 2, "timestamp": "2026-05-01T23:58:17.360078"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1660, "epoch": 0, "train_loss": 4.019203029572964, "train_ppl": 55.65673137760905, "lr": 0.00066, "grad_norm": 0.449, "tokens_per_sec": 216547, "dt_s": 24.211, "eta_s": 480049, "world_size": 2, "timestamp": "2026-05-01T23:58:41.557904"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1670, "epoch": 0, "train_loss": 3.9411566853523254, "train_ppl": 51.47811085376765, "lr": 0.00066, "grad_norm": 0.3944, "tokens_per_sec": 216574, "dt_s": 24.208, "eta_s": 480055, "world_size": 2, "timestamp": "2026-05-01T23:59:05.766207"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1680, "epoch": 0, "train_loss": 3.987380787730217, "train_ppl": 53.913493387736054, "lr": 0.00066, "grad_norm": 0.3906, "tokens_per_sec": 216621, "dt_s": 24.203, "eta_s": 480029, "world_size": 2, "timestamp": "2026-05-01T23:59:29.969215"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1690, "epoch": 0, "train_loss": 3.978803865611553, "train_ppl": 53.45305893163732, "lr": 0.00066, "grad_norm": 0.4349, "tokens_per_sec": 216590, "dt_s": 24.206, "eta_s": 479970, "world_size": 2, "timestamp": "2026-05-01T23:59:54.175630"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1700, "epoch": 0, "train_loss": 3.952532798051834, "train_ppl": 52.06707535590488, "lr": 0.00066, "grad_norm": 0.3662, "tokens_per_sec": 216660, "dt_s": 24.199, "eta_s": 479922, "world_size": 2, "timestamp": "2026-05-02T00:00:18.374297"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1710, "epoch": 0, "train_loss": 3.9494395405054092, "train_ppl": 51.90626732069501, "lr": 0.00066, "grad_norm": 0.4009, "tokens_per_sec": 216629, "dt_s": 24.202, "eta_s": 479914, "world_size": 2, "timestamp": "2026-05-02T00:00:42.576356"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1720, "epoch": 0, "train_loss": 3.9179621413350105, "train_ppl": 50.297840377097515, "lr": 0.00066, "grad_norm": 0.3932, "tokens_per_sec": 216564, "dt_s": 24.209, "eta_s": 479894, "world_size": 2, "timestamp": "2026-05-02T00:01:06.785726"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1730, "epoch": 0, "train_loss": 3.9257531613111496, "train_ppl": 50.69124236765956, "lr": 0.00066, "grad_norm": 0.366, "tokens_per_sec": 216518, "dt_s": 24.215, "eta_s": 479916, "world_size": 2, "timestamp": "2026-05-02T00:01:31.000239"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1740, "epoch": 0, "train_loss": 3.915371425449848, "train_ppl": 50.16770161211493, "lr": 0.00066, "grad_norm": 0.4635, "tokens_per_sec": 216599, "dt_s": 24.205, "eta_s": 479887, "world_size": 2, "timestamp": "2026-05-02T00:01:55.205658"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1750, "epoch": 0, "train_loss": 3.921862542629242, "train_ppl": 50.49440523050855, "lr": 0.00066, "grad_norm": 0.4344, "tokens_per_sec": 216738, "dt_s": 24.19, "eta_s": 479828, "world_size": 2, "timestamp": "2026-05-02T00:02:19.395578"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1760, "epoch": 0, "train_loss": 3.9266701117157936, "train_ppl": 50.73774503992429, "lr": 0.00066, "grad_norm": 0.3637, "tokens_per_sec": 217107, "dt_s": 24.149, "eta_s": 479593, "world_size": 2, "timestamp": "2026-05-02T00:02:43.544455"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1770, "epoch": 0, "train_loss": 3.919258065521717, "train_ppl": 50.36306481881883, "lr": 0.00066, "grad_norm": 0.3989, "tokens_per_sec": 216910, "dt_s": 24.171, "eta_s": 479416, "world_size": 2, "timestamp": "2026-05-02T00:03:07.721073"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1780, "epoch": 0, "train_loss": 3.9075947999954224, "train_ppl": 49.77907923064224, "lr": 0.00066, "grad_norm": 0.4515, "tokens_per_sec": 217091, "dt_s": 24.151, "eta_s": 479115, "world_size": 2, "timestamp": "2026-05-02T00:03:31.865866"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1790, "epoch": 0, "train_loss": 3.9113713428378105, "train_ppl": 49.96742748478074, "lr": 0.00066, "grad_norm": 0.5611, "tokens_per_sec": 217134, "dt_s": 24.146, "eta_s": 478855, "world_size": 2, "timestamp": "2026-05-02T00:03:56.011749"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1800, "epoch": 0, "train_loss": 3.970505513250828, "train_ppl": 53.011321993175535, "lr": 0.00066, "grad_norm": 0.4289, "tokens_per_sec": 217175, "dt_s": 24.141, "eta_s": 478638, "world_size": 2, "timestamp": "2026-05-02T00:04:20.153063"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1810, "epoch": 0, "train_loss": 3.8969727605581284, "train_ppl": 49.253122201258215, "lr": 0.00066, "grad_norm": 0.3587, "tokens_per_sec": 216979, "dt_s": 24.163, "eta_s": 478671, "world_size": 2, "timestamp": "2026-05-02T00:04:44.316148"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1820, "epoch": 0, "train_loss": 4.008155845105648, "train_ppl": 55.045264909284185, "lr": 0.00066, "grad_norm": 0.3688, "tokens_per_sec": 217181, "dt_s": 24.141, "eta_s": 478527, "world_size": 2, "timestamp": "2026-05-02T00:05:08.456789"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1830, "epoch": 0, "train_loss": 3.8779801353812218, "train_ppl": 48.32650342799684, "lr": 0.00066, "grad_norm": 0.4787, "tokens_per_sec": 217111, "dt_s": 24.148, "eta_s": 478517, "world_size": 2, "timestamp": "2026-05-02T00:05:32.605184"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1840, "epoch": 0, "train_loss": 3.9097212329506874, "train_ppl": 49.885043728472716, "lr": 0.00066, "grad_norm": 0.4453, "tokens_per_sec": 217039, "dt_s": 24.156, "eta_s": 478534, "world_size": 2, "timestamp": "2026-05-02T00:05:56.761546"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1850, "epoch": 0, "train_loss": 3.9880776032805443, "train_ppl": 53.95107424024169, "lr": 0.00066, "grad_norm": 0.4613, "tokens_per_sec": 217154, "dt_s": 24.144, "eta_s": 478519, "world_size": 2, "timestamp": "2026-05-02T00:06:20.905112"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1860, "epoch": 0, "train_loss": 3.915185645222664, "train_ppl": 50.158382310809884, "lr": 0.00066, "grad_norm": 0.3655, "tokens_per_sec": 217012, "dt_s": 24.159, "eta_s": 478480, "world_size": 2, "timestamp": "2026-05-02T00:06:45.064547"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1870, "epoch": 0, "train_loss": 3.9326749593019485, "train_ppl": 51.04333405523624, "lr": 0.00066, "grad_norm": 0.4301, "tokens_per_sec": 216996, "dt_s": 24.161, "eta_s": 478537, "world_size": 2, "timestamp": "2026-05-02T00:07:09.225702"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1880, "epoch": 0, "train_loss": 3.9051739647984505, "train_ppl": 49.65871802966451, "lr": 0.00066, "grad_norm": 0.3818, "tokens_per_sec": 217010, "dt_s": 24.16, "eta_s": 478558, "world_size": 2, "timestamp": "2026-05-02T00:07:33.385302"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1890, "epoch": 0, "train_loss": 3.928481660783291, "train_ppl": 50.82974225821677, "lr": 0.00066, "grad_norm": 0.4297, "tokens_per_sec": 217025, "dt_s": 24.158, "eta_s": 478540, "world_size": 2, "timestamp": "2026-05-02T00:07:57.543289"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1900, "epoch": 0, "train_loss": 3.924814023077488, "train_ppl": 50.64365863119072, "lr": 0.00066, "grad_norm": 0.4409, "tokens_per_sec": 217097, "dt_s": 24.15, "eta_s": 478541, "world_size": 2, "timestamp": "2026-05-02T00:08:21.693189"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1910, "epoch": 0, "train_loss": 3.848757281899452, "train_ppl": 46.93470037310257, "lr": 0.00066, "grad_norm": 0.3747, "tokens_per_sec": 217048, "dt_s": 24.155, "eta_s": 478501, "world_size": 2, "timestamp": "2026-05-02T00:08:45.848607"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1920, "epoch": 0, "train_loss": 3.8430111184716225, "train_ppl": 46.66577928652364, "lr": 0.00066, "grad_norm": 0.3796, "tokens_per_sec": 216953, "dt_s": 24.166, "eta_s": 478496, "world_size": 2, "timestamp": "2026-05-02T00:09:10.014594"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1930, "epoch": 0, "train_loss": 3.891435332596302, "train_ppl": 48.98114032019418, "lr": 0.00066, "grad_norm": 0.3897, "tokens_per_sec": 217067, "dt_s": 24.153, "eta_s": 478447, "world_size": 2, "timestamp": "2026-05-02T00:09:34.167927"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1940, "epoch": 0, "train_loss": 3.957089513540268, "train_ppl": 52.304871578021014, "lr": 0.00066, "grad_norm": 0.369, "tokens_per_sec": 217117, "dt_s": 24.148, "eta_s": 478382, "world_size": 2, "timestamp": "2026-05-02T00:09:58.315636"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1950, "epoch": 0, "train_loss": 3.908643700182438, "train_ppl": 49.83131990899581, "lr": 0.00066, "grad_norm": 0.4534, "tokens_per_sec": 217003, "dt_s": 24.16, "eta_s": 478400, "world_size": 2, "timestamp": "2026-05-02T00:10:22.476069"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1510, "epoch": 0, "train_loss": 4.08025835454464, "train_ppl": 59.160752324876256, "lr": 0.00066, "grad_norm": 0.8219, "tokens_per_sec": 386914, "dt_s": 13.55, "eta_s": 13653, "world_size": 8, "timestamp": "2026-05-02T12:58:48.204512"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1520, "epoch": 0, "train_loss": 4.044106662273407, "train_ppl": 57.06018923912211, "lr": 0.00066, "grad_norm": 0.7959, "tokens_per_sec": 402141, "dt_s": 13.037, "eta_s": 13381, "world_size": 8, "timestamp": "2026-05-02T12:59:01.241984"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1530, "epoch": 0, "train_loss": 4.318581238389015, "train_ppl": 75.08202918985769, "lr": 0.00066, "grad_norm": 1.132, "tokens_per_sec": 401532, "dt_s": 13.057, "eta_s": 13288, "world_size": 8, "timestamp": "2026-05-02T12:59:14.299132"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1540, "epoch": 0, "train_loss": 4.298528328537941, "train_ppl": 73.5914115874405, "lr": 0.00066, "grad_norm": 0.7478, "tokens_per_sec": 402614, "dt_s": 13.022, "eta_s": 13226, "world_size": 8, "timestamp": "2026-05-02T12:59:27.321285"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1550, "epoch": 0, "train_loss": 4.2405827939510345, "train_ppl": 69.4483141043977, "lr": 0.00066, "grad_norm": 0.7332, "tokens_per_sec": 401824, "dt_s": 13.048, "eta_s": 13189, "world_size": 8, "timestamp": "2026-05-02T12:59:40.369010"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1560, "epoch": 0, "train_loss": 4.10242073237896, "train_ppl": 60.48653222343999, "lr": 0.00066, "grad_norm": 0.6635, "tokens_per_sec": 401889, "dt_s": 13.046, "eta_s": 13075, "world_size": 8, "timestamp": "2026-05-02T12:59:53.414613"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1570, "epoch": 0, "train_loss": 4.188880577683449, "train_ppl": 65.94892492616492, "lr": 0.00066, "grad_norm": 0.7064, "tokens_per_sec": 402500, "dt_s": 13.026, "eta_s": 13059, "world_size": 8, "timestamp": "2026-05-02T13:00:06.440264"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1580, "epoch": 0, "train_loss": 4.176458060741425, "train_ppl": 65.1347408669037, "lr": 0.00066, "grad_norm": 0.6607, "tokens_per_sec": 399268, "dt_s": 13.131, "eta_s": 13061, "world_size": 8, "timestamp": "2026-05-02T13:00:19.571614"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1590, "epoch": 0, "train_loss": 4.336529836058617, "train_ppl": 76.44181292501766, "lr": 0.00066, "grad_norm": 0.6677, "tokens_per_sec": 402100, "dt_s": 13.039, "eta_s": 13051, "world_size": 8, "timestamp": "2026-05-02T13:00:32.610233"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1600, "epoch": 0, "train_loss": 4.1488978266716, "train_ppl": 63.36412355009878, "lr": 0.00066, "grad_norm": 0.6567, "tokens_per_sec": 402306, "dt_s": 13.032, "eta_s": 13035, "world_size": 8, "timestamp": "2026-05-02T13:00:45.642397"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1610, "epoch": 0, "train_loss": 4.204032331705093, "train_ppl": 66.95577532715673, "lr": 0.00066, "grad_norm": 0.704, "tokens_per_sec": 402135, "dt_s": 13.038, "eta_s": 13021, "world_size": 8, "timestamp": "2026-05-02T13:00:58.679896"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1620, "epoch": 0, "train_loss": 4.113686308264732, "train_ppl": 61.171800566077245, "lr": 0.00066, "grad_norm": 0.6451, "tokens_per_sec": 402339, "dt_s": 13.031, "eta_s": 13009, "world_size": 8, "timestamp": "2026-05-02T13:01:11.711007"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1630, "epoch": 0, "train_loss": 4.239570125937462, "train_ppl": 69.37802161559219, "lr": 0.00066, "grad_norm": 0.6688, "tokens_per_sec": 402420, "dt_s": 13.028, "eta_s": 12975, "world_size": 8, "timestamp": "2026-05-02T13:01:24.739270"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1640, "epoch": 0, "train_loss": 4.103269085288048, "train_ppl": 60.537867921327845, "lr": 0.00066, "grad_norm": 0.6454, "tokens_per_sec": 402461, "dt_s": 13.027, "eta_s": 12960, "world_size": 8, "timestamp": "2026-05-02T13:01:37.766372"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1650, "epoch": 0, "train_loss": 4.103435322642326, "train_ppl": 60.5479324128488, "lr": 0.00066, "grad_norm": 0.5701, "tokens_per_sec": 403131, "dt_s": 13.005, "eta_s": 12941, "world_size": 8, "timestamp": "2026-05-02T13:01:50.771864"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1660, "epoch": 0, "train_loss": 4.203580856323242, "train_ppl": 66.92555326569547, "lr": 0.00066, "grad_norm": 0.6279, "tokens_per_sec": 402236, "dt_s": 13.034, "eta_s": 12928, "world_size": 8, "timestamp": "2026-05-02T13:02:03.806076"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1670, "epoch": 0, "train_loss": 4.137563109397888, "train_ppl": 62.6499641677068, "lr": 0.00066, "grad_norm": 0.6245, "tokens_per_sec": 402943, "dt_s": 13.011, "eta_s": 12911, "world_size": 8, "timestamp": "2026-05-02T13:02:16.817573"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1680, "epoch": 0, "train_loss": 4.170594796538353, "train_ppl": 64.75395608417536, "lr": 0.00066, "grad_norm": 0.6559, "tokens_per_sec": 402350, "dt_s": 13.031, "eta_s": 12898, "world_size": 8, "timestamp": "2026-05-02T13:02:29.848329"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1690, "epoch": 0, "train_loss": 4.125308856368065, "train_ppl": 61.88692046839186, "lr": 0.00066, "grad_norm": 0.6778, "tokens_per_sec": 401418, "dt_s": 13.061, "eta_s": 12892, "world_size": 8, "timestamp": "2026-05-02T13:02:42.909124"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1700, "epoch": 0, "train_loss": 4.1953219920396805, "train_ppl": 66.37510039077513, "lr": 0.00066, "grad_norm": 0.6202, "tokens_per_sec": 402695, "dt_s": 13.019, "eta_s": 12882, "world_size": 8, "timestamp": "2026-05-02T13:02:55.928591"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1710, "epoch": 0, "train_loss": 4.171405792236328, "train_ppl": 64.80649256453677, "lr": 0.00066, "grad_norm": 0.5975, "tokens_per_sec": 401591, "dt_s": 13.055, "eta_s": 12873, "world_size": 8, "timestamp": "2026-05-02T13:03:08.983968"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1720, "epoch": 0, "train_loss": 4.216330036520958, "train_ppl": 67.78426148574346, "lr": 0.00066, "grad_norm": 0.5667, "tokens_per_sec": 402039, "dt_s": 13.041, "eta_s": 12866, "world_size": 8, "timestamp": "2026-05-02T13:03:22.024614"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1730, "epoch": 0, "train_loss": 4.164383515715599, "train_ppl": 64.35299760050346, "lr": 0.00066, "grad_norm": 0.6403, "tokens_per_sec": 402524, "dt_s": 13.025, "eta_s": 12851, "world_size": 8, "timestamp": "2026-05-02T13:03:35.049576"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1740, "epoch": 0, "train_loss": 4.093825280666351, "train_ppl": 59.968851194854984, "lr": 0.00066, "grad_norm": 0.5624, "tokens_per_sec": 401984, "dt_s": 13.043, "eta_s": 12835, "world_size": 8, "timestamp": "2026-05-02T13:03:48.092091"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1750, "epoch": 0, "train_loss": 4.235491126775742, "train_ppl": 69.09560510351429, "lr": 0.00066, "grad_norm": 0.5657, "tokens_per_sec": 402450, "dt_s": 13.027, "eta_s": 12823, "world_size": 8, "timestamp": "2026-05-02T13:04:01.119527"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1760, "epoch": 0, "train_loss": 4.203767150640488, "train_ppl": 66.93802227736433, "lr": 0.00066, "grad_norm": 0.8599, "tokens_per_sec": 402126, "dt_s": 13.038, "eta_s": 12807, "world_size": 8, "timestamp": "2026-05-02T13:04:14.157507"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1770, "epoch": 0, "train_loss": 4.162021055817604, "train_ppl": 64.20114566703467, "lr": 0.00066, "grad_norm": 0.549, "tokens_per_sec": 401956, "dt_s": 13.043, "eta_s": 12794, "world_size": 8, "timestamp": "2026-05-02T13:04:27.200877"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1780, "epoch": 0, "train_loss": 4.131020337343216, "train_ppl": 62.2413977688196, "lr": 0.00066, "grad_norm": 0.577, "tokens_per_sec": 402803, "dt_s": 13.016, "eta_s": 12780, "world_size": 8, "timestamp": "2026-05-02T13:04:40.216837"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1790, "epoch": 0, "train_loss": 4.123115390539169, "train_ppl": 61.751322392328845, "lr": 0.00066, "grad_norm": 0.5563, "tokens_per_sec": 402269, "dt_s": 13.033, "eta_s": 12765, "world_size": 8, "timestamp": "2026-05-02T13:04:53.250087"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1800, "epoch": 0, "train_loss": 4.1946277767419815, "train_ppl": 66.32903777123505, "lr": 0.00066, "grad_norm": 0.6351, "tokens_per_sec": 402456, "dt_s": 13.027, "eta_s": 12752, "world_size": 8, "timestamp": "2026-05-02T13:05:06.277328"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1810, "epoch": 0, "train_loss": 4.153491228818893, "train_ppl": 63.65584994666726, "lr": 0.00066, "grad_norm": 0.5245, "tokens_per_sec": 402622, "dt_s": 13.022, "eta_s": 12736, "world_size": 8, "timestamp": "2026-05-02T13:05:19.299251"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1820, "epoch": 0, "train_loss": 4.235090970993042, "train_ppl": 69.06796162878999, "lr": 0.00066, "grad_norm": 0.5395, "tokens_per_sec": 401877, "dt_s": 13.046, "eta_s": 12723, "world_size": 8, "timestamp": "2026-05-02T13:05:32.345181"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1830, "epoch": 0, "train_loss": 4.176015689969063, "train_ppl": 65.10593353350976, "lr": 0.00066, "grad_norm": 0.6015, "tokens_per_sec": 402785, "dt_s": 13.017, "eta_s": 12710, "world_size": 8, "timestamp": "2026-05-02T13:05:45.361688"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1840, "epoch": 0, "train_loss": 4.1768176555633545, "train_ppl": 65.15816719418774, "lr": 0.00066, "grad_norm": 0.5535, "tokens_per_sec": 402174, "dt_s": 13.036, "eta_s": 12698, "world_size": 8, "timestamp": "2026-05-02T13:05:58.398043"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1850, "epoch": 0, "train_loss": 4.2145766466856, "train_ppl": 67.66551338693486, "lr": 0.00066, "grad_norm": 0.5399, "tokens_per_sec": 402288, "dt_s": 13.033, "eta_s": 12686, "world_size": 8, "timestamp": "2026-05-02T13:06:11.430711"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1860, "epoch": 0, "train_loss": 4.176976829767227, "train_ppl": 65.16853951905806, "lr": 0.00066, "grad_norm": 0.5597, "tokens_per_sec": 402800, "dt_s": 13.016, "eta_s": 12672, "world_size": 8, "timestamp": "2026-05-02T13:06:24.446854"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1870, "epoch": 0, "train_loss": 4.195066541433334, "train_ppl": 66.35814699660362, "lr": 0.00066, "grad_norm": 0.6112, "tokens_per_sec": 401576, "dt_s": 13.056, "eta_s": 12660, "world_size": 8, "timestamp": "2026-05-02T13:06:37.502560"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1880, "epoch": 0, "train_loss": 4.139525353908539, "train_ppl": 62.77301940873859, "lr": 0.00066, "grad_norm": 0.5662, "tokens_per_sec": 402761, "dt_s": 13.017, "eta_s": 12648, "world_size": 8, "timestamp": "2026-05-02T13:06:50.519882"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1890, "epoch": 0, "train_loss": 4.1271697878837585, "train_ppl": 62.002195014852, "lr": 0.00066, "grad_norm": 0.5329, "tokens_per_sec": 402318, "dt_s": 13.032, "eta_s": 12634, "world_size": 8, "timestamp": "2026-05-02T13:07:03.551555"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1900, "epoch": 0, "train_loss": 4.151518374681473, "train_ppl": 63.530390037471854, "lr": 0.00066, "grad_norm": 0.577, "tokens_per_sec": 402397, "dt_s": 13.029, "eta_s": 12620, "world_size": 8, "timestamp": "2026-05-02T13:07:16.580735"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1910, "epoch": 0, "train_loss": 4.08414301276207, "train_ppl": 59.391018590844894, "lr": 0.00066, "grad_norm": 0.5829, "tokens_per_sec": 403031, "dt_s": 13.009, "eta_s": 12605, "world_size": 8, "timestamp": "2026-05-02T13:07:29.589331"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1920, "epoch": 0, "train_loss": 4.039629861712456, "train_ppl": 56.80531309244668, "lr": 0.00066, "grad_norm": 0.6316, "tokens_per_sec": 400934, "dt_s": 13.077, "eta_s": 12597, "world_size": 8, "timestamp": "2026-05-02T13:07:42.666083"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1930, "epoch": 0, "train_loss": 4.152038872241974, "train_ppl": 63.56346605775123, "lr": 0.00066, "grad_norm": 0.5588, "tokens_per_sec": 402442, "dt_s": 13.028, "eta_s": 12585, "world_size": 8, "timestamp": "2026-05-02T13:07:55.693721"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1940, "epoch": 0, "train_loss": 4.205876260995865, "train_ppl": 67.07935093980082, "lr": 0.00066, "grad_norm": 0.5298, "tokens_per_sec": 402406, "dt_s": 13.029, "eta_s": 12572, "world_size": 8, "timestamp": "2026-05-02T13:08:08.722496"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1950, "epoch": 0, "train_loss": 4.13373863697052, "train_ppl": 62.410818701283176, "lr": 0.00066, "grad_norm": 0.5398, "tokens_per_sec": 402819, "dt_s": 13.015, "eta_s": 12556, "world_size": 8, "timestamp": "2026-05-02T13:08:21.737989"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1960, "epoch": 0, "train_loss": 4.159528881311417, "train_ppl": 64.04134441754483, "lr": 0.00066, "grad_norm": 0.5706, "tokens_per_sec": 402304, "dt_s": 13.032, "eta_s": 12548, "world_size": 8, "timestamp": "2026-05-02T13:08:34.770132"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1970, "epoch": 0, "train_loss": 4.148401722311974, "train_ppl": 63.33269612844667, "lr": 0.00066, "grad_norm": 0.5589, "tokens_per_sec": 380076, "dt_s": 13.794, "eta_s": 12673, "world_size": 8, "timestamp": "2026-05-02T13:08:48.564512"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1980, "epoch": 0, "train_loss": 4.210332423448563, "train_ppl": 67.3789344254049, "lr": 0.00066, "grad_norm": 0.5603, "tokens_per_sec": 376752, "dt_s": 13.916, "eta_s": 12830, "world_size": 8, "timestamp": "2026-05-02T13:09:02.480476"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 1990, "epoch": 0, "train_loss": 4.273190826177597, "train_ppl": 71.7502132081587, "lr": 0.00066, "grad_norm": 0.5811, "tokens_per_sec": 376806, "dt_s": 13.914, "eta_s": 12987, "world_size": 8, "timestamp": "2026-05-02T13:09:16.394423"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2000, "epoch": 0, "train_loss": 4.104986235499382, "train_ppl": 60.64190983625591, "lr": 0.00066, "grad_norm": 0.5236, "tokens_per_sec": 377441, "dt_s": 13.891, "eta_s": 13141, "world_size": 8, "timestamp": "2026-05-02T13:09:30.285028"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2010, "epoch": 0, "train_loss": 4.183995470404625, "train_ppl": 65.6275429847368, "lr": 0.00066, "grad_norm": 0.5025, "tokens_per_sec": 344221, "dt_s": 15.231, "eta_s": 13297, "world_size": 8, "timestamp": "2026-05-02T13:09:45.516215"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2020, "epoch": 0, "train_loss": 4.141069516539574, "train_ppl": 62.870026037276645, "lr": 0.00066, "grad_norm": 0.5175, "tokens_per_sec": 377340, "dt_s": 13.894, "eta_s": 13302, "world_size": 8, "timestamp": "2026-05-02T13:09:59.410570"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2030, "epoch": 0, "train_loss": 4.092688545584679, "train_ppl": 59.900721228092124, "lr": 0.00066, "grad_norm": 0.5449, "tokens_per_sec": 376832, "dt_s": 13.913, "eta_s": 13287, "world_size": 8, "timestamp": "2026-05-02T13:10:13.323570"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2040, "epoch": 0, "train_loss": 4.1826736479997635, "train_ppl": 65.54085233548707, "lr": 0.00066, "grad_norm": 0.572, "tokens_per_sec": 376675, "dt_s": 13.919, "eta_s": 13274, "world_size": 8, "timestamp": "2026-05-02T13:10:27.242363"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2050, "epoch": 0, "train_loss": 4.0788718312978745, "train_ppl": 59.07878140690542, "lr": 0.00066, "grad_norm": 0.5004, "tokens_per_sec": 377151, "dt_s": 13.901, "eta_s": 13263, "world_size": 8, "timestamp": "2026-05-02T13:10:41.143694"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2060, "epoch": 0, "train_loss": 4.179611071944237, "train_ppl": 65.34043554294918, "lr": 0.00066, "grad_norm": 0.5055, "tokens_per_sec": 377029, "dt_s": 13.906, "eta_s": 13247, "world_size": 8, "timestamp": "2026-05-02T13:10:55.049536"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2070, "epoch": 0, "train_loss": 4.207634940743446, "train_ppl": 67.19742583330816, "lr": 0.00066, "grad_norm": 0.7288, "tokens_per_sec": 375903, "dt_s": 13.947, "eta_s": 13243, "world_size": 8, "timestamp": "2026-05-02T13:11:08.996912"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2080, "epoch": 0, "train_loss": 4.1705711632966995, "train_ppl": 64.75242575636658, "lr": 0.00066, "grad_norm": 0.5731, "tokens_per_sec": 377826, "dt_s": 13.876, "eta_s": 13222, "world_size": 8, "timestamp": "2026-05-02T13:11:22.873325"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2090, "epoch": 0, "train_loss": 4.149628937244415, "train_ppl": 63.41046666967147, "lr": 0.00066, "grad_norm": 0.6159, "tokens_per_sec": 376596, "dt_s": 13.922, "eta_s": 13208, "world_size": 8, "timestamp": "2026-05-02T13:11:36.795110"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2100, "epoch": 0, "train_loss": 4.110012501478195, "train_ppl": 60.94747949844558, "lr": 0.00066, "grad_norm": 0.4774, "tokens_per_sec": 377723, "dt_s": 13.88, "eta_s": 13191, "world_size": 8, "timestamp": "2026-05-02T13:11:50.675398"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2110, "epoch": 0, "train_loss": 4.056714683771133, "train_ppl": 57.78415965871778, "lr": 0.00066, "grad_norm": 0.4945, "tokens_per_sec": 376508, "dt_s": 13.925, "eta_s": 13180, "world_size": 8, "timestamp": "2026-05-02T13:12:04.600381"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2120, "epoch": 0, "train_loss": 4.077954038977623, "train_ppl": 59.02458422972123, "lr": 0.00066, "grad_norm": 0.5183, "tokens_per_sec": 376837, "dt_s": 13.913, "eta_s": 13160, "world_size": 8, "timestamp": "2026-05-02T13:12:18.513218"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2130, "epoch": 0, "train_loss": 4.171571984887123, "train_ppl": 64.81726382235195, "lr": 0.00066, "grad_norm": 0.4831, "tokens_per_sec": 377571, "dt_s": 13.886, "eta_s": 13148, "world_size": 8, "timestamp": "2026-05-02T13:12:32.399038"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2140, "epoch": 0, "train_loss": 4.108861044049263, "train_ppl": 60.8773414586353, "lr": 0.00066, "grad_norm": 0.5155, "tokens_per_sec": 376526, "dt_s": 13.924, "eta_s": 13134, "world_size": 8, "timestamp": "2026-05-02T13:12:46.323428"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2150, "epoch": 0, "train_loss": 4.094742938876152, "train_ppl": 60.02390736099414, "lr": 0.00066, "grad_norm": 0.6879, "tokens_per_sec": 376981, "dt_s": 13.908, "eta_s": 13126, "world_size": 8, "timestamp": "2026-05-02T13:13:00.231084"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2160, "epoch": 0, "train_loss": 4.103731215000153, "train_ppl": 60.56585073414838, "lr": 0.00066, "grad_norm": 0.529, "tokens_per_sec": 376834, "dt_s": 13.913, "eta_s": 13109, "world_size": 8, "timestamp": "2026-05-02T13:13:14.143960"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2170, "epoch": 0, "train_loss": 4.234869867563248, "train_ppl": 69.05269215371382, "lr": 0.00066, "grad_norm": 0.5715, "tokens_per_sec": 377075, "dt_s": 13.904, "eta_s": 13094, "world_size": 8, "timestamp": "2026-05-02T13:13:28.047990"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2180, "epoch": 0, "train_loss": 4.244007915258408, "train_ppl": 69.68659083520731, "lr": 0.00066, "grad_norm": 0.6076, "tokens_per_sec": 377480, "dt_s": 13.889, "eta_s": 13081, "world_size": 8, "timestamp": "2026-05-02T13:13:41.937177"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2190, "epoch": 0, "train_loss": 4.228204235434532, "train_ppl": 68.59394293562592, "lr": 0.00066, "grad_norm": 0.5142, "tokens_per_sec": 376538, "dt_s": 13.924, "eta_s": 13067, "world_size": 8, "timestamp": "2026-05-02T13:13:55.861149"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2200, "epoch": 0, "train_loss": 4.109453946352005, "train_ppl": 60.91344647688749, "lr": 0.00066, "grad_norm": 0.4888, "tokens_per_sec": 377357, "dt_s": 13.894, "eta_s": 13050, "world_size": 8, "timestamp": "2026-05-02T13:14:09.754759"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2210, "epoch": 0, "train_loss": 4.225642427802086, "train_ppl": 68.41844334315316, "lr": 0.00066, "grad_norm": 0.5405, "tokens_per_sec": 376653, "dt_s": 13.92, "eta_s": 13037, "world_size": 8, "timestamp": "2026-05-02T13:14:23.674368"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2220, "epoch": 0, "train_loss": 4.127228021621704, "train_ppl": 62.00580573956052, "lr": 0.00066, "grad_norm": 0.5492, "tokens_per_sec": 377406, "dt_s": 13.892, "eta_s": 13021, "world_size": 8, "timestamp": "2026-05-02T13:14:37.566267"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2230, "epoch": 0, "train_loss": 4.212728068232536, "train_ppl": 67.54054392039141, "lr": 0.00066, "grad_norm": 0.5771, "tokens_per_sec": 394986, "dt_s": 13.274, "eta_s": 12892, "world_size": 8, "timestamp": "2026-05-02T13:14:50.839945"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2240, "epoch": 0, "train_loss": 4.0451339185237885, "train_ppl": 57.11883479201416, "lr": 0.00066, "grad_norm": 0.5147, "tokens_per_sec": 399755, "dt_s": 13.115, "eta_s": 12727, "world_size": 8, "timestamp": "2026-05-02T13:15:03.955092"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2250, "epoch": 0, "train_loss": 4.120509281754494, "train_ppl": 61.590601247935524, "lr": 0.00066, "grad_norm": 0.5034, "tokens_per_sec": 400434, "dt_s": 13.093, "eta_s": 12564, "world_size": 8, "timestamp": "2026-05-02T13:15:17.048050"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2260, "epoch": 0, "train_loss": 4.088390201330185, "train_ppl": 59.64379987094905, "lr": 0.00066, "grad_norm": 0.5157, "tokens_per_sec": 403007, "dt_s": 13.009, "eta_s": 12381, "world_size": 8, "timestamp": "2026-05-02T13:15:30.057443"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2270, "epoch": 0, "train_loss": 4.105512037873268, "train_ppl": 60.67380388065212, "lr": 0.00066, "grad_norm": 0.5008, "tokens_per_sec": 403388, "dt_s": 12.997, "eta_s": 12201, "world_size": 8, "timestamp": "2026-05-02T13:15:43.054586"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2280, "epoch": 0, "train_loss": 4.097683936357498, "train_ppl": 60.20069736341105, "lr": 0.00066, "grad_norm": 0.4833, "tokens_per_sec": 403440, "dt_s": 12.995, "eta_s": 12136, "world_size": 8, "timestamp": "2026-05-02T13:15:56.050060"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2290, "epoch": 0, "train_loss": 4.141538918018341, "train_ppl": 62.89954424787254, "lr": 0.00066, "grad_norm": 0.5127, "tokens_per_sec": 402897, "dt_s": 13.013, "eta_s": 12104, "world_size": 8, "timestamp": "2026-05-02T13:16:09.062994"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2300, "epoch": 0, "train_loss": 4.156313702464104, "train_ppl": 63.83577069685026, "lr": 0.00066, "grad_norm": 0.5003, "tokens_per_sec": 396744, "dt_s": 13.215, "eta_s": 12114, "world_size": 8, "timestamp": "2026-05-02T13:16:22.277710"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2310, "epoch": 0, "train_loss": 4.1929592192173, "train_ppl": 66.21845623763768, "lr": 0.00066, "grad_norm": 0.5223, "tokens_per_sec": 376951, "dt_s": 13.909, "eta_s": 12267, "world_size": 8, "timestamp": "2026-05-02T13:16:36.186384"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2320, "epoch": 0, "train_loss": 4.131801754236221, "train_ppl": 62.290053256114476, "lr": 0.00066, "grad_norm": 0.5103, "tokens_per_sec": 377048, "dt_s": 13.905, "eta_s": 12422, "world_size": 8, "timestamp": "2026-05-02T13:16:50.091540"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2330, "epoch": 0, "train_loss": 4.151615664362907, "train_ppl": 63.5365711895563, "lr": 0.00066, "grad_norm": 0.4965, "tokens_per_sec": 377659, "dt_s": 13.883, "eta_s": 12573, "world_size": 8, "timestamp": "2026-05-02T13:17:03.974043"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2340, "epoch": 0, "train_loss": 4.146425411105156, "train_ppl": 63.207654612503674, "lr": 0.00066, "grad_norm": 0.4981, "tokens_per_sec": 376931, "dt_s": 13.909, "eta_s": 12725, "world_size": 8, "timestamp": "2026-05-02T13:17:17.883409"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2350, "epoch": 0, "train_loss": 4.160045877099037, "train_ppl": 64.07446208295137, "lr": 0.00066, "grad_norm": 0.5012, "tokens_per_sec": 377890, "dt_s": 13.874, "eta_s": 12833, "world_size": 8, "timestamp": "2026-05-02T13:17:31.757507"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2360, "epoch": 0, "train_loss": 4.097192093729973, "train_ppl": 60.1710953745977, "lr": 0.00066, "grad_norm": 0.496, "tokens_per_sec": 377612, "dt_s": 13.884, "eta_s": 12815, "world_size": 8, "timestamp": "2026-05-02T13:17:45.641872"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2370, "epoch": 0, "train_loss": 4.261464506387711, "train_ppl": 70.91376110032347, "lr": 0.00066, "grad_norm": 0.512, "tokens_per_sec": 377317, "dt_s": 13.895, "eta_s": 12799, "world_size": 8, "timestamp": "2026-05-02T13:17:59.536984"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2380, "epoch": 0, "train_loss": 4.2061797976493835, "train_ppl": 67.0997150719792, "lr": 0.00066, "grad_norm": 0.629, "tokens_per_sec": 377965, "dt_s": 13.871, "eta_s": 12783, "world_size": 8, "timestamp": "2026-05-02T13:18:13.408278"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2390, "epoch": 0, "train_loss": 4.067231193184853, "train_ppl": 58.39505392514014, "lr": 0.00066, "grad_norm": 0.497, "tokens_per_sec": 377087, "dt_s": 13.904, "eta_s": 12768, "world_size": 8, "timestamp": "2026-05-02T13:18:27.311937"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2400, "epoch": 0, "train_loss": 4.083262234926224, "train_ppl": 59.33873132813305, "lr": 0.00066, "grad_norm": 0.5166, "tokens_per_sec": 377552, "dt_s": 13.887, "eta_s": 12757, "world_size": 8, "timestamp": "2026-05-02T13:18:41.198513"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2410, "epoch": 0, "train_loss": 4.0780985951423645, "train_ppl": 59.03311721397582, "lr": 0.00066, "grad_norm": 0.4666, "tokens_per_sec": 376880, "dt_s": 13.911, "eta_s": 12748, "world_size": 8, "timestamp": "2026-05-02T13:18:55.109712"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2420, "epoch": 0, "train_loss": 4.09953448176384, "train_ppl": 60.31220463018406, "lr": 0.00066, "grad_norm": 0.5125, "tokens_per_sec": 377177, "dt_s": 13.9, "eta_s": 12735, "world_size": 8, "timestamp": "2026-05-02T13:19:09.010033"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2430, "epoch": 0, "train_loss": 4.156326204538345, "train_ppl": 63.83656878138363, "lr": 0.00066, "grad_norm": 0.4961, "tokens_per_sec": 377659, "dt_s": 13.883, "eta_s": 12723, "world_size": 8, "timestamp": "2026-05-02T13:19:22.892571"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2440, "epoch": 0, "train_loss": 4.129678949713707, "train_ppl": 62.157963898900014, "lr": 0.00066, "grad_norm": 0.47, "tokens_per_sec": 376747, "dt_s": 13.916, "eta_s": 12711, "world_size": 8, "timestamp": "2026-05-02T13:19:36.808849"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2450, "epoch": 0, "train_loss": 4.096526697278023, "train_ppl": 60.13107105872112, "lr": 0.00066, "grad_norm": 0.4844, "tokens_per_sec": 377929, "dt_s": 13.873, "eta_s": 12695, "world_size": 8, "timestamp": "2026-05-02T13:19:50.681383"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2460, "epoch": 0, "train_loss": 4.10742025077343, "train_ppl": 60.78969295126348, "lr": 0.00066, "grad_norm": 0.4641, "tokens_per_sec": 376791, "dt_s": 13.915, "eta_s": 12682, "world_size": 8, "timestamp": "2026-05-02T13:20:04.595911"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2470, "epoch": 0, "train_loss": 4.047896459698677, "train_ppl": 57.27684608084416, "lr": 0.00066, "grad_norm": 0.5077, "tokens_per_sec": 377876, "dt_s": 13.875, "eta_s": 12663, "world_size": 8, "timestamp": "2026-05-02T13:20:18.470560"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2480, "epoch": 0, "train_loss": 4.073474436998367, "train_ppl": 58.760768920333426, "lr": 0.00066, "grad_norm": 0.5007, "tokens_per_sec": 377315, "dt_s": 13.895, "eta_s": 12652, "world_size": 8, "timestamp": "2026-05-02T13:20:32.365911"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2490, "epoch": 0, "train_loss": 4.149632394313812, "train_ppl": 63.410685884434166, "lr": 0.00066, "grad_norm": 0.5142, "tokens_per_sec": 376640, "dt_s": 13.92, "eta_s": 12638, "world_size": 8, "timestamp": "2026-05-02T13:20:46.285918"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2500, "epoch": 0, "train_loss": 4.151930049061775, "train_ppl": 63.556549255591406, "lr": 0.00066, "grad_norm": 0.5811, "tokens_per_sec": 377750, "dt_s": 13.879, "eta_s": 12626, "world_size": 8, "timestamp": "2026-05-02T13:21:00.165172"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2510, "epoch": 0, "train_loss": 4.1793200969696045, "train_ppl": 65.32142587717725, "lr": 0.00066, "grad_norm": 0.4704, "tokens_per_sec": 377032, "dt_s": 13.906, "eta_s": 12610, "world_size": 8, "timestamp": "2026-05-02T13:21:14.070940"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2520, "epoch": 0, "train_loss": 4.100276470184326, "train_ppl": 60.3569721940868, "lr": 0.00066, "grad_norm": 0.5155, "tokens_per_sec": 377997, "dt_s": 13.87, "eta_s": 12595, "world_size": 8, "timestamp": "2026-05-02T13:21:27.941047"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2530, "epoch": 0, "train_loss": 4.105367198586464, "train_ppl": 60.66501656655986, "lr": 0.00066, "grad_norm": 0.5078, "tokens_per_sec": 377432, "dt_s": 13.891, "eta_s": 12581, "world_size": 8, "timestamp": "2026-05-02T13:21:41.831909"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2540, "epoch": 0, "train_loss": 4.0546534061431885, "train_ppl": 57.66517313742066, "lr": 0.00066, "grad_norm": 0.4866, "tokens_per_sec": 377682, "dt_s": 13.882, "eta_s": 12560, "world_size": 8, "timestamp": "2026-05-02T13:21:55.713686"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2550, "epoch": 0, "train_loss": 4.124544009566307, "train_ppl": 61.83960455222116, "lr": 0.00066, "grad_norm": 0.5007, "tokens_per_sec": 377607, "dt_s": 13.884, "eta_s": 12547, "world_size": 8, "timestamp": "2026-05-02T13:22:09.598250"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2560, "epoch": 0, "train_loss": 4.093259930610657, "train_ppl": 59.934957383328914, "lr": 0.00066, "grad_norm": 0.5041, "tokens_per_sec": 377043, "dt_s": 13.905, "eta_s": 12533, "world_size": 8, "timestamp": "2026-05-02T13:22:23.503466"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2570, "epoch": 0, "train_loss": 4.157974645495415, "train_ppl": 63.941886376991896, "lr": 0.00066, "grad_norm": 0.4693, "tokens_per_sec": 377959, "dt_s": 13.872, "eta_s": 12519, "world_size": 8, "timestamp": "2026-05-02T13:22:37.374982"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2580, "epoch": 0, "train_loss": 4.11640228331089, "train_ppl": 61.33816747185816, "lr": 0.00066, "grad_norm": 0.4567, "tokens_per_sec": 377869, "dt_s": 13.875, "eta_s": 12503, "world_size": 8, "timestamp": "2026-05-02T13:22:51.249850"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2590, "epoch": 0, "train_loss": 4.079595074057579, "train_ppl": 59.12152516300299, "lr": 0.00066, "grad_norm": 0.4692, "tokens_per_sec": 377312, "dt_s": 13.895, "eta_s": 12491, "world_size": 8, "timestamp": "2026-05-02T13:23:05.145329"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2600, "epoch": 0, "train_loss": 4.094775035977364, "train_ppl": 60.025833985343176, "lr": 0.00066, "grad_norm": 0.4695, "tokens_per_sec": 378402, "dt_s": 13.855, "eta_s": 12472, "world_size": 8, "timestamp": "2026-05-02T13:23:19.000503"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2610, "epoch": 0, "train_loss": 4.078152850270271, "train_ppl": 59.036320150188075, "lr": 0.00066, "grad_norm": 0.4904, "tokens_per_sec": 376904, "dt_s": 13.91, "eta_s": 12459, "world_size": 8, "timestamp": "2026-05-02T13:23:32.910935"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2620, "epoch": 0, "train_loss": 4.211201578378677, "train_ppl": 67.43752261586563, "lr": 0.00066, "grad_norm": 0.4899, "tokens_per_sec": 378037, "dt_s": 13.869, "eta_s": 12445, "world_size": 8, "timestamp": "2026-05-02T13:23:46.779704"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2630, "epoch": 0, "train_loss": 4.069534540176392, "train_ppl": 58.529713020695574, "lr": 0.00066, "grad_norm": 0.5117, "tokens_per_sec": 377299, "dt_s": 13.896, "eta_s": 12435, "world_size": 8, "timestamp": "2026-05-02T13:24:00.675484"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2640, "epoch": 0, "train_loss": 4.173486590385437, "train_ppl": 64.94148218868973, "lr": 0.00066, "grad_norm": 0.441, "tokens_per_sec": 377217, "dt_s": 13.899, "eta_s": 12421, "world_size": 8, "timestamp": "2026-05-02T13:24:14.574273"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2650, "epoch": 0, "train_loss": 4.380282253026962, "train_ppl": 79.86057111219822, "lr": 0.00066, "grad_norm": 1.0931, "tokens_per_sec": 377871, "dt_s": 13.875, "eta_s": 12411, "world_size": 8, "timestamp": "2026-05-02T13:24:28.449112"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2660, "epoch": 0, "train_loss": 4.04567277431488, "train_ppl": 57.14962190108331, "lr": 0.00066, "grad_norm": 0.4686, "tokens_per_sec": 377097, "dt_s": 13.903, "eta_s": 12396, "world_size": 8, "timestamp": "2026-05-02T13:24:42.352490"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2670, "epoch": 0, "train_loss": 4.050413981080055, "train_ppl": 57.42122342578277, "lr": 0.00066, "grad_norm": 0.4761, "tokens_per_sec": 377493, "dt_s": 13.889, "eta_s": 12385, "world_size": 8, "timestamp": "2026-05-02T13:24:56.241050"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2680, "epoch": 0, "train_loss": 4.054984033107758, "train_ppl": 57.68424195073466, "lr": 0.00066, "grad_norm": 0.4445, "tokens_per_sec": 376701, "dt_s": 13.918, "eta_s": 12376, "world_size": 8, "timestamp": "2026-05-02T13:25:10.158988"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2690, "epoch": 0, "train_loss": 4.069229647517204, "train_ppl": 58.51187046102218, "lr": 0.00066, "grad_norm": 0.4599, "tokens_per_sec": 377621, "dt_s": 13.884, "eta_s": 12359, "world_size": 8, "timestamp": "2026-05-02T13:25:24.042984"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2700, "epoch": 0, "train_loss": 4.49939788877964, "train_ppl": 89.96294728977857, "lr": 0.00066, "grad_norm": 1.5748, "tokens_per_sec": 377577, "dt_s": 13.886, "eta_s": 12347, "world_size": 8, "timestamp": "2026-05-02T13:25:37.928548"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2710, "epoch": 0, "train_loss": 4.084277540445328, "train_ppl": 59.399008864426705, "lr": 0.00066, "grad_norm": 0.6078, "tokens_per_sec": 376985, "dt_s": 13.907, "eta_s": 12334, "world_size": 8, "timestamp": "2026-05-02T13:25:51.835903"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2720, "epoch": 0, "train_loss": 4.055330738425255, "train_ppl": 57.704244851510445, "lr": 0.00066, "grad_norm": 0.4728, "tokens_per_sec": 377199, "dt_s": 13.9, "eta_s": 12322, "world_size": 8, "timestamp": "2026-05-02T13:26:05.735452"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2730, "epoch": 0, "train_loss": 4.0352505296468735, "train_ppl": 56.55708769077281, "lr": 0.00066, "grad_norm": 0.4579, "tokens_per_sec": 376677, "dt_s": 13.919, "eta_s": 12308, "world_size": 8, "timestamp": "2026-05-02T13:26:19.654323"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2740, "epoch": 0, "train_loss": 4.020890533924103, "train_ppl": 55.7507316446019, "lr": 0.00066, "grad_norm": 0.4587, "tokens_per_sec": 378007, "dt_s": 13.87, "eta_s": 12292, "world_size": 8, "timestamp": "2026-05-02T13:26:33.523993"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2750, "epoch": 0, "train_loss": 4.142129302024841, "train_ppl": 62.936690096891276, "lr": 0.00066, "grad_norm": 0.4762, "tokens_per_sec": 377751, "dt_s": 13.879, "eta_s": 12277, "world_size": 8, "timestamp": "2026-05-02T13:26:47.403206"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2760, "epoch": 0, "train_loss": 4.077779024839401, "train_ppl": 59.014254996885626, "lr": 0.00066, "grad_norm": 0.4705, "tokens_per_sec": 377248, "dt_s": 13.898, "eta_s": 12261, "world_size": 8, "timestamp": "2026-05-02T13:27:01.300963"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2770, "epoch": 0, "train_loss": 4.0503289103508, "train_ppl": 57.416338768204824, "lr": 0.00066, "grad_norm": 0.4536, "tokens_per_sec": 377107, "dt_s": 13.903, "eta_s": 12248, "world_size": 8, "timestamp": "2026-05-02T13:27:15.203819"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2780, "epoch": 0, "train_loss": 4.155963703989983, "train_ppl": 63.81343218396277, "lr": 0.00066, "grad_norm": 0.4606, "tokens_per_sec": 377018, "dt_s": 13.906, "eta_s": 12232, "world_size": 8, "timestamp": "2026-05-02T13:27:29.109945"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2790, "epoch": 0, "train_loss": 4.0590600073337555, "train_ppl": 57.91984125634212, "lr": 0.00066, "grad_norm": 0.4602, "tokens_per_sec": 378065, "dt_s": 13.868, "eta_s": 12217, "world_size": 8, "timestamp": "2026-05-02T13:27:42.977726"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2800, "epoch": 0, "train_loss": 4.124105915427208, "train_ppl": 61.81251891736477, "lr": 0.00066, "grad_norm": 0.4414, "tokens_per_sec": 377770, "dt_s": 13.878, "eta_s": 12203, "world_size": 8, "timestamp": "2026-05-02T13:27:56.856152"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2810, "epoch": 0, "train_loss": 4.096522703766823, "train_ppl": 60.13083092509487, "lr": 0.00066, "grad_norm": 0.5507, "tokens_per_sec": 376410, "dt_s": 13.929, "eta_s": 12195, "world_size": 8, "timestamp": "2026-05-02T13:28:10.784743"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2820, "epoch": 0, "train_loss": 4.031416207551956, "train_ppl": 56.34064482083414, "lr": 0.00066, "grad_norm": 0.6287, "tokens_per_sec": 378430, "dt_s": 13.854, "eta_s": 12173, "world_size": 8, "timestamp": "2026-05-02T13:28:24.639044"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2830, "epoch": 0, "train_loss": 4.094673648476601, "train_ppl": 60.01974842455931, "lr": 0.00066, "grad_norm": 0.4538, "tokens_per_sec": 377157, "dt_s": 13.901, "eta_s": 12158, "world_size": 8, "timestamp": "2026-05-02T13:28:38.540211"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2840, "epoch": 0, "train_loss": 3.9968659728765488, "train_ppl": 54.42730580511015, "lr": 0.00066, "grad_norm": 0.4725, "tokens_per_sec": 378008, "dt_s": 13.87, "eta_s": 12144, "world_size": 8, "timestamp": "2026-05-02T13:28:52.409822"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2850, "epoch": 0, "train_loss": 4.036990985274315, "train_ppl": 56.65560850299712, "lr": 0.00066, "grad_norm": 0.5102, "tokens_per_sec": 377689, "dt_s": 13.881, "eta_s": 12131, "world_size": 8, "timestamp": "2026-05-02T13:29:06.291309"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2860, "epoch": 0, "train_loss": 4.130854904651642, "train_ppl": 62.23110185852293, "lr": 0.00066, "grad_norm": 0.5417, "tokens_per_sec": 377026, "dt_s": 13.906, "eta_s": 12113, "world_size": 8, "timestamp": "2026-05-02T13:29:20.197326"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2870, "epoch": 0, "train_loss": 4.029271736741066, "train_ppl": 56.219953408407186, "lr": 0.00066, "grad_norm": 0.7296, "tokens_per_sec": 378298, "dt_s": 13.859, "eta_s": 12100, "world_size": 8, "timestamp": "2026-05-02T13:29:34.056287"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2880, "epoch": 0, "train_loss": 4.083948865532875, "train_ppl": 59.37948910839118, "lr": 0.00066, "grad_norm": 0.5753, "tokens_per_sec": 376928, "dt_s": 13.909, "eta_s": 12088, "world_size": 8, "timestamp": "2026-05-02T13:29:47.965817"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2890, "epoch": 0, "train_loss": 3.949729770421982, "train_ppl": 51.921334258661396, "lr": 0.00066, "grad_norm": 0.4291, "tokens_per_sec": 377598, "dt_s": 13.885, "eta_s": 12076, "world_size": 8, "timestamp": "2026-05-02T13:30:01.850686"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2900, "epoch": 0, "train_loss": 4.0317835211753845, "train_ppl": 56.36134330841509, "lr": 0.00066, "grad_norm": 0.4528, "tokens_per_sec": 377189, "dt_s": 13.9, "eta_s": 12066, "world_size": 8, "timestamp": "2026-05-02T13:30:15.750524"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2910, "epoch": 0, "train_loss": 4.03309141099453, "train_ppl": 56.43510596171844, "lr": 0.00066, "grad_norm": 0.427, "tokens_per_sec": 376269, "dt_s": 13.934, "eta_s": 12057, "world_size": 8, "timestamp": "2026-05-02T13:30:29.684341"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2920, "epoch": 0, "train_loss": 4.040656715631485, "train_ppl": 56.86367380965483, "lr": 0.00066, "grad_norm": 0.4976, "tokens_per_sec": 376690, "dt_s": 13.918, "eta_s": 12053, "world_size": 8, "timestamp": "2026-05-02T13:30:43.602746"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2930, "epoch": 0, "train_loss": 4.0443983525037766, "train_ppl": 57.07683556653372, "lr": 0.00066, "grad_norm": 0.5504, "tokens_per_sec": 375421, "dt_s": 13.965, "eta_s": 12049, "world_size": 8, "timestamp": "2026-05-02T13:30:57.567997"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2940, "epoch": 0, "train_loss": 4.026310816407204, "train_ppl": 56.05373680368955, "lr": 0.00066, "grad_norm": 0.4527, "tokens_per_sec": 377628, "dt_s": 13.884, "eta_s": 12034, "world_size": 8, "timestamp": "2026-05-02T13:31:11.451714"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2950, "epoch": 0, "train_loss": 4.081385359168053, "train_ppl": 59.227464351591124, "lr": 0.00066, "grad_norm": 0.4577, "tokens_per_sec": 376518, "dt_s": 13.925, "eta_s": 12025, "world_size": 8, "timestamp": "2026-05-02T13:31:25.376463"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2960, "epoch": 0, "train_loss": 4.078494742512703, "train_ppl": 59.056507660849135, "lr": 0.00066, "grad_norm": 0.5914, "tokens_per_sec": 376225, "dt_s": 13.935, "eta_s": 12011, "world_size": 8, "timestamp": "2026-05-02T13:31:39.311872"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2970, "epoch": 0, "train_loss": 4.077367916703224, "train_ppl": 58.98999874281931, "lr": 0.00066, "grad_norm": 0.4787, "tokens_per_sec": 376900, "dt_s": 13.911, "eta_s": 11996, "world_size": 8, "timestamp": "2026-05-02T13:31:53.222347"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2980, "epoch": 0, "train_loss": 4.017661288380623, "train_ppl": 55.57098921530876, "lr": 0.00066, "grad_norm": 0.4838, "tokens_per_sec": 376067, "dt_s": 13.941, "eta_s": 11978, "world_size": 8, "timestamp": "2026-05-02T13:32:07.163728"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 2990, "epoch": 0, "train_loss": 4.041722282767296, "train_ppl": 56.92429816561229, "lr": 0.00066, "grad_norm": 0.5115, "tokens_per_sec": 376666, "dt_s": 13.919, "eta_s": 11970, "world_size": 8, "timestamp": "2026-05-02T13:32:21.082990"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3000, "epoch": 0, "train_loss": 4.030602693557739, "train_ppl": 56.29482955603545, "lr": 0.00066, "grad_norm": 0.4499, "tokens_per_sec": 375915, "dt_s": 13.947, "eta_s": 11960, "world_size": 8, "timestamp": "2026-05-02T13:32:35.029848"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3010, "epoch": 0, "train_loss": 4.097916483879089, "train_ppl": 60.21469851428434, "lr": 0.00066, "grad_norm": 0.4654, "tokens_per_sec": 344168, "dt_s": 15.234, "eta_s": 11943, "world_size": 8, "timestamp": "2026-05-02T13:32:50.263377"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3020, "epoch": 0, "train_loss": 4.09340038895607, "train_ppl": 59.943376339517876, "lr": 0.00066, "grad_norm": 0.4504, "tokens_per_sec": 377402, "dt_s": 13.892, "eta_s": 11926, "world_size": 8, "timestamp": "2026-05-02T13:33:04.155519"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3030, "epoch": 0, "train_loss": 4.098719775676727, "train_ppl": 60.26308792050261, "lr": 0.00066, "grad_norm": 0.4746, "tokens_per_sec": 376469, "dt_s": 13.926, "eta_s": 11910, "world_size": 8, "timestamp": "2026-05-02T13:33:18.081846"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3040, "epoch": 0, "train_loss": 4.19888561964035, "train_ppl": 66.61205849497551, "lr": 0.00066, "grad_norm": 0.5199, "tokens_per_sec": 377251, "dt_s": 13.898, "eta_s": 11892, "world_size": 8, "timestamp": "2026-05-02T13:33:31.979449"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3050, "epoch": 0, "train_loss": 4.182038381695747, "train_ppl": 65.4992296626024, "lr": 0.00066, "grad_norm": 0.4689, "tokens_per_sec": 376549, "dt_s": 13.924, "eta_s": 11874, "world_size": 8, "timestamp": "2026-05-02T13:33:45.903041"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3060, "epoch": 0, "train_loss": 3.9954008013010025, "train_ppl": 54.34761885549321, "lr": 0.00066, "grad_norm": 0.4931, "tokens_per_sec": 377484, "dt_s": 13.889, "eta_s": 11855, "world_size": 8, "timestamp": "2026-05-02T13:33:59.791914"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3070, "epoch": 0, "train_loss": 3.9313859939575195, "train_ppl": 50.97758335086793, "lr": 0.00066, "grad_norm": 0.5042, "tokens_per_sec": 377469, "dt_s": 13.89, "eta_s": 11841, "world_size": 8, "timestamp": "2026-05-02T13:34:13.681494"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3080, "epoch": 0, "train_loss": 3.9382409900426865, "train_ppl": 51.32823496978627, "lr": 0.00066, "grad_norm": 0.4727, "tokens_per_sec": 402618, "dt_s": 13.022, "eta_s": 11673, "world_size": 8, "timestamp": "2026-05-02T13:34:26.703570"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3090, "epoch": 0, "train_loss": 3.9767421185970306, "train_ppl": 53.34296577808192, "lr": 0.00066, "grad_norm": 0.4795, "tokens_per_sec": 402368, "dt_s": 13.03, "eta_s": 11512, "world_size": 8, "timestamp": "2026-05-02T13:34:39.733495"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3100, "epoch": 0, "train_loss": 4.042598322033882, "train_ppl": 56.97418793553633, "lr": 0.00066, "grad_norm": 0.4712, "tokens_per_sec": 402702, "dt_s": 13.019, "eta_s": 11345, "world_size": 8, "timestamp": "2026-05-02T13:34:52.752766"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3110, "epoch": 0, "train_loss": 4.04414339363575, "train_ppl": 57.062285176101476, "lr": 0.00066, "grad_norm": 0.5036, "tokens_per_sec": 401912, "dt_s": 13.045, "eta_s": 11188, "world_size": 8, "timestamp": "2026-05-02T13:35:05.797650"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3120, "epoch": 0, "train_loss": 4.0250211507081985, "train_ppl": 55.98149281734159, "lr": 0.00066, "grad_norm": 0.4454, "tokens_per_sec": 402450, "dt_s": 13.027, "eta_s": 11029, "world_size": 8, "timestamp": "2026-05-02T13:35:18.825010"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3130, "epoch": 0, "train_loss": 4.101960062980652, "train_ppl": 60.45867434616333, "lr": 0.00066, "grad_norm": 0.4611, "tokens_per_sec": 402445, "dt_s": 13.028, "eta_s": 11017, "world_size": 8, "timestamp": "2026-05-02T13:35:31.852525"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3140, "epoch": 0, "train_loss": 4.019771367311478, "train_ppl": 55.68837218893516, "lr": 0.00066, "grad_norm": 0.4352, "tokens_per_sec": 401750, "dt_s": 13.05, "eta_s": 11008, "world_size": 8, "timestamp": "2026-05-02T13:35:44.902758"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3150, "epoch": 0, "train_loss": 4.08763886988163, "train_ppl": 59.59900443861047, "lr": 0.00066, "grad_norm": 0.4403, "tokens_per_sec": 401212, "dt_s": 13.068, "eta_s": 11003, "world_size": 8, "timestamp": "2026-05-02T13:35:57.970286"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3160, "epoch": 0, "train_loss": 4.005921840667725, "train_ppl": 54.92243080024898, "lr": 0.00066, "grad_norm": 0.4564, "tokens_per_sec": 401315, "dt_s": 13.064, "eta_s": 10993, "world_size": 8, "timestamp": "2026-05-02T13:36:11.034507"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3170, "epoch": 0, "train_loss": 4.035490557551384, "train_ppl": 56.570664599369714, "lr": 0.00066, "grad_norm": 0.4668, "tokens_per_sec": 402446, "dt_s": 13.028, "eta_s": 10980, "world_size": 8, "timestamp": "2026-05-02T13:36:24.062051"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3180, "epoch": 0, "train_loss": 4.141602054238319, "train_ppl": 62.903515612701874, "lr": 0.00066, "grad_norm": 0.4697, "tokens_per_sec": 402573, "dt_s": 13.023, "eta_s": 10966, "world_size": 8, "timestamp": "2026-05-02T13:36:37.085587"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3190, "epoch": 0, "train_loss": 4.019229471683502, "train_ppl": 55.65820307850969, "lr": 0.00066, "grad_norm": 0.4543, "tokens_per_sec": 402806, "dt_s": 13.016, "eta_s": 10947, "world_size": 8, "timestamp": "2026-05-02T13:36:50.101380"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3200, "epoch": 0, "train_loss": 4.177674159407616, "train_ppl": 65.21399932167729, "lr": 0.00066, "grad_norm": 0.6159, "tokens_per_sec": 402823, "dt_s": 13.015, "eta_s": 10926, "world_size": 8, "timestamp": "2026-05-02T13:37:03.116720"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3210, "epoch": 0, "train_loss": 4.0893707275390625, "train_ppl": 59.702310861011334, "lr": 0.00066, "grad_norm": 0.4521, "tokens_per_sec": 402102, "dt_s": 13.039, "eta_s": 10908, "world_size": 8, "timestamp": "2026-05-02T13:37:16.155429"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3220, "epoch": 0, "train_loss": 4.054269030690193, "train_ppl": 57.64301231968393, "lr": 0.00066, "grad_norm": 0.4795, "tokens_per_sec": 402712, "dt_s": 13.019, "eta_s": 10894, "world_size": 8, "timestamp": "2026-05-02T13:37:29.174325"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3230, "epoch": 0, "train_loss": 3.996289685368538, "train_ppl": 54.39594906479666, "lr": 0.00066, "grad_norm": 0.4263, "tokens_per_sec": 402822, "dt_s": 13.015, "eta_s": 10879, "world_size": 8, "timestamp": "2026-05-02T13:37:42.189692"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3240, "epoch": 0, "train_loss": 4.041922718286514, "train_ppl": 56.93570896039735, "lr": 0.00066, "grad_norm": 0.5208, "tokens_per_sec": 402641, "dt_s": 13.021, "eta_s": 10867, "world_size": 8, "timestamp": "2026-05-02T13:37:55.210975"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3250, "epoch": 0, "train_loss": 4.026430770754814, "train_ppl": 56.060461096414976, "lr": 0.00066, "grad_norm": 0.4956, "tokens_per_sec": 402782, "dt_s": 13.017, "eta_s": 10854, "world_size": 8, "timestamp": "2026-05-02T13:38:08.227581"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3260, "epoch": 0, "train_loss": 4.046999841928482, "train_ppl": 57.22551365904708, "lr": 0.00066, "grad_norm": 0.501, "tokens_per_sec": 402078, "dt_s": 13.039, "eta_s": 10842, "world_size": 8, "timestamp": "2026-05-02T13:38:21.267000"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3270, "epoch": 0, "train_loss": 4.096561133861542, "train_ppl": 60.13314180302619, "lr": 0.00066, "grad_norm": 0.4691, "tokens_per_sec": 402519, "dt_s": 13.025, "eta_s": 10830, "world_size": 8, "timestamp": "2026-05-02T13:38:34.292278"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3280, "epoch": 0, "train_loss": 4.089725911617279, "train_ppl": 59.72351993760209, "lr": 0.00066, "grad_norm": 0.427, "tokens_per_sec": 402540, "dt_s": 13.024, "eta_s": 10818, "world_size": 8, "timestamp": "2026-05-02T13:38:47.316695"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3290, "epoch": 0, "train_loss": 3.978618696331978, "train_ppl": 53.4431619835575, "lr": 0.00066, "grad_norm": 0.4959, "tokens_per_sec": 402650, "dt_s": 13.021, "eta_s": 10805, "world_size": 8, "timestamp": "2026-05-02T13:39:00.337572"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3300, "epoch": 0, "train_loss": 3.9675633013248444, "train_ppl": 52.855580673728106, "lr": 0.00066, "grad_norm": 0.4463, "tokens_per_sec": 402628, "dt_s": 13.022, "eta_s": 10793, "world_size": 8, "timestamp": "2026-05-02T13:39:13.359222"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3310, "epoch": 0, "train_loss": 4.055207997560501, "train_ppl": 57.69716261724608, "lr": 0.00066, "grad_norm": 0.4451, "tokens_per_sec": 402657, "dt_s": 13.021, "eta_s": 10777, "world_size": 8, "timestamp": "2026-05-02T13:39:26.380064"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3320, "epoch": 0, "train_loss": 4.06026092171669, "train_ppl": 57.98943980935126, "lr": 0.00066, "grad_norm": 0.5824, "tokens_per_sec": 402139, "dt_s": 13.037, "eta_s": 10766, "world_size": 8, "timestamp": "2026-05-02T13:39:39.417405"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3330, "epoch": 0, "train_loss": 4.035500228404999, "train_ppl": 56.57121168863136, "lr": 0.00066, "grad_norm": 0.4411, "tokens_per_sec": 401884, "dt_s": 13.046, "eta_s": 10756, "world_size": 8, "timestamp": "2026-05-02T13:39:52.463280"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3340, "epoch": 0, "train_loss": 4.094229608774185, "train_ppl": 59.993103189538736, "lr": 0.00066, "grad_norm": 0.4993, "tokens_per_sec": 402708, "dt_s": 13.019, "eta_s": 10743, "world_size": 8, "timestamp": "2026-05-02T13:40:05.482270"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3350, "epoch": 0, "train_loss": 4.113725125789642, "train_ppl": 61.174175150056946, "lr": 0.00066, "grad_norm": 0.4181, "tokens_per_sec": 402668, "dt_s": 13.02, "eta_s": 10730, "world_size": 8, "timestamp": "2026-05-02T13:40:18.502589"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3360, "epoch": 0, "train_loss": 4.016866594552994, "train_ppl": 55.52684483614017, "lr": 0.00066, "grad_norm": 0.4503, "tokens_per_sec": 402751, "dt_s": 13.018, "eta_s": 10716, "world_size": 8, "timestamp": "2026-05-02T13:40:31.520290"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3370, "epoch": 0, "train_loss": 4.069400891661644, "train_ppl": 58.521891134185154, "lr": 0.00066, "grad_norm": 0.4591, "tokens_per_sec": 402686, "dt_s": 13.02, "eta_s": 10700, "world_size": 8, "timestamp": "2026-05-02T13:40:44.540174"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3380, "epoch": 0, "train_loss": 4.046653211116791, "train_ppl": 57.20568097030707, "lr": 0.00066, "grad_norm": 0.4408, "tokens_per_sec": 379288, "dt_s": 13.823, "eta_s": 10815, "world_size": 8, "timestamp": "2026-05-02T13:40:58.363008"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3390, "epoch": 0, "train_loss": 4.130770310759544, "train_ppl": 62.22583771006775, "lr": 0.00066, "grad_norm": 0.4781, "tokens_per_sec": 376147, "dt_s": 13.938, "eta_s": 10952, "world_size": 8, "timestamp": "2026-05-02T13:41:12.301414"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3400, "epoch": 0, "train_loss": 4.088705956935883, "train_ppl": 59.66263570870759, "lr": 0.00066, "grad_norm": 0.4565, "tokens_per_sec": 377840, "dt_s": 13.876, "eta_s": 11079, "world_size": 8, "timestamp": "2026-05-02T13:41:26.177452"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3410, "epoch": 0, "train_loss": 4.019984751939774, "train_ppl": 55.7002564994546, "lr": 0.00066, "grad_norm": 0.4596, "tokens_per_sec": 377361, "dt_s": 13.894, "eta_s": 11209, "world_size": 8, "timestamp": "2026-05-02T13:41:40.070859"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3420, "epoch": 0, "train_loss": 4.041004493832588, "train_ppl": 56.88345319506065, "lr": 0.00066, "grad_norm": 0.4325, "tokens_per_sec": 377017, "dt_s": 13.906, "eta_s": 11340, "world_size": 8, "timestamp": "2026-05-02T13:41:53.977190"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3430, "epoch": 0, "train_loss": 4.076733037829399, "train_ppl": 58.95255912491932, "lr": 0.00066, "grad_norm": 0.4392, "tokens_per_sec": 377345, "dt_s": 13.894, "eta_s": 11337, "world_size": 8, "timestamp": "2026-05-02T13:42:07.871233"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3440, "epoch": 0, "train_loss": 4.0827136635780334, "train_ppl": 59.30618872707383, "lr": 0.00066, "grad_norm": 0.4728, "tokens_per_sec": 376344, "dt_s": 13.931, "eta_s": 11322, "world_size": 8, "timestamp": "2026-05-02T13:42:21.802306"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3450, "epoch": 0, "train_loss": 3.990983620285988, "train_ppl": 54.108085006929336, "lr": 0.00066, "grad_norm": 0.4374, "tokens_per_sec": 377484, "dt_s": 13.889, "eta_s": 11310, "world_size": 8, "timestamp": "2026-05-02T13:42:35.691434"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3460, "epoch": 0, "train_loss": 3.9955261945724487, "train_ppl": 54.35443410850132, "lr": 0.00066, "grad_norm": 0.4562, "tokens_per_sec": 377011, "dt_s": 13.906, "eta_s": 11299, "world_size": 8, "timestamp": "2026-05-02T13:42:49.597714"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3470, "epoch": 0, "train_loss": 4.014970153570175, "train_ppl": 55.42164123968253, "lr": 0.00066, "grad_norm": 0.454, "tokens_per_sec": 379134, "dt_s": 13.829, "eta_s": 11272, "world_size": 8, "timestamp": "2026-05-02T13:43:03.426310"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3480, "epoch": 0, "train_loss": 3.9460033178329468, "train_ppl": 51.72821192228416, "lr": 0.00066, "grad_norm": 0.4766, "tokens_per_sec": 401799, "dt_s": 13.049, "eta_s": 11121, "world_size": 8, "timestamp": "2026-05-02T13:43:16.474923"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3490, "epoch": 0, "train_loss": 3.987846091389656, "train_ppl": 53.93858537074556, "lr": 0.00066, "grad_norm": 0.4927, "tokens_per_sec": 385779, "dt_s": 13.59, "eta_s": 11052, "world_size": 8, "timestamp": "2026-05-02T13:43:30.065188"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3500, "epoch": 0, "train_loss": 3.936375766992569, "train_ppl": 51.23258559423958, "lr": 0.00066, "grad_norm": 0.4305, "tokens_per_sec": 395968, "dt_s": 13.241, "eta_s": 10934, "world_size": 8, "timestamp": "2026-05-02T13:43:43.305874"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3510, "epoch": 0, "train_loss": 4.078239515423775, "train_ppl": 59.04143676364901, "lr": 0.00066, "grad_norm": 0.4294, "tokens_per_sec": 402498, "dt_s": 13.026, "eta_s": 10778, "world_size": 8, "timestamp": "2026-05-02T13:43:56.331844"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3520, "epoch": 0, "train_loss": 4.096453547477722, "train_ppl": 60.126672643754866, "lr": 0.00066, "grad_norm": 0.4463, "tokens_per_sec": 402695, "dt_s": 13.019, "eta_s": 10634, "world_size": 8, "timestamp": "2026-05-02T13:44:09.351191"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3530, "epoch": 0, "train_loss": 4.127837851643562, "train_ppl": 62.043630273525984, "lr": 0.00066, "grad_norm": 0.5049, "tokens_per_sec": 402672, "dt_s": 13.02, "eta_s": 10616, "world_size": 8, "timestamp": "2026-05-02T13:44:22.371442"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3540, "epoch": 0, "train_loss": 4.063577741384506, "train_ppl": 58.18209965619558, "lr": 0.00066, "grad_norm": 0.435, "tokens_per_sec": 402634, "dt_s": 13.021, "eta_s": 10512, "world_size": 8, "timestamp": "2026-05-02T13:44:35.393020"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3550, "epoch": 0, "train_loss": 4.119883447885513, "train_ppl": 61.552067822682694, "lr": 0.00066, "grad_norm": 0.5167, "tokens_per_sec": 402904, "dt_s": 13.013, "eta_s": 10462, "world_size": 8, "timestamp": "2026-05-02T13:44:48.405611"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3560, "epoch": 0, "train_loss": 3.939982831478119, "train_ppl": 51.41771852672338, "lr": 0.00066, "grad_norm": 0.6291, "tokens_per_sec": 392899, "dt_s": 13.344, "eta_s": 10500, "world_size": 8, "timestamp": "2026-05-02T13:45:01.749727"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3570, "epoch": 0, "train_loss": 4.058350130915642, "train_ppl": 57.87873991705511, "lr": 0.00066, "grad_norm": 0.4707, "tokens_per_sec": 375940, "dt_s": 13.946, "eta_s": 10636, "world_size": 8, "timestamp": "2026-05-02T13:45:15.695873"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3580, "epoch": 0, "train_loss": 4.005937322974205, "train_ppl": 54.92328113273782, "lr": 0.00066, "grad_norm": 0.5009, "tokens_per_sec": 377748, "dt_s": 13.879, "eta_s": 10760, "world_size": 8, "timestamp": "2026-05-02T13:45:29.575055"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3590, "epoch": 0, "train_loss": 3.9693574011325836, "train_ppl": 52.95049397736383, "lr": 0.00066, "grad_norm": 0.4546, "tokens_per_sec": 376781, "dt_s": 13.915, "eta_s": 10889, "world_size": 8, "timestamp": "2026-05-02T13:45:43.490086"}
{"run_name": "final_c2_18l320_standard", "stage": "pretraining", "event": "train_step", "step": 3600, "epoch": 0, "train_loss": 4.020142674446106, "train_ppl": 55.70905351816969, "lr": 0.00066, "grad_norm": 0.4499, "tokens_per_sec": 377443, "dt_s": 13.891, "eta_s": 11016, "world_size": 8, "timestamp": "2026-05-02T13:45:57.380549"}