{"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10, "epoch": 0, "train_loss": 10.813395261764526, "train_ppl": 49681.86527842387, "lr": 6.040992448759438e-06, "grad_norm": 0.5695, "tokens_per_sec": 13659, "dt_s": 47.982, "eta_s": 444668, "world_size": 1, "timestamp": "2026-05-04T20:24:00.150688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20, "epoch": 0, "train_loss": 10.795455694198608, "train_ppl": 48798.541016849835, "lr": 1.2081984897518876e-05, "grad_norm": 0.4217, "tokens_per_sec": 153224, "dt_s": 4.277, "eta_s": 242123, "world_size": 1, "timestamp": "2026-05-04T20:24:04.427823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30, "epoch": 0, "train_loss": 10.779916226863861, "train_ppl": 48046.09909731803, "lr": 1.8122977346278317e-05, "grad_norm": 0.4781, "tokens_per_sec": 153446, "dt_s": 4.271, "eta_s": 174587, "world_size": 1, "timestamp": "2026-05-04T20:24:08.698742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40, "epoch": 0, "train_loss": 10.751462876796722, "train_ppl": 46698.29235884729, "lr": 2.416396979503775e-05, "grad_norm": 0.5757, "tokens_per_sec": 150988, "dt_s": 4.34, "eta_s": 140977, "world_size": 1, "timestamp": "2026-05-04T20:24:13.039220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50, "epoch": 0, "train_loss": 10.712764024734497, "train_ppl": 44925.64301984376, "lr": 3.0204962243797193e-05, "grad_norm": 0.6597, "tokens_per_sec": 135039, "dt_s": 4.853, "eta_s": 121760, "world_size": 1, "timestamp": "2026-05-04T20:24:17.892337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60, "epoch": 0, "train_loss": 10.638461887836456, "train_ppl": 41708.56939845686, "lr": 3.6245954692556634e-05, "grad_norm": 0.8304, "tokens_per_sec": 151220, "dt_s": 4.334, "eta_s": 40888, "world_size": 1, "timestamp": "2026-05-04T20:24:22.226150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70, "epoch": 0, "train_loss": 10.530482530593872, "train_ppl": 37439.535746966125, "lr": 4.2286947141316075e-05, "grad_norm": 1.0244, "tokens_per_sec": 150736, "dt_s": 4.348, "eta_s": 41014, "world_size": 1, "timestamp": "2026-05-04T20:24:26.573892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80, "epoch": 0, "train_loss": 10.376741230487823, "train_ppl": 32104.1707468966, "lr": 4.83279395900755e-05, "grad_norm": 1.2307, "tokens_per_sec": 151107, "dt_s": 4.337, "eta_s": 41132, "world_size": 1, "timestamp": "2026-05-04T20:24:30.910951"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90, "epoch": 0, "train_loss": 10.164317607879639, "train_ppl": 25960.141146503167, "lr": 5.4368932038834944e-05, "grad_norm": 1.5098, "tokens_per_sec": 150804, "dt_s": 4.346, "eta_s": 41138, "world_size": 1, "timestamp": "2026-05-04T20:24:35.256734"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 100, "epoch": 0, "train_loss": 9.89901614189148, "train_ppl": 19910.77142457402, "lr": 6.0409924487594385e-05, "grad_norm": 1.7651, "tokens_per_sec": 151693, "dt_s": 4.32, "eta_s": 40147, "world_size": 1, "timestamp": "2026-05-04T20:24:39.577036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 110, "epoch": 0, "train_loss": 9.596240639686584, "train_ppl": 14709.379634881803, "lr": 6.645091693635382e-05, "grad_norm": 2.0156, "tokens_per_sec": 151388, "dt_s": 4.329, "eta_s": 40133, "world_size": 1, "timestamp": "2026-05-04T20:24:43.906032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 120, "epoch": 0, "train_loss": 9.209220945835114, "train_ppl": 9988.81200182644, "lr": 7.249190938511327e-05, "grad_norm": 2.2756, "tokens_per_sec": 148064, "dt_s": 4.426, "eta_s": 40274, "world_size": 1, "timestamp": "2026-05-04T20:24:48.332250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 130, "epoch": 0, "train_loss": 8.810535967350006, "train_ppl": 6704.511703588624, "lr": 7.85329018338727e-05, "grad_norm": 2.4511, "tokens_per_sec": 151830, "dt_s": 4.316, "eta_s": 40232, "world_size": 1, "timestamp": "2026-05-04T20:24:52.648648"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 140, "epoch": 0, "train_loss": 8.381231665611267, "train_ppl": 4364.381075262094, "lr": 8.457389428263215e-05, "grad_norm": 2.5689, "tokens_per_sec": 151126, "dt_s": 4.337, "eta_s": 40210, "world_size": 1, "timestamp": "2026-05-04T20:24:56.985157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 150, "epoch": 0, "train_loss": 8.041115999221802, "train_ppl": 3106.0776370341887, "lr": 9.061488673139157e-05, "grad_norm": 2.4544, "tokens_per_sec": 147576, "dt_s": 4.441, "eta_s": 40429, "world_size": 1, "timestamp": "2026-05-04T20:25:01.425992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 160, "epoch": 0, "train_loss": 7.815774112939835, "train_ppl": 2479.405550231845, "lr": 9.6655879180151e-05, "grad_norm": 1.6757, "tokens_per_sec": 152174, "dt_s": 4.307, "eta_s": 40383, "world_size": 1, "timestamp": "2026-05-04T20:25:05.732663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 170, "epoch": 0, "train_loss": 7.5937148332595825, "train_ppl": 1985.6762858530228, "lr": 0.00010269687162891045, "grad_norm": 1.1427, "tokens_per_sec": 150485, "dt_s": 4.355, "eta_s": 40247, "world_size": 1, "timestamp": "2026-05-04T20:25:10.087647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 180, "epoch": 0, "train_loss": 7.590886503458023, "train_ppl": 1980.0680731136576, "lr": 0.00010873786407766989, "grad_norm": 0.6641, "tokens_per_sec": 148721, "dt_s": 4.407, "eta_s": 40410, "world_size": 1, "timestamp": "2026-05-04T20:25:14.494311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 190, "epoch": 0, "train_loss": 7.753774851560593, "train_ppl": 2330.352567323899, "lr": 0.00011477885652642934, "grad_norm": 0.8623, "tokens_per_sec": 149285, "dt_s": 4.39, "eta_s": 40504, "world_size": 1, "timestamp": "2026-05-04T20:25:18.884289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 200, "epoch": 0, "train_loss": 7.6607730984687805, "train_ppl": 2123.398390228366, "lr": 0.00012081984897518877, "grad_norm": 0.5303, "tokens_per_sec": 147215, "dt_s": 4.452, "eta_s": 40520, "world_size": 1, "timestamp": "2026-05-04T20:25:23.335978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 210, "epoch": 0, "train_loss": 7.565940588712692, "train_ppl": 1931.2844693862971, "lr": 0.00012686084142394822, "grad_norm": 0.4335, "tokens_per_sec": 149116, "dt_s": 4.395, "eta_s": 40679, "world_size": 1, "timestamp": "2026-05-04T20:25:27.730948"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 220, "epoch": 0, "train_loss": 7.559975683689117, "train_ppl": 1919.7988303807917, "lr": 0.00013290183387270764, "grad_norm": 0.5581, "tokens_per_sec": 150299, "dt_s": 4.36, "eta_s": 40685, "world_size": 1, "timestamp": "2026-05-04T20:25:32.091329"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 230, "epoch": 0, "train_loss": 7.587352067232132, "train_ppl": 1973.082001970415, "lr": 0.0001389428263214671, "grad_norm": 0.6232, "tokens_per_sec": 148176, "dt_s": 4.423, "eta_s": 40710, "world_size": 1, "timestamp": "2026-05-04T20:25:36.514153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 240, "epoch": 0, "train_loss": 7.628476291894913, "train_ppl": 2055.915017653997, "lr": 0.00014498381877022654, "grad_norm": 0.4652, "tokens_per_sec": 150898, "dt_s": 4.343, "eta_s": 40619, "world_size": 1, "timestamp": "2026-05-04T20:25:40.857241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 250, "epoch": 0, "train_loss": 7.8391848504543304, "train_ppl": 2538.1350307247662, "lr": 0.00015102481121898596, "grad_norm": 0.6461, "tokens_per_sec": 152600, "dt_s": 4.295, "eta_s": 40324, "world_size": 1, "timestamp": "2026-05-04T20:25:45.151844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 260, "epoch": 0, "train_loss": 7.501410961151123, "train_ppl": 1810.5952926448924, "lr": 0.0001570658036677454, "grad_norm": 0.5953, "tokens_per_sec": 147737, "dt_s": 4.436, "eta_s": 40396, "world_size": 1, "timestamp": "2026-05-04T20:25:49.587852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 270, "epoch": 0, "train_loss": 7.5918175876140594, "train_ppl": 1981.9125416688812, "lr": 0.00016310679611650483, "grad_norm": 0.9775, "tokens_per_sec": 149437, "dt_s": 4.386, "eta_s": 40438, "world_size": 1, "timestamp": "2026-05-04T20:25:53.973393"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 280, "epoch": 0, "train_loss": 7.590016454458237, "train_ppl": 1978.346066091008, "lr": 0.0001691477885652643, "grad_norm": 0.6265, "tokens_per_sec": 150641, "dt_s": 4.35, "eta_s": 40300, "world_size": 1, "timestamp": "2026-05-04T20:25:58.323865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 290, "epoch": 0, "train_loss": 7.508354604244232, "train_ppl": 1823.2111695276337, "lr": 0.00017518878101402372, "grad_norm": 1.4259, "tokens_per_sec": 146251, "dt_s": 4.481, "eta_s": 40550, "world_size": 1, "timestamp": "2026-05-04T20:26:02.804904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 300, "epoch": 0, "train_loss": 7.64470836520195, "train_ppl": 2089.5590988203126, "lr": 0.00018122977346278314, "grad_norm": 0.6843, "tokens_per_sec": 137203, "dt_s": 4.777, "eta_s": 41436, "world_size": 1, "timestamp": "2026-05-04T20:26:07.581483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 310, "epoch": 0, "train_loss": 7.430827379226685, "train_ppl": 1687.2029527073921, "lr": 0.0001872707659115426, "grad_norm": 0.8534, "tokens_per_sec": 149453, "dt_s": 4.385, "eta_s": 41338, "world_size": 1, "timestamp": "2026-05-04T20:26:11.966537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 320, "epoch": 0, "train_loss": 7.599418371915817, "train_ppl": 1997.0340261743806, "lr": 0.000193311758360302, "grad_norm": 1.2102, "tokens_per_sec": 148586, "dt_s": 4.411, "eta_s": 41380, "world_size": 1, "timestamp": "2026-05-04T20:26:16.377195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 330, "epoch": 0, "train_loss": 7.763778805732727, "train_ppl": 2353.7823072325327, "lr": 0.0001993527508090615, "grad_norm": 1.1405, "tokens_per_sec": 150855, "dt_s": 4.344, "eta_s": 41364, "world_size": 1, "timestamp": "2026-05-04T20:26:20.721488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 340, "epoch": 0, "train_loss": 7.268510013818741, "train_ppl": 1434.411606534459, "lr": 0.0002053937432578209, "grad_norm": 0.9679, "tokens_per_sec": 148245, "dt_s": 4.421, "eta_s": 41248, "world_size": 1, "timestamp": "2026-05-04T20:26:25.142309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 350, "epoch": 0, "train_loss": 7.360440641641617, "train_ppl": 1572.529332221458, "lr": 0.00021143473570658036, "grad_norm": 1.0323, "tokens_per_sec": 146072, "dt_s": 4.487, "eta_s": 40708, "world_size": 1, "timestamp": "2026-05-04T20:26:29.628851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 360, "epoch": 0, "train_loss": 7.344170033931732, "train_ppl": 1547.150349961495, "lr": 0.00021747572815533978, "grad_norm": 1.3562, "tokens_per_sec": 150954, "dt_s": 4.341, "eta_s": 40623, "world_size": 1, "timestamp": "2026-05-04T20:26:33.970311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 370, "epoch": 0, "train_loss": 7.308372259140015, "train_ppl": 1492.745407879514, "lr": 0.00022351672060409925, "grad_norm": 3.6676, "tokens_per_sec": 147833, "dt_s": 4.433, "eta_s": 40660, "world_size": 1, "timestamp": "2026-05-04T20:26:38.403424"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 380, "epoch": 0, "train_loss": 7.392580032348633, "train_ppl": 1623.890402580581, "lr": 0.00022955771305285867, "grad_norm": 1.1417, "tokens_per_sec": 149491, "dt_s": 4.384, "eta_s": 40729, "world_size": 1, "timestamp": "2026-05-04T20:26:42.787385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 390, "epoch": 0, "train_loss": 7.351307570934296, "train_ppl": 1558.2326961121223, "lr": 0.0002355987055016181, "grad_norm": 3.1273, "tokens_per_sec": 149697, "dt_s": 4.378, "eta_s": 40645, "world_size": 1, "timestamp": "2026-05-04T20:26:47.165295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 400, "epoch": 0, "train_loss": 7.319361686706543, "train_ppl": 1509.240294089748, "lr": 0.00024163969795037754, "grad_norm": 2.0324, "tokens_per_sec": 146398, "dt_s": 4.477, "eta_s": 40623, "world_size": 1, "timestamp": "2026-05-04T20:26:51.641842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 410, "epoch": 0, "train_loss": 7.185497134923935, "train_ppl": 1320.1453655370763, "lr": 0.00024768069039913696, "grad_norm": 0.981, "tokens_per_sec": 149849, "dt_s": 4.373, "eta_s": 40677, "world_size": 1, "timestamp": "2026-05-04T20:26:56.015380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 420, "epoch": 0, "train_loss": 7.129296451807022, "train_ppl": 1247.9986307920572, "lr": 0.00025372168284789644, "grad_norm": 2.1616, "tokens_per_sec": 148151, "dt_s": 4.424, "eta_s": 40655, "world_size": 1, "timestamp": "2026-05-04T20:27:00.438913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 430, "epoch": 0, "train_loss": 7.223040968179703, "train_ppl": 1370.650835164456, "lr": 0.00025976267529665586, "grad_norm": 1.1968, "tokens_per_sec": 145580, "dt_s": 4.502, "eta_s": 40868, "world_size": 1, "timestamp": "2026-05-04T20:27:04.940625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 440, "epoch": 0, "train_loss": 7.123153865337372, "train_ppl": 1240.3561875027558, "lr": 0.0002658036677454153, "grad_norm": 0.8095, "tokens_per_sec": 150301, "dt_s": 4.36, "eta_s": 40831, "world_size": 1, "timestamp": "2026-05-04T20:27:09.300939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 450, "epoch": 0, "train_loss": 7.098129898309708, "train_ppl": 1209.7026907918146, "lr": 0.00027184466019417475, "grad_norm": 2.3146, "tokens_per_sec": 149019, "dt_s": 4.398, "eta_s": 40681, "world_size": 1, "timestamp": "2026-05-04T20:27:13.698781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 460, "epoch": 0, "train_loss": 7.077949434518814, "train_ppl": 1185.535007162192, "lr": 0.0002778856526429342, "grad_norm": 1.6442, "tokens_per_sec": 148042, "dt_s": 4.427, "eta_s": 40775, "world_size": 1, "timestamp": "2026-05-04T20:27:18.125605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 470, "epoch": 0, "train_loss": 7.005278170108795, "train_ppl": 1102.4366772921003, "lr": 0.00028392664509169365, "grad_norm": 1.1071, "tokens_per_sec": 150852, "dt_s": 4.344, "eta_s": 40625, "world_size": 1, "timestamp": "2026-05-04T20:27:22.469982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 480, "epoch": 0, "train_loss": 6.8543645441532135, "train_ppl": 948.0095197672839, "lr": 0.00028996763754045307, "grad_norm": 1.6326, "tokens_per_sec": 150029, "dt_s": 4.368, "eta_s": 40375, "world_size": 1, "timestamp": "2026-05-04T20:27:26.838213"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 490, "epoch": 0, "train_loss": 7.139170825481415, "train_ppl": 1260.3828783363854, "lr": 0.0002960086299892125, "grad_norm": 3.4507, "tokens_per_sec": 148428, "dt_s": 4.415, "eta_s": 40472, "world_size": 1, "timestamp": "2026-05-04T20:27:31.253530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 500, "epoch": 0, "train_loss": 6.922876238822937, "train_ppl": 1015.2358599569116, "lr": 0.0003020496224379719, "grad_norm": 1.3174, "tokens_per_sec": 149780, "dt_s": 4.375, "eta_s": 40426, "world_size": 1, "timestamp": "2026-05-04T20:27:35.629028"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 510, "epoch": 0, "train_loss": 6.926956206560135, "train_ppl": 1019.3864508921281, "lr": 0.00030809061488673133, "grad_norm": 1.7169, "tokens_per_sec": 26849, "dt_s": 24.409, "eta_s": 40304, "world_size": 1, "timestamp": "2026-05-04T20:28:00.037754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 520, "epoch": 0, "train_loss": 6.724045038223267, "train_ppl": 832.1768998420694, "lr": 0.0003141316073354908, "grad_norm": 1.4151, "tokens_per_sec": 150416, "dt_s": 4.357, "eta_s": 40323, "world_size": 1, "timestamp": "2026-05-04T20:28:04.394733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 530, "epoch": 0, "train_loss": 6.884079724550247, "train_ppl": 976.6025127195289, "lr": 0.00032017259978425023, "grad_norm": 1.048, "tokens_per_sec": 132985, "dt_s": 4.928, "eta_s": 41350, "world_size": 1, "timestamp": "2026-05-04T20:28:09.322807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 540, "epoch": 0, "train_loss": 6.8941148817539215, "train_ppl": 986.4522114388565, "lr": 0.00032621359223300965, "grad_norm": 1.9618, "tokens_per_sec": 149659, "dt_s": 4.379, "eta_s": 41279, "world_size": 1, "timestamp": "2026-05-04T20:28:13.701817"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 550, "epoch": 0, "train_loss": 6.780481964349747, "train_ppl": 880.4929880899944, "lr": 0.0003322545846817691, "grad_norm": 1.676, "tokens_per_sec": 146331, "dt_s": 4.479, "eta_s": 41464, "world_size": 1, "timestamp": "2026-05-04T20:28:18.180453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 560, "epoch": 0, "train_loss": 6.797888159751892, "train_ppl": 895.9531823353552, "lr": 0.0003382955771305286, "grad_norm": 1.3102, "tokens_per_sec": 148998, "dt_s": 4.398, "eta_s": 41525, "world_size": 1, "timestamp": "2026-05-04T20:28:22.578893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 570, "epoch": 0, "train_loss": 6.780531227588654, "train_ppl": 880.5363650948595, "lr": 0.000344336569579288, "grad_norm": 2.1173, "tokens_per_sec": 148925, "dt_s": 4.401, "eta_s": 41601, "world_size": 1, "timestamp": "2026-05-04T20:28:26.979492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 580, "epoch": 0, "train_loss": 6.6542483270168304, "train_ppl": 776.0743496644287, "lr": 0.00035037756202804744, "grad_norm": 2.0963, "tokens_per_sec": 146308, "dt_s": 4.479, "eta_s": 40770, "world_size": 1, "timestamp": "2026-05-04T20:28:31.458800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 590, "epoch": 0, "train_loss": 6.632586628198624, "train_ppl": 759.4440315929323, "lr": 0.00035641855447680686, "grad_norm": 1.7614, "tokens_per_sec": 149241, "dt_s": 4.391, "eta_s": 40788, "world_size": 1, "timestamp": "2026-05-04T20:28:35.850066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 600, "epoch": 0, "train_loss": 6.800557464361191, "train_ppl": 898.3479490536686, "lr": 0.0003624595469255663, "grad_norm": 1.3487, "tokens_per_sec": 147143, "dt_s": 4.454, "eta_s": 40738, "world_size": 1, "timestamp": "2026-05-04T20:28:40.303957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 610, "epoch": 0, "train_loss": 6.483335018157959, "train_ppl": 654.148910972689, "lr": 0.00036850053937432576, "grad_norm": 1.4737, "tokens_per_sec": 148809, "dt_s": 4.404, "eta_s": 40744, "world_size": 1, "timestamp": "2026-05-04T20:28:44.708048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 620, "epoch": 0, "train_loss": 6.530223906040192, "train_ppl": 685.5516934734287, "lr": 0.0003745415318230852, "grad_norm": 1.5747, "tokens_per_sec": 149776, "dt_s": 4.376, "eta_s": 40693, "world_size": 1, "timestamp": "2026-05-04T20:28:49.083604"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 630, "epoch": 0, "train_loss": 6.596829891204834, "train_ppl": 732.7685473197658, "lr": 0.0003805825242718446, "grad_norm": 2.0225, "tokens_per_sec": 146344, "dt_s": 4.478, "eta_s": 40687, "world_size": 1, "timestamp": "2026-05-04T20:28:53.561818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 640, "epoch": 0, "train_loss": 6.427870869636536, "train_ppl": 618.8549215173341, "lr": 0.000386623516720604, "grad_norm": 1.6753, "tokens_per_sec": 148748, "dt_s": 4.406, "eta_s": 40709, "world_size": 1, "timestamp": "2026-05-04T20:28:57.967675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 650, "epoch": 0, "train_loss": 6.472074806690216, "train_ppl": 646.8243712281012, "lr": 0.00039266450916936355, "grad_norm": 1.3521, "tokens_per_sec": 149332, "dt_s": 4.389, "eta_s": 40585, "world_size": 1, "timestamp": "2026-05-04T20:29:02.356298"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 660, "epoch": 0, "train_loss": 6.495208889245987, "train_ppl": 661.9624877134025, "lr": 0.000398705501618123, "grad_norm": 1.3704, "tokens_per_sec": 145515, "dt_s": 4.504, "eta_s": 40764, "world_size": 1, "timestamp": "2026-05-04T20:29:06.860020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 670, "epoch": 0, "train_loss": 6.595969885587692, "train_ppl": 732.1386331566896, "lr": 0.0004047464940668824, "grad_norm": 1.1882, "tokens_per_sec": 148159, "dt_s": 4.423, "eta_s": 40847, "world_size": 1, "timestamp": "2026-05-04T20:29:11.283379"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 680, "epoch": 0, "train_loss": 6.3862179815769196, "train_ppl": 593.6072952248454, "lr": 0.0004107874865156418, "grad_norm": 2.0838, "tokens_per_sec": 147282, "dt_s": 4.45, "eta_s": 40790, "world_size": 1, "timestamp": "2026-05-04T20:29:15.733061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 690, "epoch": 0, "train_loss": 6.391629248857498, "train_ppl": 596.8281696056679, "lr": 0.00041682847896440124, "grad_norm": 1.3944, "tokens_per_sec": 147184, "dt_s": 4.453, "eta_s": 40872, "world_size": 1, "timestamp": "2026-05-04T20:29:20.185738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 700, "epoch": 0, "train_loss": 6.5539476573467255, "train_ppl": 702.0100059025904, "lr": 0.0004228694714131607, "grad_norm": 1.1169, "tokens_per_sec": 150086, "dt_s": 4.367, "eta_s": 40827, "world_size": 1, "timestamp": "2026-05-04T20:29:24.552267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 710, "epoch": 0, "train_loss": 6.280904084444046, "train_ppl": 534.2714720697595, "lr": 0.00042891046386192013, "grad_norm": 1.4467, "tokens_per_sec": 149765, "dt_s": 4.376, "eta_s": 40587, "world_size": 1, "timestamp": "2026-05-04T20:29:28.928207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 720, "epoch": 0, "train_loss": 6.250024378299713, "train_ppl": 518.0254530941662, "lr": 0.00043495145631067955, "grad_norm": 1.5574, "tokens_per_sec": 147946, "dt_s": 4.43, "eta_s": 40595, "world_size": 1, "timestamp": "2026-05-04T20:29:33.357912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 730, "epoch": 0, "train_loss": 6.244597166776657, "train_ppl": 515.221634725691, "lr": 0.000440992448759439, "grad_norm": 1.7781, "tokens_per_sec": 149427, "dt_s": 4.386, "eta_s": 40473, "world_size": 1, "timestamp": "2026-05-04T20:29:37.743731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 740, "epoch": 0, "train_loss": 6.234754145145416, "train_ppl": 510.17517398514605, "lr": 0.0004470334412081985, "grad_norm": 1.4212, "tokens_per_sec": 148141, "dt_s": 4.424, "eta_s": 40416, "world_size": 1, "timestamp": "2026-05-04T20:29:42.167634"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 750, "epoch": 0, "train_loss": 6.287113010883331, "train_ppl": 537.599043973003, "lr": 0.0004530744336569579, "grad_norm": 1.6785, "tokens_per_sec": 148619, "dt_s": 4.41, "eta_s": 40491, "world_size": 1, "timestamp": "2026-05-04T20:29:46.577319"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 760, "epoch": 0, "train_loss": 6.242779076099396, "train_ppl": 514.2857660796923, "lr": 0.00045911542610571735, "grad_norm": 1.3898, "tokens_per_sec": 149733, "dt_s": 4.377, "eta_s": 40488, "world_size": 1, "timestamp": "2026-05-04T20:29:50.954146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 770, "epoch": 0, "train_loss": 6.354778349399567, "train_ppl": 575.234825383624, "lr": 0.00046515641855447677, "grad_norm": 1.1873, "tokens_per_sec": 147826, "dt_s": 4.433, "eta_s": 40490, "world_size": 1, "timestamp": "2026-05-04T20:29:55.387465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 780, "epoch": 0, "train_loss": 6.229099333286285, "train_ppl": 507.2983709170459, "lr": 0.0004711974110032362, "grad_norm": 0.8227, "tokens_per_sec": 148505, "dt_s": 4.413, "eta_s": 40536, "world_size": 1, "timestamp": "2026-05-04T20:29:59.800540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 790, "epoch": 0, "train_loss": 6.259915500879288, "train_ppl": 523.1747304354872, "lr": 0.00047723840345199566, "grad_norm": 1.3318, "tokens_per_sec": 149268, "dt_s": 4.39, "eta_s": 40470, "world_size": 1, "timestamp": "2026-05-04T20:30:04.190992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 800, "epoch": 0, "train_loss": 6.28540575504303, "train_ppl": 536.682007895781, "lr": 0.0004832793959007551, "grad_norm": 2.18, "tokens_per_sec": 146367, "dt_s": 4.478, "eta_s": 40590, "world_size": 1, "timestamp": "2026-05-04T20:30:08.668504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 810, "epoch": 0, "train_loss": 6.189121901988983, "train_ppl": 487.417917542215, "lr": 0.0004893203883495146, "grad_norm": 1.0681, "tokens_per_sec": 149344, "dt_s": 4.388, "eta_s": 40607, "world_size": 1, "timestamp": "2026-05-04T20:30:13.056781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 820, "epoch": 0, "train_loss": 6.302308291196823, "train_ppl": 545.830392585313, "lr": 0.0004953613807982739, "grad_norm": 1.0202, "tokens_per_sec": 150217, "dt_s": 4.363, "eta_s": 40473, "world_size": 1, "timestamp": "2026-05-04T20:30:17.419524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 830, "epoch": 0, "train_loss": 6.256552159786224, "train_ppl": 521.4180711439052, "lr": 0.0005014023732470334, "grad_norm": 1.0335, "tokens_per_sec": 131385, "dt_s": 4.988, "eta_s": 41525, "world_size": 1, "timestamp": "2026-05-04T20:30:22.407628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 840, "epoch": 0, "train_loss": 6.11391606926918, "train_ppl": 452.1057303896296, "lr": 0.0005074433656957929, "grad_norm": 1.0823, "tokens_per_sec": 149611, "dt_s": 4.38, "eta_s": 41502, "world_size": 1, "timestamp": "2026-05-04T20:30:26.788029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 850, "epoch": 0, "train_loss": 6.227150857448578, "train_ppl": 506.3108746675205, "lr": 0.0005134843581445524, "grad_norm": 0.9022, "tokens_per_sec": 147620, "dt_s": 4.439, "eta_s": 41427, "world_size": 1, "timestamp": "2026-05-04T20:30:31.227537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 860, "epoch": 0, "train_loss": 6.096544414758682, "train_ppl": 444.319729498152, "lr": 0.0005195253505933117, "grad_norm": 1.1542, "tokens_per_sec": 149277, "dt_s": 4.39, "eta_s": 41426, "world_size": 1, "timestamp": "2026-05-04T20:30:35.617761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 870, "epoch": 0, "train_loss": 6.040245622396469, "train_ppl": 419.9961826873507, "lr": 0.0005255663430420712, "grad_norm": 1.4877, "tokens_per_sec": 149457, "dt_s": 4.385, "eta_s": 41462, "world_size": 1, "timestamp": "2026-05-04T20:30:40.002715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 880, "epoch": 0, "train_loss": 6.031282842159271, "train_ppl": 416.24866836202716, "lr": 0.0005316073354908306, "grad_norm": 1.0784, "tokens_per_sec": 148375, "dt_s": 4.417, "eta_s": 40409, "world_size": 1, "timestamp": "2026-05-04T20:30:44.419598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 890, "epoch": 0, "train_loss": 6.074007600545883, "train_ppl": 434.41817216880855, "lr": 0.00053764832793959, "grad_norm": 1.6015, "tokens_per_sec": 149548, "dt_s": 4.382, "eta_s": 40408, "world_size": 1, "timestamp": "2026-05-04T20:30:48.801881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 900, "epoch": 0, "train_loss": 6.132917046546936, "train_ppl": 460.77831395605665, "lr": 0.0005436893203883495, "grad_norm": 0.7685, "tokens_per_sec": 149064, "dt_s": 4.396, "eta_s": 40325, "world_size": 1, "timestamp": "2026-05-04T20:30:53.198397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 910, "epoch": 0, "train_loss": 6.060025930404663, "train_ppl": 428.38654495173046, "lr": 0.0005497303128371089, "grad_norm": 0.8103, "tokens_per_sec": 148694, "dt_s": 4.407, "eta_s": 40352, "world_size": 1, "timestamp": "2026-05-04T20:30:57.605818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 920, "epoch": 0, "train_loss": 6.036933660507202, "train_ppl": 418.60747228402556, "lr": 0.0005557713052858684, "grad_norm": 1.1935, "tokens_per_sec": 149228, "dt_s": 4.392, "eta_s": 40360, "world_size": 1, "timestamp": "2026-05-04T20:31:01.997495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 930, "epoch": 0, "train_loss": 6.0230401158332825, "train_ppl": 412.83174622834895, "lr": 0.00056, "grad_norm": 1.0685, "tokens_per_sec": 150498, "dt_s": 4.355, "eta_s": 40241, "world_size": 1, "timestamp": "2026-05-04T20:31:06.352109"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 940, "epoch": 0, "train_loss": 6.028549879789352, "train_ppl": 415.1126294976957, "lr": 0.00056, "grad_norm": 1.1283, "tokens_per_sec": 147339, "dt_s": 4.448, "eta_s": 40357, "world_size": 1, "timestamp": "2026-05-04T20:31:10.800116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 950, "epoch": 0, "train_loss": 6.034033387899399, "train_ppl": 417.3951553732942, "lr": 0.00056, "grad_norm": 0.9823, "tokens_per_sec": 150273, "dt_s": 4.361, "eta_s": 40288, "world_size": 1, "timestamp": "2026-05-04T20:31:15.161224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 960, "epoch": 0, "train_loss": 5.998370349407196, "train_ppl": 402.77188093461893, "lr": 0.00056, "grad_norm": 1.0388, "tokens_per_sec": 150628, "dt_s": 4.351, "eta_s": 40180, "world_size": 1, "timestamp": "2026-05-04T20:31:19.512069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 970, "epoch": 0, "train_loss": 5.829013824462891, "train_ppl": 340.0231911208001, "lr": 0.00056, "grad_norm": 0.9033, "tokens_per_sec": 149983, "dt_s": 4.37, "eta_s": 40135, "world_size": 1, "timestamp": "2026-05-04T20:31:23.881620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 980, "epoch": 0, "train_loss": 5.881161272525787, "train_ppl": 358.22499710665994, "lr": 0.00056, "grad_norm": 0.719, "tokens_per_sec": 149454, "dt_s": 4.385, "eta_s": 40187, "world_size": 1, "timestamp": "2026-05-04T20:31:28.266659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 990, "epoch": 0, "train_loss": 5.80232161283493, "train_ppl": 331.0672784350716, "lr": 0.00056, "grad_norm": 0.8111, "tokens_per_sec": 148285, "dt_s": 4.42, "eta_s": 40130, "world_size": 1, "timestamp": "2026-05-04T20:31:32.686295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1000, "epoch": 0, "train_loss": 5.883974224328995, "train_ppl": 359.23408535113947, "lr": 0.00056, "grad_norm": 0.7732, "tokens_per_sec": 149250, "dt_s": 4.391, "eta_s": 40181, "world_size": 1, "timestamp": "2026-05-04T20:31:37.077289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1010, "epoch": 0, "train_loss": 5.817328304052353, "train_ppl": 336.07296833598866, "lr": 0.00056, "grad_norm": 0.8426, "tokens_per_sec": 101700, "dt_s": 6.444, "eta_s": 40342, "world_size": 1, "timestamp": "2026-05-04T20:31:43.521310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1020, "epoch": 0, "train_loss": 5.9445657432079315, "train_ppl": 381.6735809597684, "lr": 0.00056, "grad_norm": 0.8777, "tokens_per_sec": 145091, "dt_s": 4.517, "eta_s": 40608, "world_size": 1, "timestamp": "2026-05-04T20:31:48.038198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1030, "epoch": 0, "train_loss": 5.834527254104614, "train_ppl": 341.9030625690711, "lr": 0.00056, "grad_norm": 0.9327, "tokens_per_sec": 149434, "dt_s": 4.386, "eta_s": 40605, "world_size": 1, "timestamp": "2026-05-04T20:31:52.423805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1040, "epoch": 0, "train_loss": 5.607234865427017, "train_ppl": 272.3900006932448, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 148937, "dt_s": 4.4, "eta_s": 40565, "world_size": 1, "timestamp": "2026-05-04T20:31:56.824032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1050, "epoch": 0, "train_loss": 5.815402001142502, "train_ppl": 335.42621312237554, "lr": 0.00056, "grad_norm": 0.9045, "tokens_per_sec": 147494, "dt_s": 4.443, "eta_s": 40656, "world_size": 1, "timestamp": "2026-05-04T20:32:01.267349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1060, "epoch": 0, "train_loss": 5.679002493619919, "train_ppl": 292.6573566938344, "lr": 0.00056, "grad_norm": 0.9439, "tokens_per_sec": 149109, "dt_s": 4.395, "eta_s": 40567, "world_size": 1, "timestamp": "2026-05-04T20:32:05.662509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1070, "epoch": 0, "train_loss": 5.876029819250107, "train_ppl": 356.39149057167793, "lr": 0.00056, "grad_norm": 1.0391, "tokens_per_sec": 146197, "dt_s": 4.483, "eta_s": 40500, "world_size": 1, "timestamp": "2026-05-04T20:32:10.145222"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1080, "epoch": 0, "train_loss": 5.676663368940353, "train_ppl": 291.97359466222423, "lr": 0.00056, "grad_norm": 0.9807, "tokens_per_sec": 150250, "dt_s": 4.362, "eta_s": 40452, "world_size": 1, "timestamp": "2026-05-04T20:32:14.507003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1090, "epoch": 0, "train_loss": 5.706742465496063, "train_ppl": 300.88931278148937, "lr": 0.00056, "grad_norm": 0.8455, "tokens_per_sec": 148659, "dt_s": 4.408, "eta_s": 40462, "world_size": 1, "timestamp": "2026-05-04T20:32:18.915482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1100, "epoch": 0, "train_loss": 5.706462770700455, "train_ppl": 300.80516737471487, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 147552, "dt_s": 4.442, "eta_s": 40455, "world_size": 1, "timestamp": "2026-05-04T20:32:23.357031"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1110, "epoch": 0, "train_loss": 5.783355474472046, "train_ppl": 324.84738079227674, "lr": 0.00056, "grad_norm": 0.6674, "tokens_per_sec": 148695, "dt_s": 4.407, "eta_s": 40473, "world_size": 1, "timestamp": "2026-05-04T20:32:27.764464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1120, "epoch": 0, "train_loss": 5.813569575548172, "train_ppl": 334.8121323449369, "lr": 0.00056, "grad_norm": 0.8684, "tokens_per_sec": 131228, "dt_s": 4.994, "eta_s": 41405, "world_size": 1, "timestamp": "2026-05-04T20:32:32.758529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1130, "epoch": 0, "train_loss": 5.524959534406662, "train_ppl": 250.87618596172203, "lr": 0.00056, "grad_norm": 0.9045, "tokens_per_sec": 148069, "dt_s": 4.426, "eta_s": 41518, "world_size": 1, "timestamp": "2026-05-04T20:32:37.184557"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1140, "epoch": 0, "train_loss": 5.595505595207214, "train_ppl": 269.21372886000927, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 148993, "dt_s": 4.399, "eta_s": 41495, "world_size": 1, "timestamp": "2026-05-04T20:32:41.583137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1150, "epoch": 0, "train_loss": 5.69536092877388, "train_ppl": 297.484144799503, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 147251, "dt_s": 4.451, "eta_s": 41507, "world_size": 1, "timestamp": "2026-05-04T20:32:46.033765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1160, "epoch": 0, "train_loss": 5.6913145780563354, "train_ppl": 296.28285168242786, "lr": 0.00056, "grad_norm": 0.769, "tokens_per_sec": 147018, "dt_s": 4.458, "eta_s": 41595, "world_size": 1, "timestamp": "2026-05-04T20:32:50.491447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1170, "epoch": 0, "train_loss": 5.639537751674652, "train_ppl": 281.33264287094835, "lr": 0.00056, "grad_norm": 0.8267, "tokens_per_sec": 149419, "dt_s": 4.386, "eta_s": 40477, "world_size": 1, "timestamp": "2026-05-04T20:32:54.877484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1180, "epoch": 0, "train_loss": 5.579875469207764, "train_ppl": 265.03859826442806, "lr": 0.00056, "grad_norm": 1.0337, "tokens_per_sec": 147746, "dt_s": 4.436, "eta_s": 40491, "world_size": 1, "timestamp": "2026-05-04T20:32:59.313209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1190, "epoch": 0, "train_loss": 5.502661347389221, "train_ppl": 245.3440098168852, "lr": 0.00056, "grad_norm": 0.7409, "tokens_per_sec": 148862, "dt_s": 4.402, "eta_s": 40493, "world_size": 1, "timestamp": "2026-05-04T20:33:03.715684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1200, "epoch": 0, "train_loss": 5.635561227798462, "train_ppl": 280.2161382742632, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 149123, "dt_s": 4.395, "eta_s": 40387, "world_size": 1, "timestamp": "2026-05-04T20:33:08.110452"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1210, "epoch": 0, "train_loss": 5.500985115766525, "train_ppl": 244.9331009146518, "lr": 0.00056, "grad_norm": 0.7319, "tokens_per_sec": 148348, "dt_s": 4.418, "eta_s": 40309, "world_size": 1, "timestamp": "2026-05-04T20:33:12.528166"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1220, "epoch": 0, "train_loss": 5.645490407943726, "train_ppl": 283.0123136829222, "lr": 0.00056, "grad_norm": 0.7623, "tokens_per_sec": 150334, "dt_s": 4.359, "eta_s": 40256, "world_size": 1, "timestamp": "2026-05-04T20:33:16.887508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1230, "epoch": 0, "train_loss": 5.521970897912979, "train_ppl": 250.12752752816687, "lr": 0.00056, "grad_norm": 0.7655, "tokens_per_sec": 146539, "dt_s": 4.472, "eta_s": 40318, "world_size": 1, "timestamp": "2026-05-04T20:33:21.359793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1240, "epoch": 0, "train_loss": 5.625215083360672, "train_ppl": 277.33192759179633, "lr": 0.00056, "grad_norm": 0.7741, "tokens_per_sec": 147108, "dt_s": 4.455, "eta_s": 40410, "world_size": 1, "timestamp": "2026-05-04T20:33:25.814726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1250, "epoch": 0, "train_loss": 5.553938210010529, "train_ppl": 258.25260885746144, "lr": 0.00056, "grad_norm": 0.6814, "tokens_per_sec": 148844, "dt_s": 4.403, "eta_s": 40421, "world_size": 1, "timestamp": "2026-05-04T20:33:30.217717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1260, "epoch": 0, "train_loss": 5.624082922935486, "train_ppl": 277.01812103193987, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 148308, "dt_s": 4.419, "eta_s": 40418, "world_size": 1, "timestamp": "2026-05-04T20:33:34.636627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1270, "epoch": 0, "train_loss": 5.455497562885284, "train_ppl": 234.0412923643151, "lr": 0.00056, "grad_norm": 0.9236, "tokens_per_sec": 148961, "dt_s": 4.4, "eta_s": 40488, "world_size": 1, "timestamp": "2026-05-04T20:33:39.036170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1280, "epoch": 0, "train_loss": 5.582897037267685, "train_ppl": 265.84064153153673, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 149562, "dt_s": 4.382, "eta_s": 40318, "world_size": 1, "timestamp": "2026-05-04T20:33:43.418023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1290, "epoch": 0, "train_loss": 5.483137339353561, "train_ppl": 240.60036951688744, "lr": 0.00056, "grad_norm": 0.6902, "tokens_per_sec": 145432, "dt_s": 4.506, "eta_s": 40407, "world_size": 1, "timestamp": "2026-05-04T20:33:47.924343"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1300, "epoch": 0, "train_loss": 5.477509945631027, "train_ppl": 239.250218985138, "lr": 0.00056, "grad_norm": 0.624, "tokens_per_sec": 149475, "dt_s": 4.384, "eta_s": 40369, "world_size": 1, "timestamp": "2026-05-04T20:33:52.308744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1310, "epoch": 0, "train_loss": 5.525235295295715, "train_ppl": 250.94537734150487, "lr": 0.00056, "grad_norm": 0.8487, "tokens_per_sec": 150690, "dt_s": 4.349, "eta_s": 40237, "world_size": 1, "timestamp": "2026-05-04T20:33:56.657803"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1320, "epoch": 0, "train_loss": 5.455865532159805, "train_ppl": 234.12742821557575, "lr": 0.00056, "grad_norm": 0.8103, "tokens_per_sec": 146973, "dt_s": 4.459, "eta_s": 40341, "world_size": 1, "timestamp": "2026-05-04T20:34:01.116885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1330, "epoch": 0, "train_loss": 5.5093288123607635, "train_ppl": 246.98529794356713, "lr": 0.00056, "grad_norm": 0.6306, "tokens_per_sec": 149735, "dt_s": 4.377, "eta_s": 40327, "world_size": 1, "timestamp": "2026-05-04T20:34:05.493653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1340, "epoch": 0, "train_loss": 5.491524040699005, "train_ppl": 242.6266981892282, "lr": 0.00056, "grad_norm": 0.6941, "tokens_per_sec": 148943, "dt_s": 4.4, "eta_s": 40129, "world_size": 1, "timestamp": "2026-05-04T20:34:09.893750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1350, "epoch": 0, "train_loss": 5.434653639793396, "train_ppl": 229.21344410946233, "lr": 0.00056, "grad_norm": 0.7848, "tokens_per_sec": 145989, "dt_s": 4.489, "eta_s": 40315, "world_size": 1, "timestamp": "2026-05-04T20:34:14.382837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1360, "epoch": 0, "train_loss": 5.341572314500809, "train_ppl": 208.84081572195726, "lr": 0.00056, "grad_norm": 0.7774, "tokens_per_sec": 149275, "dt_s": 4.39, "eta_s": 40386, "world_size": 1, "timestamp": "2026-05-04T20:34:18.773112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1370, "epoch": 0, "train_loss": 5.463345885276794, "train_ppl": 235.88535079669145, "lr": 0.00056, "grad_norm": 0.849, "tokens_per_sec": 148125, "dt_s": 4.424, "eta_s": 40319, "world_size": 1, "timestamp": "2026-05-04T20:34:23.197508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1380, "epoch": 0, "train_loss": 5.502787262201309, "train_ppl": 245.37490420676806, "lr": 0.00056, "grad_norm": 0.6944, "tokens_per_sec": 148151, "dt_s": 4.424, "eta_s": 40400, "world_size": 1, "timestamp": "2026-05-04T20:34:27.621071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1390, "epoch": 0, "train_loss": 5.34774374961853, "train_ppl": 210.1336484795592, "lr": 0.00056, "grad_norm": 0.5813, "tokens_per_sec": 150669, "dt_s": 4.35, "eta_s": 40303, "world_size": 1, "timestamp": "2026-05-04T20:34:31.970786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1400, "epoch": 0, "train_loss": 5.326720803976059, "train_ppl": 205.76213228686763, "lr": 0.00056, "grad_norm": 0.6659, "tokens_per_sec": 147601, "dt_s": 4.44, "eta_s": 40209, "world_size": 1, "timestamp": "2026-05-04T20:34:36.410818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1410, "epoch": 0, "train_loss": 5.426266133785248, "train_ppl": 227.2989550876521, "lr": 0.00056, "grad_norm": 0.802, "tokens_per_sec": 148581, "dt_s": 4.411, "eta_s": 40242, "world_size": 1, "timestamp": "2026-05-04T20:34:40.821626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1420, "epoch": 0, "train_loss": 5.299907594919205, "train_ppl": 200.3182986909612, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 133558, "dt_s": 4.907, "eta_s": 41119, "world_size": 1, "timestamp": "2026-05-04T20:34:45.728563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1430, "epoch": 0, "train_loss": 5.411452442407608, "train_ppl": 223.95663568178156, "lr": 0.00056, "grad_norm": 0.6408, "tokens_per_sec": 146330, "dt_s": 4.479, "eta_s": 41215, "world_size": 1, "timestamp": "2026-05-04T20:34:50.207199"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1440, "epoch": 0, "train_loss": 5.31511577963829, "train_ppl": 203.38805996171448, "lr": 0.00056, "grad_norm": 0.9606, "tokens_per_sec": 148361, "dt_s": 4.417, "eta_s": 41334, "world_size": 1, "timestamp": "2026-05-04T20:34:54.624520"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1450, "epoch": 0, "train_loss": 5.274558663368225, "train_ppl": 195.30426256385218, "lr": 0.00056, "grad_norm": 0.6223, "tokens_per_sec": 150721, "dt_s": 4.348, "eta_s": 41162, "world_size": 1, "timestamp": "2026-05-04T20:34:58.972708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1460, "epoch": 0, "train_loss": 5.309698939323425, "train_ppl": 202.28931786082634, "lr": 0.00056, "grad_norm": 0.6286, "tokens_per_sec": 146994, "dt_s": 4.458, "eta_s": 41244, "world_size": 1, "timestamp": "2026-05-04T20:35:03.431116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1470, "epoch": 0, "train_loss": 5.2186969220638275, "train_ppl": 184.69335735784205, "lr": 0.00056, "grad_norm": 0.5987, "tokens_per_sec": 150176, "dt_s": 4.364, "eta_s": 40249, "world_size": 1, "timestamp": "2026-05-04T20:35:07.795065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1480, "epoch": 0, "train_loss": 5.310608357191086, "train_ppl": 202.47336705705493, "lr": 0.00056, "grad_norm": 0.6195, "tokens_per_sec": 149955, "dt_s": 4.37, "eta_s": 40047, "world_size": 1, "timestamp": "2026-05-04T20:35:12.165441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1490, "epoch": 0, "train_loss": 5.3414303958415985, "train_ppl": 208.81117941642376, "lr": 0.00056, "grad_norm": 0.7754, "tokens_per_sec": 146070, "dt_s": 4.487, "eta_s": 40169, "world_size": 1, "timestamp": "2026-05-04T20:35:16.652059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1500, "epoch": 0, "train_loss": 5.304797142744064, "train_ppl": 201.30016307285757, "lr": 0.00056, "grad_norm": 0.6306, "tokens_per_sec": 149571, "dt_s": 4.382, "eta_s": 40226, "world_size": 1, "timestamp": "2026-05-04T20:35:21.033671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1510, "epoch": 0, "train_loss": 5.2693502604961395, "train_ppl": 194.2896837429842, "lr": 0.00056, "grad_norm": 0.6553, "tokens_per_sec": 109917, "dt_s": 5.962, "eta_s": 40297, "world_size": 1, "timestamp": "2026-05-04T20:35:26.995975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1520, "epoch": 0, "train_loss": 5.22486138343811, "train_ppl": 185.83540887504395, "lr": 0.00056, "grad_norm": 0.7267, "tokens_per_sec": 144584, "dt_s": 4.533, "eta_s": 40600, "world_size": 1, "timestamp": "2026-05-04T20:35:31.528701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1530, "epoch": 0, "train_loss": 5.240292310714722, "train_ppl": 188.72526075944293, "lr": 0.00056, "grad_norm": 0.7991, "tokens_per_sec": 148040, "dt_s": 4.427, "eta_s": 40699, "world_size": 1, "timestamp": "2026-05-04T20:35:35.955612"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1540, "epoch": 0, "train_loss": 5.248469740152359, "train_ppl": 190.2748755599821, "lr": 0.00056, "grad_norm": 0.7265, "tokens_per_sec": 146504, "dt_s": 4.473, "eta_s": 40670, "world_size": 1, "timestamp": "2026-05-04T20:35:40.428926"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1550, "epoch": 0, "train_loss": 5.236302614212036, "train_ppl": 187.97380428504306, "lr": 0.00056, "grad_norm": 0.7477, "tokens_per_sec": 150144, "dt_s": 4.365, "eta_s": 40635, "world_size": 1, "timestamp": "2026-05-04T20:35:44.793797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1560, "epoch": 0, "train_loss": 5.498194932937622, "train_ppl": 244.25064531295024, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 147610, "dt_s": 4.44, "eta_s": 40521, "world_size": 1, "timestamp": "2026-05-04T20:35:49.233622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1570, "epoch": 0, "train_loss": 5.30110689997673, "train_ppl": 200.55868555947677, "lr": 0.00056, "grad_norm": 0.9148, "tokens_per_sec": 148325, "dt_s": 4.418, "eta_s": 40308, "world_size": 1, "timestamp": "2026-05-04T20:35:53.652044"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1580, "epoch": 0, "train_loss": 5.287310540676117, "train_ppl": 197.81070551992525, "lr": 0.00056, "grad_norm": 0.7272, "tokens_per_sec": 149358, "dt_s": 4.388, "eta_s": 40233, "world_size": 1, "timestamp": "2026-05-04T20:35:58.039887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1590, "epoch": 0, "train_loss": 5.219916790723801, "train_ppl": 184.91879647128678, "lr": 0.00056, "grad_norm": 1.2122, "tokens_per_sec": 146050, "dt_s": 4.487, "eta_s": 40254, "world_size": 1, "timestamp": "2026-05-04T20:36:02.527140"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1600, "epoch": 0, "train_loss": 5.27328759431839, "train_ppl": 195.0561750619802, "lr": 0.00056, "grad_norm": 0.639, "tokens_per_sec": 148920, "dt_s": 4.401, "eta_s": 40315, "world_size": 1, "timestamp": "2026-05-04T20:36:06.927894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1610, "epoch": 0, "train_loss": 5.3283205926418304, "train_ppl": 206.0915716603712, "lr": 0.00056, "grad_norm": 0.6314, "tokens_per_sec": 149897, "dt_s": 4.372, "eta_s": 40187, "world_size": 1, "timestamp": "2026-05-04T20:36:11.299985"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1620, "epoch": 0, "train_loss": 5.093294531106949, "train_ppl": 162.92574275860272, "lr": 0.00056, "grad_norm": 0.6478, "tokens_per_sec": 145708, "dt_s": 4.498, "eta_s": 40327, "world_size": 1, "timestamp": "2026-05-04T20:36:15.797731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1630, "epoch": 0, "train_loss": 5.200967401266098, "train_ppl": 181.44768972200126, "lr": 0.00056, "grad_norm": 0.5764, "tokens_per_sec": 147934, "dt_s": 4.43, "eta_s": 40399, "world_size": 1, "timestamp": "2026-05-04T20:36:20.227802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1640, "epoch": 0, "train_loss": 5.121986299753189, "train_ppl": 167.6680781445226, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 147639, "dt_s": 4.439, "eta_s": 40307, "world_size": 1, "timestamp": "2026-05-04T20:36:24.666725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1650, "epoch": 0, "train_loss": 5.152184098958969, "train_ppl": 172.80850932889058, "lr": 0.00056, "grad_norm": 0.7502, "tokens_per_sec": 146595, "dt_s": 4.471, "eta_s": 40430, "world_size": 1, "timestamp": "2026-05-04T20:36:29.137264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1660, "epoch": 0, "train_loss": 5.149110287427902, "train_ppl": 172.27814407924717, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 150098, "dt_s": 4.366, "eta_s": 40414, "world_size": 1, "timestamp": "2026-05-04T20:36:33.503474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1670, "epoch": 0, "train_loss": 5.113145291805267, "train_ppl": 166.1922568199886, "lr": 0.00056, "grad_norm": 0.7066, "tokens_per_sec": 150098, "dt_s": 4.366, "eta_s": 40171, "world_size": 1, "timestamp": "2026-05-04T20:36:37.869705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1680, "epoch": 0, "train_loss": 5.010351359844208, "train_ppl": 149.95741590802004, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 147886, "dt_s": 4.432, "eta_s": 40169, "world_size": 1, "timestamp": "2026-05-04T20:36:42.301240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1690, "epoch": 0, "train_loss": 5.1066476702690125, "train_ppl": 165.11590308939304, "lr": 0.00056, "grad_norm": 0.6673, "tokens_per_sec": 149538, "dt_s": 4.383, "eta_s": 40062, "world_size": 1, "timestamp": "2026-05-04T20:36:46.683826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1700, "epoch": 0, "train_loss": 5.160762131214142, "train_ppl": 174.29724236716217, "lr": 0.00056, "grad_norm": 0.6313, "tokens_per_sec": 147349, "dt_s": 4.448, "eta_s": 40016, "world_size": 1, "timestamp": "2026-05-04T20:36:51.131475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1710, "epoch": 0, "train_loss": 5.049985945224762, "train_ppl": 156.02027164113457, "lr": 0.00056, "grad_norm": 0.6158, "tokens_per_sec": 130508, "dt_s": 5.022, "eta_s": 41204, "world_size": 1, "timestamp": "2026-05-04T20:36:56.153061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1720, "epoch": 0, "train_loss": 5.093592405319214, "train_ppl": 162.97428136472527, "lr": 0.00056, "grad_norm": 0.6764, "tokens_per_sec": 149377, "dt_s": 4.387, "eta_s": 41238, "world_size": 1, "timestamp": "2026-05-04T20:37:00.540386"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1730, "epoch": 0, "train_loss": 5.115907192230225, "train_ppl": 166.65189773375988, "lr": 0.00056, "grad_norm": 0.639, "tokens_per_sec": 146853, "dt_s": 4.463, "eta_s": 41290, "world_size": 1, "timestamp": "2026-05-04T20:37:05.003057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1740, "epoch": 0, "train_loss": 5.066944420337677, "train_ppl": 158.68869982053508, "lr": 0.00056, "grad_norm": 0.6805, "tokens_per_sec": 147528, "dt_s": 4.442, "eta_s": 41394, "world_size": 1, "timestamp": "2026-05-04T20:37:09.445335"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1750, "epoch": 0, "train_loss": 5.15871986746788, "train_ppl": 173.94164466374644, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 149921, "dt_s": 4.371, "eta_s": 41251, "world_size": 1, "timestamp": "2026-05-04T20:37:13.816689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1760, "epoch": 0, "train_loss": 5.071189701557159, "train_ppl": 159.36380997994183, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 147122, "dt_s": 4.455, "eta_s": 40215, "world_size": 1, "timestamp": "2026-05-04T20:37:18.271229"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1770, "epoch": 0, "train_loss": 4.959928929805756, "train_ppl": 142.58366208832678, "lr": 0.00056, "grad_norm": 0.5807, "tokens_per_sec": 148956, "dt_s": 4.4, "eta_s": 40233, "world_size": 1, "timestamp": "2026-05-04T20:37:22.670915"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1780, "epoch": 0, "train_loss": 5.004991769790649, "train_ppl": 149.15585556941903, "lr": 0.00056, "grad_norm": 0.6096, "tokens_per_sec": 148514, "dt_s": 4.413, "eta_s": 40138, "world_size": 1, "timestamp": "2026-05-04T20:37:27.083712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1790, "epoch": 0, "train_loss": 5.1493527591228485, "train_ppl": 172.31992171758696, "lr": 0.00056, "grad_norm": 0.7377, "tokens_per_sec": 145835, "dt_s": 4.494, "eta_s": 40227, "world_size": 1, "timestamp": "2026-05-04T20:37:31.577552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1800, "epoch": 0, "train_loss": 5.0667319893836975, "train_ppl": 158.65499300895826, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 149857, "dt_s": 4.373, "eta_s": 40226, "world_size": 1, "timestamp": "2026-05-04T20:37:35.950802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1810, "epoch": 0, "train_loss": 5.127521187067032, "train_ppl": 168.5986750619841, "lr": 0.00056, "grad_norm": 0.6807, "tokens_per_sec": 148657, "dt_s": 4.409, "eta_s": 40138, "world_size": 1, "timestamp": "2026-05-04T20:37:40.359332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1820, "epoch": 0, "train_loss": 5.038280010223389, "train_ppl": 154.20455653763986, "lr": 0.00056, "grad_norm": 0.6342, "tokens_per_sec": 145535, "dt_s": 4.503, "eta_s": 40322, "world_size": 1, "timestamp": "2026-05-04T20:37:44.862425"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1830, "epoch": 0, "train_loss": 5.0841885805130005, "train_ppl": 161.44888330405416, "lr": 0.00056, "grad_norm": 0.7276, "tokens_per_sec": 147418, "dt_s": 4.446, "eta_s": 40377, "world_size": 1, "timestamp": "2026-05-04T20:37:49.308029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1840, "epoch": 0, "train_loss": 4.9834849536418915, "train_ppl": 145.98223754288284, "lr": 0.00056, "grad_norm": 0.6123, "tokens_per_sec": 145627, "dt_s": 4.5, "eta_s": 40384, "world_size": 1, "timestamp": "2026-05-04T20:37:53.808286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1850, "epoch": 0, "train_loss": 5.053500026464462, "train_ppl": 156.5695040091068, "lr": 0.00056, "grad_norm": 0.6822, "tokens_per_sec": 147398, "dt_s": 4.446, "eta_s": 40512, "world_size": 1, "timestamp": "2026-05-04T20:37:58.254477"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1860, "epoch": 0, "train_loss": 4.960765212774277, "train_ppl": 142.70295224973344, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 148394, "dt_s": 4.416, "eta_s": 40522, "world_size": 1, "timestamp": "2026-05-04T20:38:02.670840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1870, "epoch": 0, "train_loss": 4.973268002271652, "train_ppl": 144.4983374964832, "lr": 0.00056, "grad_norm": 0.6331, "tokens_per_sec": 147895, "dt_s": 4.431, "eta_s": 40387, "world_size": 1, "timestamp": "2026-05-04T20:38:07.102106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1880, "epoch": 0, "train_loss": 4.912342220544815, "train_ppl": 135.9574841809465, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 149091, "dt_s": 4.396, "eta_s": 40292, "world_size": 1, "timestamp": "2026-05-04T20:38:11.497790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1890, "epoch": 0, "train_loss": 4.907777726650238, "train_ppl": 135.33832123223098, "lr": 0.00056, "grad_norm": 0.6391, "tokens_per_sec": 151175, "dt_s": 4.335, "eta_s": 39987, "world_size": 1, "timestamp": "2026-05-04T20:38:15.832875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1900, "epoch": 0, "train_loss": 5.206290066242218, "train_ppl": 182.416049827093, "lr": 0.00056, "grad_norm": 1.1517, "tokens_per_sec": 147367, "dt_s": 4.447, "eta_s": 39985, "world_size": 1, "timestamp": "2026-05-04T20:38:20.280003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1910, "epoch": 0, "train_loss": 5.108527421951294, "train_ppl": 165.42657188461382, "lr": 0.00056, "grad_norm": 0.6877, "tokens_per_sec": 149079, "dt_s": 4.396, "eta_s": 39944, "world_size": 1, "timestamp": "2026-05-04T20:38:24.676066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1920, "epoch": 0, "train_loss": 5.101145893335342, "train_ppl": 164.20996663771984, "lr": 0.00056, "grad_norm": 0.7235, "tokens_per_sec": 148270, "dt_s": 4.42, "eta_s": 39919, "world_size": 1, "timestamp": "2026-05-04T20:38:29.096139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1930, "epoch": 0, "train_loss": 4.960522949695587, "train_ppl": 142.66838478056314, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 146997, "dt_s": 4.458, "eta_s": 40028, "world_size": 1, "timestamp": "2026-05-04T20:38:33.554449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1940, "epoch": 0, "train_loss": 4.955588400363922, "train_ppl": 141.9661147159429, "lr": 0.00056, "grad_norm": 0.7172, "tokens_per_sec": 147707, "dt_s": 4.437, "eta_s": 40208, "world_size": 1, "timestamp": "2026-05-04T20:38:37.991348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1950, "epoch": 0, "train_loss": 4.922532439231873, "train_ppl": 137.35000366595256, "lr": 0.00056, "grad_norm": 0.6085, "tokens_per_sec": 147557, "dt_s": 4.441, "eta_s": 40193, "world_size": 1, "timestamp": "2026-05-04T20:38:42.432743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1960, "epoch": 0, "train_loss": 5.011225342750549, "train_ppl": 150.08853341508663, "lr": 0.00056, "grad_norm": 0.6863, "tokens_per_sec": 150046, "dt_s": 4.368, "eta_s": 40138, "world_size": 1, "timestamp": "2026-05-04T20:38:46.800484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1970, "epoch": 0, "train_loss": 4.880510061979294, "train_ppl": 131.69782080842805, "lr": 0.00056, "grad_norm": 0.6235, "tokens_per_sec": 148178, "dt_s": 4.423, "eta_s": 40138, "world_size": 1, "timestamp": "2026-05-04T20:38:51.223250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1980, "epoch": 0, "train_loss": 4.961283892393112, "train_ppl": 142.7769885615162, "lr": 0.00056, "grad_norm": 0.6295, "tokens_per_sec": 146498, "dt_s": 4.474, "eta_s": 40161, "world_size": 1, "timestamp": "2026-05-04T20:38:55.696771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 1990, "epoch": 0, "train_loss": 4.902181565761566, "train_ppl": 134.58306146084857, "lr": 0.00056, "grad_norm": 0.6991, "tokens_per_sec": 148773, "dt_s": 4.405, "eta_s": 40099, "world_size": 1, "timestamp": "2026-05-04T20:39:00.101840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2000, "epoch": 0, "train_loss": 4.989797383546829, "train_ppl": 146.90665477464267, "lr": 0.00056, "grad_norm": 0.5958, "tokens_per_sec": 148450, "dt_s": 4.415, "eta_s": 40046, "world_size": 1, "timestamp": "2026-05-04T20:39:04.516529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2010, "epoch": 0, "train_loss": 4.91170135140419, "train_ppl": 135.87038113870503, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 98107, "dt_s": 6.68, "eta_s": 41400, "world_size": 1, "timestamp": "2026-05-04T20:39:11.196618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2020, "epoch": 0, "train_loss": 4.873417496681213, "train_ppl": 130.76705009259416, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 148489, "dt_s": 4.414, "eta_s": 41379, "world_size": 1, "timestamp": "2026-05-04T20:39:15.610149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2030, "epoch": 0, "train_loss": 4.94978129863739, "train_ppl": 141.14409214047416, "lr": 0.00056, "grad_norm": 0.8881, "tokens_per_sec": 147025, "dt_s": 4.457, "eta_s": 41345, "world_size": 1, "timestamp": "2026-05-04T20:39:20.067634"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2040, "epoch": 0, "train_loss": 4.85649374127388, "train_ppl": 128.57260204884005, "lr": 0.00056, "grad_norm": 0.6215, "tokens_per_sec": 149383, "dt_s": 4.387, "eta_s": 41308, "world_size": 1, "timestamp": "2026-05-04T20:39:24.454748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2050, "epoch": 0, "train_loss": 4.76366126537323, "train_ppl": 117.17414715241932, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 146425, "dt_s": 4.476, "eta_s": 41414, "world_size": 1, "timestamp": "2026-05-04T20:39:28.930484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2060, "epoch": 0, "train_loss": 4.884895235300064, "train_ppl": 132.2766066893507, "lr": 0.00056, "grad_norm": 0.6866, "tokens_per_sec": 145840, "dt_s": 4.494, "eta_s": 40281, "world_size": 1, "timestamp": "2026-05-04T20:39:33.424168"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2070, "epoch": 0, "train_loss": 4.75858548283577, "train_ppl": 116.58090352395836, "lr": 0.00056, "grad_norm": 0.7096, "tokens_per_sec": 146950, "dt_s": 4.46, "eta_s": 40360, "world_size": 1, "timestamp": "2026-05-04T20:39:37.883933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2080, "epoch": 0, "train_loss": 4.908489286899567, "train_ppl": 135.43465687207194, "lr": 0.00056, "grad_norm": 0.6863, "tokens_per_sec": 146221, "dt_s": 4.482, "eta_s": 40400, "world_size": 1, "timestamp": "2026-05-04T20:39:42.365913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2090, "epoch": 0, "train_loss": 4.7211754322052, "train_ppl": 112.30017636288663, "lr": 0.00056, "grad_norm": 0.5873, "tokens_per_sec": 147566, "dt_s": 4.441, "eta_s": 40493, "world_size": 1, "timestamp": "2026-05-04T20:39:46.807050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2100, "epoch": 0, "train_loss": 4.813448488712311, "train_ppl": 123.15558672126828, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 147647, "dt_s": 4.439, "eta_s": 40422, "world_size": 1, "timestamp": "2026-05-04T20:39:51.245764"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2110, "epoch": 0, "train_loss": 4.809849262237549, "train_ppl": 122.71311862213469, "lr": 0.00056, "grad_norm": 0.6329, "tokens_per_sec": 144592, "dt_s": 4.532, "eta_s": 40488, "world_size": 1, "timestamp": "2026-05-04T20:39:55.778248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2120, "epoch": 0, "train_loss": 4.85774353146553, "train_ppl": 128.7333912812694, "lr": 0.00056, "grad_norm": 0.6025, "tokens_per_sec": 147946, "dt_s": 4.43, "eta_s": 40429, "world_size": 1, "timestamp": "2026-05-04T20:40:00.207954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2130, "epoch": 0, "train_loss": 4.885583758354187, "train_ppl": 132.367713543563, "lr": 0.00056, "grad_norm": 0.6158, "tokens_per_sec": 148791, "dt_s": 4.405, "eta_s": 40284, "world_size": 1, "timestamp": "2026-05-04T20:40:04.612534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2140, "epoch": 0, "train_loss": 4.720530778169632, "train_ppl": 112.22780493077349, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 145184, "dt_s": 4.514, "eta_s": 40411, "world_size": 1, "timestamp": "2026-05-04T20:40:09.126528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2150, "epoch": 0, "train_loss": 4.822426974773407, "train_ppl": 124.26631631345856, "lr": 0.00056, "grad_norm": 0.594, "tokens_per_sec": 148970, "dt_s": 4.399, "eta_s": 40336, "world_size": 1, "timestamp": "2026-05-04T20:40:13.525830"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2160, "epoch": 0, "train_loss": 4.822132498025894, "train_ppl": 124.22972816024975, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 147562, "dt_s": 4.441, "eta_s": 40166, "world_size": 1, "timestamp": "2026-05-04T20:40:17.967084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2170, "epoch": 0, "train_loss": 4.796190291643143, "train_ppl": 121.04837894061548, "lr": 0.00056, "grad_norm": 0.5819, "tokens_per_sec": 144900, "dt_s": 4.523, "eta_s": 40330, "world_size": 1, "timestamp": "2026-05-04T20:40:22.489911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2180, "epoch": 0, "train_loss": 4.700107544660568, "train_ppl": 109.95899731930457, "lr": 0.00056, "grad_norm": 0.5922, "tokens_per_sec": 149148, "dt_s": 4.394, "eta_s": 40307, "world_size": 1, "timestamp": "2026-05-04T20:40:26.883941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2190, "epoch": 0, "train_loss": 4.756842225790024, "train_ppl": 116.37785008108563, "lr": 0.00056, "grad_norm": 0.618, "tokens_per_sec": 147755, "dt_s": 4.435, "eta_s": 40160, "world_size": 1, "timestamp": "2026-05-04T20:40:31.319391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2200, "epoch": 0, "train_loss": 4.840478986501694, "train_ppl": 126.52994335259766, "lr": 0.00056, "grad_norm": 0.6393, "tokens_per_sec": 148023, "dt_s": 4.427, "eta_s": 40207, "world_size": 1, "timestamp": "2026-05-04T20:40:35.746839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2210, "epoch": 0, "train_loss": 4.852214813232422, "train_ppl": 128.02362449191384, "lr": 0.00056, "grad_norm": 0.621, "tokens_per_sec": 149580, "dt_s": 4.381, "eta_s": 40094, "world_size": 1, "timestamp": "2026-05-04T20:40:40.128156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2220, "epoch": 0, "train_loss": 4.854077905416489, "train_ppl": 128.26236663633384, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 145052, "dt_s": 4.518, "eta_s": 40081, "world_size": 1, "timestamp": "2026-05-04T20:40:44.646273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2230, "epoch": 0, "train_loss": 4.763171404600143, "train_ppl": 117.11676219057789, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 148885, "dt_s": 4.402, "eta_s": 40090, "world_size": 1, "timestamp": "2026-05-04T20:40:49.048040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2240, "epoch": 0, "train_loss": 4.845648616552353, "train_ppl": 127.1857500284714, "lr": 0.00056, "grad_norm": 0.585, "tokens_per_sec": 148861, "dt_s": 4.403, "eta_s": 40026, "world_size": 1, "timestamp": "2026-05-04T20:40:53.450574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2250, "epoch": 0, "train_loss": 4.894436746835709, "train_ppl": 133.5447659147788, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 146480, "dt_s": 4.474, "eta_s": 40106, "world_size": 1, "timestamp": "2026-05-04T20:40:57.924639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2260, "epoch": 0, "train_loss": 4.660080671310425, "train_ppl": 105.64460430157689, "lr": 0.00056, "grad_norm": 0.5593, "tokens_per_sec": 150065, "dt_s": 4.367, "eta_s": 40076, "world_size": 1, "timestamp": "2026-05-04T20:41:02.291798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2270, "epoch": 0, "train_loss": 4.697937160730362, "train_ppl": 109.72060287585677, "lr": 0.00056, "grad_norm": 0.6165, "tokens_per_sec": 148609, "dt_s": 4.41, "eta_s": 39876, "world_size": 1, "timestamp": "2026-05-04T20:41:06.701772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2280, "epoch": 0, "train_loss": 4.70198467373848, "train_ppl": 110.16559839833191, "lr": 0.00056, "grad_norm": 0.6056, "tokens_per_sec": 146974, "dt_s": 4.459, "eta_s": 39975, "world_size": 1, "timestamp": "2026-05-04T20:41:11.160795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2290, "epoch": 0, "train_loss": 4.690825670957565, "train_ppl": 108.94309383121255, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 149614, "dt_s": 4.38, "eta_s": 39931, "world_size": 1, "timestamp": "2026-05-04T20:41:15.541118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2300, "epoch": 0, "train_loss": 4.705232322216034, "train_ppl": 110.5239591361256, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 131622, "dt_s": 4.979, "eta_s": 40839, "world_size": 1, "timestamp": "2026-05-04T20:41:20.520248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2310, "epoch": 0, "train_loss": 5.0447534918785095, "train_ppl": 155.20603493421234, "lr": 0.00056, "grad_norm": 1.2264, "tokens_per_sec": 150611, "dt_s": 4.351, "eta_s": 40806, "world_size": 1, "timestamp": "2026-05-04T20:41:24.871570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2320, "epoch": 0, "train_loss": 4.809573769569397, "train_ppl": 122.67931671397095, "lr": 0.00056, "grad_norm": 0.7386, "tokens_per_sec": 150340, "dt_s": 4.359, "eta_s": 40710, "world_size": 1, "timestamp": "2026-05-04T20:41:29.230774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2330, "epoch": 0, "train_loss": 4.786202758550644, "train_ppl": 119.84542153823404, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 147495, "dt_s": 4.443, "eta_s": 40677, "world_size": 1, "timestamp": "2026-05-04T20:41:33.674053"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2340, "epoch": 0, "train_loss": 4.729464203119278, "train_ppl": 113.23487519919833, "lr": 0.00056, "grad_norm": 0.6255, "tokens_per_sec": 149284, "dt_s": 4.39, "eta_s": 40690, "world_size": 1, "timestamp": "2026-05-04T20:41:38.064076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2350, "epoch": 0, "train_loss": 4.745012521743774, "train_ppl": 115.00924561039825, "lr": 0.00056, "grad_norm": 0.5538, "tokens_per_sec": 148535, "dt_s": 4.412, "eta_s": 39661, "world_size": 1, "timestamp": "2026-05-04T20:41:42.476235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2360, "epoch": 0, "train_loss": 4.661291480064392, "train_ppl": 105.77259718508948, "lr": 0.00056, "grad_norm": 0.6048, "tokens_per_sec": 145844, "dt_s": 4.494, "eta_s": 39914, "world_size": 1, "timestamp": "2026-05-04T20:41:46.969797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2370, "epoch": 0, "train_loss": 4.757176488637924, "train_ppl": 116.41675737495541, "lr": 0.00056, "grad_norm": 0.6173, "tokens_per_sec": 148491, "dt_s": 4.413, "eta_s": 40007, "world_size": 1, "timestamp": "2026-05-04T20:41:51.383267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2380, "epoch": 0, "train_loss": 4.676348239183426, "train_ppl": 107.37723974549253, "lr": 0.00056, "grad_norm": 0.6725, "tokens_per_sec": 148913, "dt_s": 4.401, "eta_s": 39926, "world_size": 1, "timestamp": "2026-05-04T20:41:55.784214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2390, "epoch": 0, "train_loss": 4.754651188850403, "train_ppl": 116.1231410529463, "lr": 0.00056, "grad_norm": 0.5563, "tokens_per_sec": 147959, "dt_s": 4.429, "eta_s": 39993, "world_size": 1, "timestamp": "2026-05-04T20:42:00.213526"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2400, "epoch": 0, "train_loss": 4.782497674226761, "train_ppl": 119.40220572860359, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 150005, "dt_s": 4.369, "eta_s": 39910, "world_size": 1, "timestamp": "2026-05-04T20:42:04.582439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2410, "epoch": 0, "train_loss": 4.653554141521454, "train_ppl": 104.9573567548263, "lr": 0.00056, "grad_norm": 0.6096, "tokens_per_sec": 148954, "dt_s": 4.4, "eta_s": 39737, "world_size": 1, "timestamp": "2026-05-04T20:42:08.982204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2420, "epoch": 0, "train_loss": 4.763200342655182, "train_ppl": 117.120151370926, "lr": 0.00056, "grad_norm": 0.5708, "tokens_per_sec": 147383, "dt_s": 4.447, "eta_s": 39792, "world_size": 1, "timestamp": "2026-05-04T20:42:13.428841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2430, "epoch": 0, "train_loss": 4.67936447262764, "train_ppl": 107.70160349964927, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 149651, "dt_s": 4.379, "eta_s": 39748, "world_size": 1, "timestamp": "2026-05-04T20:42:17.808093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2440, "epoch": 0, "train_loss": 4.760272979736328, "train_ppl": 116.77779952128778, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 147024, "dt_s": 4.457, "eta_s": 39795, "world_size": 1, "timestamp": "2026-05-04T20:42:22.265572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2450, "epoch": 0, "train_loss": 4.599454551935196, "train_ppl": 99.43006691076626, "lr": 0.00056, "grad_norm": 0.6039, "tokens_per_sec": 149250, "dt_s": 4.391, "eta_s": 39830, "world_size": 1, "timestamp": "2026-05-04T20:42:26.656608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2460, "epoch": 0, "train_loss": 4.736077547073364, "train_ppl": 113.98621808253384, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 149896, "dt_s": 4.372, "eta_s": 39776, "world_size": 1, "timestamp": "2026-05-04T20:42:31.028712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2470, "epoch": 0, "train_loss": 4.8404639065265656, "train_ppl": 126.52803529858566, "lr": 0.00056, "grad_norm": 0.7337, "tokens_per_sec": 145338, "dt_s": 4.509, "eta_s": 39885, "world_size": 1, "timestamp": "2026-05-04T20:42:35.537911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2480, "epoch": 0, "train_loss": 4.683907061815262, "train_ppl": 108.19196054127976, "lr": 0.00056, "grad_norm": 0.6153, "tokens_per_sec": 150669, "dt_s": 4.35, "eta_s": 39827, "world_size": 1, "timestamp": "2026-05-04T20:42:39.887583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2490, "epoch": 0, "train_loss": 4.7080395221710205, "train_ppl": 110.83465788195502, "lr": 0.00056, "grad_norm": 0.5993, "tokens_per_sec": 151141, "dt_s": 4.336, "eta_s": 39603, "world_size": 1, "timestamp": "2026-05-04T20:42:44.223675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2500, "epoch": 0, "train_loss": 4.667839229106903, "train_ppl": 106.4674419591281, "lr": 0.00056, "grad_norm": 0.5957, "tokens_per_sec": 146857, "dt_s": 4.463, "eta_s": 39728, "world_size": 1, "timestamp": "2026-05-04T20:42:48.686256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2510, "epoch": 0, "train_loss": 4.695744514465332, "train_ppl": 109.48028796484034, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 111138, "dt_s": 5.897, "eta_s": 39771, "world_size": 1, "timestamp": "2026-05-04T20:42:54.583048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2520, "epoch": 0, "train_loss": 4.694962859153748, "train_ppl": 109.39474555293457, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 139621, "dt_s": 4.694, "eta_s": 40100, "world_size": 1, "timestamp": "2026-05-04T20:42:59.276869"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2530, "epoch": 0, "train_loss": 4.686710745096207, "train_ppl": 108.4957221588863, "lr": 0.00056, "grad_norm": 0.6153, "tokens_per_sec": 147324, "dt_s": 4.448, "eta_s": 40273, "world_size": 1, "timestamp": "2026-05-04T20:43:03.725315"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2540, "epoch": 0, "train_loss": 4.650404930114746, "train_ppl": 104.62734376279836, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 149850, "dt_s": 4.373, "eta_s": 40336, "world_size": 1, "timestamp": "2026-05-04T20:43:08.098748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2550, "epoch": 0, "train_loss": 4.686528921127319, "train_ppl": 108.47599682939926, "lr": 0.00056, "grad_norm": 1.0349, "tokens_per_sec": 146907, "dt_s": 4.461, "eta_s": 40329, "world_size": 1, "timestamp": "2026-05-04T20:43:12.559789"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2560, "epoch": 0, "train_loss": 4.8516334891319275, "train_ppl": 127.94922290137814, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 150406, "dt_s": 4.357, "eta_s": 40250, "world_size": 1, "timestamp": "2026-05-04T20:43:16.917041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2570, "epoch": 0, "train_loss": 4.633613705635071, "train_ppl": 102.88518992749817, "lr": 0.00056, "grad_norm": 0.6343, "tokens_per_sec": 149186, "dt_s": 4.393, "eta_s": 39704, "world_size": 1, "timestamp": "2026-05-04T20:43:21.309944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2580, "epoch": 0, "train_loss": 4.702022463083267, "train_ppl": 110.16976156277464, "lr": 0.00056, "grad_norm": 0.6656, "tokens_per_sec": 147061, "dt_s": 4.456, "eta_s": 39713, "world_size": 1, "timestamp": "2026-05-04T20:43:25.766350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2590, "epoch": 0, "train_loss": 4.700495153665543, "train_ppl": 110.00162667807143, "lr": 0.00056, "grad_norm": 0.6883, "tokens_per_sec": 150034, "dt_s": 4.368, "eta_s": 39699, "world_size": 1, "timestamp": "2026-05-04T20:43:30.134422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2600, "epoch": 0, "train_loss": 4.659658998250961, "train_ppl": 105.60006620898206, "lr": 0.00056, "grad_norm": 0.5993, "tokens_per_sec": 133061, "dt_s": 4.925, "eta_s": 40531, "world_size": 1, "timestamp": "2026-05-04T20:43:35.059688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2610, "epoch": 0, "train_loss": 4.689574509859085, "train_ppl": 108.80687370469438, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 148460, "dt_s": 4.414, "eta_s": 40630, "world_size": 1, "timestamp": "2026-05-04T20:43:39.474079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2620, "epoch": 0, "train_loss": 4.604752421379089, "train_ppl": 99.95823226424817, "lr": 0.00056, "grad_norm": 0.6225, "tokens_per_sec": 150023, "dt_s": 4.368, "eta_s": 40581, "world_size": 1, "timestamp": "2026-05-04T20:43:43.842492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2630, "epoch": 0, "train_loss": 4.680998504161835, "train_ppl": 107.8777351792202, "lr": 0.00056, "grad_norm": 0.6348, "tokens_per_sec": 148071, "dt_s": 4.426, "eta_s": 40522, "world_size": 1, "timestamp": "2026-05-04T20:43:48.268475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2640, "epoch": 0, "train_loss": 4.628092139959335, "train_ppl": 102.31866807738174, "lr": 0.00056, "grad_norm": 0.6067, "tokens_per_sec": 148518, "dt_s": 4.413, "eta_s": 40598, "world_size": 1, "timestamp": "2026-05-04T20:43:52.681131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2650, "epoch": 0, "train_loss": 4.623812317848206, "train_ppl": 101.8816981231923, "lr": 0.00056, "grad_norm": 0.648, "tokens_per_sec": 149557, "dt_s": 4.382, "eta_s": 39615, "world_size": 1, "timestamp": "2026-05-04T20:43:57.063151"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2660, "epoch": 0, "train_loss": 4.734550356864929, "train_ppl": 113.81227230433424, "lr": 0.00056, "grad_norm": 0.6301, "tokens_per_sec": 147042, "dt_s": 4.457, "eta_s": 39687, "world_size": 1, "timestamp": "2026-05-04T20:44:01.520114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2670, "epoch": 0, "train_loss": 4.628888696432114, "train_ppl": 102.40020314405622, "lr": 0.00056, "grad_norm": 0.6222, "tokens_per_sec": 148244, "dt_s": 4.421, "eta_s": 39777, "world_size": 1, "timestamp": "2026-05-04T20:44:05.940928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2680, "epoch": 0, "train_loss": 4.717226177453995, "train_ppl": 111.85754895730709, "lr": 0.00056, "grad_norm": 0.6156, "tokens_per_sec": 150363, "dt_s": 4.359, "eta_s": 39651, "world_size": 1, "timestamp": "2026-05-04T20:44:10.299444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2690, "epoch": 0, "train_loss": 4.536201179027557, "train_ppl": 93.3355607157099, "lr": 0.00056, "grad_norm": 0.8126, "tokens_per_sec": 146702, "dt_s": 4.467, "eta_s": 39745, "world_size": 1, "timestamp": "2026-05-04T20:44:14.766746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2700, "epoch": 0, "train_loss": 4.575514882802963, "train_ppl": 97.07801006917306, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 149815, "dt_s": 4.374, "eta_s": 39727, "world_size": 1, "timestamp": "2026-05-04T20:44:19.141210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2710, "epoch": 0, "train_loss": 4.623826622962952, "train_ppl": 101.88315556299887, "lr": 0.00056, "grad_norm": 0.5925, "tokens_per_sec": 148841, "dt_s": 4.403, "eta_s": 39626, "world_size": 1, "timestamp": "2026-05-04T20:44:23.544314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2720, "epoch": 0, "train_loss": 4.567667245864868, "train_ppl": 96.31915858147089, "lr": 0.00056, "grad_norm": 0.627, "tokens_per_sec": 145865, "dt_s": 4.493, "eta_s": 39751, "world_size": 1, "timestamp": "2026-05-04T20:44:28.037262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2730, "epoch": 0, "train_loss": 4.673119008541107, "train_ppl": 107.03105313166355, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 149092, "dt_s": 4.396, "eta_s": 39813, "world_size": 1, "timestamp": "2026-05-04T20:44:32.432894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2740, "epoch": 0, "train_loss": 4.593325585126877, "train_ppl": 98.82252702862043, "lr": 0.00056, "grad_norm": 0.6316, "tokens_per_sec": 147422, "dt_s": 4.445, "eta_s": 39770, "world_size": 1, "timestamp": "2026-05-04T20:44:36.878403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2750, "epoch": 0, "train_loss": 4.5441475212574005, "train_ppl": 94.08019165122964, "lr": 0.00056, "grad_norm": 0.6233, "tokens_per_sec": 149135, "dt_s": 4.394, "eta_s": 39801, "world_size": 1, "timestamp": "2026-05-04T20:44:41.272811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2760, "epoch": 0, "train_loss": 4.619656145572662, "train_ppl": 101.45913895667931, "lr": 0.00056, "grad_norm": 0.6174, "tokens_per_sec": 150165, "dt_s": 4.364, "eta_s": 39727, "world_size": 1, "timestamp": "2026-05-04T20:44:45.637037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2770, "epoch": 0, "train_loss": 4.544961303472519, "train_ppl": 94.15678359834567, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 149069, "dt_s": 4.396, "eta_s": 39549, "world_size": 1, "timestamp": "2026-05-04T20:44:50.033425"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2780, "epoch": 0, "train_loss": 4.557483524084091, "train_ppl": 95.3432486998169, "lr": 0.00056, "grad_norm": 0.5991, "tokens_per_sec": 149215, "dt_s": 4.392, "eta_s": 39538, "world_size": 1, "timestamp": "2026-05-04T20:44:54.425464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2790, "epoch": 0, "train_loss": 4.513565123081207, "train_ppl": 91.24654448437737, "lr": 0.00056, "grad_norm": 0.6966, "tokens_per_sec": 151320, "dt_s": 4.331, "eta_s": 39328, "world_size": 1, "timestamp": "2026-05-04T20:44:58.756439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2800, "epoch": 0, "train_loss": 4.5987690687179565, "train_ppl": 99.36193262373303, "lr": 0.00056, "grad_norm": 0.6128, "tokens_per_sec": 148184, "dt_s": 4.423, "eta_s": 39374, "world_size": 1, "timestamp": "2026-05-04T20:45:03.179027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2810, "epoch": 0, "train_loss": 4.559743374586105, "train_ppl": 95.55895382712582, "lr": 0.00056, "grad_norm": 0.5996, "tokens_per_sec": 149309, "dt_s": 4.389, "eta_s": 39415, "world_size": 1, "timestamp": "2026-05-04T20:45:07.568339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2820, "epoch": 0, "train_loss": 4.6369359493255615, "train_ppl": 103.22756801739337, "lr": 0.00056, "grad_norm": 0.6492, "tokens_per_sec": 148733, "dt_s": 4.406, "eta_s": 39428, "world_size": 1, "timestamp": "2026-05-04T20:45:11.974606"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2830, "epoch": 0, "train_loss": 4.578732132911682, "train_ppl": 97.39083725944667, "lr": 0.00056, "grad_norm": 0.5596, "tokens_per_sec": 148590, "dt_s": 4.411, "eta_s": 39457, "world_size": 1, "timestamp": "2026-05-04T20:45:16.385128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2840, "epoch": 0, "train_loss": 4.707865446805954, "train_ppl": 110.81536597759356, "lr": 0.00056, "grad_norm": 0.7921, "tokens_per_sec": 149057, "dt_s": 4.397, "eta_s": 39571, "world_size": 1, "timestamp": "2026-05-04T20:45:20.781853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2850, "epoch": 0, "train_loss": 4.577597051858902, "train_ppl": 97.28035348124308, "lr": 0.00056, "grad_norm": 0.5665, "tokens_per_sec": 148835, "dt_s": 4.403, "eta_s": 39531, "world_size": 1, "timestamp": "2026-05-04T20:45:25.185097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2860, "epoch": 0, "train_loss": 4.512232631444931, "train_ppl": 91.12504019671428, "lr": 0.00056, "grad_norm": 0.5646, "tokens_per_sec": 148256, "dt_s": 4.42, "eta_s": 39583, "world_size": 1, "timestamp": "2026-05-04T20:45:29.605564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2870, "epoch": 0, "train_loss": 4.567695677280426, "train_ppl": 96.32189711042464, "lr": 0.00056, "grad_norm": 0.8234, "tokens_per_sec": 148588, "dt_s": 4.411, "eta_s": 39586, "world_size": 1, "timestamp": "2026-05-04T20:45:34.016168"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2880, "epoch": 0, "train_loss": 4.6567299365997314, "train_ppl": 105.29120965544236, "lr": 0.00056, "grad_norm": 0.6263, "tokens_per_sec": 145842, "dt_s": 4.494, "eta_s": 39731, "world_size": 1, "timestamp": "2026-05-04T20:45:38.509776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2890, "epoch": 0, "train_loss": 4.6319176852703094, "train_ppl": 102.71084244038546, "lr": 0.00056, "grad_norm": 0.7065, "tokens_per_sec": 133784, "dt_s": 4.899, "eta_s": 40628, "world_size": 1, "timestamp": "2026-05-04T20:45:43.408398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2900, "epoch": 0, "train_loss": 4.604326605796814, "train_ppl": 99.91567755224608, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 149582, "dt_s": 4.381, "eta_s": 40584, "world_size": 1, "timestamp": "2026-05-04T20:45:47.789705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2910, "epoch": 0, "train_loss": 4.622061163187027, "train_ppl": 101.70344363377716, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 147795, "dt_s": 4.434, "eta_s": 40604, "world_size": 1, "timestamp": "2026-05-04T20:45:52.223961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2920, "epoch": 0, "train_loss": 4.57199826836586, "train_ppl": 96.73722369577555, "lr": 0.00056, "grad_norm": 0.6227, "tokens_per_sec": 149626, "dt_s": 4.38, "eta_s": 40545, "world_size": 1, "timestamp": "2026-05-04T20:45:56.603979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2930, "epoch": 0, "train_loss": 4.589256793260574, "train_ppl": 98.42125563292385, "lr": 0.00056, "grad_norm": 0.5752, "tokens_per_sec": 148168, "dt_s": 4.423, "eta_s": 40414, "world_size": 1, "timestamp": "2026-05-04T20:46:01.027000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2940, "epoch": 0, "train_loss": 4.642751187086105, "train_ppl": 103.82960967995987, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 147792, "dt_s": 4.434, "eta_s": 39576, "world_size": 1, "timestamp": "2026-05-04T20:46:05.461382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2950, "epoch": 0, "train_loss": 4.574923813343048, "train_ppl": 97.02064717658885, "lr": 0.00056, "grad_norm": 0.6273, "tokens_per_sec": 148287, "dt_s": 4.42, "eta_s": 39640, "world_size": 1, "timestamp": "2026-05-04T20:46:09.880897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2960, "epoch": 0, "train_loss": 4.51192370057106, "train_ppl": 91.09689320637581, "lr": 0.00056, "grad_norm": 0.5888, "tokens_per_sec": 147870, "dt_s": 4.432, "eta_s": 39632, "world_size": 1, "timestamp": "2026-05-04T20:46:14.312897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2970, "epoch": 0, "train_loss": 4.59219953417778, "train_ppl": 98.71131045778209, "lr": 0.00056, "grad_norm": 0.6353, "tokens_per_sec": 147326, "dt_s": 4.448, "eta_s": 39750, "world_size": 1, "timestamp": "2026-05-04T20:46:18.761278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2980, "epoch": 0, "train_loss": 4.591606438159943, "train_ppl": 98.65278253068996, "lr": 0.00056, "grad_norm": 0.6397, "tokens_per_sec": 148922, "dt_s": 4.401, "eta_s": 39706, "world_size": 1, "timestamp": "2026-05-04T20:46:23.161965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 2990, "epoch": 0, "train_loss": 4.605912059545517, "train_ppl": 100.07421488136774, "lr": 0.00056, "grad_norm": 0.7746, "tokens_per_sec": 146084, "dt_s": 4.486, "eta_s": 39794, "world_size": 1, "timestamp": "2026-05-04T20:46:27.648128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3000, "epoch": 0, "train_loss": 4.490844041109085, "train_ppl": 89.19669979691405, "lr": 0.00056, "grad_norm": 0.6156, "tokens_per_sec": 149243, "dt_s": 4.391, "eta_s": 39739, "world_size": 1, "timestamp": "2026-05-04T20:46:32.039394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3010, "epoch": 0, "train_loss": 4.506465882062912, "train_ppl": 90.60105721883478, "lr": 0.00056, "grad_norm": 1.0022, "tokens_per_sec": 108561, "dt_s": 6.037, "eta_s": 39699, "world_size": 1, "timestamp": "2026-05-04T20:46:38.076147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3020, "epoch": 0, "train_loss": 4.725346058607101, "train_ppl": 112.76951648420138, "lr": 0.00056, "grad_norm": 0.6713, "tokens_per_sec": 143312, "dt_s": 4.573, "eta_s": 39918, "world_size": 1, "timestamp": "2026-05-04T20:46:42.649104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3030, "epoch": 0, "train_loss": 4.50160014629364, "train_ppl": 90.16128718402894, "lr": 0.00056, "grad_norm": 0.5875, "tokens_per_sec": 147941, "dt_s": 4.43, "eta_s": 39966, "world_size": 1, "timestamp": "2026-05-04T20:46:47.078974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3040, "epoch": 0, "train_loss": 4.6365578174591064, "train_ppl": 103.18854176342946, "lr": 0.00056, "grad_norm": 0.6427, "tokens_per_sec": 147077, "dt_s": 4.456, "eta_s": 39907, "world_size": 1, "timestamp": "2026-05-04T20:46:51.534885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3050, "epoch": 0, "train_loss": 4.55188250541687, "train_ppl": 94.8107221218827, "lr": 0.00056, "grad_norm": 0.6108, "tokens_per_sec": 147134, "dt_s": 4.454, "eta_s": 40015, "world_size": 1, "timestamp": "2026-05-04T20:46:55.989074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3060, "epoch": 0, "train_loss": 4.484782963991165, "train_ppl": 88.65770680987687, "lr": 0.00056, "grad_norm": 0.5875, "tokens_per_sec": 147264, "dt_s": 4.45, "eta_s": 40079, "world_size": 1, "timestamp": "2026-05-04T20:47:00.439296"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3070, "epoch": 0, "train_loss": 4.528352200984955, "train_ppl": 92.6058394783341, "lr": 0.00056, "grad_norm": 0.5643, "tokens_per_sec": 146042, "dt_s": 4.487, "eta_s": 39921, "world_size": 1, "timestamp": "2026-05-04T20:47:04.926740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3080, "epoch": 0, "train_loss": 4.485745847225189, "train_ppl": 88.74311494176143, "lr": 0.00056, "grad_norm": 0.5946, "tokens_per_sec": 147358, "dt_s": 4.447, "eta_s": 39948, "world_size": 1, "timestamp": "2026-05-04T20:47:09.374135"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3090, "epoch": 0, "train_loss": 4.5468045473098755, "train_ppl": 94.33049755886879, "lr": 0.00056, "grad_norm": 0.5882, "tokens_per_sec": 147699, "dt_s": 4.437, "eta_s": 39910, "world_size": 1, "timestamp": "2026-05-04T20:47:13.811305"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3100, "epoch": 0, "train_loss": 4.443058609962463, "train_ppl": 85.03463209840636, "lr": 0.00056, "grad_norm": 0.6195, "tokens_per_sec": 144935, "dt_s": 4.522, "eta_s": 40027, "world_size": 1, "timestamp": "2026-05-04T20:47:18.333023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3110, "epoch": 0, "train_loss": 4.598532319068909, "train_ppl": 99.33841150547376, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 147941, "dt_s": 4.43, "eta_s": 39986, "world_size": 1, "timestamp": "2026-05-04T20:47:22.762884"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3120, "epoch": 0, "train_loss": 4.570098668336868, "train_ppl": 96.55363608955555, "lr": 0.00056, "grad_norm": 0.5975, "tokens_per_sec": 148436, "dt_s": 4.415, "eta_s": 39852, "world_size": 1, "timestamp": "2026-05-04T20:47:27.177983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3130, "epoch": 0, "train_loss": 4.520143538713455, "train_ppl": 91.84878088784626, "lr": 0.00056, "grad_norm": 0.6025, "tokens_per_sec": 145720, "dt_s": 4.497, "eta_s": 39937, "world_size": 1, "timestamp": "2026-05-04T20:47:31.675371"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3140, "epoch": 0, "train_loss": 4.557549819350243, "train_ppl": 95.34956971538959, "lr": 0.00056, "grad_norm": 0.7301, "tokens_per_sec": 148274, "dt_s": 4.42, "eta_s": 39901, "world_size": 1, "timestamp": "2026-05-04T20:47:36.095297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3150, "epoch": 0, "train_loss": 4.5247092843055725, "train_ppl": 92.2690978543333, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 145120, "dt_s": 4.516, "eta_s": 39887, "world_size": 1, "timestamp": "2026-05-04T20:47:40.611272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3160, "epoch": 0, "train_loss": 4.47326523065567, "train_ppl": 87.642429057627, "lr": 0.00056, "grad_norm": 0.5634, "tokens_per_sec": 149515, "dt_s": 4.383, "eta_s": 39799, "world_size": 1, "timestamp": "2026-05-04T20:47:44.994489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3170, "epoch": 0, "train_loss": 4.46873551607132, "train_ppl": 87.24633164988977, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 148835, "dt_s": 4.403, "eta_s": 39773, "world_size": 1, "timestamp": "2026-05-04T20:47:49.397776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3180, "epoch": 0, "train_loss": 4.619095146656036, "train_ppl": 101.40223645225684, "lr": 0.00056, "grad_norm": 0.6182, "tokens_per_sec": 145817, "dt_s": 4.494, "eta_s": 39764, "world_size": 1, "timestamp": "2026-05-04T20:47:53.892196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3190, "epoch": 0, "train_loss": 4.467887133359909, "train_ppl": 87.17234475952104, "lr": 0.00056, "grad_norm": 0.5663, "tokens_per_sec": 134031, "dt_s": 4.89, "eta_s": 40600, "world_size": 1, "timestamp": "2026-05-04T20:47:58.781807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3200, "epoch": 0, "train_loss": 4.390060603618622, "train_ppl": 80.64530622976541, "lr": 0.00056, "grad_norm": 0.5937, "tokens_per_sec": 147476, "dt_s": 4.444, "eta_s": 40466, "world_size": 1, "timestamp": "2026-05-04T20:48:03.225657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3210, "epoch": 0, "train_loss": 4.464035898447037, "train_ppl": 86.83726922340135, "lr": 0.00056, "grad_norm": 0.5998, "tokens_per_sec": 146202, "dt_s": 4.483, "eta_s": 40640, "world_size": 1, "timestamp": "2026-05-04T20:48:07.708253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3220, "epoch": 0, "train_loss": 4.428210467100143, "train_ppl": 83.78135319040557, "lr": 0.00056, "grad_norm": 0.5979, "tokens_per_sec": 146130, "dt_s": 4.485, "eta_s": 40781, "world_size": 1, "timestamp": "2026-05-04T20:48:12.193009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3230, "epoch": 0, "train_loss": 4.57257741689682, "train_ppl": 96.79326514336273, "lr": 0.00056, "grad_norm": 0.6279, "tokens_per_sec": 146482, "dt_s": 4.474, "eta_s": 40740, "world_size": 1, "timestamp": "2026-05-04T20:48:16.667009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3240, "epoch": 0, "train_loss": 4.505987673997879, "train_ppl": 90.55774142037977, "lr": 0.00056, "grad_norm": 0.5895, "tokens_per_sec": 146656, "dt_s": 4.469, "eta_s": 39982, "world_size": 1, "timestamp": "2026-05-04T20:48:21.135709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3250, "epoch": 0, "train_loss": 4.482577085494995, "train_ppl": 88.46235422215527, "lr": 0.00056, "grad_norm": 0.6104, "tokens_per_sec": 148015, "dt_s": 4.428, "eta_s": 39949, "world_size": 1, "timestamp": "2026-05-04T20:48:25.563366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3260, "epoch": 0, "train_loss": 4.454596281051636, "train_ppl": 86.02141535857774, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 145506, "dt_s": 4.504, "eta_s": 39982, "world_size": 1, "timestamp": "2026-05-04T20:48:30.067363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3270, "epoch": 0, "train_loss": 4.466284483671188, "train_ppl": 87.03274991901992, "lr": 0.00056, "grad_norm": 0.5853, "tokens_per_sec": 148383, "dt_s": 4.417, "eta_s": 39856, "world_size": 1, "timestamp": "2026-05-04T20:48:34.484017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3280, "epoch": 0, "train_loss": 4.482581853866577, "train_ppl": 88.46277604453692, "lr": 0.00056, "grad_norm": 0.5697, "tokens_per_sec": 149495, "dt_s": 4.384, "eta_s": 39691, "world_size": 1, "timestamp": "2026-05-04T20:48:38.867836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3290, "epoch": 0, "train_loss": 4.387831419706345, "train_ppl": 80.46573323549438, "lr": 0.00056, "grad_norm": 0.6021, "tokens_per_sec": 145364, "dt_s": 4.508, "eta_s": 39757, "world_size": 1, "timestamp": "2026-05-04T20:48:43.376286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3300, "epoch": 0, "train_loss": 4.470541208982468, "train_ppl": 87.4040140526281, "lr": 0.00056, "grad_norm": 0.7618, "tokens_per_sec": 148821, "dt_s": 4.404, "eta_s": 39710, "world_size": 1, "timestamp": "2026-05-04T20:48:47.779922"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3310, "epoch": 0, "train_loss": 4.561055153608322, "train_ppl": 95.68438831131681, "lr": 0.00056, "grad_norm": 0.6416, "tokens_per_sec": 147247, "dt_s": 4.451, "eta_s": 39610, "world_size": 1, "timestamp": "2026-05-04T20:48:52.230685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3320, "epoch": 0, "train_loss": 4.4707847237586975, "train_ppl": 87.4253008132669, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 147762, "dt_s": 4.435, "eta_s": 39639, "world_size": 1, "timestamp": "2026-05-04T20:48:56.665917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3330, "epoch": 0, "train_loss": 4.405995756387711, "train_ppl": 81.94069520394982, "lr": 0.00056, "grad_norm": 0.7615, "tokens_per_sec": 149156, "dt_s": 4.394, "eta_s": 39652, "world_size": 1, "timestamp": "2026-05-04T20:49:01.059705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3340, "epoch": 0, "train_loss": 4.386690199375153, "train_ppl": 80.37395648347014, "lr": 0.00056, "grad_norm": 0.6058, "tokens_per_sec": 149233, "dt_s": 4.392, "eta_s": 39439, "world_size": 1, "timestamp": "2026-05-04T20:49:05.451234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3350, "epoch": 0, "train_loss": 4.476532757282257, "train_ppl": 87.9292714054847, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 149206, "dt_s": 4.392, "eta_s": 39414, "world_size": 1, "timestamp": "2026-05-04T20:49:09.843574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3360, "epoch": 0, "train_loss": 4.519047409296036, "train_ppl": 91.74815789514705, "lr": 0.00056, "grad_norm": 0.6898, "tokens_per_sec": 150006, "dt_s": 4.369, "eta_s": 39263, "world_size": 1, "timestamp": "2026-05-04T20:49:14.212427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3370, "epoch": 0, "train_loss": 4.397198885679245, "train_ppl": 81.22303471375449, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 148540, "dt_s": 4.412, "eta_s": 39218, "world_size": 1, "timestamp": "2026-05-04T20:49:18.624426"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3380, "epoch": 0, "train_loss": 4.505610674619675, "train_ppl": 90.52360764278343, "lr": 0.00056, "grad_norm": 0.6451, "tokens_per_sec": 149373, "dt_s": 4.387, "eta_s": 39202, "world_size": 1, "timestamp": "2026-05-04T20:49:23.011839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3390, "epoch": 0, "train_loss": 4.540797472000122, "train_ppl": 93.76554570903441, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 150733, "dt_s": 4.348, "eta_s": 39119, "world_size": 1, "timestamp": "2026-05-04T20:49:27.359647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3400, "epoch": 0, "train_loss": 4.5111647844314575, "train_ppl": 91.02778453101534, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 146749, "dt_s": 4.466, "eta_s": 39246, "world_size": 1, "timestamp": "2026-05-04T20:49:31.825502"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3410, "epoch": 0, "train_loss": 4.501115381717682, "train_ppl": 90.11759077796991, "lr": 0.00056, "grad_norm": 0.6345, "tokens_per_sec": 150168, "dt_s": 4.364, "eta_s": 39234, "world_size": 1, "timestamp": "2026-05-04T20:49:36.189676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3420, "epoch": 0, "train_loss": 4.461815893650055, "train_ppl": 86.64470389623054, "lr": 0.00056, "grad_norm": 0.5898, "tokens_per_sec": 151081, "dt_s": 4.338, "eta_s": 39097, "world_size": 1, "timestamp": "2026-05-04T20:49:40.527466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3430, "epoch": 0, "train_loss": 4.467222720384598, "train_ppl": 87.11444555919442, "lr": 0.00056, "grad_norm": 0.5555, "tokens_per_sec": 146380, "dt_s": 4.477, "eta_s": 39253, "world_size": 1, "timestamp": "2026-05-04T20:49:45.004570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3440, "epoch": 0, "train_loss": 4.437831223011017, "train_ppl": 84.59128296046708, "lr": 0.00056, "grad_norm": 0.6375, "tokens_per_sec": 149924, "dt_s": 4.371, "eta_s": 39290, "world_size": 1, "timestamp": "2026-05-04T20:49:49.375859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3450, "epoch": 0, "train_loss": 4.298752158880234, "train_ppl": 73.60788542188916, "lr": 0.00056, "grad_norm": 0.7177, "tokens_per_sec": 148734, "dt_s": 4.406, "eta_s": 39179, "world_size": 1, "timestamp": "2026-05-04T20:49:53.782119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3460, "epoch": 0, "train_loss": 4.46194052696228, "train_ppl": 86.655503385638, "lr": 0.00056, "grad_norm": 0.6128, "tokens_per_sec": 144752, "dt_s": 4.527, "eta_s": 39466, "world_size": 1, "timestamp": "2026-05-04T20:49:58.309615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3470, "epoch": 0, "train_loss": 4.461916089057922, "train_ppl": 86.65338573260978, "lr": 0.00056, "grad_norm": 0.6366, "tokens_per_sec": 148287, "dt_s": 4.42, "eta_s": 39608, "world_size": 1, "timestamp": "2026-05-04T20:50:02.729138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3480, "epoch": 0, "train_loss": 4.448735564947128, "train_ppl": 85.51874271390074, "lr": 0.00056, "grad_norm": 0.7133, "tokens_per_sec": 131628, "dt_s": 4.979, "eta_s": 40498, "world_size": 1, "timestamp": "2026-05-04T20:50:07.708012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3490, "epoch": 0, "train_loss": 4.389607518911362, "train_ppl": 80.60877535121654, "lr": 0.00056, "grad_norm": 0.6259, "tokens_per_sec": 148270, "dt_s": 4.42, "eta_s": 40581, "world_size": 1, "timestamp": "2026-05-04T20:50:12.128057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3500, "epoch": 0, "train_loss": 4.509768187999725, "train_ppl": 90.90074418464721, "lr": 0.00056, "grad_norm": 0.6117, "tokens_per_sec": 150214, "dt_s": 4.363, "eta_s": 40499, "world_size": 1, "timestamp": "2026-05-04T20:50:16.490926"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3510, "epoch": 0, "train_loss": 4.484131395816803, "train_ppl": 88.59995908504047, "lr": 0.00056, "grad_norm": 0.5811, "tokens_per_sec": 108548, "dt_s": 6.038, "eta_s": 40510, "world_size": 1, "timestamp": "2026-05-04T20:50:22.528409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3520, "epoch": 0, "train_loss": 4.427523046731949, "train_ppl": 83.72377997251607, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 147977, "dt_s": 4.429, "eta_s": 40522, "world_size": 1, "timestamp": "2026-05-04T20:50:26.957231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3530, "epoch": 0, "train_loss": 4.410459205508232, "train_ppl": 82.30725077039848, "lr": 0.00056, "grad_norm": 0.7747, "tokens_per_sec": 148533, "dt_s": 4.412, "eta_s": 39507, "world_size": 1, "timestamp": "2026-05-04T20:50:31.369441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3540, "epoch": 0, "train_loss": 4.355221897363663, "train_ppl": 77.8841056972494, "lr": 0.00056, "grad_norm": 0.6347, "tokens_per_sec": 146433, "dt_s": 4.475, "eta_s": 39601, "world_size": 1, "timestamp": "2026-05-04T20:50:35.844934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3550, "epoch": 0, "train_loss": 4.431745886802673, "train_ppl": 84.07807965440304, "lr": 0.00056, "grad_norm": 0.5842, "tokens_per_sec": 150060, "dt_s": 4.367, "eta_s": 39605, "world_size": 1, "timestamp": "2026-05-04T20:50:40.212262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3560, "epoch": 0, "train_loss": 4.424792498350143, "train_ppl": 83.49547997453688, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 146122, "dt_s": 4.485, "eta_s": 39509, "world_size": 1, "timestamp": "2026-05-04T20:50:44.697269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3570, "epoch": 0, "train_loss": 4.381742462515831, "train_ppl": 79.97726945720261, "lr": 0.00056, "grad_norm": 0.7309, "tokens_per_sec": 147211, "dt_s": 4.452, "eta_s": 39546, "world_size": 1, "timestamp": "2026-05-04T20:50:49.149091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3580, "epoch": 0, "train_loss": 4.4621422290802, "train_ppl": 86.672983747054, "lr": 0.00056, "grad_norm": 0.6923, "tokens_per_sec": 148392, "dt_s": 4.416, "eta_s": 39549, "world_size": 1, "timestamp": "2026-05-04T20:50:53.565487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3590, "epoch": 0, "train_loss": 4.360183298587799, "train_ppl": 78.2714801600457, "lr": 0.00056, "grad_norm": 0.6336, "tokens_per_sec": 145755, "dt_s": 4.496, "eta_s": 39581, "world_size": 1, "timestamp": "2026-05-04T20:50:58.061841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3600, "epoch": 0, "train_loss": 4.400655418634415, "train_ppl": 81.504270580474, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 148555, "dt_s": 4.412, "eta_s": 39656, "world_size": 1, "timestamp": "2026-05-04T20:51:02.473405"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3610, "epoch": 0, "train_loss": 4.4985634088516235, "train_ppl": 89.88790633044181, "lr": 0.00056, "grad_norm": 0.6022, "tokens_per_sec": 148730, "dt_s": 4.406, "eta_s": 39511, "world_size": 1, "timestamp": "2026-05-04T20:51:06.879750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3620, "epoch": 0, "train_loss": 4.388376414775848, "train_ppl": 80.5095986154939, "lr": 0.00056, "grad_norm": 0.5938, "tokens_per_sec": 145903, "dt_s": 4.492, "eta_s": 39578, "world_size": 1, "timestamp": "2026-05-04T20:51:11.371482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3630, "epoch": 0, "train_loss": 4.449130415916443, "train_ppl": 85.55251653973102, "lr": 0.00056, "grad_norm": 0.5906, "tokens_per_sec": 149631, "dt_s": 4.38, "eta_s": 39508, "world_size": 1, "timestamp": "2026-05-04T20:51:15.751357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3640, "epoch": 0, "train_loss": 4.504203051328659, "train_ppl": 90.39627414410285, "lr": 0.00056, "grad_norm": 0.6255, "tokens_per_sec": 148650, "dt_s": 4.409, "eta_s": 39348, "world_size": 1, "timestamp": "2026-05-04T20:51:20.160092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3650, "epoch": 0, "train_loss": 4.307334005832672, "train_ppl": 74.24229533989767, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 144892, "dt_s": 4.523, "eta_s": 39542, "world_size": 1, "timestamp": "2026-05-04T20:51:24.683208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3660, "epoch": 0, "train_loss": 4.353347584605217, "train_ppl": 77.73826324414216, "lr": 0.00056, "grad_norm": 0.7014, "tokens_per_sec": 147833, "dt_s": 4.433, "eta_s": 39585, "world_size": 1, "timestamp": "2026-05-04T20:51:29.116322"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3670, "epoch": 0, "train_loss": 4.444893449544907, "train_ppl": 85.19080023518501, "lr": 0.00056, "grad_norm": 0.6333, "tokens_per_sec": 147500, "dt_s": 4.443, "eta_s": 39494, "world_size": 1, "timestamp": "2026-05-04T20:51:33.559433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3680, "epoch": 0, "train_loss": 4.613307803869247, "train_ppl": 100.8170818289855, "lr": 0.00056, "grad_norm": 0.6562, "tokens_per_sec": 147695, "dt_s": 4.437, "eta_s": 39592, "world_size": 1, "timestamp": "2026-05-04T20:51:37.996695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3690, "epoch": 0, "train_loss": 4.38960000872612, "train_ppl": 80.60816996665481, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 148493, "dt_s": 4.413, "eta_s": 39596, "world_size": 1, "timestamp": "2026-05-04T20:51:42.410089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3700, "epoch": 0, "train_loss": 4.499440580606461, "train_ppl": 89.96678805432856, "lr": 0.00056, "grad_norm": 0.5994, "tokens_per_sec": 145393, "dt_s": 4.508, "eta_s": 39564, "world_size": 1, "timestamp": "2026-05-04T20:51:46.917603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3710, "epoch": 0, "train_loss": 4.444548696279526, "train_ppl": 85.16143549071015, "lr": 0.00056, "grad_norm": 0.5611, "tokens_per_sec": 147035, "dt_s": 4.457, "eta_s": 39602, "world_size": 1, "timestamp": "2026-05-04T20:51:51.374765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3720, "epoch": 0, "train_loss": 4.34054434299469, "train_ppl": 76.74930591661514, "lr": 0.00056, "grad_norm": 0.7761, "tokens_per_sec": 149125, "dt_s": 4.395, "eta_s": 39511, "world_size": 1, "timestamp": "2026-05-04T20:51:55.769472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3730, "epoch": 0, "train_loss": 4.425445169210434, "train_ppl": 83.54999282882287, "lr": 0.00056, "grad_norm": 0.6392, "tokens_per_sec": 143894, "dt_s": 4.554, "eta_s": 39715, "world_size": 1, "timestamp": "2026-05-04T20:52:00.323935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3740, "epoch": 0, "train_loss": 4.504476815462112, "train_ppl": 90.42102478952624, "lr": 0.00056, "grad_norm": 0.6163, "tokens_per_sec": 148456, "dt_s": 4.415, "eta_s": 39713, "world_size": 1, "timestamp": "2026-05-04T20:52:04.738478"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3750, "epoch": 0, "train_loss": 4.4021444618701935, "train_ppl": 81.62572436579657, "lr": 0.00056, "grad_norm": 0.6056, "tokens_per_sec": 148215, "dt_s": 4.422, "eta_s": 39556, "world_size": 1, "timestamp": "2026-05-04T20:52:09.160105"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3760, "epoch": 0, "train_loss": 4.3678871393203735, "train_ppl": 78.87679982648788, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 144106, "dt_s": 4.548, "eta_s": 39712, "world_size": 1, "timestamp": "2026-05-04T20:52:13.707871"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3770, "epoch": 0, "train_loss": 4.39124870300293, "train_ppl": 80.74117780964929, "lr": 0.00056, "grad_norm": 0.5901, "tokens_per_sec": 147647, "dt_s": 4.439, "eta_s": 39786, "world_size": 1, "timestamp": "2026-05-04T20:52:18.146559"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3780, "epoch": 0, "train_loss": 4.295281678438187, "train_ppl": 73.35287345812036, "lr": 0.00056, "grad_norm": 0.5714, "tokens_per_sec": 131514, "dt_s": 4.983, "eta_s": 40544, "world_size": 1, "timestamp": "2026-05-04T20:52:23.129764"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3790, "epoch": 0, "train_loss": 4.473129600286484, "train_ppl": 87.63054288869849, "lr": 0.00056, "grad_norm": 0.691, "tokens_per_sec": 146819, "dt_s": 4.464, "eta_s": 40627, "world_size": 1, "timestamp": "2026-05-04T20:52:27.593497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3800, "epoch": 0, "train_loss": 4.348014414310455, "train_ppl": 77.32477542836918, "lr": 0.00056, "grad_norm": 0.6144, "tokens_per_sec": 149023, "dt_s": 4.398, "eta_s": 40580, "world_size": 1, "timestamp": "2026-05-04T20:52:31.991204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3810, "epoch": 0, "train_loss": 4.439426928758621, "train_ppl": 84.72637351060315, "lr": 0.00056, "grad_norm": 0.6611, "tokens_per_sec": 146912, "dt_s": 4.461, "eta_s": 40421, "world_size": 1, "timestamp": "2026-05-04T20:52:36.452096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3820, "epoch": 0, "train_loss": 4.407910645008087, "train_ppl": 82.09775283468498, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 148346, "dt_s": 4.418, "eta_s": 40379, "world_size": 1, "timestamp": "2026-05-04T20:52:40.869875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3830, "epoch": 0, "train_loss": 4.348201215267181, "train_ppl": 77.33922111959029, "lr": 0.00056, "grad_norm": 0.5753, "tokens_per_sec": 149949, "dt_s": 4.371, "eta_s": 39286, "world_size": 1, "timestamp": "2026-05-04T20:52:45.240445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3840, "epoch": 0, "train_loss": 4.314000353217125, "train_ppl": 74.7388736138903, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 146166, "dt_s": 4.484, "eta_s": 39317, "world_size": 1, "timestamp": "2026-05-04T20:52:49.724114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3850, "epoch": 0, "train_loss": 4.302910685539246, "train_ppl": 73.91462312213822, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 148641, "dt_s": 4.409, "eta_s": 39333, "world_size": 1, "timestamp": "2026-05-04T20:52:54.133116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3860, "epoch": 0, "train_loss": 4.47040182352066, "train_ppl": 87.39183205278212, "lr": 0.00056, "grad_norm": 0.6713, "tokens_per_sec": 149424, "dt_s": 4.386, "eta_s": 39195, "world_size": 1, "timestamp": "2026-05-04T20:52:58.519009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3870, "epoch": 0, "train_loss": 4.690927594900131, "train_ppl": 108.95419830674751, "lr": 0.00056, "grad_norm": 1.0823, "tokens_per_sec": 147126, "dt_s": 4.454, "eta_s": 39256, "world_size": 1, "timestamp": "2026-05-04T20:53:02.973445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3880, "epoch": 0, "train_loss": 4.3642071187496185, "train_ppl": 78.58706502256929, "lr": 0.00056, "grad_norm": 0.7089, "tokens_per_sec": 149593, "dt_s": 4.381, "eta_s": 39270, "world_size": 1, "timestamp": "2026-05-04T20:53:07.354410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3890, "epoch": 0, "train_loss": 4.337668061256409, "train_ppl": 76.52887045879412, "lr": 0.00056, "grad_norm": 0.6141, "tokens_per_sec": 149293, "dt_s": 4.39, "eta_s": 39098, "world_size": 1, "timestamp": "2026-05-04T20:53:11.744156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3900, "epoch": 0, "train_loss": 4.424040377140045, "train_ppl": 83.43270486330626, "lr": 0.00056, "grad_norm": 0.6095, "tokens_per_sec": 147977, "dt_s": 4.429, "eta_s": 39129, "world_size": 1, "timestamp": "2026-05-04T20:53:16.172960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3910, "epoch": 0, "train_loss": 4.440147668123245, "train_ppl": 84.78746115470457, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 148403, "dt_s": 4.416, "eta_s": 39178, "world_size": 1, "timestamp": "2026-05-04T20:53:20.589071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3920, "epoch": 0, "train_loss": 4.3605081886053085, "train_ppl": 78.29691391396805, "lr": 0.00056, "grad_norm": 0.6453, "tokens_per_sec": 145132, "dt_s": 4.516, "eta_s": 39283, "world_size": 1, "timestamp": "2026-05-04T20:53:25.104652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3930, "epoch": 0, "train_loss": 4.373070001602173, "train_ppl": 79.28666864654858, "lr": 0.00056, "grad_norm": 0.5781, "tokens_per_sec": 148367, "dt_s": 4.417, "eta_s": 39342, "world_size": 1, "timestamp": "2026-05-04T20:53:29.521790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3940, "epoch": 0, "train_loss": 4.333546012639999, "train_ppl": 76.21406400370745, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 149867, "dt_s": 4.373, "eta_s": 39308, "world_size": 1, "timestamp": "2026-05-04T20:53:33.894734"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3950, "epoch": 0, "train_loss": 4.381703466176987, "train_ppl": 79.97415069731352, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 146720, "dt_s": 4.467, "eta_s": 39371, "world_size": 1, "timestamp": "2026-05-04T20:53:38.361480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3960, "epoch": 0, "train_loss": 4.469572126865387, "train_ppl": 87.3193534138297, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 149175, "dt_s": 4.393, "eta_s": 39326, "world_size": 1, "timestamp": "2026-05-04T20:53:42.754731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3970, "epoch": 0, "train_loss": 4.360292434692383, "train_ppl": 78.28002287064098, "lr": 0.00056, "grad_norm": 0.627, "tokens_per_sec": 150303, "dt_s": 4.36, "eta_s": 39046, "world_size": 1, "timestamp": "2026-05-04T20:53:47.114980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3980, "epoch": 0, "train_loss": 4.451965481042862, "train_ppl": 85.7954076392047, "lr": 0.00056, "grad_norm": 0.7019, "tokens_per_sec": 146850, "dt_s": 4.463, "eta_s": 39123, "world_size": 1, "timestamp": "2026-05-04T20:53:51.577777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 3990, "epoch": 0, "train_loss": 4.365232557058334, "train_ppl": 78.66769254182132, "lr": 0.00056, "grad_norm": 0.5989, "tokens_per_sec": 146907, "dt_s": 4.461, "eta_s": 39275, "world_size": 1, "timestamp": "2026-05-04T20:53:56.038822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4000, "epoch": 0, "train_loss": 4.388825416564941, "train_ppl": 80.54575568599864, "lr": 0.00056, "grad_norm": 0.6467, "tokens_per_sec": 148963, "dt_s": 4.399, "eta_s": 39151, "world_size": 1, "timestamp": "2026-05-04T20:54:00.438280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4010, "epoch": 0, "train_loss": 4.467906147241592, "train_ppl": 87.17400225992806, "lr": 0.00056, "grad_norm": 0.6253, "tokens_per_sec": 107840, "dt_s": 6.077, "eta_s": 39291, "world_size": 1, "timestamp": "2026-05-04T20:54:06.515453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4020, "epoch": 0, "train_loss": 4.421781063079834, "train_ppl": 83.24441696097266, "lr": 0.00056, "grad_norm": 0.7688, "tokens_per_sec": 145700, "dt_s": 4.498, "eta_s": 39530, "world_size": 1, "timestamp": "2026-05-04T20:54:11.013464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4030, "epoch": 0, "train_loss": 4.326944559812546, "train_ppl": 75.71259748032142, "lr": 0.00056, "grad_norm": 0.6332, "tokens_per_sec": 144192, "dt_s": 4.545, "eta_s": 39672, "world_size": 1, "timestamp": "2026-05-04T20:54:15.558539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4040, "epoch": 0, "train_loss": 4.330802574753761, "train_ppl": 76.00526200167853, "lr": 0.00056, "grad_norm": 0.5908, "tokens_per_sec": 148695, "dt_s": 4.407, "eta_s": 39572, "world_size": 1, "timestamp": "2026-05-04T20:54:19.965943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4050, "epoch": 0, "train_loss": 4.3703117072582245, "train_ppl": 79.06827401367619, "lr": 0.00056, "grad_norm": 0.6097, "tokens_per_sec": 148670, "dt_s": 4.408, "eta_s": 39583, "world_size": 1, "timestamp": "2026-05-04T20:54:24.374077"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4060, "epoch": 0, "train_loss": 4.4139544069767, "train_ppl": 82.5954345309268, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 144360, "dt_s": 4.54, "eta_s": 39694, "world_size": 1, "timestamp": "2026-05-04T20:54:28.913825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4070, "epoch": 0, "train_loss": 4.302342623472214, "train_ppl": 73.8726469522073, "lr": 0.00056, "grad_norm": 0.5905, "tokens_per_sec": 131318, "dt_s": 4.991, "eta_s": 40563, "world_size": 1, "timestamp": "2026-05-04T20:54:33.904460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4080, "epoch": 0, "train_loss": 4.486555486917496, "train_ppl": 88.81499398417361, "lr": 0.00056, "grad_norm": 0.711, "tokens_per_sec": 146053, "dt_s": 4.487, "eta_s": 40456, "world_size": 1, "timestamp": "2026-05-04T20:54:38.391606"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4090, "epoch": 0, "train_loss": 4.395519405603409, "train_ppl": 81.08673673216346, "lr": 0.00056, "grad_norm": 0.6402, "tokens_per_sec": 148821, "dt_s": 4.404, "eta_s": 40445, "world_size": 1, "timestamp": "2026-05-04T20:54:42.795304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4100, "epoch": 0, "train_loss": 4.344006448984146, "train_ppl": 77.01548064500828, "lr": 0.00056, "grad_norm": 0.6138, "tokens_per_sec": 148847, "dt_s": 4.403, "eta_s": 40431, "world_size": 1, "timestamp": "2026-05-04T20:54:47.198171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4110, "epoch": 0, "train_loss": 4.350329101085663, "train_ppl": 77.50396536773563, "lr": 0.00056, "grad_norm": 0.5896, "tokens_per_sec": 145227, "dt_s": 4.513, "eta_s": 40378, "world_size": 1, "timestamp": "2026-05-04T20:54:51.710861"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4120, "epoch": 0, "train_loss": 4.402533739805222, "train_ppl": 81.65750564469683, "lr": 0.00056, "grad_norm": 0.622, "tokens_per_sec": 148344, "dt_s": 4.418, "eta_s": 39359, "world_size": 1, "timestamp": "2026-05-04T20:54:56.128693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4130, "epoch": 0, "train_loss": 4.332747012376785, "train_ppl": 76.15319326761328, "lr": 0.00056, "grad_norm": 0.6538, "tokens_per_sec": 146561, "dt_s": 4.472, "eta_s": 39327, "world_size": 1, "timestamp": "2026-05-04T20:55:00.600274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4140, "epoch": 0, "train_loss": 4.237434238195419, "train_ppl": 69.22999608888486, "lr": 0.00056, "grad_norm": 0.6109, "tokens_per_sec": 146603, "dt_s": 4.47, "eta_s": 39440, "world_size": 1, "timestamp": "2026-05-04T20:55:05.070565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4150, "epoch": 0, "train_loss": 4.317662805318832, "train_ppl": 75.01310302815149, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 149323, "dt_s": 4.389, "eta_s": 39411, "world_size": 1, "timestamp": "2026-05-04T20:55:09.459420"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4160, "epoch": 0, "train_loss": 4.410228282213211, "train_ppl": 82.288246303218, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 146496, "dt_s": 4.474, "eta_s": 39337, "world_size": 1, "timestamp": "2026-05-04T20:55:13.933007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4170, "epoch": 0, "train_loss": 4.373025178909302, "train_ppl": 79.28311488419625, "lr": 0.00056, "grad_norm": 0.6004, "tokens_per_sec": 147724, "dt_s": 4.436, "eta_s": 39366, "world_size": 1, "timestamp": "2026-05-04T20:55:18.369378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4180, "epoch": 0, "train_loss": 4.297491282224655, "train_ppl": 73.51513344418409, "lr": 0.00056, "grad_norm": 0.5912, "tokens_per_sec": 148716, "dt_s": 4.407, "eta_s": 39247, "world_size": 1, "timestamp": "2026-05-04T20:55:22.776167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4190, "epoch": 0, "train_loss": 4.226806819438934, "train_ppl": 68.49815560552773, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 148196, "dt_s": 4.422, "eta_s": 39157, "world_size": 1, "timestamp": "2026-05-04T20:55:27.198397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4200, "epoch": 0, "train_loss": 4.263214990496635, "train_ppl": 71.03800322243872, "lr": 0.00056, "grad_norm": 0.5917, "tokens_per_sec": 149527, "dt_s": 4.383, "eta_s": 39142, "world_size": 1, "timestamp": "2026-05-04T20:55:31.581294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4210, "epoch": 0, "train_loss": 4.346652656793594, "train_ppl": 77.21954949655259, "lr": 0.00056, "grad_norm": 0.6319, "tokens_per_sec": 149474, "dt_s": 4.384, "eta_s": 38980, "world_size": 1, "timestamp": "2026-05-04T20:55:35.965740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4220, "epoch": 0, "train_loss": 4.336307302117348, "train_ppl": 76.42480391972039, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 146527, "dt_s": 4.473, "eta_s": 39040, "world_size": 1, "timestamp": "2026-05-04T20:55:40.438355"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4230, "epoch": 0, "train_loss": 4.325331389904022, "train_ppl": 75.59055865751364, "lr": 0.00056, "grad_norm": 0.63, "tokens_per_sec": 150351, "dt_s": 4.359, "eta_s": 38951, "world_size": 1, "timestamp": "2026-05-04T20:55:44.797233"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4240, "epoch": 0, "train_loss": 4.477369964122772, "train_ppl": 88.00291721707387, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 149194, "dt_s": 4.393, "eta_s": 38894, "world_size": 1, "timestamp": "2026-05-04T20:55:49.189910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4250, "epoch": 0, "train_loss": 4.328737571835518, "train_ppl": 75.84847285458187, "lr": 0.00056, "grad_norm": 0.6266, "tokens_per_sec": 148278, "dt_s": 4.42, "eta_s": 38955, "world_size": 1, "timestamp": "2026-05-04T20:55:53.609689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4260, "epoch": 0, "train_loss": 4.341316223144531, "train_ppl": 76.8085700518286, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 150374, "dt_s": 4.358, "eta_s": 38904, "world_size": 1, "timestamp": "2026-05-04T20:55:57.967907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4270, "epoch": 0, "train_loss": 4.361646205186844, "train_ppl": 78.38606781997507, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 147818, "dt_s": 4.434, "eta_s": 38831, "world_size": 1, "timestamp": "2026-05-04T20:56:02.401459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4280, "epoch": 0, "train_loss": 4.298981100320816, "train_ppl": 73.62473924641168, "lr": 0.00056, "grad_norm": 0.6408, "tokens_per_sec": 145267, "dt_s": 4.511, "eta_s": 39096, "world_size": 1, "timestamp": "2026-05-04T20:56:06.912872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4290, "epoch": 0, "train_loss": 4.239734649658203, "train_ppl": 69.3894368848616, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 149130, "dt_s": 4.395, "eta_s": 39095, "world_size": 1, "timestamp": "2026-05-04T20:56:11.307433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4300, "epoch": 0, "train_loss": 4.274439036846161, "train_ppl": 71.83982850750795, "lr": 0.00056, "grad_norm": 0.6137, "tokens_per_sec": 148033, "dt_s": 4.427, "eta_s": 39103, "world_size": 1, "timestamp": "2026-05-04T20:56:15.734561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4310, "epoch": 0, "train_loss": 4.435435503721237, "train_ppl": 84.38886855299437, "lr": 0.00056, "grad_norm": 0.7042, "tokens_per_sec": 148402, "dt_s": 4.416, "eta_s": 39201, "world_size": 1, "timestamp": "2026-05-04T20:56:20.150660"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4320, "epoch": 0, "train_loss": 4.3188886642456055, "train_ppl": 75.1051148953855, "lr": 0.00056, "grad_norm": 0.6082, "tokens_per_sec": 149720, "dt_s": 4.377, "eta_s": 39097, "world_size": 1, "timestamp": "2026-05-04T20:56:24.527886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4330, "epoch": 0, "train_loss": 4.342755168676376, "train_ppl": 76.91917295720614, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 146477, "dt_s": 4.474, "eta_s": 39027, "world_size": 1, "timestamp": "2026-05-04T20:56:29.002054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4340, "epoch": 0, "train_loss": 4.382180899381638, "train_ppl": 80.01234212857342, "lr": 0.00056, "grad_norm": 0.5825, "tokens_per_sec": 148495, "dt_s": 4.413, "eta_s": 39056, "world_size": 1, "timestamp": "2026-05-04T20:56:33.415409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4350, "epoch": 0, "train_loss": 4.278020814061165, "train_ppl": 72.09760414024356, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 149516, "dt_s": 4.383, "eta_s": 38974, "world_size": 1, "timestamp": "2026-05-04T20:56:37.798652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4360, "epoch": 0, "train_loss": 4.261677175760269, "train_ppl": 70.92884388916922, "lr": 0.00056, "grad_norm": 0.6318, "tokens_per_sec": 145744, "dt_s": 4.497, "eta_s": 39112, "world_size": 1, "timestamp": "2026-05-04T20:56:42.295301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4370, "epoch": 0, "train_loss": 4.209203988313675, "train_ppl": 67.30294455133345, "lr": 0.00056, "grad_norm": 0.6133, "tokens_per_sec": 133422, "dt_s": 4.912, "eta_s": 40052, "world_size": 1, "timestamp": "2026-05-04T20:56:47.207252"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4380, "epoch": 0, "train_loss": 4.2980697453022, "train_ppl": 73.55767153669413, "lr": 0.00056, "grad_norm": 0.726, "tokens_per_sec": 148568, "dt_s": 4.411, "eta_s": 39936, "world_size": 1, "timestamp": "2026-05-04T20:56:51.618403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4390, "epoch": 0, "train_loss": 4.307597041130066, "train_ppl": 74.26182625267872, "lr": 0.00056, "grad_norm": 0.6156, "tokens_per_sec": 147246, "dt_s": 4.451, "eta_s": 39997, "world_size": 1, "timestamp": "2026-05-04T20:56:56.069212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4400, "epoch": 0, "train_loss": 4.390267223119736, "train_ppl": 80.66197084426365, "lr": 0.00056, "grad_norm": 0.6332, "tokens_per_sec": 149830, "dt_s": 4.374, "eta_s": 39977, "world_size": 1, "timestamp": "2026-05-04T20:57:00.443232"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4410, "epoch": 0, "train_loss": 4.323569685220718, "train_ppl": 75.45750764901699, "lr": 0.00056, "grad_norm": 0.7448, "tokens_per_sec": 148255, "dt_s": 4.42, "eta_s": 39838, "world_size": 1, "timestamp": "2026-05-04T20:57:04.863698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4420, "epoch": 0, "train_loss": 4.325176447629929, "train_ppl": 75.57884739176373, "lr": 0.00056, "grad_norm": 0.6268, "tokens_per_sec": 147880, "dt_s": 4.432, "eta_s": 38985, "world_size": 1, "timestamp": "2026-05-04T20:57:09.295402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4430, "epoch": 0, "train_loss": 4.3199352622032166, "train_ppl": 75.183760903438, "lr": 0.00056, "grad_norm": 0.7206, "tokens_per_sec": 147185, "dt_s": 4.453, "eta_s": 39054, "world_size": 1, "timestamp": "2026-05-04T20:57:13.748026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4440, "epoch": 0, "train_loss": 4.329987525939941, "train_ppl": 75.94333924150408, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 146898, "dt_s": 4.461, "eta_s": 39068, "world_size": 1, "timestamp": "2026-05-04T20:57:18.209371"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4450, "epoch": 0, "train_loss": 4.284863829612732, "train_ppl": 72.59266107593217, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 150093, "dt_s": 4.366, "eta_s": 39050, "world_size": 1, "timestamp": "2026-05-04T20:57:22.575718"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4460, "epoch": 0, "train_loss": 4.3836424350738525, "train_ppl": 80.12936852070503, "lr": 0.00056, "grad_norm": 0.632, "tokens_per_sec": 149686, "dt_s": 4.378, "eta_s": 38972, "world_size": 1, "timestamp": "2026-05-04T20:57:26.953963"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4470, "epoch": 0, "train_loss": 4.260209530591965, "train_ppl": 70.82482186652312, "lr": 0.00056, "grad_norm": 0.6106, "tokens_per_sec": 144596, "dt_s": 4.532, "eta_s": 39145, "world_size": 1, "timestamp": "2026-05-04T20:57:31.486317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4480, "epoch": 0, "train_loss": 4.317172467708588, "train_ppl": 74.97633029873853, "lr": 0.00056, "grad_norm": 0.6729, "tokens_per_sec": 149584, "dt_s": 4.381, "eta_s": 39014, "world_size": 1, "timestamp": "2026-05-04T20:57:35.867516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4490, "epoch": 0, "train_loss": 4.243657186627388, "train_ppl": 69.66215403819594, "lr": 0.00056, "grad_norm": 0.7161, "tokens_per_sec": 148667, "dt_s": 4.408, "eta_s": 38916, "world_size": 1, "timestamp": "2026-05-04T20:57:40.275770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4500, "epoch": 0, "train_loss": 4.408740818500519, "train_ppl": 82.16593651110098, "lr": 0.00056, "grad_norm": 0.6065, "tokens_per_sec": 143667, "dt_s": 4.562, "eta_s": 39256, "world_size": 1, "timestamp": "2026-05-04T20:57:44.837449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4510, "epoch": 0, "train_loss": 4.271683156490326, "train_ppl": 71.64211909223593, "lr": 0.00056, "grad_norm": 0.5977, "tokens_per_sec": 108443, "dt_s": 6.043, "eta_s": 39369, "world_size": 1, "timestamp": "2026-05-04T20:57:50.880800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4520, "epoch": 0, "train_loss": 4.2726351618766785, "train_ppl": 71.71035525094315, "lr": 0.00056, "grad_norm": 0.5848, "tokens_per_sec": 141117, "dt_s": 4.644, "eta_s": 39562, "world_size": 1, "timestamp": "2026-05-04T20:57:55.524894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4530, "epoch": 0, "train_loss": 4.226913571357727, "train_ppl": 68.50546830538781, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 148511, "dt_s": 4.413, "eta_s": 39613, "world_size": 1, "timestamp": "2026-05-04T20:57:59.937794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4540, "epoch": 0, "train_loss": 4.226975530385971, "train_ppl": 68.50971296912968, "lr": 0.00056, "grad_norm": 0.6204, "tokens_per_sec": 148506, "dt_s": 4.413, "eta_s": 39617, "world_size": 1, "timestamp": "2026-05-04T20:58:04.350813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4550, "epoch": 0, "train_loss": 4.3171166479587555, "train_ppl": 74.97214525554304, "lr": 0.00056, "grad_norm": 0.6151, "tokens_per_sec": 144070, "dt_s": 4.549, "eta_s": 39590, "world_size": 1, "timestamp": "2026-05-04T20:58:08.899682"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4560, "epoch": 0, "train_loss": 4.262698829174042, "train_ppl": 71.00134561417465, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 150144, "dt_s": 4.365, "eta_s": 39445, "world_size": 1, "timestamp": "2026-05-04T20:58:13.264566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4570, "epoch": 0, "train_loss": 4.309691205620766, "train_ppl": 74.41750568450595, "lr": 0.00056, "grad_norm": 0.6861, "tokens_per_sec": 147043, "dt_s": 4.457, "eta_s": 39110, "world_size": 1, "timestamp": "2026-05-04T20:58:17.721477"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4580, "epoch": 0, "train_loss": 4.218689501285553, "train_ppl": 67.94438489079437, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 146971, "dt_s": 4.459, "eta_s": 39187, "world_size": 1, "timestamp": "2026-05-04T20:58:22.180580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4590, "epoch": 0, "train_loss": 4.213250130414963, "train_ppl": 67.5758134898059, "lr": 0.00056, "grad_norm": 0.7035, "tokens_per_sec": 148454, "dt_s": 4.415, "eta_s": 39186, "world_size": 1, "timestamp": "2026-05-04T20:58:26.595158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4600, "epoch": 0, "train_loss": 4.234935104846954, "train_ppl": 69.05719711072655, "lr": 0.00056, "grad_norm": 0.5926, "tokens_per_sec": 148109, "dt_s": 4.425, "eta_s": 38963, "world_size": 1, "timestamp": "2026-05-04T20:58:31.020007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4610, "epoch": 0, "train_loss": 4.303822576999664, "train_ppl": 73.98205597681275, "lr": 0.00056, "grad_norm": 0.8209, "tokens_per_sec": 148146, "dt_s": 4.424, "eta_s": 39062, "world_size": 1, "timestamp": "2026-05-04T20:58:35.443775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4620, "epoch": 0, "train_loss": 4.435318559408188, "train_ppl": 84.37900033175985, "lr": 0.00056, "grad_norm": 0.6163, "tokens_per_sec": 148929, "dt_s": 4.4, "eta_s": 38958, "world_size": 1, "timestamp": "2026-05-04T20:58:39.844276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4630, "epoch": 0, "train_loss": 4.3053664565086365, "train_ppl": 74.09636357293971, "lr": 0.00056, "grad_norm": 0.6382, "tokens_per_sec": 145528, "dt_s": 4.503, "eta_s": 39031, "world_size": 1, "timestamp": "2026-05-04T20:58:44.347583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4640, "epoch": 0, "train_loss": 4.308509200811386, "train_ppl": 74.32959580007311, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 148506, "dt_s": 4.413, "eta_s": 39024, "world_size": 1, "timestamp": "2026-05-04T20:58:48.760620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4650, "epoch": 0, "train_loss": 4.254397660493851, "train_ppl": 70.41439104271697, "lr": 0.00056, "grad_norm": 0.6449, "tokens_per_sec": 148785, "dt_s": 4.405, "eta_s": 38985, "world_size": 1, "timestamp": "2026-05-04T20:58:53.165363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4660, "epoch": 0, "train_loss": 4.316674739122391, "train_ppl": 74.93902172140365, "lr": 0.00056, "grad_norm": 0.7068, "tokens_per_sec": 127634, "dt_s": 5.135, "eta_s": 40232, "world_size": 1, "timestamp": "2026-05-04T20:58:58.300032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4670, "epoch": 0, "train_loss": 4.357365906238556, "train_ppl": 78.05126904689568, "lr": 0.00056, "grad_norm": 0.6236, "tokens_per_sec": 148395, "dt_s": 4.416, "eta_s": 40255, "world_size": 1, "timestamp": "2026-05-04T20:59:02.716361"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4680, "epoch": 0, "train_loss": 4.241868197917938, "train_ppl": 69.5376406408852, "lr": 0.00056, "grad_norm": 0.6384, "tokens_per_sec": 149547, "dt_s": 4.382, "eta_s": 40037, "world_size": 1, "timestamp": "2026-05-04T20:59:07.098656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4690, "epoch": 0, "train_loss": 4.361058056354523, "train_ppl": 78.33997870067594, "lr": 0.00056, "grad_norm": 0.593, "tokens_per_sec": 145117, "dt_s": 4.516, "eta_s": 40214, "world_size": 1, "timestamp": "2026-05-04T20:59:11.614751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4700, "epoch": 0, "train_loss": 4.26446507871151, "train_ppl": 71.12686252248533, "lr": 0.00056, "grad_norm": 0.6475, "tokens_per_sec": 147265, "dt_s": 4.45, "eta_s": 40290, "world_size": 1, "timestamp": "2026-05-04T20:59:16.064944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4710, "epoch": 0, "train_loss": 4.395565003156662, "train_ppl": 81.09043417325623, "lr": 0.00056, "grad_norm": 0.6002, "tokens_per_sec": 146562, "dt_s": 4.472, "eta_s": 39118, "world_size": 1, "timestamp": "2026-05-04T20:59:20.536497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4720, "epoch": 0, "train_loss": 4.2673525512218475, "train_ppl": 71.33253617837904, "lr": 0.00056, "grad_norm": 0.7777, "tokens_per_sec": 144170, "dt_s": 4.546, "eta_s": 39342, "world_size": 1, "timestamp": "2026-05-04T20:59:25.082272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4730, "epoch": 0, "train_loss": 4.192655608057976, "train_ppl": 66.19835462706173, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 149871, "dt_s": 4.373, "eta_s": 39321, "world_size": 1, "timestamp": "2026-05-04T20:59:29.455094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4740, "epoch": 0, "train_loss": 4.383620858192444, "train_ppl": 80.12763959747556, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 145856, "dt_s": 4.493, "eta_s": 39276, "world_size": 1, "timestamp": "2026-05-04T20:59:33.948317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4750, "epoch": 0, "train_loss": 4.310334458947182, "train_ppl": 74.46539039192864, "lr": 0.00056, "grad_norm": 0.645, "tokens_per_sec": 148906, "dt_s": 4.401, "eta_s": 39185, "world_size": 1, "timestamp": "2026-05-04T20:59:38.349457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4760, "epoch": 0, "train_loss": 4.303843170404434, "train_ppl": 73.98357953492476, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 149707, "dt_s": 4.378, "eta_s": 39015, "world_size": 1, "timestamp": "2026-05-04T20:59:42.727096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4770, "epoch": 0, "train_loss": 4.253696411848068, "train_ppl": 70.36503035543498, "lr": 0.00056, "grad_norm": 0.7391, "tokens_per_sec": 146790, "dt_s": 4.465, "eta_s": 38868, "world_size": 1, "timestamp": "2026-05-04T20:59:47.191702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4780, "epoch": 0, "train_loss": 4.2175465524196625, "train_ppl": 67.86677229520285, "lr": 0.00056, "grad_norm": 0.6272, "tokens_per_sec": 148252, "dt_s": 4.421, "eta_s": 38948, "world_size": 1, "timestamp": "2026-05-04T20:59:51.612280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4790, "epoch": 0, "train_loss": 4.336539134383202, "train_ppl": 76.44252370911063, "lr": 0.00056, "grad_norm": 0.6243, "tokens_per_sec": 148434, "dt_s": 4.415, "eta_s": 38806, "world_size": 1, "timestamp": "2026-05-04T20:59:56.027407"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4800, "epoch": 0, "train_loss": 4.331640541553497, "train_ppl": 76.0689785803015, "lr": 0.00056, "grad_norm": 0.6407, "tokens_per_sec": 143811, "dt_s": 4.557, "eta_s": 39076, "world_size": 1, "timestamp": "2026-05-04T21:00:00.584531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4810, "epoch": 0, "train_loss": 4.3400397300720215, "train_ppl": 76.7105869948995, "lr": 0.00056, "grad_norm": 0.6347, "tokens_per_sec": 149594, "dt_s": 4.381, "eta_s": 39077, "world_size": 1, "timestamp": "2026-05-04T21:00:04.965446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4820, "epoch": 0, "train_loss": 4.212261006236076, "train_ppl": 67.50900566483959, "lr": 0.00056, "grad_norm": 0.6133, "tokens_per_sec": 148200, "dt_s": 4.422, "eta_s": 38998, "world_size": 1, "timestamp": "2026-05-04T21:00:09.387571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4830, "epoch": 0, "train_loss": 4.224258571863174, "train_ppl": 68.32382755641041, "lr": 0.00056, "grad_norm": 0.6293, "tokens_per_sec": 147624, "dt_s": 4.439, "eta_s": 39027, "world_size": 1, "timestamp": "2026-05-04T21:00:13.826948"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4840, "epoch": 0, "train_loss": 4.22142718732357, "train_ppl": 68.13065013621546, "lr": 0.00056, "grad_norm": 0.6391, "tokens_per_sec": 149827, "dt_s": 4.374, "eta_s": 38950, "world_size": 1, "timestamp": "2026-05-04T21:00:18.201059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4850, "epoch": 0, "train_loss": 4.2879675924777985, "train_ppl": 72.81832149872085, "lr": 0.00056, "grad_norm": 0.6482, "tokens_per_sec": 146226, "dt_s": 4.482, "eta_s": 38813, "world_size": 1, "timestamp": "2026-05-04T21:00:22.682901"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4860, "epoch": 0, "train_loss": 4.2024092227220535, "train_ppl": 66.8471869559767, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 148795, "dt_s": 4.404, "eta_s": 38850, "world_size": 1, "timestamp": "2026-05-04T21:00:27.087352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4870, "epoch": 0, "train_loss": 4.190324783325195, "train_ppl": 66.04423754455465, "lr": 0.00056, "grad_norm": 0.6888, "tokens_per_sec": 147792, "dt_s": 4.434, "eta_s": 38868, "world_size": 1, "timestamp": "2026-05-04T21:00:31.521685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4880, "epoch": 0, "train_loss": 4.347435384988785, "train_ppl": 77.28001507613335, "lr": 0.00056, "grad_norm": 0.7232, "tokens_per_sec": 146778, "dt_s": 4.465, "eta_s": 38908, "world_size": 1, "timestamp": "2026-05-04T21:00:35.986666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4890, "epoch": 0, "train_loss": 4.294275015592575, "train_ppl": 73.27906900012668, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 148553, "dt_s": 4.412, "eta_s": 38970, "world_size": 1, "timestamp": "2026-05-04T21:00:40.398286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4900, "epoch": 0, "train_loss": 4.367116183042526, "train_ppl": 78.81601269760532, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 150186, "dt_s": 4.364, "eta_s": 38758, "world_size": 1, "timestamp": "2026-05-04T21:00:44.761915"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4910, "epoch": 0, "train_loss": 4.255106538534164, "train_ppl": 70.46432395433051, "lr": 0.00056, "grad_norm": 0.6179, "tokens_per_sec": 147596, "dt_s": 4.44, "eta_s": 38816, "world_size": 1, "timestamp": "2026-05-04T21:00:49.202165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4920, "epoch": 0, "train_loss": 4.287682354450226, "train_ppl": 72.79755390631975, "lr": 0.00056, "grad_norm": 0.6562, "tokens_per_sec": 148909, "dt_s": 4.401, "eta_s": 38753, "world_size": 1, "timestamp": "2026-05-04T21:00:53.603255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4930, "epoch": 0, "train_loss": 4.39381967484951, "train_ppl": 80.94902817889236, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 146848, "dt_s": 4.463, "eta_s": 38745, "world_size": 1, "timestamp": "2026-05-04T21:00:58.066097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4940, "epoch": 0, "train_loss": 4.325668752193451, "train_ppl": 75.61606436353131, "lr": 0.00056, "grad_norm": 0.6172, "tokens_per_sec": 145268, "dt_s": 4.511, "eta_s": 38915, "world_size": 1, "timestamp": "2026-05-04T21:01:02.577461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4950, "epoch": 0, "train_loss": 4.316487550735474, "train_ppl": 74.9249953196412, "lr": 0.00056, "grad_norm": 0.5936, "tokens_per_sec": 148254, "dt_s": 4.421, "eta_s": 39011, "world_size": 1, "timestamp": "2026-05-04T21:01:06.997998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4960, "epoch": 0, "train_loss": 4.28408670425415, "train_ppl": 72.5362693927047, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 130808, "dt_s": 5.01, "eta_s": 40006, "world_size": 1, "timestamp": "2026-05-04T21:01:12.008092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4970, "epoch": 0, "train_loss": 4.220591112971306, "train_ppl": 68.07371165275308, "lr": 0.00056, "grad_norm": 0.5803, "tokens_per_sec": 149382, "dt_s": 4.387, "eta_s": 39977, "world_size": 1, "timestamp": "2026-05-04T21:01:16.395242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4980, "epoch": 0, "train_loss": 4.399163872003555, "train_ppl": 81.38279377696844, "lr": 0.00056, "grad_norm": 0.6936, "tokens_per_sec": 149561, "dt_s": 4.382, "eta_s": 39831, "world_size": 1, "timestamp": "2026-05-04T21:01:20.777137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 4990, "epoch": 0, "train_loss": 4.1973157078027725, "train_ppl": 66.50756547969097, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 147007, "dt_s": 4.458, "eta_s": 39733, "world_size": 1, "timestamp": "2026-05-04T21:01:25.235142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5000, "epoch": 0, "train_loss": 4.2251458168029785, "train_ppl": 68.38447442701396, "lr": 0.00056, "grad_norm": 0.7638, "tokens_per_sec": 149014, "dt_s": 4.398, "eta_s": 39689, "world_size": 1, "timestamp": "2026-05-04T21:01:29.633108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5010, "epoch": 0, "train_loss": 4.276016011834145, "train_ppl": 71.95320749462236, "lr": 0.00056, "grad_norm": 0.7296, "tokens_per_sec": 107503, "dt_s": 6.096, "eta_s": 38720, "world_size": 1, "timestamp": "2026-05-04T21:01:35.729323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5020, "epoch": 0, "train_loss": 4.2302165031433105, "train_ppl": 68.73211128124034, "lr": 0.00056, "grad_norm": 0.6247, "tokens_per_sec": 145165, "dt_s": 4.515, "eta_s": 38939, "world_size": 1, "timestamp": "2026-05-04T21:01:40.243886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5030, "epoch": 0, "train_loss": 4.220630243420601, "train_ppl": 68.07637545979287, "lr": 0.00056, "grad_norm": 0.5913, "tokens_per_sec": 149285, "dt_s": 4.39, "eta_s": 38949, "world_size": 1, "timestamp": "2026-05-04T21:01:44.633874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5040, "epoch": 0, "train_loss": 4.221027672290802, "train_ppl": 68.10343635381332, "lr": 0.00056, "grad_norm": 0.649, "tokens_per_sec": 146767, "dt_s": 4.465, "eta_s": 38957, "world_size": 1, "timestamp": "2026-05-04T21:01:49.099171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5050, "epoch": 0, "train_loss": 4.208939656615257, "train_ppl": 67.28515660075486, "lr": 0.00056, "grad_norm": 0.691, "tokens_per_sec": 150198, "dt_s": 4.363, "eta_s": 38892, "world_size": 1, "timestamp": "2026-05-04T21:01:53.462494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5060, "epoch": 0, "train_loss": 4.285094618797302, "train_ppl": 72.60941661041137, "lr": 0.00056, "grad_norm": 0.6836, "tokens_per_sec": 149947, "dt_s": 4.371, "eta_s": 38730, "world_size": 1, "timestamp": "2026-05-04T21:01:57.833072"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5070, "epoch": 0, "train_loss": 4.270564243197441, "train_ppl": 71.56200260291423, "lr": 0.00056, "grad_norm": 0.6339, "tokens_per_sec": 146995, "dt_s": 4.458, "eta_s": 38627, "world_size": 1, "timestamp": "2026-05-04T21:02:02.291456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5080, "epoch": 0, "train_loss": 4.252205342054367, "train_ppl": 70.26018936617956, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 149973, "dt_s": 4.37, "eta_s": 38588, "world_size": 1, "timestamp": "2026-05-04T21:02:06.661348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5090, "epoch": 0, "train_loss": 4.162865236401558, "train_ppl": 64.25536591029271, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 149050, "dt_s": 4.397, "eta_s": 38464, "world_size": 1, "timestamp": "2026-05-04T21:02:11.058250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5100, "epoch": 0, "train_loss": 4.24941186606884, "train_ppl": 70.06419309733654, "lr": 0.00056, "grad_norm": 0.6351, "tokens_per_sec": 147761, "dt_s": 4.435, "eta_s": 38585, "world_size": 1, "timestamp": "2026-05-04T21:02:15.493495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5110, "epoch": 0, "train_loss": 4.176509842276573, "train_ppl": 65.13811373110258, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 149657, "dt_s": 4.379, "eta_s": 38596, "world_size": 1, "timestamp": "2026-05-04T21:02:19.872582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5120, "epoch": 0, "train_loss": 4.290759861469269, "train_ppl": 73.02193397796545, "lr": 0.00056, "grad_norm": 0.596, "tokens_per_sec": 150393, "dt_s": 4.358, "eta_s": 38415, "world_size": 1, "timestamp": "2026-05-04T21:02:24.230217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5130, "epoch": 0, "train_loss": 4.510385096073151, "train_ppl": 90.9568388884739, "lr": 0.00056, "grad_norm": 1.552, "tokens_per_sec": 147027, "dt_s": 4.457, "eta_s": 38564, "world_size": 1, "timestamp": "2026-05-04T21:02:28.687630"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5140, "epoch": 0, "train_loss": 4.409415900707245, "train_ppl": 82.22142400003955, "lr": 0.00056, "grad_norm": 0.7029, "tokens_per_sec": 147984, "dt_s": 4.429, "eta_s": 38615, "world_size": 1, "timestamp": "2026-05-04T21:02:33.116223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5150, "epoch": 0, "train_loss": 4.2184188812971115, "train_ppl": 67.92600026987571, "lr": 0.00056, "grad_norm": 0.6827, "tokens_per_sec": 146599, "dt_s": 4.47, "eta_s": 38672, "world_size": 1, "timestamp": "2026-05-04T21:02:37.586623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5160, "epoch": 0, "train_loss": 4.165805831551552, "train_ppl": 64.4445930114421, "lr": 0.00056, "grad_norm": 0.6323, "tokens_per_sec": 148306, "dt_s": 4.419, "eta_s": 38737, "world_size": 1, "timestamp": "2026-05-04T21:02:42.005601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5170, "epoch": 0, "train_loss": 4.240254610776901, "train_ppl": 69.42552607576502, "lr": 0.00056, "grad_norm": 0.7107, "tokens_per_sec": 149221, "dt_s": 4.392, "eta_s": 38793, "world_size": 1, "timestamp": "2026-05-04T21:02:46.397463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5180, "epoch": 0, "train_loss": 4.234516397118568, "train_ppl": 69.0282883811722, "lr": 0.00056, "grad_norm": 0.6749, "tokens_per_sec": 143862, "dt_s": 4.555, "eta_s": 38960, "world_size": 1, "timestamp": "2026-05-04T21:02:50.952961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5190, "epoch": 0, "train_loss": 4.215909838676453, "train_ppl": 67.75578466854337, "lr": 0.00056, "grad_norm": 0.6081, "tokens_per_sec": 148695, "dt_s": 4.407, "eta_s": 38918, "world_size": 1, "timestamp": "2026-05-04T21:02:55.360382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5200, "epoch": 0, "train_loss": 4.179803714156151, "train_ppl": 65.35302408148263, "lr": 0.00056, "grad_norm": 0.59, "tokens_per_sec": 147337, "dt_s": 4.448, "eta_s": 38875, "world_size": 1, "timestamp": "2026-05-04T21:02:59.808414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5210, "epoch": 0, "train_loss": 4.219574108719826, "train_ppl": 68.00451559089686, "lr": 0.00056, "grad_norm": 0.5967, "tokens_per_sec": 146038, "dt_s": 4.488, "eta_s": 38990, "world_size": 1, "timestamp": "2026-05-04T21:03:04.295990"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5220, "epoch": 0, "train_loss": 4.151086524128914, "train_ppl": 63.50296032662407, "lr": 0.00056, "grad_norm": 0.6134, "tokens_per_sec": 149213, "dt_s": 4.392, "eta_s": 38986, "world_size": 1, "timestamp": "2026-05-04T21:03:08.688114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5230, "epoch": 0, "train_loss": 4.188400536775589, "train_ppl": 65.91727434176643, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 147150, "dt_s": 4.454, "eta_s": 38804, "world_size": 1, "timestamp": "2026-05-04T21:03:13.141838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5240, "epoch": 0, "train_loss": 4.35436749458313, "train_ppl": 77.8175897205465, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 144407, "dt_s": 4.538, "eta_s": 39028, "world_size": 1, "timestamp": "2026-05-04T21:03:17.680110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5250, "epoch": 0, "train_loss": 4.234778717160225, "train_ppl": 69.04639825984404, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 147167, "dt_s": 4.453, "eta_s": 39033, "world_size": 1, "timestamp": "2026-05-04T21:03:22.133283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5260, "epoch": 0, "train_loss": 4.103359118103981, "train_ppl": 60.54331856141196, "lr": 0.00056, "grad_norm": 0.6288, "tokens_per_sec": 130223, "dt_s": 5.033, "eta_s": 39981, "world_size": 1, "timestamp": "2026-05-04T21:03:27.165872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5270, "epoch": 0, "train_loss": 4.247607409954071, "train_ppl": 69.93787933377101, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 147969, "dt_s": 4.429, "eta_s": 40041, "world_size": 1, "timestamp": "2026-05-04T21:03:31.594905"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5280, "epoch": 0, "train_loss": 4.240450128912926, "train_ppl": 69.43910135227901, "lr": 0.00056, "grad_norm": 0.6406, "tokens_per_sec": 147643, "dt_s": 4.439, "eta_s": 40010, "world_size": 1, "timestamp": "2026-05-04T21:03:36.033721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5290, "epoch": 0, "train_loss": 4.252002626657486, "train_ppl": 70.24594798752938, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 143915, "dt_s": 4.554, "eta_s": 40033, "world_size": 1, "timestamp": "2026-05-04T21:03:40.587548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5300, "epoch": 0, "train_loss": 4.297153532505035, "train_ppl": 73.49030792111823, "lr": 0.00056, "grad_norm": 0.6342, "tokens_per_sec": 148524, "dt_s": 4.412, "eta_s": 39957, "world_size": 1, "timestamp": "2026-05-04T21:03:45.000033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5310, "epoch": 0, "train_loss": 4.156032413244247, "train_ppl": 63.817816907934045, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 146406, "dt_s": 4.476, "eta_s": 38981, "world_size": 1, "timestamp": "2026-05-04T21:03:49.476373"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5320, "epoch": 0, "train_loss": 4.293216645717621, "train_ppl": 73.20155366821403, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 147693, "dt_s": 4.437, "eta_s": 38991, "world_size": 1, "timestamp": "2026-05-04T21:03:53.913681"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5330, "epoch": 0, "train_loss": 4.214258149266243, "train_ppl": 67.64396552719265, "lr": 0.00056, "grad_norm": 0.6571, "tokens_per_sec": 149068, "dt_s": 4.396, "eta_s": 38912, "world_size": 1, "timestamp": "2026-05-04T21:03:58.310049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5340, "epoch": 0, "train_loss": 4.184565469622612, "train_ppl": 65.66496129610752, "lr": 0.00056, "grad_norm": 0.644, "tokens_per_sec": 147580, "dt_s": 4.441, "eta_s": 38710, "world_size": 1, "timestamp": "2026-05-04T21:04:02.750747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5350, "epoch": 0, "train_loss": 4.260554388165474, "train_ppl": 70.84925055470319, "lr": 0.00056, "grad_norm": 0.6837, "tokens_per_sec": 147869, "dt_s": 4.432, "eta_s": 38740, "world_size": 1, "timestamp": "2026-05-04T21:04:07.182783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5360, "epoch": 0, "train_loss": 4.269705444574356, "train_ppl": 71.50057163580425, "lr": 0.00056, "grad_norm": 0.9052, "tokens_per_sec": 148524, "dt_s": 4.412, "eta_s": 38624, "world_size": 1, "timestamp": "2026-05-04T21:04:11.595256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5370, "epoch": 0, "train_loss": 4.297957628965378, "train_ppl": 73.54942498231162, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 145941, "dt_s": 4.491, "eta_s": 38712, "world_size": 1, "timestamp": "2026-05-04T21:04:16.085887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5380, "epoch": 0, "train_loss": 4.162021219730377, "train_ppl": 64.20115619042335, "lr": 0.00056, "grad_norm": 0.649, "tokens_per_sec": 148069, "dt_s": 4.426, "eta_s": 38760, "world_size": 1, "timestamp": "2026-05-04T21:04:20.511850"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5390, "epoch": 0, "train_loss": 4.199293777346611, "train_ppl": 66.63925226927687, "lr": 0.00056, "grad_norm": 0.627, "tokens_per_sec": 149023, "dt_s": 4.398, "eta_s": 38680, "world_size": 1, "timestamp": "2026-05-04T21:04:24.909580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5400, "epoch": 0, "train_loss": 4.406953871250153, "train_ppl": 82.01924142400529, "lr": 0.00056, "grad_norm": 0.7119, "tokens_per_sec": 146544, "dt_s": 4.472, "eta_s": 38746, "world_size": 1, "timestamp": "2026-05-04T21:04:29.381690"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5410, "epoch": 0, "train_loss": 4.338565215468407, "train_ppl": 76.59755946497404, "lr": 0.00056, "grad_norm": 0.6494, "tokens_per_sec": 149595, "dt_s": 4.381, "eta_s": 38686, "world_size": 1, "timestamp": "2026-05-04T21:04:33.762557"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5420, "epoch": 0, "train_loss": 4.305461689829826, "train_ppl": 74.10342035174577, "lr": 0.00056, "grad_norm": 0.6208, "tokens_per_sec": 148853, "dt_s": 4.403, "eta_s": 38528, "world_size": 1, "timestamp": "2026-05-04T21:04:38.165315"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5430, "epoch": 0, "train_loss": 4.158201307058334, "train_ppl": 63.95638118754029, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 146191, "dt_s": 4.483, "eta_s": 38623, "world_size": 1, "timestamp": "2026-05-04T21:04:42.648227"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5440, "epoch": 0, "train_loss": 4.233456373214722, "train_ppl": 68.95515551358841, "lr": 0.00056, "grad_norm": 0.6381, "tokens_per_sec": 150163, "dt_s": 4.364, "eta_s": 38561, "world_size": 1, "timestamp": "2026-05-04T21:04:47.012539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5450, "epoch": 0, "train_loss": 4.289284840226173, "train_ppl": 72.91430447155423, "lr": 0.00056, "grad_norm": 0.6364, "tokens_per_sec": 147547, "dt_s": 4.442, "eta_s": 38503, "world_size": 1, "timestamp": "2026-05-04T21:04:51.454267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5460, "epoch": 0, "train_loss": 4.17428931593895, "train_ppl": 64.9936333046419, "lr": 0.00056, "grad_norm": 0.6649, "tokens_per_sec": 146323, "dt_s": 4.479, "eta_s": 38670, "world_size": 1, "timestamp": "2026-05-04T21:04:55.933107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5470, "epoch": 0, "train_loss": 4.2667533457279205, "train_ppl": 71.28980613412236, "lr": 0.00056, "grad_norm": 0.6279, "tokens_per_sec": 149503, "dt_s": 4.384, "eta_s": 38632, "world_size": 1, "timestamp": "2026-05-04T21:05:00.316702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5480, "epoch": 0, "train_loss": 4.129506558179855, "train_ppl": 62.14724931574064, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 144842, "dt_s": 4.525, "eta_s": 38700, "world_size": 1, "timestamp": "2026-05-04T21:05:04.841360"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5490, "epoch": 0, "train_loss": 4.191029578447342, "train_ppl": 66.0908016081569, "lr": 0.00056, "grad_norm": 0.6171, "tokens_per_sec": 148498, "dt_s": 4.413, "eta_s": 38781, "world_size": 1, "timestamp": "2026-05-04T21:05:09.254609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5500, "epoch": 0, "train_loss": 4.268797442317009, "train_ppl": 71.43567842140844, "lr": 0.00056, "grad_norm": 0.7154, "tokens_per_sec": 147719, "dt_s": 4.437, "eta_s": 38768, "world_size": 1, "timestamp": "2026-05-04T21:05:13.691150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5510, "epoch": 0, "train_loss": 4.225894525647163, "train_ppl": 68.43569365957596, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 123888, "dt_s": 5.29, "eta_s": 38819, "world_size": 1, "timestamp": "2026-05-04T21:05:18.981085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5520, "epoch": 0, "train_loss": 4.2957737147808075, "train_ppl": 73.38897461851055, "lr": 0.00056, "grad_norm": 0.6507, "tokens_per_sec": 145422, "dt_s": 4.507, "eta_s": 39029, "world_size": 1, "timestamp": "2026-05-04T21:05:23.487705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5530, "epoch": 0, "train_loss": 4.259839713573456, "train_ppl": 70.79863448463222, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 146355, "dt_s": 4.478, "eta_s": 38943, "world_size": 1, "timestamp": "2026-05-04T21:05:27.965588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5540, "epoch": 0, "train_loss": 4.209895968437195, "train_ppl": 67.34953296848765, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 147452, "dt_s": 4.445, "eta_s": 38993, "world_size": 1, "timestamp": "2026-05-04T21:05:32.410156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5550, "epoch": 0, "train_loss": 4.146329089999199, "train_ppl": 63.201566674509614, "lr": 0.00056, "grad_norm": 0.6234, "tokens_per_sec": 134054, "dt_s": 4.889, "eta_s": 39777, "world_size": 1, "timestamp": "2026-05-04T21:05:37.298928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5560, "epoch": 0, "train_loss": 4.059459000825882, "train_ppl": 57.942955506979544, "lr": 0.00056, "grad_norm": 0.6477, "tokens_per_sec": 146754, "dt_s": 4.466, "eta_s": 39693, "world_size": 1, "timestamp": "2026-05-04T21:05:41.764635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5570, "epoch": 0, "train_loss": 4.309033185243607, "train_ppl": 74.36855355686326, "lr": 0.00056, "grad_norm": 0.6279, "tokens_per_sec": 149369, "dt_s": 4.388, "eta_s": 39482, "world_size": 1, "timestamp": "2026-05-04T21:05:46.152148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5580, "epoch": 0, "train_loss": 4.168010652065277, "train_ppl": 64.58683852740414, "lr": 0.00056, "grad_norm": 0.6325, "tokens_per_sec": 148740, "dt_s": 4.406, "eta_s": 39352, "world_size": 1, "timestamp": "2026-05-04T21:05:50.558239"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5590, "epoch": 0, "train_loss": 4.310768634080887, "train_ppl": 74.497728432437, "lr": 0.00056, "grad_norm": 0.6851, "tokens_per_sec": 145061, "dt_s": 4.518, "eta_s": 39475, "world_size": 1, "timestamp": "2026-05-04T21:05:55.076049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5600, "epoch": 0, "train_loss": 4.279965177178383, "train_ppl": 72.23792443516936, "lr": 0.00056, "grad_norm": 0.5998, "tokens_per_sec": 147562, "dt_s": 4.441, "eta_s": 38691, "world_size": 1, "timestamp": "2026-05-04T21:05:59.517312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5610, "epoch": 0, "train_loss": 4.266209781169891, "train_ppl": 71.25106605198074, "lr": 0.00056, "grad_norm": 0.6333, "tokens_per_sec": 149429, "dt_s": 4.386, "eta_s": 38547, "world_size": 1, "timestamp": "2026-05-04T21:06:03.903086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5620, "epoch": 0, "train_loss": 4.115268170833588, "train_ppl": 61.26864252280154, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 146632, "dt_s": 4.469, "eta_s": 38685, "world_size": 1, "timestamp": "2026-05-04T21:06:08.372497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5630, "epoch": 0, "train_loss": 4.045636668801308, "train_ppl": 57.147558521883994, "lr": 0.00056, "grad_norm": 0.7113, "tokens_per_sec": 149924, "dt_s": 4.371, "eta_s": 38620, "world_size": 1, "timestamp": "2026-05-04T21:06:12.743791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5640, "epoch": 0, "train_loss": 4.211455598473549, "train_ppl": 67.45465527768648, "lr": 0.00056, "grad_norm": 0.785, "tokens_per_sec": 148517, "dt_s": 4.413, "eta_s": 38433, "world_size": 1, "timestamp": "2026-05-04T21:06:17.156469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5650, "epoch": 0, "train_loss": 4.25043623149395, "train_ppl": 70.13600120687683, "lr": 0.00056, "grad_norm": 0.5951, "tokens_per_sec": 146628, "dt_s": 4.47, "eta_s": 38478, "world_size": 1, "timestamp": "2026-05-04T21:06:21.626007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5660, "epoch": 0, "train_loss": 4.249726206064224, "train_ppl": 70.08622053734267, "lr": 0.00056, "grad_norm": 0.6225, "tokens_per_sec": 148993, "dt_s": 4.399, "eta_s": 38496, "world_size": 1, "timestamp": "2026-05-04T21:06:26.024596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5670, "epoch": 0, "train_loss": 4.143714815378189, "train_ppl": 63.03655620806109, "lr": 0.00056, "grad_norm": 0.6374, "tokens_per_sec": 148917, "dt_s": 4.401, "eta_s": 38372, "world_size": 1, "timestamp": "2026-05-04T21:06:30.425456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5680, "epoch": 0, "train_loss": 4.205345183610916, "train_ppl": 67.04373607148229, "lr": 0.00056, "grad_norm": 0.6406, "tokens_per_sec": 146035, "dt_s": 4.488, "eta_s": 38570, "world_size": 1, "timestamp": "2026-05-04T21:06:34.913141"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5690, "epoch": 0, "train_loss": 4.358521193265915, "train_ppl": 78.14149277260935, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 150110, "dt_s": 4.366, "eta_s": 38484, "world_size": 1, "timestamp": "2026-05-04T21:06:39.278998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5700, "epoch": 0, "train_loss": 4.252868220210075, "train_ppl": 70.30677875076302, "lr": 0.00056, "grad_norm": 0.6096, "tokens_per_sec": 146434, "dt_s": 4.475, "eta_s": 38490, "world_size": 1, "timestamp": "2026-05-04T21:06:43.754455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5710, "epoch": 0, "train_loss": 4.244481205940247, "train_ppl": 69.71958065557021, "lr": 0.00056, "grad_norm": 0.6119, "tokens_per_sec": 150325, "dt_s": 4.36, "eta_s": 38418, "world_size": 1, "timestamp": "2026-05-04T21:06:48.114074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5720, "epoch": 0, "train_loss": 4.103241801261902, "train_ppl": 60.5362162270892, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 146184, "dt_s": 4.483, "eta_s": 38556, "world_size": 1, "timestamp": "2026-05-04T21:06:52.597196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5730, "epoch": 0, "train_loss": 4.206212401390076, "train_ppl": 67.10190280935394, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 143401, "dt_s": 4.57, "eta_s": 38695, "world_size": 1, "timestamp": "2026-05-04T21:06:57.167308"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5740, "epoch": 0, "train_loss": 4.235073506832123, "train_ppl": 69.06675542532643, "lr": 0.00056, "grad_norm": 0.6159, "tokens_per_sec": 146939, "dt_s": 4.46, "eta_s": 38855, "world_size": 1, "timestamp": "2026-05-04T21:07:01.627374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5750, "epoch": 0, "train_loss": 4.204848200082779, "train_ppl": 67.0104247172728, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 145523, "dt_s": 4.503, "eta_s": 38899, "world_size": 1, "timestamp": "2026-05-04T21:07:06.130861"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5760, "epoch": 0, "train_loss": 4.15662594139576, "train_ppl": 63.85570582179317, "lr": 0.00056, "grad_norm": 0.6314, "tokens_per_sec": 145080, "dt_s": 4.517, "eta_s": 39168, "world_size": 1, "timestamp": "2026-05-04T21:07:10.648081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5770, "epoch": 0, "train_loss": 4.211246684193611, "train_ppl": 67.44056450888336, "lr": 0.00056, "grad_norm": 0.6659, "tokens_per_sec": 147784, "dt_s": 4.435, "eta_s": 39080, "world_size": 1, "timestamp": "2026-05-04T21:07:15.082662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5780, "epoch": 0, "train_loss": 4.200694724917412, "train_ppl": 66.73267579331319, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 147380, "dt_s": 4.447, "eta_s": 38861, "world_size": 1, "timestamp": "2026-05-04T21:07:19.529405"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5790, "epoch": 0, "train_loss": 4.173955366015434, "train_ppl": 64.97193230949534, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 144830, "dt_s": 4.525, "eta_s": 38969, "world_size": 1, "timestamp": "2026-05-04T21:07:24.054435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5800, "epoch": 0, "train_loss": 4.239209890365601, "train_ppl": 69.35303368534179, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 145387, "dt_s": 4.508, "eta_s": 38972, "world_size": 1, "timestamp": "2026-05-04T21:07:28.562169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5810, "epoch": 0, "train_loss": 4.131619215011597, "train_ppl": 62.27868391579813, "lr": 0.00056, "grad_norm": 0.6781, "tokens_per_sec": 143101, "dt_s": 4.58, "eta_s": 39076, "world_size": 1, "timestamp": "2026-05-04T21:07:33.141835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5820, "epoch": 0, "train_loss": 4.226594462990761, "train_ppl": 68.4836111248587, "lr": 0.00056, "grad_norm": 0.6154, "tokens_per_sec": 147228, "dt_s": 4.451, "eta_s": 39100, "world_size": 1, "timestamp": "2026-05-04T21:07:37.593151"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5830, "epoch": 0, "train_loss": 4.103095427155495, "train_ppl": 60.52735594100774, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 149339, "dt_s": 4.388, "eta_s": 38994, "world_size": 1, "timestamp": "2026-05-04T21:07:41.981576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5840, "epoch": 0, "train_loss": 4.308139219880104, "train_ppl": 74.30210035370624, "lr": 0.00056, "grad_norm": 1.0086, "tokens_per_sec": 142613, "dt_s": 4.595, "eta_s": 39112, "world_size": 1, "timestamp": "2026-05-04T21:07:46.576982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5850, "epoch": 0, "train_loss": 4.167475447058678, "train_ppl": 64.55228057668288, "lr": 0.00056, "grad_norm": 0.6652, "tokens_per_sec": 130796, "dt_s": 5.011, "eta_s": 39981, "world_size": 1, "timestamp": "2026-05-04T21:07:51.587499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5860, "epoch": 0, "train_loss": 4.01548570394516, "train_ppl": 55.45022125419148, "lr": 0.00056, "grad_norm": 0.6793, "tokens_per_sec": 145404, "dt_s": 4.507, "eta_s": 39851, "world_size": 1, "timestamp": "2026-05-04T21:07:56.094672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5870, "epoch": 0, "train_loss": 4.211437910795212, "train_ppl": 67.45346217199327, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 139531, "dt_s": 4.697, "eta_s": 40272, "world_size": 1, "timestamp": "2026-05-04T21:08:00.791550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5880, "epoch": 0, "train_loss": 4.149816110730171, "train_ppl": 63.42233653857916, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 146885, "dt_s": 4.462, "eta_s": 40395, "world_size": 1, "timestamp": "2026-05-04T21:08:05.253275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5890, "epoch": 0, "train_loss": 4.078993991017342, "train_ppl": 59.08599889510274, "lr": 0.00056, "grad_norm": 0.6326, "tokens_per_sec": 144321, "dt_s": 4.541, "eta_s": 40296, "world_size": 1, "timestamp": "2026-05-04T21:08:09.794307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5900, "epoch": 0, "train_loss": 4.25205272436142, "train_ppl": 70.24946723638661, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 146404, "dt_s": 4.476, "eta_s": 39364, "world_size": 1, "timestamp": "2026-05-04T21:08:14.270667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5910, "epoch": 0, "train_loss": 4.2083950489759445, "train_ppl": 67.24852256695053, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 147526, "dt_s": 4.442, "eta_s": 39247, "world_size": 1, "timestamp": "2026-05-04T21:08:18.712990"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5920, "epoch": 0, "train_loss": 4.256101816892624, "train_ppl": 70.53449048282725, "lr": 0.00056, "grad_norm": 0.6998, "tokens_per_sec": 145888, "dt_s": 4.492, "eta_s": 38888, "world_size": 1, "timestamp": "2026-05-04T21:08:23.205209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5930, "epoch": 0, "train_loss": 4.294087648391724, "train_ppl": 73.26534019229143, "lr": 0.00056, "grad_norm": 0.7469, "tokens_per_sec": 146094, "dt_s": 4.486, "eta_s": 38925, "world_size": 1, "timestamp": "2026-05-04T21:08:27.691098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5940, "epoch": 0, "train_loss": 4.19545978307724, "train_ppl": 66.38424691486614, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 145411, "dt_s": 4.507, "eta_s": 38862, "world_size": 1, "timestamp": "2026-05-04T21:08:32.198047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5950, "epoch": 0, "train_loss": 4.082882851362228, "train_ppl": 59.31622345858667, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 142479, "dt_s": 4.6, "eta_s": 39071, "world_size": 1, "timestamp": "2026-05-04T21:08:36.797758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5960, "epoch": 0, "train_loss": 4.151504695415497, "train_ppl": 63.529520994312925, "lr": 0.00056, "grad_norm": 0.6564, "tokens_per_sec": 147236, "dt_s": 4.451, "eta_s": 39082, "world_size": 1, "timestamp": "2026-05-04T21:08:41.248819"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5970, "epoch": 0, "train_loss": 4.088237151503563, "train_ppl": 59.63467209623971, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 147061, "dt_s": 4.456, "eta_s": 39015, "world_size": 1, "timestamp": "2026-05-04T21:08:45.705200"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5980, "epoch": 0, "train_loss": 4.176146969199181, "train_ppl": 65.11448115139021, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 145386, "dt_s": 4.508, "eta_s": 39048, "world_size": 1, "timestamp": "2026-05-04T21:08:50.212924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 5990, "epoch": 0, "train_loss": 4.170759215950966, "train_ppl": 64.76460376691796, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 147623, "dt_s": 4.439, "eta_s": 38927, "world_size": 1, "timestamp": "2026-05-04T21:08:54.652350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6000, "epoch": 0, "train_loss": 4.172603353857994, "train_ppl": 64.88414882266314, "lr": 0.00056, "grad_norm": 0.6742, "tokens_per_sec": 142019, "dt_s": 4.615, "eta_s": 38948, "world_size": 1, "timestamp": "2026-05-04T21:08:59.266939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6010, "epoch": 0, "train_loss": 4.10586179792881, "train_ppl": 60.69502886527689, "lr": 0.00056, "grad_norm": 0.6126, "tokens_per_sec": 107308, "dt_s": 6.107, "eta_s": 38966, "world_size": 1, "timestamp": "2026-05-04T21:09:05.374236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6020, "epoch": 0, "train_loss": 4.176052242517471, "train_ppl": 65.1083133647911, "lr": 0.00056, "grad_norm": 0.6869, "tokens_per_sec": 143377, "dt_s": 4.571, "eta_s": 39160, "world_size": 1, "timestamp": "2026-05-04T21:09:09.945095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6030, "epoch": 0, "train_loss": 4.138758793473244, "train_ppl": 62.72491853412493, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 144227, "dt_s": 4.544, "eta_s": 39218, "world_size": 1, "timestamp": "2026-05-04T21:09:14.489034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6040, "epoch": 0, "train_loss": 4.28746697306633, "train_ppl": 72.78187635680193, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 145840, "dt_s": 4.494, "eta_s": 39307, "world_size": 1, "timestamp": "2026-05-04T21:09:18.982731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6050, "epoch": 0, "train_loss": 4.235015034675598, "train_ppl": 69.06271706125963, "lr": 0.00056, "grad_norm": 0.7081, "tokens_per_sec": 146917, "dt_s": 4.461, "eta_s": 39036, "world_size": 1, "timestamp": "2026-05-04T21:09:23.443483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6060, "epoch": 0, "train_loss": 4.124987900257111, "train_ppl": 61.86706067031165, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 147054, "dt_s": 4.457, "eta_s": 39019, "world_size": 1, "timestamp": "2026-05-04T21:09:27.900086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6070, "epoch": 0, "train_loss": 4.180617421865463, "train_ppl": 65.40622398265407, "lr": 0.00056, "grad_norm": 0.7405, "tokens_per_sec": 147850, "dt_s": 4.433, "eta_s": 38775, "world_size": 1, "timestamp": "2026-05-04T21:09:32.332677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6080, "epoch": 0, "train_loss": 4.207821175456047, "train_ppl": 67.20994149198461, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 145530, "dt_s": 4.503, "eta_s": 38700, "world_size": 1, "timestamp": "2026-05-04T21:09:36.835937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6090, "epoch": 0, "train_loss": 4.1783188581466675, "train_ppl": 65.25605626037947, "lr": 0.00056, "grad_norm": 0.6934, "tokens_per_sec": 147925, "dt_s": 4.43, "eta_s": 38586, "world_size": 1, "timestamp": "2026-05-04T21:09:41.266293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6100, "epoch": 0, "train_loss": 4.139556005597115, "train_ppl": 62.77494353726921, "lr": 0.00056, "grad_norm": 0.7113, "tokens_per_sec": 148520, "dt_s": 4.413, "eta_s": 38498, "world_size": 1, "timestamp": "2026-05-04T21:09:45.678893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6110, "epoch": 0, "train_loss": 4.260023608803749, "train_ppl": 70.81165521301345, "lr": 0.00056, "grad_norm": 0.6947, "tokens_per_sec": 146463, "dt_s": 4.475, "eta_s": 38525, "world_size": 1, "timestamp": "2026-05-04T21:09:50.153459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6120, "epoch": 0, "train_loss": 4.206273004412651, "train_ppl": 67.10596951071071, "lr": 0.00056, "grad_norm": 0.6084, "tokens_per_sec": 147850, "dt_s": 4.433, "eta_s": 38521, "world_size": 1, "timestamp": "2026-05-04T21:09:54.586072"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6130, "epoch": 0, "train_loss": 4.21924914419651, "train_ppl": 67.98242012622008, "lr": 0.00056, "grad_norm": 0.615, "tokens_per_sec": 146136, "dt_s": 4.485, "eta_s": 38484, "world_size": 1, "timestamp": "2026-05-04T21:09:59.070663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6140, "epoch": 0, "train_loss": 4.169140011072159, "train_ppl": 64.65982145942971, "lr": 0.00056, "grad_norm": 0.636, "tokens_per_sec": 130723, "dt_s": 5.013, "eta_s": 39489, "world_size": 1, "timestamp": "2026-05-04T21:10:04.084033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6150, "epoch": 0, "train_loss": 4.126420110464096, "train_ppl": 61.95573078804682, "lr": 0.00056, "grad_norm": 0.5947, "tokens_per_sec": 147241, "dt_s": 4.451, "eta_s": 39550, "world_size": 1, "timestamp": "2026-05-04T21:10:08.534975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6160, "epoch": 0, "train_loss": 4.227859646081924, "train_ppl": 68.57031026525405, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 144622, "dt_s": 4.532, "eta_s": 39644, "world_size": 1, "timestamp": "2026-05-04T21:10:13.066509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6170, "epoch": 0, "train_loss": 4.248331516981125, "train_ppl": 69.98854018337884, "lr": 0.00056, "grad_norm": 0.6998, "tokens_per_sec": 147083, "dt_s": 4.456, "eta_s": 39680, "world_size": 1, "timestamp": "2026-05-04T21:10:17.522238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6180, "epoch": 0, "train_loss": 4.176368728280067, "train_ppl": 65.12892248007147, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 148936, "dt_s": 4.4, "eta_s": 39529, "world_size": 1, "timestamp": "2026-05-04T21:10:21.922525"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6190, "epoch": 0, "train_loss": 4.1266635954380035, "train_ppl": 61.9708179142109, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 144189, "dt_s": 4.545, "eta_s": 38715, "world_size": 1, "timestamp": "2026-05-04T21:10:26.467657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6200, "epoch": 0, "train_loss": 4.13851960003376, "train_ppl": 62.70991693933247, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 147904, "dt_s": 4.431, "eta_s": 38676, "world_size": 1, "timestamp": "2026-05-04T21:10:30.898686"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6210, "epoch": 0, "train_loss": 4.1734806299209595, "train_ppl": 64.94109510844561, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 149619, "dt_s": 4.38, "eta_s": 38409, "world_size": 1, "timestamp": "2026-05-04T21:10:35.278819"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6220, "epoch": 0, "train_loss": 4.201279744505882, "train_ppl": 66.77172713763024, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 147612, "dt_s": 4.44, "eta_s": 38377, "world_size": 1, "timestamp": "2026-05-04T21:10:39.718552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6230, "epoch": 0, "train_loss": 4.220003664493561, "train_ppl": 68.03373359814157, "lr": 0.00056, "grad_norm": 0.7378, "tokens_per_sec": 147207, "dt_s": 4.452, "eta_s": 38462, "world_size": 1, "timestamp": "2026-05-04T21:10:44.170503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6240, "epoch": 0, "train_loss": 4.144225358963013, "train_ppl": 63.068747334229734, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 147298, "dt_s": 4.449, "eta_s": 38292, "world_size": 1, "timestamp": "2026-05-04T21:10:48.619730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6250, "epoch": 0, "train_loss": 4.019290655851364, "train_ppl": 55.66160858353018, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 146487, "dt_s": 4.474, "eta_s": 38362, "world_size": 1, "timestamp": "2026-05-04T21:10:53.093582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6260, "epoch": 0, "train_loss": 3.222409211099148, "train_ppl": 25.08849089997139, "lr": 0.00056, "grad_norm": 2.4717, "tokens_per_sec": 146739, "dt_s": 4.466, "eta_s": 38506, "world_size": 1, "timestamp": "2026-05-04T21:10:57.559723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6270, "epoch": 0, "train_loss": 4.092846751213074, "train_ppl": 59.91019860900307, "lr": 0.00056, "grad_norm": 0.5977, "tokens_per_sec": 143285, "dt_s": 4.574, "eta_s": 38733, "world_size": 1, "timestamp": "2026-05-04T21:11:02.133555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6280, "epoch": 0, "train_loss": 4.126436248421669, "train_ppl": 61.956730635069405, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 147882, "dt_s": 4.432, "eta_s": 38694, "world_size": 1, "timestamp": "2026-05-04T21:11:06.565193"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6290, "epoch": 0, "train_loss": 4.227060228586197, "train_ppl": 68.51551586426007, "lr": 0.00056, "grad_norm": 0.6292, "tokens_per_sec": 147870, "dt_s": 4.432, "eta_s": 38659, "world_size": 1, "timestamp": "2026-05-04T21:11:10.997196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6300, "epoch": 0, "train_loss": 4.166829153895378, "train_ppl": 64.51057435774456, "lr": 0.00056, "grad_norm": 0.7643, "tokens_per_sec": 147686, "dt_s": 4.438, "eta_s": 38592, "world_size": 1, "timestamp": "2026-05-04T21:11:15.434715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6310, "epoch": 0, "train_loss": 4.187493547797203, "train_ppl": 65.85751520498904, "lr": 0.00056, "grad_norm": 0.6904, "tokens_per_sec": 148093, "dt_s": 4.425, "eta_s": 38517, "world_size": 1, "timestamp": "2026-05-04T21:11:19.860050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6320, "epoch": 0, "train_loss": 4.1129070818424225, "train_ppl": 61.124152449522555, "lr": 0.00056, "grad_norm": 0.6412, "tokens_per_sec": 148611, "dt_s": 4.41, "eta_s": 38229, "world_size": 1, "timestamp": "2026-05-04T21:11:24.269959"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6330, "epoch": 0, "train_loss": 4.142111569643021, "train_ppl": 62.93557408936672, "lr": 0.00056, "grad_norm": 0.6303, "tokens_per_sec": 144546, "dt_s": 4.534, "eta_s": 38402, "world_size": 1, "timestamp": "2026-05-04T21:11:28.803864"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6340, "epoch": 0, "train_loss": 4.141112253069878, "train_ppl": 62.87271294146367, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 146972, "dt_s": 4.459, "eta_s": 38444, "world_size": 1, "timestamp": "2026-05-04T21:11:33.262928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6350, "epoch": 0, "train_loss": 4.140204668045044, "train_ppl": 62.81567649535463, "lr": 0.00056, "grad_norm": 0.6268, "tokens_per_sec": 147938, "dt_s": 4.43, "eta_s": 38426, "world_size": 1, "timestamp": "2026-05-04T21:11:37.692895"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6360, "epoch": 0, "train_loss": 4.156542181968689, "train_ppl": 63.850357528446466, "lr": 0.00056, "grad_norm": 0.629, "tokens_per_sec": 146719, "dt_s": 4.467, "eta_s": 38493, "world_size": 1, "timestamp": "2026-05-04T21:11:42.159662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6370, "epoch": 0, "train_loss": 4.177627116441727, "train_ppl": 65.2109315338912, "lr": 0.00056, "grad_norm": 0.6913, "tokens_per_sec": 148095, "dt_s": 4.425, "eta_s": 38516, "world_size": 1, "timestamp": "2026-05-04T21:11:46.584934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6380, "epoch": 0, "train_loss": 4.055338814854622, "train_ppl": 57.70471089765016, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 145672, "dt_s": 4.499, "eta_s": 38451, "world_size": 1, "timestamp": "2026-05-04T21:11:51.083801"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6390, "epoch": 0, "train_loss": 4.292610928416252, "train_ppl": 73.15722764654448, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 150068, "dt_s": 4.367, "eta_s": 38287, "world_size": 1, "timestamp": "2026-05-04T21:11:55.450911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6400, "epoch": 0, "train_loss": 4.134614184498787, "train_ppl": 62.46548626786833, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 148050, "dt_s": 4.427, "eta_s": 38277, "world_size": 1, "timestamp": "2026-05-04T21:11:59.877498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6410, "epoch": 0, "train_loss": 4.153760477900505, "train_ppl": 63.67299153337929, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 145932, "dt_s": 4.491, "eta_s": 38314, "world_size": 1, "timestamp": "2026-05-04T21:12:04.368382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6420, "epoch": 0, "train_loss": 4.225700467824936, "train_ppl": 68.42241446640908, "lr": 0.00056, "grad_norm": 0.6676, "tokens_per_sec": 149669, "dt_s": 4.379, "eta_s": 38230, "world_size": 1, "timestamp": "2026-05-04T21:12:08.747091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6430, "epoch": 0, "train_loss": 4.178141757845879, "train_ppl": 65.24450041648939, "lr": 0.00056, "grad_norm": 0.917, "tokens_per_sec": 147335, "dt_s": 4.448, "eta_s": 38138, "world_size": 1, "timestamp": "2026-05-04T21:12:13.195204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6440, "epoch": 0, "train_loss": 4.131904974579811, "train_ppl": 62.296483188658016, "lr": 0.00056, "grad_norm": 0.7028, "tokens_per_sec": 132428, "dt_s": 4.949, "eta_s": 39137, "world_size": 1, "timestamp": "2026-05-04T21:12:18.144000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6450, "epoch": 0, "train_loss": 4.148843973875046, "train_ppl": 63.36071130672463, "lr": 0.00056, "grad_norm": 0.69, "tokens_per_sec": 149110, "dt_s": 4.395, "eta_s": 39078, "world_size": 1, "timestamp": "2026-05-04T21:12:22.539142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6460, "epoch": 0, "train_loss": 4.118281751871109, "train_ppl": 61.45355903260078, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 147519, "dt_s": 4.443, "eta_s": 38990, "world_size": 1, "timestamp": "2026-05-04T21:12:26.981691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6470, "epoch": 0, "train_loss": 4.150504365563393, "train_ppl": 63.46600229309565, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 146646, "dt_s": 4.469, "eta_s": 39141, "world_size": 1, "timestamp": "2026-05-04T21:12:31.450701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6480, "epoch": 0, "train_loss": 4.117190256714821, "train_ppl": 61.386519363972525, "lr": 0.00056, "grad_norm": 0.6271, "tokens_per_sec": 148530, "dt_s": 4.412, "eta_s": 39075, "world_size": 1, "timestamp": "2026-05-04T21:12:35.862993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6490, "epoch": 0, "train_loss": 4.209953397512436, "train_ppl": 67.35340090094877, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 146283, "dt_s": 4.48, "eta_s": 38262, "world_size": 1, "timestamp": "2026-05-04T21:12:40.343093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6500, "epoch": 0, "train_loss": 4.101628586649895, "train_ppl": 60.43863704775717, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 148454, "dt_s": 4.415, "eta_s": 38291, "world_size": 1, "timestamp": "2026-05-04T21:12:44.757663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6510, "epoch": 0, "train_loss": 4.3592169135808945, "train_ppl": 78.19587631224717, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 109886, "dt_s": 5.964, "eta_s": 38354, "world_size": 1, "timestamp": "2026-05-04T21:12:50.721649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6520, "epoch": 0, "train_loss": 4.130664080381393, "train_ppl": 62.219227786890585, "lr": 0.00056, "grad_norm": 0.6335, "tokens_per_sec": 142452, "dt_s": 4.601, "eta_s": 38576, "world_size": 1, "timestamp": "2026-05-04T21:12:55.322200"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6530, "epoch": 0, "train_loss": 4.163325443863869, "train_ppl": 64.28494351457321, "lr": 0.00056, "grad_norm": 0.6386, "tokens_per_sec": 148637, "dt_s": 4.409, "eta_s": 38566, "world_size": 1, "timestamp": "2026-05-04T21:12:59.731347"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6540, "epoch": 0, "train_loss": 4.126679360866547, "train_ppl": 61.971794918413906, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 145679, "dt_s": 4.499, "eta_s": 38593, "world_size": 1, "timestamp": "2026-05-04T21:13:04.229981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6550, "epoch": 0, "train_loss": 4.1177608370780945, "train_ppl": 61.42155530094962, "lr": 0.00056, "grad_norm": 0.7119, "tokens_per_sec": 146545, "dt_s": 4.472, "eta_s": 38688, "world_size": 1, "timestamp": "2026-05-04T21:13:08.702040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6560, "epoch": 0, "train_loss": 4.111550971865654, "train_ppl": 61.04131755587177, "lr": 0.00056, "grad_norm": 0.6084, "tokens_per_sec": 147515, "dt_s": 4.443, "eta_s": 38617, "world_size": 1, "timestamp": "2026-05-04T21:13:13.144699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6570, "epoch": 0, "train_loss": 4.093784123659134, "train_ppl": 59.966383107203434, "lr": 0.00056, "grad_norm": 0.6326, "tokens_per_sec": 144244, "dt_s": 4.543, "eta_s": 38514, "world_size": 1, "timestamp": "2026-05-04T21:13:17.688116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6580, "epoch": 0, "train_loss": 4.148689806461334, "train_ppl": 63.35094390265861, "lr": 0.00056, "grad_norm": 0.6461, "tokens_per_sec": 147914, "dt_s": 4.431, "eta_s": 38547, "world_size": 1, "timestamp": "2026-05-04T21:13:22.118805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6590, "epoch": 0, "train_loss": 4.135688215494156, "train_ppl": 62.53261217746184, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 147354, "dt_s": 4.448, "eta_s": 38454, "world_size": 1, "timestamp": "2026-05-04T21:13:26.566356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6600, "epoch": 0, "train_loss": 4.07896488904953, "train_ppl": 59.08427940128523, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 145217, "dt_s": 4.513, "eta_s": 38520, "world_size": 1, "timestamp": "2026-05-04T21:13:31.079311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6610, "epoch": 0, "train_loss": 4.257215678691864, "train_ppl": 70.61309992920715, "lr": 0.00056, "grad_norm": 0.6573, "tokens_per_sec": 148072, "dt_s": 4.426, "eta_s": 38487, "world_size": 1, "timestamp": "2026-05-04T21:13:35.505258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6620, "epoch": 0, "train_loss": 4.1229918003082275, "train_ppl": 61.74369100372567, "lr": 0.00056, "grad_norm": 0.6482, "tokens_per_sec": 146830, "dt_s": 4.463, "eta_s": 38345, "world_size": 1, "timestamp": "2026-05-04T21:13:39.968643"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6630, "epoch": 0, "train_loss": 4.286884009838104, "train_ppl": 72.7394595641211, "lr": 0.00056, "grad_norm": 0.756, "tokens_per_sec": 146465, "dt_s": 4.475, "eta_s": 38416, "world_size": 1, "timestamp": "2026-05-04T21:13:44.443159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6640, "epoch": 0, "train_loss": 4.021046653389931, "train_ppl": 55.75943609849544, "lr": 0.00056, "grad_norm": 0.6408, "tokens_per_sec": 149260, "dt_s": 4.391, "eta_s": 38313, "world_size": 1, "timestamp": "2026-05-04T21:13:48.833877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6650, "epoch": 0, "train_loss": 4.176767081022263, "train_ppl": 65.15487193311236, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 145064, "dt_s": 4.518, "eta_s": 38317, "world_size": 1, "timestamp": "2026-05-04T21:13:53.351628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6660, "epoch": 0, "train_loss": 4.1811684519052505, "train_ppl": 65.44227470846182, "lr": 0.00056, "grad_norm": 0.6938, "tokens_per_sec": 144943, "dt_s": 4.522, "eta_s": 38477, "world_size": 1, "timestamp": "2026-05-04T21:13:57.873138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6670, "epoch": 0, "train_loss": 4.0353065729141235, "train_ppl": 56.56025742357341, "lr": 0.00056, "grad_norm": 0.7599, "tokens_per_sec": 144211, "dt_s": 4.544, "eta_s": 38612, "world_size": 1, "timestamp": "2026-05-04T21:14:02.417561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6680, "epoch": 0, "train_loss": 4.073251768946648, "train_ppl": 58.74768623100291, "lr": 0.00056, "grad_norm": 0.6533, "tokens_per_sec": 143229, "dt_s": 4.576, "eta_s": 38781, "world_size": 1, "timestamp": "2026-05-04T21:14:06.993171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6690, "epoch": 0, "train_loss": 4.045966178178787, "train_ppl": 57.16639228109674, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 145882, "dt_s": 4.492, "eta_s": 38952, "world_size": 1, "timestamp": "2026-05-04T21:14:11.485608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6700, "epoch": 0, "train_loss": 4.055097132921219, "train_ppl": 57.6907663966888, "lr": 0.00056, "grad_norm": 0.6448, "tokens_per_sec": 147005, "dt_s": 4.458, "eta_s": 38845, "world_size": 1, "timestamp": "2026-05-04T21:14:15.943677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6710, "epoch": 0, "train_loss": 4.127217993140221, "train_ppl": 62.00518391860376, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 145394, "dt_s": 4.507, "eta_s": 38816, "world_size": 1, "timestamp": "2026-05-04T21:14:20.451154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6720, "epoch": 0, "train_loss": 4.169299349188805, "train_ppl": 64.67012505445962, "lr": 0.00056, "grad_norm": 0.6623, "tokens_per_sec": 149532, "dt_s": 4.383, "eta_s": 38534, "world_size": 1, "timestamp": "2026-05-04T21:14:24.833916"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6730, "epoch": 0, "train_loss": 4.067310869693756, "train_ppl": 58.39970682453506, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 132327, "dt_s": 4.953, "eta_s": 39177, "world_size": 1, "timestamp": "2026-05-04T21:14:29.786493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6740, "epoch": 0, "train_loss": 4.113424822688103, "train_ppl": 61.15580711365244, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 147929, "dt_s": 4.43, "eta_s": 39066, "world_size": 1, "timestamp": "2026-05-04T21:14:34.216725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6750, "epoch": 0, "train_loss": 4.078921273350716, "train_ppl": 59.08170245534826, "lr": 0.00056, "grad_norm": 0.7071, "tokens_per_sec": 147996, "dt_s": 4.428, "eta_s": 39010, "world_size": 1, "timestamp": "2026-05-04T21:14:38.644951"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6760, "epoch": 0, "train_loss": 4.062904387712479, "train_ppl": 58.14293571281811, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 145495, "dt_s": 4.504, "eta_s": 39000, "world_size": 1, "timestamp": "2026-05-04T21:14:43.149323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6770, "epoch": 0, "train_loss": 4.221356198191643, "train_ppl": 68.12581377217131, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 147341, "dt_s": 4.448, "eta_s": 39107, "world_size": 1, "timestamp": "2026-05-04T21:14:47.597223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6780, "epoch": 0, "train_loss": 4.107729062438011, "train_ppl": 60.80846841642527, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 149414, "dt_s": 4.386, "eta_s": 38130, "world_size": 1, "timestamp": "2026-05-04T21:14:51.983404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6790, "epoch": 0, "train_loss": 4.120369553565979, "train_ppl": 61.58199590601204, "lr": 0.00056, "grad_norm": 0.6076, "tokens_per_sec": 146116, "dt_s": 4.485, "eta_s": 38220, "world_size": 1, "timestamp": "2026-05-04T21:14:56.468599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6800, "epoch": 0, "train_loss": 4.1408867090940475, "train_ppl": 62.85853397886553, "lr": 0.00056, "grad_norm": 0.7226, "tokens_per_sec": 148005, "dt_s": 4.428, "eta_s": 38215, "world_size": 1, "timestamp": "2026-05-04T21:15:00.896579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6810, "epoch": 0, "train_loss": 4.189174756407738, "train_ppl": 65.96832855069475, "lr": 0.00056, "grad_norm": 0.7449, "tokens_per_sec": 148355, "dt_s": 4.418, "eta_s": 38061, "world_size": 1, "timestamp": "2026-05-04T21:15:05.314114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6820, "epoch": 0, "train_loss": 4.073340758681297, "train_ppl": 58.752914404635334, "lr": 0.00056, "grad_norm": 0.6858, "tokens_per_sec": 146137, "dt_s": 4.485, "eta_s": 38120, "world_size": 1, "timestamp": "2026-05-04T21:15:09.798665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6830, "epoch": 0, "train_loss": 4.224680453538895, "train_ppl": 68.3526582084058, "lr": 0.00056, "grad_norm": 0.6363, "tokens_per_sec": 149496, "dt_s": 4.384, "eta_s": 38111, "world_size": 1, "timestamp": "2026-05-04T21:15:14.182447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6840, "epoch": 0, "train_loss": 4.2040043622255325, "train_ppl": 66.95390263515647, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 146036, "dt_s": 4.488, "eta_s": 38111, "world_size": 1, "timestamp": "2026-05-04T21:15:18.670132"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6850, "epoch": 0, "train_loss": 4.1103163957595825, "train_ppl": 60.96600390351795, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 146098, "dt_s": 4.486, "eta_s": 38206, "world_size": 1, "timestamp": "2026-05-04T21:15:23.155892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6860, "epoch": 0, "train_loss": 4.142198100686073, "train_ppl": 62.94102020586346, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 148830, "dt_s": 4.403, "eta_s": 38177, "world_size": 1, "timestamp": "2026-05-04T21:15:27.559339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6870, "epoch": 0, "train_loss": 4.113089874386787, "train_ppl": 61.13532651010732, "lr": 0.00056, "grad_norm": 0.6313, "tokens_per_sec": 146970, "dt_s": 4.459, "eta_s": 38129, "world_size": 1, "timestamp": "2026-05-04T21:15:32.018454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6880, "epoch": 0, "train_loss": 3.899005025625229, "train_ppl": 49.35331938008127, "lr": 0.00056, "grad_norm": 0.7407, "tokens_per_sec": 147798, "dt_s": 4.434, "eta_s": 38211, "world_size": 1, "timestamp": "2026-05-04T21:15:36.452630"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6890, "epoch": 0, "train_loss": 4.160606175661087, "train_ppl": 64.11037297139067, "lr": 0.00056, "grad_norm": 0.662, "tokens_per_sec": 148878, "dt_s": 4.402, "eta_s": 38060, "world_size": 1, "timestamp": "2026-05-04T21:15:40.854633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6900, "epoch": 0, "train_loss": 4.1611657589674, "train_ppl": 64.14625810529147, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 144870, "dt_s": 4.524, "eta_s": 38121, "world_size": 1, "timestamp": "2026-05-04T21:15:45.378412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6910, "epoch": 0, "train_loss": 4.1267901211977005, "train_ppl": 61.97865931508567, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 148120, "dt_s": 4.425, "eta_s": 38152, "world_size": 1, "timestamp": "2026-05-04T21:15:49.802956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6920, "epoch": 0, "train_loss": 4.228478834033012, "train_ppl": 68.61278132259922, "lr": 0.00056, "grad_norm": 0.7783, "tokens_per_sec": 147643, "dt_s": 4.439, "eta_s": 38113, "world_size": 1, "timestamp": "2026-05-04T21:15:54.241750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6930, "epoch": 0, "train_loss": 4.1188890635967255, "train_ppl": 61.49089183475894, "lr": 0.00056, "grad_norm": 0.6057, "tokens_per_sec": 144342, "dt_s": 4.54, "eta_s": 38291, "world_size": 1, "timestamp": "2026-05-04T21:15:58.782079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6940, "epoch": 0, "train_loss": 4.15230268239975, "train_ppl": 63.580236957830344, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 147623, "dt_s": 4.439, "eta_s": 38350, "world_size": 1, "timestamp": "2026-05-04T21:16:03.221508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6950, "epoch": 0, "train_loss": 4.168433964252472, "train_ppl": 64.61418471086319, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 148476, "dt_s": 4.414, "eta_s": 38157, "world_size": 1, "timestamp": "2026-05-04T21:16:07.635417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6960, "epoch": 0, "train_loss": 4.150896787643433, "train_ppl": 63.49091264109294, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 144840, "dt_s": 4.525, "eta_s": 38325, "world_size": 1, "timestamp": "2026-05-04T21:16:12.160143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6970, "epoch": 0, "train_loss": 4.128288179636002, "train_ppl": 62.07157654901349, "lr": 0.00056, "grad_norm": 0.6412, "tokens_per_sec": 149322, "dt_s": 4.389, "eta_s": 38235, "world_size": 1, "timestamp": "2026-05-04T21:16:16.549036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6980, "epoch": 0, "train_loss": 4.156558096408844, "train_ppl": 63.851373679225944, "lr": 0.00056, "grad_norm": 0.7081, "tokens_per_sec": 144640, "dt_s": 4.531, "eta_s": 38214, "world_size": 1, "timestamp": "2026-05-04T21:16:21.080012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 6990, "epoch": 0, "train_loss": 4.220891654491425, "train_ppl": 68.09417370422733, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 146390, "dt_s": 4.477, "eta_s": 38274, "world_size": 1, "timestamp": "2026-05-04T21:16:25.556799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7000, "epoch": 0, "train_loss": 4.003091797232628, "train_ppl": 54.76721766906835, "lr": 0.00056, "grad_norm": 0.6485, "tokens_per_sec": 147739, "dt_s": 4.436, "eta_s": 38307, "world_size": 1, "timestamp": "2026-05-04T21:16:29.992730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7010, "epoch": 0, "train_loss": 4.125967368483543, "train_ppl": 61.92768717652239, "lr": 0.00056, "grad_norm": 0.609, "tokens_per_sec": 107452, "dt_s": 6.099, "eta_s": 38438, "world_size": 1, "timestamp": "2026-05-04T21:16:36.091805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7020, "epoch": 0, "train_loss": 4.142830699682236, "train_ppl": 62.9808492286348, "lr": 0.00056, "grad_norm": 0.774, "tokens_per_sec": 146418, "dt_s": 4.476, "eta_s": 38583, "world_size": 1, "timestamp": "2026-05-04T21:16:40.567768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7030, "epoch": 0, "train_loss": 4.118341237306595, "train_ppl": 61.45721473305137, "lr": 0.00056, "grad_norm": 0.7636, "tokens_per_sec": 131348, "dt_s": 4.99, "eta_s": 39364, "world_size": 1, "timestamp": "2026-05-04T21:16:45.557270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7040, "epoch": 0, "train_loss": 4.073037400841713, "train_ppl": 58.73509395057614, "lr": 0.00056, "grad_norm": 0.6346, "tokens_per_sec": 146193, "dt_s": 4.483, "eta_s": 39370, "world_size": 1, "timestamp": "2026-05-04T21:16:50.040096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7050, "epoch": 0, "train_loss": 4.094413682818413, "train_ppl": 60.004147379111735, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 146977, "dt_s": 4.459, "eta_s": 39404, "world_size": 1, "timestamp": "2026-05-04T21:16:54.499041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7060, "epoch": 0, "train_loss": 4.016217052936554, "train_ppl": 55.490789550561395, "lr": 0.00056, "grad_norm": 0.6312, "tokens_per_sec": 143770, "dt_s": 4.558, "eta_s": 39322, "world_size": 1, "timestamp": "2026-05-04T21:16:59.057457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7070, "epoch": 0, "train_loss": 4.106552347540855, "train_ppl": 60.73695626872286, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 147481, "dt_s": 4.444, "eta_s": 39262, "world_size": 1, "timestamp": "2026-05-04T21:17:03.501146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7080, "epoch": 0, "train_loss": 4.083963334560394, "train_ppl": 59.38034827806883, "lr": 0.00056, "grad_norm": 0.6381, "tokens_per_sec": 147589, "dt_s": 4.44, "eta_s": 38318, "world_size": 1, "timestamp": "2026-05-04T21:17:07.941582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7090, "epoch": 0, "train_loss": 4.154433876276016, "train_ppl": 63.71588326243131, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 145705, "dt_s": 4.498, "eta_s": 38339, "world_size": 1, "timestamp": "2026-05-04T21:17:12.439465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7100, "epoch": 0, "train_loss": 4.119606226682663, "train_ppl": 61.535006649371724, "lr": 0.00056, "grad_norm": 0.7221, "tokens_per_sec": 148824, "dt_s": 4.404, "eta_s": 38239, "world_size": 1, "timestamp": "2026-05-04T21:17:16.843040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7110, "epoch": 0, "train_loss": 4.163777217268944, "train_ppl": 64.31399230364096, "lr": 0.00056, "grad_norm": 0.6992, "tokens_per_sec": 148125, "dt_s": 4.424, "eta_s": 38006, "world_size": 1, "timestamp": "2026-05-04T21:17:21.267407"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7120, "epoch": 0, "train_loss": 4.113631084561348, "train_ppl": 61.168422525982095, "lr": 0.00056, "grad_norm": 0.7236, "tokens_per_sec": 146899, "dt_s": 4.461, "eta_s": 38031, "world_size": 1, "timestamp": "2026-05-04T21:17:25.728717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7130, "epoch": 0, "train_loss": 4.055821478366852, "train_ppl": 57.73256957872782, "lr": 0.00056, "grad_norm": 0.7688, "tokens_per_sec": 148363, "dt_s": 4.417, "eta_s": 37987, "world_size": 1, "timestamp": "2026-05-04T21:17:30.145989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7140, "epoch": 0, "train_loss": 4.1985228806734085, "train_ppl": 66.58790008755395, "lr": 0.00056, "grad_norm": 0.7046, "tokens_per_sec": 146500, "dt_s": 4.473, "eta_s": 37941, "world_size": 1, "timestamp": "2026-05-04T21:17:34.619422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7150, "epoch": 0, "train_loss": 4.155847296118736, "train_ppl": 63.80600423050983, "lr": 0.00056, "grad_norm": 0.6942, "tokens_per_sec": 145196, "dt_s": 4.514, "eta_s": 38125, "world_size": 1, "timestamp": "2026-05-04T21:17:39.133079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7160, "epoch": 0, "train_loss": 4.05357725918293, "train_ppl": 57.603150315457384, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 149583, "dt_s": 4.381, "eta_s": 38047, "world_size": 1, "timestamp": "2026-05-04T21:17:43.514305"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7170, "epoch": 0, "train_loss": 4.079940602183342, "train_ppl": 59.14195684244149, "lr": 0.00056, "grad_norm": 0.6953, "tokens_per_sec": 147314, "dt_s": 4.449, "eta_s": 38021, "world_size": 1, "timestamp": "2026-05-04T21:17:47.963004"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7180, "epoch": 0, "train_loss": 4.017084136605263, "train_ppl": 55.53892557390844, "lr": 0.00056, "grad_norm": 0.6932, "tokens_per_sec": 148577, "dt_s": 4.411, "eta_s": 38005, "world_size": 1, "timestamp": "2026-05-04T21:17:52.373941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7190, "epoch": 0, "train_loss": 4.131802573800087, "train_ppl": 62.29010430681224, "lr": 0.00056, "grad_norm": 0.7168, "tokens_per_sec": 149898, "dt_s": 4.372, "eta_s": 37828, "world_size": 1, "timestamp": "2026-05-04T21:17:56.745971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7200, "epoch": 0, "train_loss": 4.09711030125618, "train_ppl": 60.16617403312327, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 145457, "dt_s": 4.506, "eta_s": 37809, "world_size": 1, "timestamp": "2026-05-04T21:18:01.251509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7210, "epoch": 0, "train_loss": 3.9925974160432816, "train_ppl": 54.19547490070265, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 146195, "dt_s": 4.483, "eta_s": 37978, "world_size": 1, "timestamp": "2026-05-04T21:18:05.734320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7220, "epoch": 0, "train_loss": 4.041500523686409, "train_ppl": 56.91167608515317, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 147966, "dt_s": 4.429, "eta_s": 37941, "world_size": 1, "timestamp": "2026-05-04T21:18:10.163433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7230, "epoch": 0, "train_loss": 4.124978944659233, "train_ppl": 61.866506616275366, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 147209, "dt_s": 4.452, "eta_s": 38006, "world_size": 1, "timestamp": "2026-05-04T21:18:14.615364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7240, "epoch": 0, "train_loss": 4.103708133101463, "train_ppl": 60.564452775451535, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 148727, "dt_s": 4.406, "eta_s": 38060, "world_size": 1, "timestamp": "2026-05-04T21:18:19.021792"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7250, "epoch": 0, "train_loss": 3.9469093531370163, "train_ppl": 51.77510074675972, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 148883, "dt_s": 4.402, "eta_s": 37879, "world_size": 1, "timestamp": "2026-05-04T21:18:23.423641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7260, "epoch": 0, "train_loss": 3.949246495962143, "train_ppl": 51.89624806613975, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 148683, "dt_s": 4.408, "eta_s": 37746, "world_size": 1, "timestamp": "2026-05-04T21:18:27.831397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7270, "epoch": 0, "train_loss": 4.073528990149498, "train_ppl": 58.76397459287987, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 149475, "dt_s": 4.384, "eta_s": 37665, "world_size": 1, "timestamp": "2026-05-04T21:18:32.215821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7280, "epoch": 0, "train_loss": 4.119153380393982, "train_ppl": 61.507147058518854, "lr": 0.00056, "grad_norm": 0.681, "tokens_per_sec": 146649, "dt_s": 4.469, "eta_s": 37690, "world_size": 1, "timestamp": "2026-05-04T21:18:36.684716"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7290, "epoch": 0, "train_loss": 4.009276449680328, "train_ppl": 55.106983459542285, "lr": 0.00056, "grad_norm": 0.6218, "tokens_per_sec": 148931, "dt_s": 4.4, "eta_s": 37675, "world_size": 1, "timestamp": "2026-05-04T21:18:41.085133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7300, "epoch": 0, "train_loss": 4.122070074081421, "train_ppl": 61.68680644441013, "lr": 0.00056, "grad_norm": 0.7503, "tokens_per_sec": 149158, "dt_s": 4.394, "eta_s": 37657, "world_size": 1, "timestamp": "2026-05-04T21:18:45.478841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7310, "epoch": 0, "train_loss": 4.090831428766251, "train_ppl": 59.78958182263523, "lr": 0.00056, "grad_norm": 0.6885, "tokens_per_sec": 146018, "dt_s": 4.488, "eta_s": 37790, "world_size": 1, "timestamp": "2026-05-04T21:18:49.967058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7320, "epoch": 0, "train_loss": 4.050970986485481, "train_ppl": 57.45321626688266, "lr": 0.00056, "grad_norm": 0.7202, "tokens_per_sec": 133892, "dt_s": 4.895, "eta_s": 38657, "world_size": 1, "timestamp": "2026-05-04T21:18:54.861773"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7330, "epoch": 0, "train_loss": 4.1449790596961975, "train_ppl": 63.11630021340545, "lr": 0.00056, "grad_norm": 0.7035, "tokens_per_sec": 148749, "dt_s": 4.406, "eta_s": 38545, "world_size": 1, "timestamp": "2026-05-04T21:18:59.267580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7340, "epoch": 0, "train_loss": 4.04407899081707, "train_ppl": 57.058610322432564, "lr": 0.00056, "grad_norm": 0.687, "tokens_per_sec": 145980, "dt_s": 4.489, "eta_s": 38692, "world_size": 1, "timestamp": "2026-05-04T21:19:03.756963"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7350, "epoch": 0, "train_loss": 4.12068809568882, "train_ppl": 61.60161549038285, "lr": 0.00056, "grad_norm": 0.673, "tokens_per_sec": 146678, "dt_s": 4.468, "eta_s": 38814, "world_size": 1, "timestamp": "2026-05-04T21:19:08.224984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7360, "epoch": 0, "train_loss": 4.231708377599716, "train_ppl": 68.8347274886123, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 144141, "dt_s": 4.547, "eta_s": 38909, "world_size": 1, "timestamp": "2026-05-04T21:19:12.771658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7370, "epoch": 0, "train_loss": 4.08531554043293, "train_ppl": 59.46069704551837, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 146160, "dt_s": 4.484, "eta_s": 38204, "world_size": 1, "timestamp": "2026-05-04T21:19:17.255496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7380, "epoch": 0, "train_loss": 4.234541043639183, "train_ppl": 69.0299897092706, "lr": 0.00056, "grad_norm": 0.681, "tokens_per_sec": 146990, "dt_s": 4.459, "eta_s": 38289, "world_size": 1, "timestamp": "2026-05-04T21:19:21.714050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7390, "epoch": 0, "train_loss": 4.093827903270721, "train_ppl": 59.969008469632435, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 147504, "dt_s": 4.443, "eta_s": 38206, "world_size": 1, "timestamp": "2026-05-04T21:19:26.157033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7400, "epoch": 0, "train_loss": 4.115313455462456, "train_ppl": 61.27141711336212, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 148426, "dt_s": 4.415, "eta_s": 38111, "world_size": 1, "timestamp": "2026-05-04T21:19:30.572424"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7410, "epoch": 0, "train_loss": 4.113937348127365, "train_ppl": 61.187159054204066, "lr": 0.00056, "grad_norm": 0.6175, "tokens_per_sec": 149452, "dt_s": 4.385, "eta_s": 37831, "world_size": 1, "timestamp": "2026-05-04T21:19:34.957495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7420, "epoch": 0, "train_loss": 4.051473140716553, "train_ppl": 57.48207388740147, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 147363, "dt_s": 4.447, "eta_s": 37765, "world_size": 1, "timestamp": "2026-05-04T21:19:39.404746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7430, "epoch": 0, "train_loss": 4.028184533119202, "train_ppl": 56.15886408572432, "lr": 0.00056, "grad_norm": 0.6822, "tokens_per_sec": 147857, "dt_s": 4.432, "eta_s": 37716, "world_size": 1, "timestamp": "2026-05-04T21:19:43.837144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7440, "epoch": 0, "train_loss": 4.146955922245979, "train_ppl": 63.24119587368063, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 148771, "dt_s": 4.405, "eta_s": 37647, "world_size": 1, "timestamp": "2026-05-04T21:19:48.242312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7450, "epoch": 0, "train_loss": 4.231070324778557, "train_ppl": 68.79082130526609, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 147721, "dt_s": 4.436, "eta_s": 37678, "world_size": 1, "timestamp": "2026-05-04T21:19:52.678793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7460, "epoch": 0, "train_loss": 4.00545808672905, "train_ppl": 54.89696621175175, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 149308, "dt_s": 4.389, "eta_s": 37681, "world_size": 1, "timestamp": "2026-05-04T21:19:57.068070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7470, "epoch": 0, "train_loss": 4.108527272939682, "train_ppl": 60.85702575141499, "lr": 0.00056, "grad_norm": 0.7321, "tokens_per_sec": 148256, "dt_s": 4.42, "eta_s": 37631, "world_size": 1, "timestamp": "2026-05-04T21:20:01.488530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7480, "epoch": 0, "train_loss": 4.131551533937454, "train_ppl": 62.27446897021216, "lr": 0.00056, "grad_norm": 0.6229, "tokens_per_sec": 148475, "dt_s": 4.414, "eta_s": 37595, "world_size": 1, "timestamp": "2026-05-04T21:20:05.902493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7490, "epoch": 0, "train_loss": 4.126749575138092, "train_ppl": 61.97614637561587, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 149502, "dt_s": 4.384, "eta_s": 37554, "world_size": 1, "timestamp": "2026-05-04T21:20:10.286124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7500, "epoch": 0, "train_loss": 4.007262960076332, "train_ppl": 54.99613775202744, "lr": 0.00056, "grad_norm": 0.6919, "tokens_per_sec": 148408, "dt_s": 4.416, "eta_s": 37514, "world_size": 1, "timestamp": "2026-05-04T21:20:14.702043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7510, "epoch": 0, "train_loss": 4.206713616847992, "train_ppl": 67.13554375027313, "lr": 0.00056, "grad_norm": 0.6994, "tokens_per_sec": 127053, "dt_s": 5.158, "eta_s": 37518, "world_size": 1, "timestamp": "2026-05-04T21:20:19.860216"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7520, "epoch": 0, "train_loss": 4.198196187615395, "train_ppl": 66.56614983586918, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 148206, "dt_s": 4.422, "eta_s": 37516, "world_size": 1, "timestamp": "2026-05-04T21:20:24.282192"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7530, "epoch": 0, "train_loss": 4.07646407186985, "train_ppl": 58.93670502548853, "lr": 0.00056, "grad_norm": 0.7287, "tokens_per_sec": 144593, "dt_s": 4.532, "eta_s": 37713, "world_size": 1, "timestamp": "2026-05-04T21:20:28.814617"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7540, "epoch": 0, "train_loss": 4.20784130692482, "train_ppl": 67.21129454044235, "lr": 0.00056, "grad_norm": 0.6511, "tokens_per_sec": 147861, "dt_s": 4.432, "eta_s": 37791, "world_size": 1, "timestamp": "2026-05-04T21:20:33.246884"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7550, "epoch": 0, "train_loss": 4.041189551353455, "train_ppl": 56.89398087997084, "lr": 0.00056, "grad_norm": 0.6381, "tokens_per_sec": 147095, "dt_s": 4.455, "eta_s": 37854, "world_size": 1, "timestamp": "2026-05-04T21:20:37.702254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7560, "epoch": 0, "train_loss": 4.0829876363277435, "train_ppl": 59.32243923267041, "lr": 0.00056, "grad_norm": 0.629, "tokens_per_sec": 148389, "dt_s": 4.417, "eta_s": 37889, "world_size": 1, "timestamp": "2026-05-04T21:20:42.118759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7570, "epoch": 0, "train_loss": 4.0983927845954895, "train_ppl": 60.24338564963374, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 149960, "dt_s": 4.37, "eta_s": 37796, "world_size": 1, "timestamp": "2026-05-04T21:20:46.488979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7580, "epoch": 0, "train_loss": 4.121730417013168, "train_ppl": 61.66585764248908, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 147237, "dt_s": 4.451, "eta_s": 37653, "world_size": 1, "timestamp": "2026-05-04T21:20:50.940040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7590, "epoch": 0, "train_loss": 4.013166442513466, "train_ppl": 55.32176671206983, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 148018, "dt_s": 4.428, "eta_s": 37641, "world_size": 1, "timestamp": "2026-05-04T21:20:55.367614"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7600, "epoch": 0, "train_loss": 4.136623844504356, "train_ppl": 62.5911468826127, "lr": 0.00056, "grad_norm": 0.7049, "tokens_per_sec": 149735, "dt_s": 4.377, "eta_s": 37503, "world_size": 1, "timestamp": "2026-05-04T21:20:59.744404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7610, "epoch": 0, "train_loss": 3.962911754846573, "train_ppl": 52.610291413061, "lr": 0.00056, "grad_norm": 0.6434, "tokens_per_sec": 146390, "dt_s": 4.477, "eta_s": 37601, "world_size": 1, "timestamp": "2026-05-04T21:21:04.221218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7620, "epoch": 0, "train_loss": 4.056154012680054, "train_ppl": 57.75177083146282, "lr": 0.00056, "grad_norm": 0.6218, "tokens_per_sec": 133939, "dt_s": 4.893, "eta_s": 38486, "world_size": 1, "timestamp": "2026-05-04T21:21:09.114194"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7630, "epoch": 0, "train_loss": 4.123514994978905, "train_ppl": 61.776003425913224, "lr": 0.00056, "grad_norm": 0.7234, "tokens_per_sec": 148751, "dt_s": 4.406, "eta_s": 38404, "world_size": 1, "timestamp": "2026-05-04T21:21:13.519937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7640, "epoch": 0, "train_loss": 4.016437292098999, "train_ppl": 55.50301214147262, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 149332, "dt_s": 4.389, "eta_s": 38333, "world_size": 1, "timestamp": "2026-05-04T21:21:17.908539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7650, "epoch": 0, "train_loss": 4.207393869757652, "train_ppl": 67.1812284360594, "lr": 0.00056, "grad_norm": 1.8258, "tokens_per_sec": 147771, "dt_s": 4.435, "eta_s": 38428, "world_size": 1, "timestamp": "2026-05-04T21:21:22.343509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7660, "epoch": 0, "train_loss": 4.090605616569519, "train_ppl": 59.77608213007718, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 146926, "dt_s": 4.46, "eta_s": 38396, "world_size": 1, "timestamp": "2026-05-04T21:21:26.804006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7670, "epoch": 0, "train_loss": 4.138023719191551, "train_ppl": 62.678828001747085, "lr": 0.00056, "grad_norm": 0.6434, "tokens_per_sec": 146044, "dt_s": 4.487, "eta_s": 37701, "world_size": 1, "timestamp": "2026-05-04T21:21:31.291406"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7680, "epoch": 0, "train_loss": 4.065116599202156, "train_ppl": 58.27170256048141, "lr": 0.00056, "grad_norm": 0.7329, "tokens_per_sec": 148802, "dt_s": 4.404, "eta_s": 37694, "world_size": 1, "timestamp": "2026-05-04T21:21:35.695653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7690, "epoch": 0, "train_loss": 4.079849675297737, "train_ppl": 59.13657949297375, "lr": 0.00056, "grad_norm": 0.7171, "tokens_per_sec": 147309, "dt_s": 4.449, "eta_s": 37792, "world_size": 1, "timestamp": "2026-05-04T21:21:40.144513"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7700, "epoch": 0, "train_loss": 4.034673944115639, "train_ppl": 56.52448709174156, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 148402, "dt_s": 4.416, "eta_s": 37756, "world_size": 1, "timestamp": "2026-05-04T21:21:44.560640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7710, "epoch": 0, "train_loss": 4.0820930898189545, "train_ppl": 59.26939627999318, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 147588, "dt_s": 4.44, "eta_s": 37717, "world_size": 1, "timestamp": "2026-05-04T21:21:49.001083"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7720, "epoch": 0, "train_loss": 4.151273101568222, "train_ppl": 63.51480965172429, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 144388, "dt_s": 4.539, "eta_s": 37800, "world_size": 1, "timestamp": "2026-05-04T21:21:53.540003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7730, "epoch": 0, "train_loss": 4.0217446237802505, "train_ppl": 55.79836811899137, "lr": 0.00056, "grad_norm": 0.7631, "tokens_per_sec": 145916, "dt_s": 4.491, "eta_s": 37944, "world_size": 1, "timestamp": "2026-05-04T21:21:58.031378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7740, "epoch": 0, "train_loss": 4.050182998180389, "train_ppl": 57.40796163677573, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 145702, "dt_s": 4.498, "eta_s": 38023, "world_size": 1, "timestamp": "2026-05-04T21:22:02.529312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7750, "epoch": 0, "train_loss": 4.03191676735878, "train_ppl": 56.36885374265921, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 144185, "dt_s": 4.545, "eta_s": 38238, "world_size": 1, "timestamp": "2026-05-04T21:22:07.074555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7760, "epoch": 0, "train_loss": 4.141759425401688, "train_ppl": 62.91341559109892, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 147801, "dt_s": 4.434, "eta_s": 38222, "world_size": 1, "timestamp": "2026-05-04T21:22:11.508642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7770, "epoch": 0, "train_loss": 4.083533138036728, "train_ppl": 59.354808552609946, "lr": 0.00056, "grad_norm": 0.6434, "tokens_per_sec": 145265, "dt_s": 4.511, "eta_s": 38171, "world_size": 1, "timestamp": "2026-05-04T21:22:16.020136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7780, "epoch": 0, "train_loss": 3.942975401878357, "train_ppl": 51.571820134184335, "lr": 0.00056, "grad_norm": 0.6113, "tokens_per_sec": 146084, "dt_s": 4.486, "eta_s": 38158, "world_size": 1, "timestamp": "2026-05-04T21:22:20.506325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7790, "epoch": 0, "train_loss": 4.132710739970207, "train_ppl": 62.34669976742847, "lr": 0.00056, "grad_norm": 0.7424, "tokens_per_sec": 147890, "dt_s": 4.431, "eta_s": 38041, "world_size": 1, "timestamp": "2026-05-04T21:22:24.937710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7800, "epoch": 0, "train_loss": 4.0742121785879135, "train_ppl": 58.80413517797871, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 144833, "dt_s": 4.525, "eta_s": 38002, "world_size": 1, "timestamp": "2026-05-04T21:22:29.462649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7810, "epoch": 0, "train_loss": 4.172490760684013, "train_ppl": 64.87684372166612, "lr": 0.00056, "grad_norm": 0.7345, "tokens_per_sec": 148530, "dt_s": 4.412, "eta_s": 37960, "world_size": 1, "timestamp": "2026-05-04T21:22:33.874954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7820, "epoch": 0, "train_loss": 4.114483281970024, "train_ppl": 61.220572314952946, "lr": 0.00056, "grad_norm": 0.7111, "tokens_per_sec": 149198, "dt_s": 4.393, "eta_s": 37754, "world_size": 1, "timestamp": "2026-05-04T21:22:38.267504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7830, "epoch": 0, "train_loss": 3.9995914548635483, "train_ppl": 54.57584878033331, "lr": 0.00056, "grad_norm": 0.6692, "tokens_per_sec": 145177, "dt_s": 4.514, "eta_s": 37797, "world_size": 1, "timestamp": "2026-05-04T21:22:42.781735"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7840, "epoch": 0, "train_loss": 4.135554671287537, "train_ppl": 62.52426186696098, "lr": 0.00056, "grad_norm": 0.7435, "tokens_per_sec": 149224, "dt_s": 4.392, "eta_s": 37725, "world_size": 1, "timestamp": "2026-05-04T21:22:47.173528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7850, "epoch": 0, "train_loss": 4.210496783256531, "train_ppl": 67.39000972427023, "lr": 0.00056, "grad_norm": 0.7055, "tokens_per_sec": 149397, "dt_s": 4.387, "eta_s": 37487, "world_size": 1, "timestamp": "2026-05-04T21:22:51.560251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7860, "epoch": 0, "train_loss": 4.004154935479164, "train_ppl": 54.825473754477905, "lr": 0.00056, "grad_norm": 0.6502, "tokens_per_sec": 144766, "dt_s": 4.527, "eta_s": 37677, "world_size": 1, "timestamp": "2026-05-04T21:22:56.087307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7870, "epoch": 0, "train_loss": 4.190493702888489, "train_ppl": 66.05539465061894, "lr": 0.00056, "grad_norm": 0.7886, "tokens_per_sec": 147838, "dt_s": 4.433, "eta_s": 37741, "world_size": 1, "timestamp": "2026-05-04T21:23:00.520271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7880, "epoch": 0, "train_loss": 4.1480498015880585, "train_ppl": 63.31041196153772, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 146691, "dt_s": 4.468, "eta_s": 37657, "world_size": 1, "timestamp": "2026-05-04T21:23:04.987913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7890, "epoch": 0, "train_loss": 4.0486636608839035, "train_ppl": 57.32080580586377, "lr": 0.00056, "grad_norm": 0.879, "tokens_per_sec": 146026, "dt_s": 4.488, "eta_s": 37816, "world_size": 1, "timestamp": "2026-05-04T21:23:09.475844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7900, "epoch": 0, "train_loss": 4.101382061839104, "train_ppl": 60.423739260607036, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 149549, "dt_s": 4.382, "eta_s": 37804, "world_size": 1, "timestamp": "2026-05-04T21:23:13.858090"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7910, "epoch": 0, "train_loss": 4.070775985717773, "train_ppl": 58.60241959325077, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 130954, "dt_s": 5.004, "eta_s": 38609, "world_size": 1, "timestamp": "2026-05-04T21:23:18.862599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7920, "epoch": 0, "train_loss": 4.14657661318779, "train_ppl": 63.21721246408482, "lr": 0.00056, "grad_norm": 0.6808, "tokens_per_sec": 148986, "dt_s": 4.399, "eta_s": 38546, "world_size": 1, "timestamp": "2026-05-04T21:23:23.261408"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7930, "epoch": 0, "train_loss": 4.082031413912773, "train_ppl": 59.26574089899442, "lr": 0.00056, "grad_norm": 0.641, "tokens_per_sec": 149152, "dt_s": 4.394, "eta_s": 38417, "world_size": 1, "timestamp": "2026-05-04T21:23:27.655317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7940, "epoch": 0, "train_loss": 4.0393475741147995, "train_ppl": 56.789279920169015, "lr": 0.00056, "grad_norm": 0.6986, "tokens_per_sec": 145367, "dt_s": 4.508, "eta_s": 38447, "world_size": 1, "timestamp": "2026-05-04T21:23:32.163638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7950, "epoch": 0, "train_loss": 4.084741413593292, "train_ppl": 59.42656886133168, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 147791, "dt_s": 4.434, "eta_s": 38531, "world_size": 1, "timestamp": "2026-05-04T21:23:36.598008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7960, "epoch": 0, "train_loss": 4.142032787203789, "train_ppl": 62.93061606663063, "lr": 0.00056, "grad_norm": 1.0809, "tokens_per_sec": 148728, "dt_s": 4.406, "eta_s": 37513, "world_size": 1, "timestamp": "2026-05-04T21:23:41.004464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7970, "epoch": 0, "train_loss": 4.185123473405838, "train_ppl": 65.70161281782903, "lr": 0.00056, "grad_norm": 0.7966, "tokens_per_sec": 146216, "dt_s": 4.482, "eta_s": 37649, "world_size": 1, "timestamp": "2026-05-04T21:23:45.486599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7980, "epoch": 0, "train_loss": 4.190883487462997, "train_ppl": 66.08114704313346, "lr": 0.00056, "grad_norm": 0.6716, "tokens_per_sec": 148429, "dt_s": 4.415, "eta_s": 37681, "world_size": 1, "timestamp": "2026-05-04T21:23:49.901911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 7990, "epoch": 0, "train_loss": 4.04398238658905, "train_ppl": 57.053098485668144, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 149427, "dt_s": 4.386, "eta_s": 37469, "world_size": 1, "timestamp": "2026-05-04T21:23:54.287741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8000, "epoch": 0, "train_loss": 4.094919130206108, "train_ppl": 60.03448398478858, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 146082, "dt_s": 4.486, "eta_s": 37553, "world_size": 1, "timestamp": "2026-05-04T21:23:58.773982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8010, "epoch": 0, "train_loss": 4.171840220689774, "train_ppl": 64.83465246516296, "lr": 0.00056, "grad_norm": 0.7362, "tokens_per_sec": 126978, "dt_s": 5.161, "eta_s": 37526, "world_size": 1, "timestamp": "2026-05-04T21:24:03.935163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8020, "epoch": 0, "train_loss": 4.127478003501892, "train_ppl": 62.021308005023876, "lr": 0.00056, "grad_norm": 0.716, "tokens_per_sec": 145084, "dt_s": 4.517, "eta_s": 37581, "world_size": 1, "timestamp": "2026-05-04T21:24:08.452304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8030, "epoch": 0, "train_loss": 3.921751156449318, "train_ppl": 50.488781164829796, "lr": 0.00056, "grad_norm": 0.9228, "tokens_per_sec": 145861, "dt_s": 4.493, "eta_s": 37708, "world_size": 1, "timestamp": "2026-05-04T21:24:12.945329"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8040, "epoch": 0, "train_loss": 4.051957651972771, "train_ppl": 57.509931347312154, "lr": 0.00056, "grad_norm": 0.7158, "tokens_per_sec": 149143, "dt_s": 4.394, "eta_s": 37718, "world_size": 1, "timestamp": "2026-05-04T21:24:17.339489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8050, "epoch": 0, "train_loss": 4.037545219063759, "train_ppl": 56.6870176587947, "lr": 0.00056, "grad_norm": 0.6549, "tokens_per_sec": 146236, "dt_s": 4.482, "eta_s": 37705, "world_size": 1, "timestamp": "2026-05-04T21:24:21.821006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8060, "epoch": 0, "train_loss": 4.060873165726662, "train_ppl": 58.02495436719443, "lr": 0.00056, "grad_norm": 0.7015, "tokens_per_sec": 150701, "dt_s": 4.349, "eta_s": 37626, "world_size": 1, "timestamp": "2026-05-04T21:24:26.169785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8070, "epoch": 0, "train_loss": 4.224006280303001, "train_ppl": 68.3065922056174, "lr": 0.00056, "grad_norm": 0.768, "tokens_per_sec": 149179, "dt_s": 4.393, "eta_s": 37411, "world_size": 1, "timestamp": "2026-05-04T21:24:30.562854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8080, "epoch": 0, "train_loss": 4.10881282389164, "train_ppl": 60.87440601440884, "lr": 0.00056, "grad_norm": 0.7196, "tokens_per_sec": 145321, "dt_s": 4.51, "eta_s": 37435, "world_size": 1, "timestamp": "2026-05-04T21:24:35.072602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8090, "epoch": 0, "train_loss": 4.04283595085144, "train_ppl": 56.98772825316791, "lr": 0.00056, "grad_norm": 0.7102, "tokens_per_sec": 148582, "dt_s": 4.411, "eta_s": 37459, "world_size": 1, "timestamp": "2026-05-04T21:24:39.483352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8100, "epoch": 0, "train_loss": 3.994379699230194, "train_ppl": 54.292152712452435, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 148368, "dt_s": 4.417, "eta_s": 37345, "world_size": 1, "timestamp": "2026-05-04T21:24:43.900475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8110, "epoch": 0, "train_loss": 4.117553234100342, "train_ppl": 61.408805326683044, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 147255, "dt_s": 4.451, "eta_s": 37513, "world_size": 1, "timestamp": "2026-05-04T21:24:48.350967"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8120, "epoch": 0, "train_loss": 4.021104395389557, "train_ppl": 55.762655852790424, "lr": 0.00056, "grad_norm": 0.8175, "tokens_per_sec": 149071, "dt_s": 4.396, "eta_s": 37514, "world_size": 1, "timestamp": "2026-05-04T21:24:52.747286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8130, "epoch": 0, "train_loss": 4.0429124385118484, "train_ppl": 56.99208727787764, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 146734, "dt_s": 4.466, "eta_s": 37436, "world_size": 1, "timestamp": "2026-05-04T21:24:57.213614"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8140, "epoch": 0, "train_loss": 4.054911315441132, "train_ppl": 57.68004743976829, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 147666, "dt_s": 4.438, "eta_s": 37478, "world_size": 1, "timestamp": "2026-05-04T21:25:01.651708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8150, "epoch": 0, "train_loss": 4.185966819524765, "train_ppl": 65.75704538914268, "lr": 0.00056, "grad_norm": 0.7105, "tokens_per_sec": 150077, "dt_s": 4.367, "eta_s": 37388, "world_size": 1, "timestamp": "2026-05-04T21:25:06.018533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8160, "epoch": 0, "train_loss": 4.157423093914986, "train_ppl": 63.90662885257051, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 146421, "dt_s": 4.476, "eta_s": 37427, "world_size": 1, "timestamp": "2026-05-04T21:25:10.494417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8170, "epoch": 0, "train_loss": 4.098485022783279, "train_ppl": 60.248942646632116, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 148506, "dt_s": 4.413, "eta_s": 37451, "world_size": 1, "timestamp": "2026-05-04T21:25:14.907443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8180, "epoch": 0, "train_loss": 4.013581290841103, "train_ppl": 55.34472161554469, "lr": 0.00056, "grad_norm": 0.665, "tokens_per_sec": 150805, "dt_s": 4.346, "eta_s": 37243, "world_size": 1, "timestamp": "2026-05-04T21:25:19.253167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8190, "epoch": 0, "train_loss": 4.108665198087692, "train_ppl": 60.865420044577974, "lr": 0.00056, "grad_norm": 0.7191, "tokens_per_sec": 146528, "dt_s": 4.473, "eta_s": 37297, "world_size": 1, "timestamp": "2026-05-04T21:25:23.725768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8200, "epoch": 0, "train_loss": 3.981793522834778, "train_ppl": 53.61310437667831, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 149365, "dt_s": 4.388, "eta_s": 37327, "world_size": 1, "timestamp": "2026-05-04T21:25:28.113404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8210, "epoch": 0, "train_loss": 4.116303503513336, "train_ppl": 61.33210879933511, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 132952, "dt_s": 4.929, "eta_s": 38089, "world_size": 1, "timestamp": "2026-05-04T21:25:33.042703"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8220, "epoch": 0, "train_loss": 4.185468152165413, "train_ppl": 65.72426267147463, "lr": 0.00056, "grad_norm": 0.779, "tokens_per_sec": 147531, "dt_s": 4.442, "eta_s": 38133, "world_size": 1, "timestamp": "2026-05-04T21:25:37.484878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8230, "epoch": 0, "train_loss": 4.079463362693787, "train_ppl": 59.11373869908844, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 148744, "dt_s": 4.406, "eta_s": 38230, "world_size": 1, "timestamp": "2026-05-04T21:25:41.890825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8240, "epoch": 0, "train_loss": 4.150014355778694, "train_ppl": 63.43491094912982, "lr": 0.00056, "grad_norm": 0.6445, "tokens_per_sec": 147926, "dt_s": 4.43, "eta_s": 38155, "world_size": 1, "timestamp": "2026-05-04T21:25:46.321156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8250, "epoch": 0, "train_loss": 3.928585097193718, "train_ppl": 50.835000176224284, "lr": 0.00056, "grad_norm": 0.7431, "tokens_per_sec": 148307, "dt_s": 4.419, "eta_s": 38203, "world_size": 1, "timestamp": "2026-05-04T21:25:50.740089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8260, "epoch": 0, "train_loss": 4.072588488459587, "train_ppl": 58.70873295696732, "lr": 0.00056, "grad_norm": 0.7247, "tokens_per_sec": 151086, "dt_s": 4.338, "eta_s": 37200, "world_size": 1, "timestamp": "2026-05-04T21:25:55.077757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8270, "epoch": 0, "train_loss": 4.077551528811455, "train_ppl": 59.00083101529076, "lr": 0.00056, "grad_norm": 0.7312, "tokens_per_sec": 148443, "dt_s": 4.415, "eta_s": 37149, "world_size": 1, "timestamp": "2026-05-04T21:25:59.492640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8280, "epoch": 0, "train_loss": 4.076340243220329, "train_ppl": 58.92940742473336, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 147136, "dt_s": 4.454, "eta_s": 37226, "world_size": 1, "timestamp": "2026-05-04T21:26:03.946757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8290, "epoch": 0, "train_loss": 4.099889829754829, "train_ppl": 60.33364025925986, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 150830, "dt_s": 4.345, "eta_s": 37078, "world_size": 1, "timestamp": "2026-05-04T21:26:08.291789"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8300, "epoch": 0, "train_loss": 4.129268556833267, "train_ppl": 62.13245994673142, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 146877, "dt_s": 4.462, "eta_s": 37146, "world_size": 1, "timestamp": "2026-05-04T21:26:12.753746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8310, "epoch": 0, "train_loss": 4.090572744607925, "train_ppl": 59.77411720529682, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 146976, "dt_s": 4.459, "eta_s": 37346, "world_size": 1, "timestamp": "2026-05-04T21:26:17.212700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8320, "epoch": 0, "train_loss": 3.929868996143341, "train_ppl": 50.900309095609884, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 148932, "dt_s": 4.4, "eta_s": 37317, "world_size": 1, "timestamp": "2026-05-04T21:26:21.613096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8330, "epoch": 0, "train_loss": 4.034866616129875, "train_ppl": 56.53537882775588, "lr": 0.00056, "grad_norm": 0.6688, "tokens_per_sec": 147366, "dt_s": 4.447, "eta_s": 37301, "world_size": 1, "timestamp": "2026-05-04T21:26:26.060287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8340, "epoch": 0, "train_loss": 4.077538847923279, "train_ppl": 59.000082837094155, "lr": 0.00056, "grad_norm": 0.6906, "tokens_per_sec": 148795, "dt_s": 4.404, "eta_s": 37397, "world_size": 1, "timestamp": "2026-05-04T21:26:30.464711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8350, "epoch": 0, "train_loss": 4.064931392669678, "train_ppl": 58.26091125984911, "lr": 0.00056, "grad_norm": 0.7536, "tokens_per_sec": 147362, "dt_s": 4.447, "eta_s": 37368, "world_size": 1, "timestamp": "2026-05-04T21:26:34.911986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8360, "epoch": 0, "train_loss": 4.07754984498024, "train_ppl": 59.00073166793343, "lr": 0.00056, "grad_norm": 0.6795, "tokens_per_sec": 149212, "dt_s": 4.392, "eta_s": 37251, "world_size": 1, "timestamp": "2026-05-04T21:26:39.304116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8370, "epoch": 0, "train_loss": 4.105787500739098, "train_ppl": 60.690519562719004, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 148292, "dt_s": 4.419, "eta_s": 37278, "world_size": 1, "timestamp": "2026-05-04T21:26:43.723502"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8380, "epoch": 0, "train_loss": 3.9943300634622574, "train_ppl": 54.28945794663855, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 145106, "dt_s": 4.516, "eta_s": 37391, "world_size": 1, "timestamp": "2026-05-04T21:26:48.239936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8390, "epoch": 0, "train_loss": 4.027742937207222, "train_ppl": 56.1340700358011, "lr": 0.00056, "grad_norm": 0.6976, "tokens_per_sec": 148230, "dt_s": 4.421, "eta_s": 37414, "world_size": 1, "timestamp": "2026-05-04T21:26:52.661160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8400, "epoch": 0, "train_loss": 4.046273797750473, "train_ppl": 57.18398048730461, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 149868, "dt_s": 4.373, "eta_s": 37285, "world_size": 1, "timestamp": "2026-05-04T21:26:57.034050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8410, "epoch": 0, "train_loss": 4.035008043050766, "train_ppl": 56.543375017729815, "lr": 0.00056, "grad_norm": 0.6759, "tokens_per_sec": 146646, "dt_s": 4.469, "eta_s": 37410, "world_size": 1, "timestamp": "2026-05-04T21:27:01.503068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8420, "epoch": 0, "train_loss": 4.087330594658852, "train_ppl": 59.580634373903166, "lr": 0.00056, "grad_norm": 0.7191, "tokens_per_sec": 148256, "dt_s": 4.42, "eta_s": 37407, "world_size": 1, "timestamp": "2026-05-04T21:27:05.923539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8430, "epoch": 0, "train_loss": 4.039724513888359, "train_ppl": 56.81069009340216, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 148652, "dt_s": 4.409, "eta_s": 37221, "world_size": 1, "timestamp": "2026-05-04T21:27:10.332215"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8440, "epoch": 0, "train_loss": 4.13226905465126, "train_ppl": 62.31916822604281, "lr": 0.00056, "grad_norm": 0.7114, "tokens_per_sec": 146646, "dt_s": 4.469, "eta_s": 37297, "world_size": 1, "timestamp": "2026-05-04T21:27:14.801220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8450, "epoch": 0, "train_loss": 4.099935919046402, "train_ppl": 60.33642105807944, "lr": 0.00056, "grad_norm": 0.7332, "tokens_per_sec": 148815, "dt_s": 4.404, "eta_s": 37345, "world_size": 1, "timestamp": "2026-05-04T21:27:19.205049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8460, "epoch": 0, "train_loss": 3.999591141939163, "train_ppl": 54.57583170222207, "lr": 0.00056, "grad_norm": 0.6548, "tokens_per_sec": 149186, "dt_s": 4.393, "eta_s": 37212, "world_size": 1, "timestamp": "2026-05-04T21:27:23.597955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8470, "epoch": 0, "train_loss": 4.105728656053543, "train_ppl": 60.68694835325358, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 147029, "dt_s": 4.457, "eta_s": 37270, "world_size": 1, "timestamp": "2026-05-04T21:27:28.055347"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8480, "epoch": 0, "train_loss": 4.036774188280106, "train_ppl": 56.643327068705695, "lr": 0.00056, "grad_norm": 0.7154, "tokens_per_sec": 147042, "dt_s": 4.457, "eta_s": 37347, "world_size": 1, "timestamp": "2026-05-04T21:27:32.512257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8490, "epoch": 0, "train_loss": 3.9460630416870117, "train_ppl": 51.731301422721565, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 145814, "dt_s": 4.495, "eta_s": 37385, "world_size": 1, "timestamp": "2026-05-04T21:27:37.006763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8500, "epoch": 0, "train_loss": 4.022583469748497, "train_ppl": 55.8451939922244, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 149859, "dt_s": 4.373, "eta_s": 37329, "world_size": 1, "timestamp": "2026-05-04T21:27:41.379936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8510, "epoch": 0, "train_loss": 4.043201178312302, "train_ppl": 57.00854553774865, "lr": 0.00056, "grad_norm": 0.6438, "tokens_per_sec": 99556, "dt_s": 6.583, "eta_s": 38273, "world_size": 1, "timestamp": "2026-05-04T21:27:47.962791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8520, "epoch": 0, "train_loss": 4.17759308218956, "train_ppl": 65.20871216637082, "lr": 0.00056, "grad_norm": 0.7749, "tokens_per_sec": 143215, "dt_s": 4.576, "eta_s": 38468, "world_size": 1, "timestamp": "2026-05-04T21:27:52.538838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8530, "epoch": 0, "train_loss": 4.1489255130290985, "train_ppl": 63.365877896161564, "lr": 0.00056, "grad_norm": 0.6837, "tokens_per_sec": 148615, "dt_s": 4.41, "eta_s": 38384, "world_size": 1, "timestamp": "2026-05-04T21:27:56.948642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8540, "epoch": 0, "train_loss": 4.174620181322098, "train_ppl": 65.01514100590813, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 147057, "dt_s": 4.456, "eta_s": 38316, "world_size": 1, "timestamp": "2026-05-04T21:28:01.405101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8550, "epoch": 0, "train_loss": 3.9680114537477493, "train_ppl": 52.87927333883719, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 149616, "dt_s": 4.38, "eta_s": 38323, "world_size": 1, "timestamp": "2026-05-04T21:28:05.785403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8560, "epoch": 0, "train_loss": 3.992217496037483, "train_ppl": 54.17488886633489, "lr": 0.00056, "grad_norm": 0.7242, "tokens_per_sec": 149020, "dt_s": 4.398, "eta_s": 37379, "world_size": 1, "timestamp": "2026-05-04T21:28:10.183215"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8570, "epoch": 0, "train_loss": 4.043837487697601, "train_ppl": 57.04483215384946, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 146422, "dt_s": 4.476, "eta_s": 37206, "world_size": 1, "timestamp": "2026-05-04T21:28:14.659022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8580, "epoch": 0, "train_loss": 4.099627465009689, "train_ppl": 60.31781291546917, "lr": 0.00056, "grad_norm": 0.7423, "tokens_per_sec": 149225, "dt_s": 4.392, "eta_s": 37172, "world_size": 1, "timestamp": "2026-05-04T21:28:19.050784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8590, "epoch": 0, "train_loss": 4.010076075792313, "train_ppl": 55.151066064921515, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 148442, "dt_s": 4.415, "eta_s": 37097, "world_size": 1, "timestamp": "2026-05-04T21:28:23.465710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8600, "epoch": 0, "train_loss": 4.043455436825752, "train_ppl": 57.023042288674134, "lr": 0.00056, "grad_norm": 0.6535, "tokens_per_sec": 145672, "dt_s": 4.499, "eta_s": 37292, "world_size": 1, "timestamp": "2026-05-04T21:28:27.964556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8610, "epoch": 0, "train_loss": 4.0259018689394, "train_ppl": 56.030818456482166, "lr": 0.00056, "grad_norm": 0.7019, "tokens_per_sec": 148153, "dt_s": 4.424, "eta_s": 37331, "world_size": 1, "timestamp": "2026-05-04T21:28:32.388091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8620, "epoch": 0, "train_loss": 4.047248438000679, "train_ppl": 57.23974146538729, "lr": 0.00056, "grad_norm": 0.7301, "tokens_per_sec": 147070, "dt_s": 4.456, "eta_s": 37294, "world_size": 1, "timestamp": "2026-05-04T21:28:36.844220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8630, "epoch": 0, "train_loss": 3.9769138395786285, "train_ppl": 53.352126671062926, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 146484, "dt_s": 4.474, "eta_s": 37427, "world_size": 1, "timestamp": "2026-05-04T21:28:41.318139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8640, "epoch": 0, "train_loss": 3.9912991374731064, "train_ppl": 54.12515973125394, "lr": 0.00056, "grad_norm": 0.7033, "tokens_per_sec": 148076, "dt_s": 4.426, "eta_s": 37441, "world_size": 1, "timestamp": "2026-05-04T21:28:45.743965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8650, "epoch": 0, "train_loss": 3.9566355496644974, "train_ppl": 52.28113244455998, "lr": 0.00056, "grad_norm": 0.6773, "tokens_per_sec": 145794, "dt_s": 4.495, "eta_s": 37430, "world_size": 1, "timestamp": "2026-05-04T21:28:50.239067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8660, "epoch": 0, "train_loss": 3.881150260567665, "train_ppl": 48.4799475837726, "lr": 0.00056, "grad_norm": 0.728, "tokens_per_sec": 148070, "dt_s": 4.426, "eta_s": 37430, "world_size": 1, "timestamp": "2026-05-04T21:28:54.665117"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8670, "epoch": 0, "train_loss": 3.957682877779007, "train_ppl": 52.33591662792759, "lr": 0.00056, "grad_norm": 0.6949, "tokens_per_sec": 148334, "dt_s": 4.418, "eta_s": 37362, "world_size": 1, "timestamp": "2026-05-04T21:28:59.083250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8680, "epoch": 0, "train_loss": 3.8949692845344543, "train_ppl": 49.15454353480812, "lr": 0.00056, "grad_norm": 0.7845, "tokens_per_sec": 146668, "dt_s": 4.468, "eta_s": 37348, "world_size": 1, "timestamp": "2026-05-04T21:29:03.551542"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8690, "epoch": 0, "train_loss": 4.033250004053116, "train_ppl": 56.444056887543134, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 148464, "dt_s": 4.414, "eta_s": 37324, "world_size": 1, "timestamp": "2026-05-04T21:29:07.965826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8700, "epoch": 0, "train_loss": 4.090938419103622, "train_ppl": 59.795979072381606, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 148024, "dt_s": 4.427, "eta_s": 37206, "world_size": 1, "timestamp": "2026-05-04T21:29:12.393219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8710, "epoch": 0, "train_loss": 4.104903191328049, "train_ppl": 60.63687408820314, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 146726, "dt_s": 4.467, "eta_s": 37269, "world_size": 1, "timestamp": "2026-05-04T21:29:16.859806"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8720, "epoch": 0, "train_loss": 4.035644486546516, "train_ppl": 56.57937313515636, "lr": 0.00056, "grad_norm": 0.8505, "tokens_per_sec": 149061, "dt_s": 4.397, "eta_s": 37229, "world_size": 1, "timestamp": "2026-05-04T21:29:21.256398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8730, "epoch": 0, "train_loss": 4.0780991315841675, "train_ppl": 59.033148881816146, "lr": 0.00056, "grad_norm": 0.6804, "tokens_per_sec": 147992, "dt_s": 4.428, "eta_s": 37157, "world_size": 1, "timestamp": "2026-05-04T21:29:25.684749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8740, "epoch": 0, "train_loss": 4.001604348421097, "train_ppl": 54.68581479255028, "lr": 0.00056, "grad_norm": 0.7956, "tokens_per_sec": 147454, "dt_s": 4.445, "eta_s": 37204, "world_size": 1, "timestamp": "2026-05-04T21:29:30.129293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8750, "epoch": 0, "train_loss": 4.052804544568062, "train_ppl": 57.5586567119889, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 149019, "dt_s": 4.398, "eta_s": 37150, "world_size": 1, "timestamp": "2026-05-04T21:29:34.527096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8760, "epoch": 0, "train_loss": 4.040338471531868, "train_ppl": 56.84558016023365, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 145915, "dt_s": 4.491, "eta_s": 37187, "world_size": 1, "timestamp": "2026-05-04T21:29:39.018482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8770, "epoch": 0, "train_loss": 3.9927612096071243, "train_ppl": 54.20435249770756, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 148401, "dt_s": 4.416, "eta_s": 37215, "world_size": 1, "timestamp": "2026-05-04T21:29:43.434633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8780, "epoch": 0, "train_loss": 4.117637515068054, "train_ppl": 61.41398113833018, "lr": 0.00056, "grad_norm": 0.6871, "tokens_per_sec": 149794, "dt_s": 4.375, "eta_s": 37121, "world_size": 1, "timestamp": "2026-05-04T21:29:47.809706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8790, "epoch": 0, "train_loss": 3.9753317683935165, "train_ppl": 53.267786542427416, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 145043, "dt_s": 4.518, "eta_s": 37241, "world_size": 1, "timestamp": "2026-05-04T21:29:52.328088"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8800, "epoch": 0, "train_loss": 3.9050710201263428, "train_ppl": 49.65360619234239, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 133592, "dt_s": 4.906, "eta_s": 38088, "world_size": 1, "timestamp": "2026-05-04T21:29:57.233779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8810, "epoch": 0, "train_loss": 4.018424481153488, "train_ppl": 55.61341678080516, "lr": 0.00056, "grad_norm": 0.6372, "tokens_per_sec": 149567, "dt_s": 4.382, "eta_s": 37900, "world_size": 1, "timestamp": "2026-05-04T21:30:01.615480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8820, "epoch": 0, "train_loss": 3.9810272455215454, "train_ppl": 53.57203760737879, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 147479, "dt_s": 4.444, "eta_s": 37942, "world_size": 1, "timestamp": "2026-05-04T21:30:06.059247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8830, "epoch": 0, "train_loss": 4.065857574343681, "train_ppl": 58.31489644435391, "lr": 0.00056, "grad_norm": 0.7535, "tokens_per_sec": 149015, "dt_s": 4.398, "eta_s": 37976, "world_size": 1, "timestamp": "2026-05-04T21:30:10.457219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8840, "epoch": 0, "train_loss": 4.014081373810768, "train_ppl": 55.372405489825596, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 148325, "dt_s": 4.418, "eta_s": 37803, "world_size": 1, "timestamp": "2026-05-04T21:30:14.875615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8850, "epoch": 0, "train_loss": 4.041223928332329, "train_ppl": 56.895936756767995, "lr": 0.00056, "grad_norm": 0.6839, "tokens_per_sec": 147740, "dt_s": 4.436, "eta_s": 37011, "world_size": 1, "timestamp": "2026-05-04T21:30:19.311534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8860, "epoch": 0, "train_loss": 4.017037272453308, "train_ppl": 55.536322850248645, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 150219, "dt_s": 4.363, "eta_s": 36975, "world_size": 1, "timestamp": "2026-05-04T21:30:23.674224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8870, "epoch": 0, "train_loss": 4.1003038585186005, "train_ppl": 60.358625293654754, "lr": 0.00056, "grad_norm": 0.6448, "tokens_per_sec": 143842, "dt_s": 4.556, "eta_s": 37159, "world_size": 1, "timestamp": "2026-05-04T21:30:28.230375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8880, "epoch": 0, "train_loss": 4.038287773728371, "train_ppl": 56.72912650030366, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 149763, "dt_s": 4.376, "eta_s": 37117, "world_size": 1, "timestamp": "2026-05-04T21:30:32.606331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8890, "epoch": 0, "train_loss": 4.131215661764145, "train_ppl": 62.25355622117929, "lr": 0.00056, "grad_norm": 0.69, "tokens_per_sec": 149827, "dt_s": 4.374, "eta_s": 37039, "world_size": 1, "timestamp": "2026-05-04T21:30:36.980428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8900, "epoch": 0, "train_loss": 3.998452588915825, "train_ppl": 54.51372958402211, "lr": 0.00056, "grad_norm": 0.7019, "tokens_per_sec": 146245, "dt_s": 4.481, "eta_s": 37110, "world_size": 1, "timestamp": "2026-05-04T21:30:41.461676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8910, "epoch": 0, "train_loss": 4.009988471865654, "train_ppl": 55.14623482659563, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 150139, "dt_s": 4.365, "eta_s": 37110, "world_size": 1, "timestamp": "2026-05-04T21:30:45.826703"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8920, "epoch": 0, "train_loss": 4.011792868375778, "train_ppl": 55.24583032812862, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 150055, "dt_s": 4.367, "eta_s": 36789, "world_size": 1, "timestamp": "2026-05-04T21:30:50.194191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8930, "epoch": 0, "train_loss": 3.9170302599668503, "train_ppl": 50.2509905894039, "lr": 0.00056, "grad_norm": 0.7092, "tokens_per_sec": 146980, "dt_s": 4.459, "eta_s": 36924, "world_size": 1, "timestamp": "2026-05-04T21:30:54.652985"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8940, "epoch": 0, "train_loss": 4.064791262149811, "train_ppl": 58.25274770006164, "lr": 0.00056, "grad_norm": 0.7624, "tokens_per_sec": 149320, "dt_s": 4.389, "eta_s": 36944, "world_size": 1, "timestamp": "2026-05-04T21:30:59.041934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8950, "epoch": 0, "train_loss": 4.066685348749161, "train_ppl": 58.36318800759126, "lr": 0.00056, "grad_norm": 0.7449, "tokens_per_sec": 147399, "dt_s": 4.446, "eta_s": 36881, "world_size": 1, "timestamp": "2026-05-04T21:31:03.488114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8960, "epoch": 0, "train_loss": 4.0164191126823425, "train_ppl": 55.502003138260775, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 146550, "dt_s": 4.472, "eta_s": 37056, "world_size": 1, "timestamp": "2026-05-04T21:31:07.960031"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8970, "epoch": 0, "train_loss": 4.018677473068237, "train_ppl": 55.62748830551821, "lr": 0.00056, "grad_norm": 0.6638, "tokens_per_sec": 148389, "dt_s": 4.416, "eta_s": 37133, "world_size": 1, "timestamp": "2026-05-04T21:31:12.376514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8980, "epoch": 0, "train_loss": 3.978628918528557, "train_ppl": 53.44370829285734, "lr": 0.00056, "grad_norm": 0.7079, "tokens_per_sec": 146126, "dt_s": 4.485, "eta_s": 37173, "world_size": 1, "timestamp": "2026-05-04T21:31:16.861411"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 8990, "epoch": 0, "train_loss": 4.063204497098923, "train_ppl": 58.16038757218336, "lr": 0.00056, "grad_norm": 0.6948, "tokens_per_sec": 147802, "dt_s": 4.434, "eta_s": 37244, "world_size": 1, "timestamp": "2026-05-04T21:31:21.295467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9000, "epoch": 0, "train_loss": 4.010418564081192, "train_ppl": 55.169957894098424, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 150175, "dt_s": 4.364, "eta_s": 37102, "world_size": 1, "timestamp": "2026-05-04T21:31:25.659455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9010, "epoch": 0, "train_loss": 3.989252135157585, "train_ppl": 54.01447872474254, "lr": 0.00056, "grad_norm": 0.6408, "tokens_per_sec": 108480, "dt_s": 6.041, "eta_s": 37145, "world_size": 1, "timestamp": "2026-05-04T21:31:31.700784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9020, "epoch": 0, "train_loss": 4.140399008989334, "train_ppl": 62.827885339541695, "lr": 0.00056, "grad_norm": 0.6674, "tokens_per_sec": 149207, "dt_s": 4.392, "eta_s": 37100, "world_size": 1, "timestamp": "2026-05-04T21:31:36.093055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9030, "epoch": 0, "train_loss": 4.172608911991119, "train_ppl": 64.88450945840223, "lr": 0.00056, "grad_norm": 0.6913, "tokens_per_sec": 149032, "dt_s": 4.397, "eta_s": 36950, "world_size": 1, "timestamp": "2026-05-04T21:31:40.490509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9040, "epoch": 0, "train_loss": 3.95819228887558, "train_ppl": 52.3625839163368, "lr": 0.00056, "grad_norm": 0.6819, "tokens_per_sec": 146965, "dt_s": 4.459, "eta_s": 36988, "world_size": 1, "timestamp": "2026-05-04T21:31:44.949808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9050, "epoch": 0, "train_loss": 4.093452230095863, "train_ppl": 59.946483953020795, "lr": 0.00056, "grad_norm": 0.7014, "tokens_per_sec": 148841, "dt_s": 4.403, "eta_s": 37049, "world_size": 1, "timestamp": "2026-05-04T21:31:49.352898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9060, "epoch": 0, "train_loss": 3.8976652026176453, "train_ppl": 49.28723894520393, "lr": 0.00056, "grad_norm": 0.63, "tokens_per_sec": 146808, "dt_s": 4.464, "eta_s": 36983, "world_size": 1, "timestamp": "2026-05-04T21:31:53.816962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9070, "epoch": 0, "train_loss": 3.996950328350067, "train_ppl": 54.43189723991728, "lr": 0.00056, "grad_norm": 0.6802, "tokens_per_sec": 149014, "dt_s": 4.398, "eta_s": 36988, "world_size": 1, "timestamp": "2026-05-04T21:31:58.214941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9080, "epoch": 0, "train_loss": 4.037318572402, "train_ppl": 56.67417119133677, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 149600, "dt_s": 4.381, "eta_s": 36956, "world_size": 1, "timestamp": "2026-05-04T21:32:02.595701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9090, "epoch": 0, "train_loss": 4.040605872869492, "train_ppl": 56.860782776915926, "lr": 0.00056, "grad_norm": 0.6889, "tokens_per_sec": 131914, "dt_s": 4.968, "eta_s": 37802, "world_size": 1, "timestamp": "2026-05-04T21:32:07.563773"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9100, "epoch": 0, "train_loss": 4.113004177808762, "train_ppl": 61.13008764630847, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 147782, "dt_s": 4.435, "eta_s": 37850, "world_size": 1, "timestamp": "2026-05-04T21:32:11.998419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9110, "epoch": 0, "train_loss": 3.96631920337677, "train_ppl": 52.78986404170741, "lr": 0.00056, "grad_norm": 0.7282, "tokens_per_sec": 148859, "dt_s": 4.403, "eta_s": 37743, "world_size": 1, "timestamp": "2026-05-04T21:32:16.400956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9120, "epoch": 0, "train_loss": 4.032929539680481, "train_ppl": 56.42597147628346, "lr": 0.00056, "grad_norm": 0.767, "tokens_per_sec": 146113, "dt_s": 4.485, "eta_s": 37884, "world_size": 1, "timestamp": "2026-05-04T21:32:20.886284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9130, "epoch": 0, "train_loss": 4.034661442041397, "train_ppl": 56.52378042282489, "lr": 0.00056, "grad_norm": 0.7345, "tokens_per_sec": 148829, "dt_s": 4.403, "eta_s": 37917, "world_size": 1, "timestamp": "2026-05-04T21:32:25.289688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9140, "epoch": 0, "train_loss": 4.090318456292152, "train_ppl": 59.758919278115414, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 149601, "dt_s": 4.381, "eta_s": 36932, "world_size": 1, "timestamp": "2026-05-04T21:32:29.670431"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9150, "epoch": 0, "train_loss": 3.93370421230793, "train_ppl": 51.095897606204005, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 146905, "dt_s": 4.461, "eta_s": 36971, "world_size": 1, "timestamp": "2026-05-04T21:32:34.131532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9160, "epoch": 0, "train_loss": 4.017436519265175, "train_ppl": 55.558499976868376, "lr": 0.00056, "grad_norm": 0.7507, "tokens_per_sec": 150774, "dt_s": 4.347, "eta_s": 36874, "world_size": 1, "timestamp": "2026-05-04T21:32:38.478163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9170, "epoch": 0, "train_loss": 4.02754120528698, "train_ppl": 56.1227471441945, "lr": 0.00056, "grad_norm": 0.8574, "tokens_per_sec": 147976, "dt_s": 4.429, "eta_s": 36775, "world_size": 1, "timestamp": "2026-05-04T21:32:42.906989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9180, "epoch": 0, "train_loss": 4.001625895500183, "train_ppl": 54.68699312482128, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 148074, "dt_s": 4.426, "eta_s": 36808, "world_size": 1, "timestamp": "2026-05-04T21:32:47.332897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9190, "epoch": 0, "train_loss": 4.075676515698433, "train_ppl": 58.89030733251559, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 150697, "dt_s": 4.349, "eta_s": 36750, "world_size": 1, "timestamp": "2026-05-04T21:32:51.681766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9200, "epoch": 0, "train_loss": 4.030822798609734, "train_ppl": 56.30722169615819, "lr": 0.00056, "grad_norm": 0.6716, "tokens_per_sec": 146417, "dt_s": 4.476, "eta_s": 36771, "world_size": 1, "timestamp": "2026-05-04T21:32:56.157736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9210, "epoch": 0, "train_loss": 4.015696331858635, "train_ppl": 55.46190184868242, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 148119, "dt_s": 4.425, "eta_s": 36897, "world_size": 1, "timestamp": "2026-05-04T21:33:00.582324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9220, "epoch": 0, "train_loss": 4.138973847031593, "train_ppl": 62.73840920160892, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 149615, "dt_s": 4.38, "eta_s": 36811, "world_size": 1, "timestamp": "2026-05-04T21:33:04.962639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9230, "epoch": 0, "train_loss": 4.00191992521286, "train_ppl": 54.70307508986886, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 147068, "dt_s": 4.456, "eta_s": 36857, "world_size": 1, "timestamp": "2026-05-04T21:33:09.418778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9240, "epoch": 0, "train_loss": 3.9622559398412704, "train_ppl": 52.57580010571365, "lr": 0.00056, "grad_norm": 0.6385, "tokens_per_sec": 150074, "dt_s": 4.367, "eta_s": 36883, "world_size": 1, "timestamp": "2026-05-04T21:33:13.785668"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9250, "epoch": 0, "train_loss": 4.117833286523819, "train_ppl": 61.426005419789085, "lr": 0.00056, "grad_norm": 0.8302, "tokens_per_sec": 150294, "dt_s": 4.361, "eta_s": 36686, "world_size": 1, "timestamp": "2026-05-04T21:33:18.146214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9260, "epoch": 0, "train_loss": 4.053620979189873, "train_ppl": 57.60566878064236, "lr": 0.00056, "grad_norm": 0.7045, "tokens_per_sec": 147428, "dt_s": 4.445, "eta_s": 36716, "world_size": 1, "timestamp": "2026-05-04T21:33:22.591482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9270, "epoch": 0, "train_loss": 3.962783560156822, "train_ppl": 52.603547485352706, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 147408, "dt_s": 4.446, "eta_s": 36821, "world_size": 1, "timestamp": "2026-05-04T21:33:27.037380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9280, "epoch": 0, "train_loss": 4.1462216675281525, "train_ppl": 63.19477777069037, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 147511, "dt_s": 4.443, "eta_s": 36794, "world_size": 1, "timestamp": "2026-05-04T21:33:31.480146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9290, "epoch": 0, "train_loss": 4.02026829123497, "train_ppl": 55.71605195013438, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 146742, "dt_s": 4.466, "eta_s": 36955, "world_size": 1, "timestamp": "2026-05-04T21:33:35.946230"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9300, "epoch": 0, "train_loss": 4.05271415412426, "train_ppl": 57.55345419459653, "lr": 0.00056, "grad_norm": 0.6888, "tokens_per_sec": 149453, "dt_s": 4.385, "eta_s": 36992, "world_size": 1, "timestamp": "2026-05-04T21:33:40.331291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9310, "epoch": 0, "train_loss": 4.107957437634468, "train_ppl": 60.82235714820667, "lr": 0.00056, "grad_norm": 0.728, "tokens_per_sec": 147240, "dt_s": 4.451, "eta_s": 36997, "world_size": 1, "timestamp": "2026-05-04T21:33:44.782253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9320, "epoch": 0, "train_loss": 4.051841840147972, "train_ppl": 57.50327140287638, "lr": 0.00056, "grad_norm": 0.7335, "tokens_per_sec": 147334, "dt_s": 4.448, "eta_s": 36996, "world_size": 1, "timestamp": "2026-05-04T21:33:49.230366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9330, "epoch": 0, "train_loss": 3.8992394655942917, "train_ppl": 49.364891127136886, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 148347, "dt_s": 4.418, "eta_s": 36950, "world_size": 1, "timestamp": "2026-05-04T21:33:53.648087"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9340, "epoch": 0, "train_loss": 3.986390620470047, "train_ppl": 53.860136432216756, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 147247, "dt_s": 4.451, "eta_s": 36920, "world_size": 1, "timestamp": "2026-05-04T21:33:58.098847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9350, "epoch": 0, "train_loss": 3.9272275418043137, "train_ppl": 50.76603566992483, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 147971, "dt_s": 4.429, "eta_s": 36989, "world_size": 1, "timestamp": "2026-05-04T21:34:02.527811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9360, "epoch": 0, "train_loss": 4.009595811367035, "train_ppl": 55.12458532926103, "lr": 0.00056, "grad_norm": 0.6652, "tokens_per_sec": 148959, "dt_s": 4.4, "eta_s": 36899, "world_size": 1, "timestamp": "2026-05-04T21:34:06.927417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9370, "epoch": 0, "train_loss": 4.0059897899627686, "train_ppl": 54.92616286749819, "lr": 0.00056, "grad_norm": 0.7393, "tokens_per_sec": 147522, "dt_s": 4.442, "eta_s": 36885, "world_size": 1, "timestamp": "2026-05-04T21:34:11.369864"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9380, "epoch": 0, "train_loss": 4.11470340192318, "train_ppl": 61.234049667725785, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 148137, "dt_s": 4.424, "eta_s": 36891, "world_size": 1, "timestamp": "2026-05-04T21:34:15.793896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9390, "epoch": 0, "train_loss": 4.0874073058366776, "train_ppl": 59.58520504985033, "lr": 0.00056, "grad_norm": 0.7157, "tokens_per_sec": 132201, "dt_s": 4.957, "eta_s": 37730, "world_size": 1, "timestamp": "2026-05-04T21:34:20.751190"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9400, "epoch": 0, "train_loss": 4.110088303685188, "train_ppl": 60.95209962700799, "lr": 0.00056, "grad_norm": 0.7085, "tokens_per_sec": 148627, "dt_s": 4.409, "eta_s": 37693, "world_size": 1, "timestamp": "2026-05-04T21:34:25.160631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9410, "epoch": 0, "train_loss": 3.993604391813278, "train_ppl": 54.250075917107594, "lr": 0.00056, "grad_norm": 0.6233, "tokens_per_sec": 149003, "dt_s": 4.398, "eta_s": 37686, "world_size": 1, "timestamp": "2026-05-04T21:34:29.558918"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9420, "epoch": 0, "train_loss": 4.029305279254913, "train_ppl": 56.22183919859978, "lr": 0.00056, "grad_norm": 0.7425, "tokens_per_sec": 147371, "dt_s": 4.447, "eta_s": 37689, "world_size": 1, "timestamp": "2026-05-04T21:34:34.005958"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9430, "epoch": 0, "train_loss": 4.066142231225967, "train_ppl": 58.331498543805424, "lr": 0.00056, "grad_norm": 0.6564, "tokens_per_sec": 147074, "dt_s": 4.456, "eta_s": 37738, "world_size": 1, "timestamp": "2026-05-04T21:34:38.461933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9440, "epoch": 0, "train_loss": 4.062847688794136, "train_ppl": 58.13963916471017, "lr": 0.00056, "grad_norm": 0.7438, "tokens_per_sec": 148226, "dt_s": 4.421, "eta_s": 36841, "world_size": 1, "timestamp": "2026-05-04T21:34:42.883291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9450, "epoch": 0, "train_loss": 4.0790791511535645, "train_ppl": 59.091030881076776, "lr": 0.00056, "grad_norm": 0.694, "tokens_per_sec": 148020, "dt_s": 4.428, "eta_s": 36867, "world_size": 1, "timestamp": "2026-05-04T21:34:47.310793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9460, "epoch": 0, "train_loss": 4.018441304564476, "train_ppl": 55.6143523960422, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 149007, "dt_s": 4.398, "eta_s": 36862, "world_size": 1, "timestamp": "2026-05-04T21:34:51.708972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9470, "epoch": 0, "train_loss": 4.004912570118904, "train_ppl": 54.867027171700926, "lr": 0.00056, "grad_norm": 0.7267, "tokens_per_sec": 149336, "dt_s": 4.388, "eta_s": 36761, "world_size": 1, "timestamp": "2026-05-04T21:34:56.097462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9480, "epoch": 0, "train_loss": 4.084562376141548, "train_ppl": 59.41593023226179, "lr": 0.00056, "grad_norm": 0.808, "tokens_per_sec": 146599, "dt_s": 4.47, "eta_s": 36780, "world_size": 1, "timestamp": "2026-05-04T21:35:00.567917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9490, "epoch": 0, "train_loss": 4.050844743847847, "train_ppl": 57.445963679123096, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 146594, "dt_s": 4.471, "eta_s": 36858, "world_size": 1, "timestamp": "2026-05-04T21:35:05.038524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9500, "epoch": 0, "train_loss": 4.033564329147339, "train_ppl": 56.461801459678696, "lr": 0.00056, "grad_norm": 0.7333, "tokens_per_sec": 146537, "dt_s": 4.472, "eta_s": 36928, "world_size": 1, "timestamp": "2026-05-04T21:35:09.510796"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9510, "epoch": 0, "train_loss": 3.985409289598465, "train_ppl": 53.80730774303955, "lr": 0.00056, "grad_norm": 0.7859, "tokens_per_sec": 126070, "dt_s": 5.198, "eta_s": 36928, "world_size": 1, "timestamp": "2026-05-04T21:35:14.709209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9520, "epoch": 0, "train_loss": 3.9583882838487625, "train_ppl": 52.37284772536189, "lr": 0.00056, "grad_norm": 0.6306, "tokens_per_sec": 146624, "dt_s": 4.47, "eta_s": 37059, "world_size": 1, "timestamp": "2026-05-04T21:35:19.178853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9530, "epoch": 0, "train_loss": 4.0276587307453156, "train_ppl": 56.12934338338111, "lr": 0.00056, "grad_norm": 0.6358, "tokens_per_sec": 144459, "dt_s": 4.537, "eta_s": 37164, "world_size": 1, "timestamp": "2026-05-04T21:35:23.715493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9540, "epoch": 0, "train_loss": 4.088327333331108, "train_ppl": 59.640050302458995, "lr": 0.00056, "grad_norm": 0.6886, "tokens_per_sec": 148214, "dt_s": 4.422, "eta_s": 37078, "world_size": 1, "timestamp": "2026-05-04T21:35:28.137246"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9550, "epoch": 0, "train_loss": 3.927583694458008, "train_ppl": 50.78411934833009, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 149703, "dt_s": 4.378, "eta_s": 36917, "world_size": 1, "timestamp": "2026-05-04T21:35:32.514931"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9560, "epoch": 0, "train_loss": 4.006900444626808, "train_ppl": 54.97620441571767, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 146029, "dt_s": 4.488, "eta_s": 37056, "world_size": 1, "timestamp": "2026-05-04T21:35:37.002807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9570, "epoch": 0, "train_loss": 4.044712141156197, "train_ppl": 57.09474844013179, "lr": 0.00056, "grad_norm": 0.6695, "tokens_per_sec": 148198, "dt_s": 4.422, "eta_s": 36973, "world_size": 1, "timestamp": "2026-05-04T21:35:41.424993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9580, "epoch": 0, "train_loss": 4.055739492177963, "train_ppl": 57.72783649939954, "lr": 0.00056, "grad_norm": 0.6913, "tokens_per_sec": 148217, "dt_s": 4.422, "eta_s": 36778, "world_size": 1, "timestamp": "2026-05-04T21:35:45.846620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9590, "epoch": 0, "train_loss": 4.09851910173893, "train_ppl": 60.25099590266276, "lr": 0.00056, "grad_norm": 0.7541, "tokens_per_sec": 146939, "dt_s": 4.46, "eta_s": 36837, "world_size": 1, "timestamp": "2026-05-04T21:35:50.306733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9600, "epoch": 0, "train_loss": 3.999233439564705, "train_ppl": 54.55631328873368, "lr": 0.00056, "grad_norm": 0.7078, "tokens_per_sec": 147993, "dt_s": 4.428, "eta_s": 36917, "world_size": 1, "timestamp": "2026-05-04T21:35:54.735041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9610, "epoch": 0, "train_loss": 3.964860737323761, "train_ppl": 52.71292793504724, "lr": 0.00056, "grad_norm": 0.7125, "tokens_per_sec": 146878, "dt_s": 4.462, "eta_s": 36869, "world_size": 1, "timestamp": "2026-05-04T21:35:59.196977"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9620, "epoch": 0, "train_loss": 4.093203157186508, "train_ppl": 59.93155476716206, "lr": 0.00056, "grad_norm": 0.7933, "tokens_per_sec": 148272, "dt_s": 4.42, "eta_s": 36861, "world_size": 1, "timestamp": "2026-05-04T21:36:03.616964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9630, "epoch": 0, "train_loss": 3.8798573315143585, "train_ppl": 48.41730695470956, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 148214, "dt_s": 4.422, "eta_s": 36857, "world_size": 1, "timestamp": "2026-05-04T21:36:08.038700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9640, "epoch": 0, "train_loss": 3.884324759244919, "train_ppl": 48.63409164886786, "lr": 0.00056, "grad_norm": 0.8403, "tokens_per_sec": 147634, "dt_s": 4.439, "eta_s": 36818, "world_size": 1, "timestamp": "2026-05-04T21:36:12.477808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9650, "epoch": 0, "train_loss": 4.107232764363289, "train_ppl": 60.778296778305645, "lr": 0.00056, "grad_norm": 0.7181, "tokens_per_sec": 149540, "dt_s": 4.382, "eta_s": 36737, "world_size": 1, "timestamp": "2026-05-04T21:36:16.860290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9660, "epoch": 0, "train_loss": 3.935351684689522, "train_ppl": 51.18014606577268, "lr": 0.00056, "grad_norm": 0.831, "tokens_per_sec": 148408, "dt_s": 4.416, "eta_s": 36656, "world_size": 1, "timestamp": "2026-05-04T21:36:21.276240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9670, "epoch": 0, "train_loss": 3.8775624334812164, "train_ppl": 48.3063215709878, "lr": 0.00056, "grad_norm": 0.7333, "tokens_per_sec": 147697, "dt_s": 4.437, "eta_s": 36680, "world_size": 1, "timestamp": "2026-05-04T21:36:25.713414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9680, "epoch": 0, "train_loss": 4.0610562562942505, "train_ppl": 58.03557916164398, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 134066, "dt_s": 4.888, "eta_s": 37450, "world_size": 1, "timestamp": "2026-05-04T21:36:30.601771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9690, "epoch": 0, "train_loss": 4.1590739488601685, "train_ppl": 64.01221655785442, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 148814, "dt_s": 4.404, "eta_s": 37388, "world_size": 1, "timestamp": "2026-05-04T21:36:35.005645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9700, "epoch": 0, "train_loss": 4.069063022732735, "train_ppl": 58.50212174542907, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 149676, "dt_s": 4.379, "eta_s": 37377, "world_size": 1, "timestamp": "2026-05-04T21:36:39.384165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9710, "epoch": 0, "train_loss": 3.9518182575702667, "train_ppl": 52.02988461153483, "lr": 0.00056, "grad_norm": 0.7091, "tokens_per_sec": 150142, "dt_s": 4.365, "eta_s": 37287, "world_size": 1, "timestamp": "2026-05-04T21:36:43.749133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9720, "epoch": 0, "train_loss": 4.069691494107246, "train_ppl": 58.53890021018987, "lr": 0.00056, "grad_norm": 0.6941, "tokens_per_sec": 147473, "dt_s": 4.444, "eta_s": 37294, "world_size": 1, "timestamp": "2026-05-04T21:36:48.193048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9730, "epoch": 0, "train_loss": 3.9426473677158356, "train_ppl": 51.5549055897828, "lr": 0.00056, "grad_norm": 0.6833, "tokens_per_sec": 148511, "dt_s": 4.413, "eta_s": 36501, "world_size": 1, "timestamp": "2026-05-04T21:36:52.605927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9740, "epoch": 0, "train_loss": 3.9809675365686417, "train_ppl": 53.56883897260285, "lr": 0.00056, "grad_norm": 0.7315, "tokens_per_sec": 148363, "dt_s": 4.417, "eta_s": 36519, "world_size": 1, "timestamp": "2026-05-04T21:36:57.023206"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9750, "epoch": 0, "train_loss": 4.024460479617119, "train_ppl": 55.95011440998835, "lr": 0.00056, "grad_norm": 0.6972, "tokens_per_sec": 147484, "dt_s": 4.444, "eta_s": 36622, "world_size": 1, "timestamp": "2026-05-04T21:37:01.466798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9760, "epoch": 0, "train_loss": 3.927288994193077, "train_ppl": 50.76915545994309, "lr": 0.00056, "grad_norm": 0.739, "tokens_per_sec": 150000, "dt_s": 4.369, "eta_s": 36624, "world_size": 1, "timestamp": "2026-05-04T21:37:05.835879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9770, "epoch": 0, "train_loss": 3.983719766139984, "train_ppl": 53.71647578733653, "lr": 0.00056, "grad_norm": 0.7484, "tokens_per_sec": 148509, "dt_s": 4.413, "eta_s": 36569, "world_size": 1, "timestamp": "2026-05-04T21:37:10.248769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9780, "epoch": 0, "train_loss": 4.005570814013481, "train_ppl": 54.90315494648941, "lr": 0.00056, "grad_norm": 0.765, "tokens_per_sec": 148679, "dt_s": 4.408, "eta_s": 36556, "world_size": 1, "timestamp": "2026-05-04T21:37:14.656673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9790, "epoch": 0, "train_loss": 4.019558444619179, "train_ppl": 55.676516133055415, "lr": 0.00056, "grad_norm": 0.685, "tokens_per_sec": 148390, "dt_s": 4.416, "eta_s": 36550, "world_size": 1, "timestamp": "2026-05-04T21:37:19.073143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9800, "epoch": 0, "train_loss": 4.057546973228455, "train_ppl": 57.83227282486501, "lr": 0.00056, "grad_norm": 0.6964, "tokens_per_sec": 148047, "dt_s": 4.427, "eta_s": 36518, "world_size": 1, "timestamp": "2026-05-04T21:37:23.499876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9810, "epoch": 0, "train_loss": 3.96940641105175, "train_ppl": 52.953089140387334, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 147957, "dt_s": 4.429, "eta_s": 36613, "world_size": 1, "timestamp": "2026-05-04T21:37:27.929268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9820, "epoch": 0, "train_loss": 3.956906884908676, "train_ppl": 52.29532008311391, "lr": 0.00056, "grad_norm": 0.7683, "tokens_per_sec": 150524, "dt_s": 4.354, "eta_s": 36511, "world_size": 1, "timestamp": "2026-05-04T21:37:32.283112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9830, "epoch": 0, "train_loss": 3.997519612312317, "train_ppl": 54.46289326798451, "lr": 0.00056, "grad_norm": 0.6351, "tokens_per_sec": 148011, "dt_s": 4.428, "eta_s": 36540, "world_size": 1, "timestamp": "2026-05-04T21:37:36.710882"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9840, "epoch": 0, "train_loss": 3.9434574246406555, "train_ppl": 51.59668491759654, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 147626, "dt_s": 4.439, "eta_s": 36573, "world_size": 1, "timestamp": "2026-05-04T21:37:41.150211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9850, "epoch": 0, "train_loss": 4.001221224665642, "train_ppl": 54.664867370800515, "lr": 0.00056, "grad_norm": 0.7272, "tokens_per_sec": 149422, "dt_s": 4.386, "eta_s": 36501, "world_size": 1, "timestamp": "2026-05-04T21:37:45.536217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9860, "epoch": 0, "train_loss": 4.020356237888336, "train_ppl": 55.72095220591955, "lr": 0.00056, "grad_norm": 0.7048, "tokens_per_sec": 143718, "dt_s": 4.56, "eta_s": 36713, "world_size": 1, "timestamp": "2026-05-04T21:37:50.096244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9870, "epoch": 0, "train_loss": 3.9618344753980637, "train_ppl": 52.55364594431953, "lr": 0.00056, "grad_norm": 0.6565, "tokens_per_sec": 148633, "dt_s": 4.409, "eta_s": 36801, "world_size": 1, "timestamp": "2026-05-04T21:37:54.505473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9880, "epoch": 0, "train_loss": 4.005932703614235, "train_ppl": 54.923027422917514, "lr": 0.00056, "grad_norm": 0.7464, "tokens_per_sec": 147862, "dt_s": 4.432, "eta_s": 36804, "world_size": 1, "timestamp": "2026-05-04T21:37:58.937733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9890, "epoch": 0, "train_loss": 3.9730150997638702, "train_ppl": 53.14452556492179, "lr": 0.00056, "grad_norm": 0.7271, "tokens_per_sec": 146592, "dt_s": 4.471, "eta_s": 36851, "world_size": 1, "timestamp": "2026-05-04T21:38:03.408403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9900, "epoch": 0, "train_loss": 4.043958321213722, "train_ppl": 57.051725497960255, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 147513, "dt_s": 4.443, "eta_s": 36940, "world_size": 1, "timestamp": "2026-05-04T21:38:07.851080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9910, "epoch": 0, "train_loss": 3.9731370508670807, "train_ppl": 53.151006993644685, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 147501, "dt_s": 4.443, "eta_s": 36742, "world_size": 1, "timestamp": "2026-05-04T21:38:12.294211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9920, "epoch": 0, "train_loss": 4.0547749400138855, "train_ppl": 57.67218183500491, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 146437, "dt_s": 4.475, "eta_s": 36847, "world_size": 1, "timestamp": "2026-05-04T21:38:16.769572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9930, "epoch": 0, "train_loss": 4.0087375938892365, "train_ppl": 55.077296741523796, "lr": 0.00056, "grad_norm": 0.697, "tokens_per_sec": 149507, "dt_s": 4.383, "eta_s": 36762, "world_size": 1, "timestamp": "2026-05-04T21:38:21.153029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9940, "epoch": 0, "train_loss": 3.944738507270813, "train_ppl": 51.662826892039604, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 147145, "dt_s": 4.454, "eta_s": 36730, "world_size": 1, "timestamp": "2026-05-04T21:38:25.606885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9950, "epoch": 0, "train_loss": 4.086423724889755, "train_ppl": 59.526626990299, "lr": 0.00056, "grad_norm": 0.7161, "tokens_per_sec": 145955, "dt_s": 4.49, "eta_s": 36804, "world_size": 1, "timestamp": "2026-05-04T21:38:30.097044"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9960, "epoch": 0, "train_loss": 4.003547817468643, "train_ppl": 54.79219832400488, "lr": 0.00056, "grad_norm": 0.6827, "tokens_per_sec": 148364, "dt_s": 4.417, "eta_s": 36757, "world_size": 1, "timestamp": "2026-05-04T21:38:34.514321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9970, "epoch": 0, "train_loss": 3.941699430346489, "train_ppl": 51.506057924122764, "lr": 0.00056, "grad_norm": 0.6553, "tokens_per_sec": 142924, "dt_s": 4.585, "eta_s": 36934, "world_size": 1, "timestamp": "2026-05-04T21:38:39.099654"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9980, "epoch": 0, "train_loss": 3.945828855037689, "train_ppl": 51.719188061025434, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 133873, "dt_s": 4.895, "eta_s": 37777, "world_size": 1, "timestamp": "2026-05-04T21:38:43.995048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 9990, "epoch": 0, "train_loss": 4.027058392763138, "train_ppl": 56.09565691927705, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 148654, "dt_s": 4.409, "eta_s": 37697, "world_size": 1, "timestamp": "2026-05-04T21:38:48.403679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10000, "epoch": 0, "train_loss": 4.049401059746742, "train_ppl": 57.363089691010416, "lr": 0.00056, "grad_norm": 0.7323, "tokens_per_sec": 146242, "dt_s": 4.481, "eta_s": 37678, "world_size": 1, "timestamp": "2026-05-04T21:38:52.885013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10010, "epoch": 0, "train_loss": 3.9670481234788895, "train_ppl": 52.82835766247817, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 126099, "dt_s": 5.197, "eta_s": 37705, "world_size": 1, "timestamp": "2026-05-04T21:38:58.082214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10020, "epoch": 0, "train_loss": 3.9575652182102203, "train_ppl": 52.32975916879419, "lr": 0.00056, "grad_norm": 0.7194, "tokens_per_sec": 145388, "dt_s": 4.508, "eta_s": 37572, "world_size": 1, "timestamp": "2026-05-04T21:39:02.589845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10030, "epoch": 0, "train_loss": 4.01462422311306, "train_ppl": 55.40247252170725, "lr": 0.00056, "grad_norm": 0.746, "tokens_per_sec": 147054, "dt_s": 4.457, "eta_s": 36842, "world_size": 1, "timestamp": "2026-05-04T21:39:07.046440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10040, "epoch": 0, "train_loss": 4.0223250687122345, "train_ppl": 55.830765400488346, "lr": 0.00056, "grad_norm": 0.6787, "tokens_per_sec": 148492, "dt_s": 4.413, "eta_s": 36846, "world_size": 1, "timestamp": "2026-05-04T21:39:11.459896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10050, "epoch": 0, "train_loss": 3.9968535900115967, "train_ppl": 54.42663184330546, "lr": 0.00056, "grad_norm": 0.7418, "tokens_per_sec": 148134, "dt_s": 4.424, "eta_s": 36746, "world_size": 1, "timestamp": "2026-05-04T21:39:15.883983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10060, "epoch": 0, "train_loss": 4.010733902454376, "train_ppl": 55.187357842160864, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 148916, "dt_s": 4.401, "eta_s": 36683, "world_size": 1, "timestamp": "2026-05-04T21:39:20.284840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10070, "epoch": 0, "train_loss": 4.080427646636963, "train_ppl": 59.17076862023663, "lr": 0.00056, "grad_norm": 0.7314, "tokens_per_sec": 149536, "dt_s": 4.383, "eta_s": 36472, "world_size": 1, "timestamp": "2026-05-04T21:39:24.667459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10080, "epoch": 0, "train_loss": 4.05645614862442, "train_ppl": 57.76922235351511, "lr": 0.00056, "grad_norm": 0.7092, "tokens_per_sec": 146747, "dt_s": 4.466, "eta_s": 36483, "world_size": 1, "timestamp": "2026-05-04T21:39:29.133394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10090, "epoch": 0, "train_loss": 3.985407203435898, "train_ppl": 53.80719549236538, "lr": 0.00056, "grad_norm": 0.8391, "tokens_per_sec": 148262, "dt_s": 4.42, "eta_s": 36490, "world_size": 1, "timestamp": "2026-05-04T21:39:33.553663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10100, "epoch": 0, "train_loss": 4.119279131293297, "train_ppl": 61.51488212391128, "lr": 0.00056, "grad_norm": 0.7919, "tokens_per_sec": 149225, "dt_s": 4.392, "eta_s": 36433, "world_size": 1, "timestamp": "2026-05-04T21:39:37.945396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10110, "epoch": 0, "train_loss": 4.076767593622208, "train_ppl": 58.95459631253608, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 145769, "dt_s": 4.496, "eta_s": 36585, "world_size": 1, "timestamp": "2026-05-04T21:39:42.441310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10120, "epoch": 0, "train_loss": 4.012940034270287, "train_ppl": 55.30924282586801, "lr": 0.00056, "grad_norm": 0.7323, "tokens_per_sec": 150952, "dt_s": 4.342, "eta_s": 36513, "world_size": 1, "timestamp": "2026-05-04T21:39:46.782799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10130, "epoch": 0, "train_loss": 4.024938121438026, "train_ppl": 55.97684490780876, "lr": 0.00056, "grad_norm": 0.6889, "tokens_per_sec": 148065, "dt_s": 4.426, "eta_s": 36443, "world_size": 1, "timestamp": "2026-05-04T21:39:51.208979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10140, "epoch": 0, "train_loss": 4.042018756270409, "train_ppl": 56.941177213673306, "lr": 0.00056, "grad_norm": 0.7721, "tokens_per_sec": 145158, "dt_s": 4.515, "eta_s": 36594, "world_size": 1, "timestamp": "2026-05-04T21:39:55.723797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10150, "epoch": 0, "train_loss": 4.064731925725937, "train_ppl": 58.24929129287872, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 147139, "dt_s": 4.454, "eta_s": 36693, "world_size": 1, "timestamp": "2026-05-04T21:40:00.177851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10160, "epoch": 0, "train_loss": 4.026163771748543, "train_ppl": 56.045495007065526, "lr": 0.00056, "grad_norm": 0.8048, "tokens_per_sec": 145090, "dt_s": 4.517, "eta_s": 36723, "world_size": 1, "timestamp": "2026-05-04T21:40:04.694745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10170, "epoch": 0, "train_loss": 3.9690567404031754, "train_ppl": 52.934576236261805, "lr": 0.00056, "grad_norm": 0.7406, "tokens_per_sec": 145444, "dt_s": 4.506, "eta_s": 36990, "world_size": 1, "timestamp": "2026-05-04T21:40:09.200657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10180, "epoch": 0, "train_loss": 3.9427245408296585, "train_ppl": 51.55888439590646, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 148143, "dt_s": 4.424, "eta_s": 36981, "world_size": 1, "timestamp": "2026-05-04T21:40:13.624494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10190, "epoch": 0, "train_loss": 4.040069505572319, "train_ppl": 56.830292690216424, "lr": 0.00056, "grad_norm": 0.7339, "tokens_per_sec": 145301, "dt_s": 4.51, "eta_s": 36970, "world_size": 1, "timestamp": "2026-05-04T21:40:18.134852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10200, "epoch": 0, "train_loss": 3.98714716732502, "train_ppl": 53.900899566709285, "lr": 0.00056, "grad_norm": 0.7396, "tokens_per_sec": 147351, "dt_s": 4.448, "eta_s": 36955, "world_size": 1, "timestamp": "2026-05-04T21:40:22.582448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10210, "epoch": 0, "train_loss": 4.081333085894585, "train_ppl": 59.22436841906823, "lr": 0.00056, "grad_norm": 0.756, "tokens_per_sec": 149106, "dt_s": 4.395, "eta_s": 36749, "world_size": 1, "timestamp": "2026-05-04T21:40:26.977711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10220, "epoch": 0, "train_loss": 4.013905853033066, "train_ppl": 55.36268733504477, "lr": 0.00056, "grad_norm": 0.8789, "tokens_per_sec": 146308, "dt_s": 4.479, "eta_s": 36701, "world_size": 1, "timestamp": "2026-05-04T21:40:31.457027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10230, "epoch": 0, "train_loss": 4.061732247471809, "train_ppl": 58.074823964217465, "lr": 0.00056, "grad_norm": 0.7069, "tokens_per_sec": 147600, "dt_s": 4.44, "eta_s": 36723, "world_size": 1, "timestamp": "2026-05-04T21:40:35.897105"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10240, "epoch": 0, "train_loss": 3.965735450387001, "train_ppl": 52.75905679353002, "lr": 0.00056, "grad_norm": 0.6977, "tokens_per_sec": 147067, "dt_s": 4.456, "eta_s": 36630, "world_size": 1, "timestamp": "2026-05-04T21:40:40.353351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10250, "epoch": 0, "train_loss": 4.032384812831879, "train_ppl": 56.395243104706196, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 143955, "dt_s": 4.553, "eta_s": 36798, "world_size": 1, "timestamp": "2026-05-04T21:40:44.905875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10260, "epoch": 0, "train_loss": 4.094200909137726, "train_ppl": 59.99138143399413, "lr": 0.00056, "grad_norm": 0.7516, "tokens_per_sec": 148590, "dt_s": 4.411, "eta_s": 36819, "world_size": 1, "timestamp": "2026-05-04T21:40:49.316417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10270, "epoch": 0, "train_loss": 4.022945284843445, "train_ppl": 55.86540328218369, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 131256, "dt_s": 4.993, "eta_s": 37661, "world_size": 1, "timestamp": "2026-05-04T21:40:54.309395"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10280, "epoch": 0, "train_loss": 4.126680091023445, "train_ppl": 61.971840167564, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 147887, "dt_s": 4.431, "eta_s": 37642, "world_size": 1, "timestamp": "2026-05-04T21:40:58.740873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10290, "epoch": 0, "train_loss": 4.015561252832413, "train_ppl": 55.454410614953936, "lr": 0.00056, "grad_norm": 0.7437, "tokens_per_sec": 149314, "dt_s": 4.389, "eta_s": 37527, "world_size": 1, "timestamp": "2026-05-04T21:41:03.130031"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10300, "epoch": 0, "train_loss": 3.9589962363243103, "train_ppl": 52.404697608414644, "lr": 0.00056, "grad_norm": 0.7243, "tokens_per_sec": 146838, "dt_s": 4.463, "eta_s": 37376, "world_size": 1, "timestamp": "2026-05-04T21:41:07.593169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10310, "epoch": 0, "train_loss": 4.0324388444423676, "train_ppl": 56.3982903128371, "lr": 0.00056, "grad_norm": 0.7173, "tokens_per_sec": 147169, "dt_s": 4.453, "eta_s": 37441, "world_size": 1, "timestamp": "2026-05-04T21:41:12.046270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10320, "epoch": 0, "train_loss": 4.0425834357738495, "train_ppl": 56.9733398092723, "lr": 0.00056, "grad_norm": 0.7071, "tokens_per_sec": 150168, "dt_s": 4.364, "eta_s": 36401, "world_size": 1, "timestamp": "2026-05-04T21:41:16.410465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10330, "epoch": 0, "train_loss": 3.9617882519960403, "train_ppl": 52.551216792157526, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 146069, "dt_s": 4.487, "eta_s": 36487, "world_size": 1, "timestamp": "2026-05-04T21:41:20.897105"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10340, "epoch": 0, "train_loss": 4.018030166625977, "train_ppl": 55.59149192557248, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 148522, "dt_s": 4.413, "eta_s": 36521, "world_size": 1, "timestamp": "2026-05-04T21:41:25.309650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10350, "epoch": 0, "train_loss": 4.300735235214233, "train_ppl": 73.75400030806729, "lr": 0.00056, "grad_norm": 1.0236, "tokens_per_sec": 147610, "dt_s": 4.44, "eta_s": 36478, "world_size": 1, "timestamp": "2026-05-04T21:41:29.749446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10360, "epoch": 0, "train_loss": 3.9705667942762375, "train_ppl": 53.01457068088604, "lr": 0.00056, "grad_norm": 0.6743, "tokens_per_sec": 145197, "dt_s": 4.514, "eta_s": 36574, "world_size": 1, "timestamp": "2026-05-04T21:41:34.263048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10370, "epoch": 0, "train_loss": 4.018092185258865, "train_ppl": 55.594939740815235, "lr": 0.00056, "grad_norm": 0.8551, "tokens_per_sec": 148787, "dt_s": 4.405, "eta_s": 36636, "world_size": 1, "timestamp": "2026-05-04T21:41:38.667727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10380, "epoch": 0, "train_loss": 3.943866118788719, "train_ppl": 51.617776490489476, "lr": 0.00056, "grad_norm": 0.7205, "tokens_per_sec": 147439, "dt_s": 4.445, "eta_s": 36563, "world_size": 1, "timestamp": "2026-05-04T21:41:43.112713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10390, "epoch": 0, "train_loss": 4.101643577218056, "train_ppl": 60.439543064056224, "lr": 0.00056, "grad_norm": 0.7003, "tokens_per_sec": 146327, "dt_s": 4.479, "eta_s": 36667, "world_size": 1, "timestamp": "2026-05-04T21:41:47.591432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10400, "epoch": 0, "train_loss": 4.028479993343353, "train_ppl": 56.175459247779614, "lr": 0.00056, "grad_norm": 0.7879, "tokens_per_sec": 148499, "dt_s": 4.413, "eta_s": 36619, "world_size": 1, "timestamp": "2026-05-04T21:41:52.004707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10410, "epoch": 0, "train_loss": 3.891576737165451, "train_ppl": 48.98806696695579, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 145817, "dt_s": 4.494, "eta_s": 36583, "world_size": 1, "timestamp": "2026-05-04T21:41:56.499090"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10420, "epoch": 0, "train_loss": 4.02845224738121, "train_ppl": 56.173900627236776, "lr": 0.00056, "grad_norm": 0.6836, "tokens_per_sec": 147010, "dt_s": 4.458, "eta_s": 36666, "world_size": 1, "timestamp": "2026-05-04T21:42:00.957016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10430, "epoch": 0, "train_loss": 3.996450424194336, "train_ppl": 54.404693308528415, "lr": 0.00056, "grad_norm": 0.7312, "tokens_per_sec": 149526, "dt_s": 4.383, "eta_s": 36560, "world_size": 1, "timestamp": "2026-05-04T21:42:05.339927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10440, "epoch": 0, "train_loss": 4.026523798704147, "train_ppl": 56.06567652873618, "lr": 0.00056, "grad_norm": 0.8587, "tokens_per_sec": 145434, "dt_s": 4.506, "eta_s": 36600, "world_size": 1, "timestamp": "2026-05-04T21:42:09.846155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10450, "epoch": 0, "train_loss": 4.003424674272537, "train_ppl": 54.78545145300575, "lr": 0.00056, "grad_norm": 0.7032, "tokens_per_sec": 147441, "dt_s": 4.445, "eta_s": 36648, "world_size": 1, "timestamp": "2026-05-04T21:42:14.291069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10460, "epoch": 0, "train_loss": 4.078578278422356, "train_ppl": 59.06144120599872, "lr": 0.00056, "grad_norm": 0.7602, "tokens_per_sec": 149179, "dt_s": 4.393, "eta_s": 36477, "world_size": 1, "timestamp": "2026-05-04T21:42:18.684187"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10470, "epoch": 0, "train_loss": 4.060378655791283, "train_ppl": 57.996267544304, "lr": 0.00056, "grad_norm": 0.7341, "tokens_per_sec": 146821, "dt_s": 4.464, "eta_s": 36482, "world_size": 1, "timestamp": "2026-05-04T21:42:23.147859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10480, "epoch": 0, "train_loss": 3.904686212539673, "train_ppl": 49.63450278377805, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 148501, "dt_s": 4.413, "eta_s": 36527, "world_size": 1, "timestamp": "2026-05-04T21:42:27.561034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10490, "epoch": 0, "train_loss": 3.9301862120628357, "train_ppl": 50.916458045178736, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 147986, "dt_s": 4.429, "eta_s": 36395, "world_size": 1, "timestamp": "2026-05-04T21:42:31.989532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10500, "epoch": 0, "train_loss": 4.060525298118591, "train_ppl": 58.00477287555745, "lr": 0.00056, "grad_norm": 0.7404, "tokens_per_sec": 144620, "dt_s": 4.532, "eta_s": 36533, "world_size": 1, "timestamp": "2026-05-04T21:42:36.521154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10510, "epoch": 0, "train_loss": 4.002382516860962, "train_ppl": 54.72838612941731, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 125693, "dt_s": 5.214, "eta_s": 36616, "world_size": 1, "timestamp": "2026-05-04T21:42:41.735127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10520, "epoch": 0, "train_loss": 3.9679760336875916, "train_ppl": 52.877400384964695, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 144180, "dt_s": 4.545, "eta_s": 36746, "world_size": 1, "timestamp": "2026-05-04T21:42:46.280567"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10530, "epoch": 0, "train_loss": 4.063274085521698, "train_ppl": 58.16443500264803, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 144310, "dt_s": 4.541, "eta_s": 36952, "world_size": 1, "timestamp": "2026-05-04T21:42:50.821896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10540, "epoch": 0, "train_loss": 3.9075646847486496, "train_ppl": 49.77758014395989, "lr": 0.00056, "grad_norm": 0.7381, "tokens_per_sec": 148301, "dt_s": 4.419, "eta_s": 36932, "world_size": 1, "timestamp": "2026-05-04T21:42:55.241022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10550, "epoch": 0, "train_loss": 4.127522960305214, "train_ppl": 62.02409634744663, "lr": 0.00056, "grad_norm": 0.8927, "tokens_per_sec": 145673, "dt_s": 4.499, "eta_s": 36874, "world_size": 1, "timestamp": "2026-05-04T21:42:59.739873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10560, "epoch": 0, "train_loss": 3.997496485710144, "train_ppl": 54.461633740883066, "lr": 0.00056, "grad_norm": 0.6867, "tokens_per_sec": 148560, "dt_s": 4.411, "eta_s": 36812, "world_size": 1, "timestamp": "2026-05-04T21:43:04.151277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10570, "epoch": 0, "train_loss": 4.071690574288368, "train_ppl": 58.656041213542736, "lr": 0.00056, "grad_norm": 0.7215, "tokens_per_sec": 133659, "dt_s": 4.903, "eta_s": 37395, "world_size": 1, "timestamp": "2026-05-04T21:43:09.054511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10580, "epoch": 0, "train_loss": 3.917640060186386, "train_ppl": 50.28164299947055, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 145952, "dt_s": 4.49, "eta_s": 37307, "world_size": 1, "timestamp": "2026-05-04T21:43:13.544774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10590, "epoch": 0, "train_loss": 3.9393438547849655, "train_ppl": 51.38487429743569, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 147992, "dt_s": 4.428, "eta_s": 37317, "world_size": 1, "timestamp": "2026-05-04T21:43:17.973113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10600, "epoch": 0, "train_loss": 3.909147784113884, "train_ppl": 49.85644540879301, "lr": 0.00056, "grad_norm": 0.7248, "tokens_per_sec": 147183, "dt_s": 4.453, "eta_s": 37237, "world_size": 1, "timestamp": "2026-05-04T21:43:22.425799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10610, "epoch": 0, "train_loss": 3.9954204857349396, "train_ppl": 54.3486886681355, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 146923, "dt_s": 4.461, "eta_s": 37313, "world_size": 1, "timestamp": "2026-05-04T21:43:26.886359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10620, "epoch": 0, "train_loss": 3.9665722399950027, "train_ppl": 52.80322350052635, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 148486, "dt_s": 4.414, "eta_s": 36505, "world_size": 1, "timestamp": "2026-05-04T21:43:31.299966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10630, "epoch": 0, "train_loss": 4.0399807542562485, "train_ppl": 56.82524915076119, "lr": 0.00056, "grad_norm": 0.7143, "tokens_per_sec": 146654, "dt_s": 4.469, "eta_s": 36465, "world_size": 1, "timestamp": "2026-05-04T21:43:35.768729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10640, "epoch": 0, "train_loss": 3.9848330318927765, "train_ppl": 53.77630979958995, "lr": 0.00056, "grad_norm": 0.7065, "tokens_per_sec": 148399, "dt_s": 4.416, "eta_s": 36441, "world_size": 1, "timestamp": "2026-05-04T21:43:40.184941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10650, "epoch": 0, "train_loss": 3.9936883449554443, "train_ppl": 54.254630572629736, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 149175, "dt_s": 4.393, "eta_s": 36339, "world_size": 1, "timestamp": "2026-05-04T21:43:44.578170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10660, "epoch": 0, "train_loss": 3.89459627866745, "train_ppl": 49.1362120207734, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 146214, "dt_s": 4.482, "eta_s": 36370, "world_size": 1, "timestamp": "2026-05-04T21:43:49.060373"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10670, "epoch": 0, "train_loss": 3.9707682728767395, "train_ppl": 53.02525305849215, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 148582, "dt_s": 4.411, "eta_s": 36361, "world_size": 1, "timestamp": "2026-05-04T21:43:53.471128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10680, "epoch": 0, "train_loss": 3.986580416560173, "train_ppl": 53.87035984567651, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 148737, "dt_s": 4.406, "eta_s": 36254, "world_size": 1, "timestamp": "2026-05-04T21:43:57.877293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10690, "epoch": 0, "train_loss": 3.959776297211647, "train_ppl": 52.44559241147617, "lr": 0.00056, "grad_norm": 0.7016, "tokens_per_sec": 146386, "dt_s": 4.477, "eta_s": 36349, "world_size": 1, "timestamp": "2026-05-04T21:44:02.354211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10700, "epoch": 0, "train_loss": 3.9417029172182083, "train_ppl": 51.506237519452625, "lr": 0.00056, "grad_norm": 0.6675, "tokens_per_sec": 147101, "dt_s": 4.455, "eta_s": 36446, "world_size": 1, "timestamp": "2026-05-04T21:44:06.809375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10710, "epoch": 0, "train_loss": 4.023143097758293, "train_ppl": 55.87645527352339, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 147988, "dt_s": 4.428, "eta_s": 36353, "world_size": 1, "timestamp": "2026-05-04T21:44:11.237839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10720, "epoch": 0, "train_loss": 4.092118427157402, "train_ppl": 59.86658045617917, "lr": 0.00056, "grad_norm": 0.7562, "tokens_per_sec": 146928, "dt_s": 4.46, "eta_s": 36430, "world_size": 1, "timestamp": "2026-05-04T21:44:15.698259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10730, "epoch": 0, "train_loss": 3.938532307744026, "train_ppl": 51.34318997143408, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 149292, "dt_s": 4.39, "eta_s": 36399, "world_size": 1, "timestamp": "2026-05-04T21:44:20.088006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10740, "epoch": 0, "train_loss": 4.06468740105629, "train_ppl": 58.24669782016391, "lr": 0.00056, "grad_norm": 0.7607, "tokens_per_sec": 148476, "dt_s": 4.414, "eta_s": 36291, "world_size": 1, "timestamp": "2026-05-04T21:44:24.501918"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10750, "epoch": 0, "train_loss": 4.099308982491493, "train_ppl": 60.29860580524653, "lr": 0.00056, "grad_norm": 0.7497, "tokens_per_sec": 146933, "dt_s": 4.46, "eta_s": 36295, "world_size": 1, "timestamp": "2026-05-04T21:44:28.962206"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10760, "epoch": 0, "train_loss": 3.9654100090265274, "train_ppl": 52.74188960791747, "lr": 0.00056, "grad_norm": 0.7865, "tokens_per_sec": 149376, "dt_s": 4.387, "eta_s": 36224, "world_size": 1, "timestamp": "2026-05-04T21:44:33.349493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10770, "epoch": 0, "train_loss": 3.917743131518364, "train_ppl": 50.286825862486246, "lr": 0.00056, "grad_norm": 0.7312, "tokens_per_sec": 146882, "dt_s": 4.462, "eta_s": 36222, "world_size": 1, "timestamp": "2026-05-04T21:44:37.811336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10780, "epoch": 0, "train_loss": 4.035061001777649, "train_ppl": 56.546369562177354, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 149568, "dt_s": 4.382, "eta_s": 36204, "world_size": 1, "timestamp": "2026-05-04T21:44:42.193019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10790, "epoch": 0, "train_loss": 3.9541614204645157, "train_ppl": 52.15194205093465, "lr": 0.00056, "grad_norm": 0.6932, "tokens_per_sec": 148744, "dt_s": 4.406, "eta_s": 36187, "world_size": 1, "timestamp": "2026-05-04T21:44:46.598978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10800, "epoch": 0, "train_loss": 3.9643268138170242, "train_ppl": 52.68479077592756, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 146875, "dt_s": 4.462, "eta_s": 36185, "world_size": 1, "timestamp": "2026-05-04T21:44:51.061005"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10810, "epoch": 0, "train_loss": 3.9017634838819504, "train_ppl": 49.48964639116281, "lr": 0.00056, "grad_norm": 0.7213, "tokens_per_sec": 148177, "dt_s": 4.423, "eta_s": 36239, "world_size": 1, "timestamp": "2026-05-04T21:44:55.483821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10820, "epoch": 0, "train_loss": 4.020027115941048, "train_ppl": 55.70261623517548, "lr": 0.00056, "grad_norm": 0.8019, "tokens_per_sec": 147374, "dt_s": 4.447, "eta_s": 36210, "world_size": 1, "timestamp": "2026-05-04T21:44:59.930752"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10830, "epoch": 0, "train_loss": 3.931857466697693, "train_ppl": 51.00162355848374, "lr": 0.00056, "grad_norm": 0.7584, "tokens_per_sec": 146482, "dt_s": 4.474, "eta_s": 36357, "world_size": 1, "timestamp": "2026-05-04T21:45:04.404742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10840, "epoch": 0, "train_loss": 3.951947495341301, "train_ppl": 52.03660927237992, "lr": 0.00056, "grad_norm": 0.7315, "tokens_per_sec": 146956, "dt_s": 4.46, "eta_s": 36440, "world_size": 1, "timestamp": "2026-05-04T21:45:08.864311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10850, "epoch": 0, "train_loss": 3.985239088535309, "train_ppl": 53.79815046136776, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 147608, "dt_s": 4.44, "eta_s": 36399, "world_size": 1, "timestamp": "2026-05-04T21:45:13.304157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10860, "epoch": 0, "train_loss": 4.0086492747068405, "train_ppl": 55.0724325745098, "lr": 0.00056, "grad_norm": 0.718, "tokens_per_sec": 131043, "dt_s": 5.001, "eta_s": 37341, "world_size": 1, "timestamp": "2026-05-04T21:45:18.305314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10870, "epoch": 0, "train_loss": 3.9429864585399628, "train_ppl": 51.57239034950029, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 148632, "dt_s": 4.409, "eta_s": 37275, "world_size": 1, "timestamp": "2026-05-04T21:45:22.714563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10880, "epoch": 0, "train_loss": 4.050894007086754, "train_ppl": 57.44879372306404, "lr": 0.00056, "grad_norm": 0.7227, "tokens_per_sec": 146954, "dt_s": 4.46, "eta_s": 37247, "world_size": 1, "timestamp": "2026-05-04T21:45:27.174211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10890, "epoch": 0, "train_loss": 4.038168206810951, "train_ppl": 56.72234397901078, "lr": 0.00056, "grad_norm": 0.7603, "tokens_per_sec": 149138, "dt_s": 4.394, "eta_s": 37135, "world_size": 1, "timestamp": "2026-05-04T21:45:31.568501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10900, "epoch": 0, "train_loss": 4.002171233296394, "train_ppl": 54.71682414238478, "lr": 0.00056, "grad_norm": 0.7781, "tokens_per_sec": 147382, "dt_s": 4.447, "eta_s": 37142, "world_size": 1, "timestamp": "2026-05-04T21:45:36.015210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10910, "epoch": 0, "train_loss": 4.1356858760118484, "train_ppl": 62.53246588369314, "lr": 0.00056, "grad_norm": 0.702, "tokens_per_sec": 146174, "dt_s": 4.483, "eta_s": 36291, "world_size": 1, "timestamp": "2026-05-04T21:45:40.498620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10920, "epoch": 0, "train_loss": 3.950319856405258, "train_ppl": 51.95198135156114, "lr": 0.00056, "grad_norm": 0.6959, "tokens_per_sec": 149288, "dt_s": 4.39, "eta_s": 36255, "world_size": 1, "timestamp": "2026-05-04T21:45:44.888528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10930, "epoch": 0, "train_loss": 4.122489541769028, "train_ppl": 61.71268749421779, "lr": 0.00056, "grad_norm": 0.7407, "tokens_per_sec": 148072, "dt_s": 4.426, "eta_s": 36195, "world_size": 1, "timestamp": "2026-05-04T21:45:49.314474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10940, "epoch": 0, "train_loss": 3.972072124481201, "train_ppl": 53.09443521159914, "lr": 0.00056, "grad_norm": 0.7086, "tokens_per_sec": 145229, "dt_s": 4.513, "eta_s": 36384, "world_size": 1, "timestamp": "2026-05-04T21:45:53.827079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10950, "epoch": 0, "train_loss": 4.002743631601334, "train_ppl": 54.74815292519125, "lr": 0.00056, "grad_norm": 0.6438, "tokens_per_sec": 148361, "dt_s": 4.417, "eta_s": 36332, "world_size": 1, "timestamp": "2026-05-04T21:45:58.244406"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10960, "epoch": 0, "train_loss": 3.960936278104782, "train_ppl": 52.50646359448178, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 147342, "dt_s": 4.448, "eta_s": 36269, "world_size": 1, "timestamp": "2026-05-04T21:46:02.692306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10970, "epoch": 0, "train_loss": 3.8808231502771378, "train_ppl": 48.464091887455815, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 145994, "dt_s": 4.489, "eta_s": 36427, "world_size": 1, "timestamp": "2026-05-04T21:46:07.181277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10980, "epoch": 0, "train_loss": 3.9406679570674896, "train_ppl": 51.452958191852815, "lr": 0.00056, "grad_norm": 0.8621, "tokens_per_sec": 148267, "dt_s": 4.42, "eta_s": 36413, "world_size": 1, "timestamp": "2026-05-04T21:46:11.601394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 10990, "epoch": 0, "train_loss": 4.015889286994934, "train_ppl": 55.472604540049474, "lr": 0.00056, "grad_norm": 0.7158, "tokens_per_sec": 147244, "dt_s": 4.451, "eta_s": 36307, "world_size": 1, "timestamp": "2026-05-04T21:46:16.052251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11000, "epoch": 0, "train_loss": 3.9229089319705963, "train_ppl": 50.547269691520476, "lr": 0.00056, "grad_norm": 0.7187, "tokens_per_sec": 147800, "dt_s": 4.434, "eta_s": 36330, "world_size": 1, "timestamp": "2026-05-04T21:46:20.486366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11010, "epoch": 0, "train_loss": 3.9115265160799026, "train_ppl": 49.9751816941099, "lr": 0.00056, "grad_norm": 0.7091, "tokens_per_sec": 126654, "dt_s": 5.174, "eta_s": 36259, "world_size": 1, "timestamp": "2026-05-04T21:46:25.660738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11020, "epoch": 0, "train_loss": 3.940791666507721, "train_ppl": 51.45932380224391, "lr": 0.00056, "grad_norm": 0.7704, "tokens_per_sec": 144335, "dt_s": 4.541, "eta_s": 36338, "world_size": 1, "timestamp": "2026-05-04T21:46:30.201308"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11030, "epoch": 0, "train_loss": 4.054916962981224, "train_ppl": 57.68037319106859, "lr": 0.00056, "grad_norm": 0.7595, "tokens_per_sec": 148917, "dt_s": 4.401, "eta_s": 36302, "world_size": 1, "timestamp": "2026-05-04T21:46:34.602135"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11040, "epoch": 0, "train_loss": 3.999167636036873, "train_ppl": 54.55272340896845, "lr": 0.00056, "grad_norm": 0.6979, "tokens_per_sec": 146777, "dt_s": 4.465, "eta_s": 36321, "world_size": 1, "timestamp": "2026-05-04T21:46:39.067120"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11050, "epoch": 0, "train_loss": 3.8704789727926254, "train_ppl": 47.96535467959335, "lr": 0.00056, "grad_norm": 0.7337, "tokens_per_sec": 147520, "dt_s": 4.443, "eta_s": 36330, "world_size": 1, "timestamp": "2026-05-04T21:46:43.509675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11060, "epoch": 0, "train_loss": 3.983319655060768, "train_ppl": 53.694987529369286, "lr": 0.00056, "grad_norm": 0.7511, "tokens_per_sec": 149416, "dt_s": 4.386, "eta_s": 36292, "world_size": 1, "timestamp": "2026-05-04T21:46:47.895786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11070, "epoch": 0, "train_loss": 4.001488015055656, "train_ppl": 54.67945337770319, "lr": 0.00056, "grad_norm": 0.6784, "tokens_per_sec": 149196, "dt_s": 4.393, "eta_s": 36047, "world_size": 1, "timestamp": "2026-05-04T21:46:52.288426"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11080, "epoch": 0, "train_loss": 4.0457684099674225, "train_ppl": 57.155087703825, "lr": 0.00056, "grad_norm": 0.7331, "tokens_per_sec": 147172, "dt_s": 4.453, "eta_s": 36127, "world_size": 1, "timestamp": "2026-05-04T21:46:56.741441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11090, "epoch": 0, "train_loss": 3.982156902551651, "train_ppl": 53.63258983146251, "lr": 0.00056, "grad_norm": 0.6923, "tokens_per_sec": 148434, "dt_s": 4.415, "eta_s": 36042, "world_size": 1, "timestamp": "2026-05-04T21:47:01.156614"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11100, "epoch": 0, "train_loss": 3.8801719397306442, "train_ppl": 48.43254183367132, "lr": 0.00056, "grad_norm": 0.6891, "tokens_per_sec": 147469, "dt_s": 4.444, "eta_s": 36040, "world_size": 1, "timestamp": "2026-05-04T21:47:05.600655"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11110, "epoch": 0, "train_loss": 3.963878110051155, "train_ppl": 52.66115621475721, "lr": 0.00056, "grad_norm": 0.753, "tokens_per_sec": 147909, "dt_s": 4.431, "eta_s": 36108, "world_size": 1, "timestamp": "2026-05-04T21:47:10.031503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11120, "epoch": 0, "train_loss": 4.006717637181282, "train_ppl": 54.96615527478058, "lr": 0.00056, "grad_norm": 0.7516, "tokens_per_sec": 147673, "dt_s": 4.438, "eta_s": 36178, "world_size": 1, "timestamp": "2026-05-04T21:47:14.469404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11130, "epoch": 0, "train_loss": 3.9329809695482254, "train_ppl": 51.058956228611926, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 144463, "dt_s": 4.537, "eta_s": 36309, "world_size": 1, "timestamp": "2026-05-04T21:47:19.005925"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11140, "epoch": 0, "train_loss": 3.983225166797638, "train_ppl": 53.68991422294657, "lr": 0.00056, "grad_norm": 0.7181, "tokens_per_sec": 150012, "dt_s": 4.369, "eta_s": 36229, "world_size": 1, "timestamp": "2026-05-04T21:47:23.374631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11150, "epoch": 0, "train_loss": 4.017167374491692, "train_ppl": 55.54354870909508, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 148198, "dt_s": 4.422, "eta_s": 36189, "world_size": 1, "timestamp": "2026-05-04T21:47:27.796844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11160, "epoch": 0, "train_loss": 3.954201340675354, "train_ppl": 52.15402400901278, "lr": 0.00056, "grad_norm": 0.6473, "tokens_per_sec": 130968, "dt_s": 5.004, "eta_s": 37119, "world_size": 1, "timestamp": "2026-05-04T21:47:32.800832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11170, "epoch": 0, "train_loss": 3.9212003350257874, "train_ppl": 50.460978520365316, "lr": 0.00056, "grad_norm": 0.6554, "tokens_per_sec": 148865, "dt_s": 4.402, "eta_s": 37057, "world_size": 1, "timestamp": "2026-05-04T21:47:37.203205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11180, "epoch": 0, "train_loss": 3.9634056985378265, "train_ppl": 52.636284353595514, "lr": 0.00056, "grad_norm": 0.6764, "tokens_per_sec": 146281, "dt_s": 4.48, "eta_s": 36960, "world_size": 1, "timestamp": "2026-05-04T21:47:41.683359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11190, "epoch": 0, "train_loss": 4.032616287469864, "train_ppl": 56.40829868414742, "lr": 0.00056, "grad_norm": 0.9962, "tokens_per_sec": 147342, "dt_s": 4.448, "eta_s": 37085, "world_size": 1, "timestamp": "2026-05-04T21:47:46.131233"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11200, "epoch": 0, "train_loss": 3.903870403766632, "train_ppl": 49.59402703344251, "lr": 0.00056, "grad_norm": 0.7517, "tokens_per_sec": 149863, "dt_s": 4.373, "eta_s": 37000, "world_size": 1, "timestamp": "2026-05-04T21:47:50.504274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11210, "epoch": 0, "train_loss": 3.9863237887620926, "train_ppl": 53.85653698758819, "lr": 0.00056, "grad_norm": 0.7847, "tokens_per_sec": 146880, "dt_s": 4.462, "eta_s": 36112, "world_size": 1, "timestamp": "2026-05-04T21:47:54.966122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11220, "epoch": 0, "train_loss": 3.9648062735795975, "train_ppl": 52.71005706980582, "lr": 0.00056, "grad_norm": 0.6345, "tokens_per_sec": 149106, "dt_s": 4.395, "eta_s": 36096, "world_size": 1, "timestamp": "2026-05-04T21:47:59.361389"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11230, "epoch": 0, "train_loss": 4.01753257215023, "train_ppl": 55.56383678738444, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 149014, "dt_s": 4.398, "eta_s": 35958, "world_size": 1, "timestamp": "2026-05-04T21:48:03.759389"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11240, "epoch": 0, "train_loss": 3.9417698979377747, "train_ppl": 51.50968755984564, "lr": 0.00056, "grad_norm": 0.7259, "tokens_per_sec": 147340, "dt_s": 4.448, "eta_s": 35954, "world_size": 1, "timestamp": "2026-05-04T21:48:08.207314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11250, "epoch": 0, "train_loss": 3.939089372754097, "train_ppl": 51.371799433998014, "lr": 0.00056, "grad_norm": 0.6919, "tokens_per_sec": 149873, "dt_s": 4.373, "eta_s": 35949, "world_size": 1, "timestamp": "2026-05-04T21:48:12.580067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11260, "epoch": 0, "train_loss": 3.9023744761943817, "train_ppl": 49.51989342406408, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 149141, "dt_s": 4.394, "eta_s": 35834, "world_size": 1, "timestamp": "2026-05-04T21:48:16.974323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11270, "epoch": 0, "train_loss": 3.7613258361816406, "train_ppl": 43.005406320822885, "lr": 0.00056, "grad_norm": 1.0016, "tokens_per_sec": 147062, "dt_s": 4.456, "eta_s": 35929, "world_size": 1, "timestamp": "2026-05-04T21:48:21.430678"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11280, "epoch": 0, "train_loss": 4.134071007370949, "train_ppl": 62.43156565773212, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 148546, "dt_s": 4.412, "eta_s": 35947, "world_size": 1, "timestamp": "2026-05-04T21:48:25.842546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11290, "epoch": 0, "train_loss": 4.004210531711578, "train_ppl": 54.828521928991705, "lr": 0.00056, "grad_norm": 0.7796, "tokens_per_sec": 147395, "dt_s": 4.446, "eta_s": 35940, "world_size": 1, "timestamp": "2026-05-04T21:48:30.288805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11300, "epoch": 0, "train_loss": 4.029588133096695, "train_ppl": 56.23774401107175, "lr": 0.00056, "grad_norm": 0.7308, "tokens_per_sec": 149502, "dt_s": 4.384, "eta_s": 35953, "world_size": 1, "timestamp": "2026-05-04T21:48:34.672436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11310, "epoch": 0, "train_loss": 4.078320130705833, "train_ppl": 59.04619659758196, "lr": 0.00056, "grad_norm": 0.7418, "tokens_per_sec": 147974, "dt_s": 4.429, "eta_s": 36005, "world_size": 1, "timestamp": "2026-05-04T21:48:39.101340"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11320, "epoch": 0, "train_loss": 3.9681220054626465, "train_ppl": 52.88511955633604, "lr": 0.00056, "grad_norm": 0.6976, "tokens_per_sec": 147883, "dt_s": 4.432, "eta_s": 35961, "world_size": 1, "timestamp": "2026-05-04T21:48:43.532957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11330, "epoch": 0, "train_loss": 4.025619849562645, "train_ppl": 56.015018907976135, "lr": 0.00056, "grad_norm": 0.7109, "tokens_per_sec": 147404, "dt_s": 4.446, "eta_s": 36012, "world_size": 1, "timestamp": "2026-05-04T21:48:47.978965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11340, "epoch": 0, "train_loss": 4.005285620689392, "train_ppl": 54.887499165795425, "lr": 0.00056, "grad_norm": 0.7096, "tokens_per_sec": 148133, "dt_s": 4.424, "eta_s": 35972, "world_size": 1, "timestamp": "2026-05-04T21:48:52.403099"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11350, "epoch": 0, "train_loss": 4.109952121973038, "train_ppl": 60.94379963088828, "lr": 0.00056, "grad_norm": 0.7788, "tokens_per_sec": 146388, "dt_s": 4.477, "eta_s": 36119, "world_size": 1, "timestamp": "2026-05-04T21:48:56.879990"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11360, "epoch": 0, "train_loss": 4.0056808441877365, "train_ppl": 54.90919628255388, "lr": 0.00056, "grad_norm": 0.6935, "tokens_per_sec": 146878, "dt_s": 4.462, "eta_s": 36168, "world_size": 1, "timestamp": "2026-05-04T21:49:01.341894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11370, "epoch": 0, "train_loss": 3.9152621924877167, "train_ppl": 50.16222194475011, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 149690, "dt_s": 4.378, "eta_s": 36077, "world_size": 1, "timestamp": "2026-05-04T21:49:05.720031"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11380, "epoch": 0, "train_loss": 3.8814434111118317, "train_ppl": 48.494161590108, "lr": 0.00056, "grad_norm": 0.6928, "tokens_per_sec": 144955, "dt_s": 4.521, "eta_s": 36194, "world_size": 1, "timestamp": "2026-05-04T21:49:10.241120"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11390, "epoch": 0, "train_loss": 3.9400867968797684, "train_ppl": 51.423064468373575, "lr": 0.00056, "grad_norm": 0.6867, "tokens_per_sec": 149533, "dt_s": 4.383, "eta_s": 36123, "world_size": 1, "timestamp": "2026-05-04T21:49:14.623841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11400, "epoch": 0, "train_loss": 3.974971354007721, "train_ppl": 53.248591525146, "lr": 0.00056, "grad_norm": 0.7744, "tokens_per_sec": 148192, "dt_s": 4.422, "eta_s": 36030, "world_size": 1, "timestamp": "2026-05-04T21:49:19.046247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11410, "epoch": 0, "train_loss": 3.9655471444129944, "train_ppl": 52.74912288328952, "lr": 0.00056, "grad_norm": 0.7184, "tokens_per_sec": 144785, "dt_s": 4.526, "eta_s": 36130, "world_size": 1, "timestamp": "2026-05-04T21:49:23.572748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11420, "epoch": 0, "train_loss": 3.9452233612537384, "train_ppl": 51.68788189294505, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 148671, "dt_s": 4.408, "eta_s": 36174, "world_size": 1, "timestamp": "2026-05-04T21:49:27.980787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11430, "epoch": 0, "train_loss": 3.958859145641327, "train_ppl": 52.39751390504862, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 148676, "dt_s": 4.408, "eta_s": 35986, "world_size": 1, "timestamp": "2026-05-04T21:49:32.388762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11440, "epoch": 0, "train_loss": 4.018353626132011, "train_ppl": 55.609476430563205, "lr": 0.00056, "grad_norm": 0.7348, "tokens_per_sec": 146068, "dt_s": 4.487, "eta_s": 36150, "world_size": 1, "timestamp": "2026-05-04T21:49:36.875457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11450, "epoch": 0, "train_loss": 4.075584039092064, "train_ppl": 58.884861608550366, "lr": 0.00056, "grad_norm": 0.75, "tokens_per_sec": 148296, "dt_s": 4.419, "eta_s": 36141, "world_size": 1, "timestamp": "2026-05-04T21:49:41.294737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11460, "epoch": 0, "train_loss": 4.041601851582527, "train_ppl": 56.91744311773114, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 132049, "dt_s": 4.963, "eta_s": 36845, "world_size": 1, "timestamp": "2026-05-04T21:49:46.257751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11470, "epoch": 0, "train_loss": 3.9786304980516434, "train_ppl": 53.44379270849508, "lr": 0.00056, "grad_norm": 0.7018, "tokens_per_sec": 147025, "dt_s": 4.457, "eta_s": 36921, "world_size": 1, "timestamp": "2026-05-04T21:49:50.715256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11480, "epoch": 0, "train_loss": 4.055212765932083, "train_ppl": 57.69743773941261, "lr": 0.00056, "grad_norm": 0.7162, "tokens_per_sec": 148399, "dt_s": 4.416, "eta_s": 36930, "world_size": 1, "timestamp": "2026-05-04T21:49:55.131416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11490, "epoch": 0, "train_loss": 3.8842553049325943, "train_ppl": 48.630713918777175, "lr": 0.00056, "grad_norm": 0.7108, "tokens_per_sec": 145532, "dt_s": 4.503, "eta_s": 36952, "world_size": 1, "timestamp": "2026-05-04T21:49:59.634660"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11500, "epoch": 0, "train_loss": 3.907669886946678, "train_ppl": 49.782817130269976, "lr": 0.00056, "grad_norm": 0.6573, "tokens_per_sec": 148563, "dt_s": 4.411, "eta_s": 36935, "world_size": 1, "timestamp": "2026-05-04T21:50:04.045969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11510, "epoch": 0, "train_loss": 3.8663619607686996, "train_ppl": 47.76828668165909, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 126730, "dt_s": 5.171, "eta_s": 36023, "world_size": 1, "timestamp": "2026-05-04T21:50:09.217299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11520, "epoch": 0, "train_loss": 3.883140668272972, "train_ppl": 48.5765385407999, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 144408, "dt_s": 4.538, "eta_s": 36149, "world_size": 1, "timestamp": "2026-05-04T21:50:13.755500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11530, "epoch": 0, "train_loss": 3.9448461532592773, "train_ppl": 51.66838848744361, "lr": 0.00056, "grad_norm": 0.673, "tokens_per_sec": 147740, "dt_s": 4.436, "eta_s": 36177, "world_size": 1, "timestamp": "2026-05-04T21:50:18.191422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11540, "epoch": 0, "train_loss": 3.938236176967621, "train_ppl": 51.3279879237329, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 147709, "dt_s": 4.437, "eta_s": 36065, "world_size": 1, "timestamp": "2026-05-04T21:50:22.628266"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11550, "epoch": 0, "train_loss": 3.958360940217972, "train_ppl": 52.371415681128966, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 147315, "dt_s": 4.449, "eta_s": 36121, "world_size": 1, "timestamp": "2026-05-04T21:50:27.077023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11560, "epoch": 0, "train_loss": 3.97540719807148, "train_ppl": 53.271804665953105, "lr": 0.00056, "grad_norm": 0.7666, "tokens_per_sec": 148169, "dt_s": 4.423, "eta_s": 36147, "world_size": 1, "timestamp": "2026-05-04T21:50:31.500021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11570, "epoch": 0, "train_loss": 4.039828419685364, "train_ppl": 56.8165933601212, "lr": 0.00056, "grad_norm": 0.7745, "tokens_per_sec": 147370, "dt_s": 4.447, "eta_s": 35995, "world_size": 1, "timestamp": "2026-05-04T21:50:35.947059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11580, "epoch": 0, "train_loss": 3.9277941435575485, "train_ppl": 50.794807945181255, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 149848, "dt_s": 4.374, "eta_s": 35889, "world_size": 1, "timestamp": "2026-05-04T21:50:40.320583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11590, "epoch": 0, "train_loss": 3.9921926707029343, "train_ppl": 54.17354397328841, "lr": 0.00056, "grad_norm": 0.7325, "tokens_per_sec": 150644, "dt_s": 4.35, "eta_s": 35745, "world_size": 1, "timestamp": "2026-05-04T21:50:44.670996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11600, "epoch": 0, "train_loss": 4.015068233013153, "train_ppl": 55.42707722995932, "lr": 0.00056, "grad_norm": 0.6888, "tokens_per_sec": 145827, "dt_s": 4.494, "eta_s": 35814, "world_size": 1, "timestamp": "2026-05-04T21:50:49.165042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11610, "epoch": 0, "train_loss": 4.050289556384087, "train_ppl": 57.414079251980915, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 148402, "dt_s": 4.416, "eta_s": 35798, "world_size": 1, "timestamp": "2026-05-04T21:50:53.581153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11620, "epoch": 0, "train_loss": 3.9338190108537674, "train_ppl": 51.101763677649245, "lr": 0.00056, "grad_norm": 0.6814, "tokens_per_sec": 148923, "dt_s": 4.401, "eta_s": 35719, "world_size": 1, "timestamp": "2026-05-04T21:50:57.981803"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11630, "epoch": 0, "train_loss": 4.038326233625412, "train_ppl": 56.73130833862447, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 142609, "dt_s": 4.596, "eta_s": 36074, "world_size": 1, "timestamp": "2026-05-04T21:51:02.577328"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11640, "epoch": 0, "train_loss": 3.9369242936372757, "train_ppl": 51.260695741392766, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 149857, "dt_s": 4.373, "eta_s": 36107, "world_size": 1, "timestamp": "2026-05-04T21:51:06.950551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11650, "epoch": 0, "train_loss": 3.9705635756254196, "train_ppl": 53.014400045769364, "lr": 0.00056, "grad_norm": 0.7088, "tokens_per_sec": 147146, "dt_s": 4.454, "eta_s": 36037, "world_size": 1, "timestamp": "2026-05-04T21:51:11.404381"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11660, "epoch": 0, "train_loss": 3.9690718948841095, "train_ppl": 52.93537843836659, "lr": 0.00056, "grad_norm": 0.6812, "tokens_per_sec": 146225, "dt_s": 4.482, "eta_s": 36139, "world_size": 1, "timestamp": "2026-05-04T21:51:15.886230"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11670, "epoch": 0, "train_loss": 3.9669318199157715, "train_ppl": 52.82221389352639, "lr": 0.00056, "grad_norm": 0.7446, "tokens_per_sec": 150259, "dt_s": 4.362, "eta_s": 36071, "world_size": 1, "timestamp": "2026-05-04T21:51:20.247738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11680, "epoch": 0, "train_loss": 4.052864626049995, "train_ppl": 57.56211502527152, "lr": 0.00056, "grad_norm": 0.7264, "tokens_per_sec": 148666, "dt_s": 4.408, "eta_s": 35763, "world_size": 1, "timestamp": "2026-05-04T21:51:24.656022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11690, "epoch": 0, "train_loss": 3.9900617003440857, "train_ppl": 54.05822467149359, "lr": 0.00056, "grad_norm": 0.6941, "tokens_per_sec": 145335, "dt_s": 4.509, "eta_s": 35980, "world_size": 1, "timestamp": "2026-05-04T21:51:29.165341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11700, "epoch": 0, "train_loss": 4.142557799816132, "train_ppl": 62.96366410832835, "lr": 0.00056, "grad_norm": 0.9134, "tokens_per_sec": 148129, "dt_s": 4.424, "eta_s": 35927, "world_size": 1, "timestamp": "2026-05-04T21:51:33.589551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11710, "epoch": 0, "train_loss": 4.010743275284767, "train_ppl": 55.187875106329756, "lr": 0.00056, "grad_norm": 0.6778, "tokens_per_sec": 143906, "dt_s": 4.554, "eta_s": 36040, "world_size": 1, "timestamp": "2026-05-04T21:51:38.143653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11720, "epoch": 0, "train_loss": 4.334562927484512, "train_ppl": 76.29160663719867, "lr": 0.00056, "grad_norm": 2.1916, "tokens_per_sec": 148745, "dt_s": 4.406, "eta_s": 36107, "world_size": 1, "timestamp": "2026-05-04T21:51:42.549601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11730, "epoch": 0, "train_loss": 3.974778860807419, "train_ppl": 53.23834251981531, "lr": 0.00056, "grad_norm": 0.7048, "tokens_per_sec": 149242, "dt_s": 4.391, "eta_s": 36075, "world_size": 1, "timestamp": "2026-05-04T21:51:46.940837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11740, "epoch": 0, "train_loss": 3.896773710846901, "train_ppl": 49.243319357166015, "lr": 0.00056, "grad_norm": 0.7016, "tokens_per_sec": 146325, "dt_s": 4.479, "eta_s": 36021, "world_size": 1, "timestamp": "2026-05-04T21:51:51.419626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11750, "epoch": 0, "train_loss": 3.9261547178030014, "train_ppl": 50.71160185258042, "lr": 0.00056, "grad_norm": 0.7539, "tokens_per_sec": 132706, "dt_s": 4.938, "eta_s": 36849, "world_size": 1, "timestamp": "2026-05-04T21:51:56.358082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11760, "epoch": 0, "train_loss": 3.994500517845154, "train_ppl": 54.29871261141731, "lr": 0.00056, "grad_norm": 0.7047, "tokens_per_sec": 148042, "dt_s": 4.427, "eta_s": 36639, "world_size": 1, "timestamp": "2026-05-04T21:52:00.784950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11770, "epoch": 0, "train_loss": 3.964398145675659, "train_ppl": 52.68854901401486, "lr": 0.00056, "grad_norm": 0.7218, "tokens_per_sec": 143859, "dt_s": 4.556, "eta_s": 36876, "world_size": 1, "timestamp": "2026-05-04T21:52:05.340528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11780, "epoch": 0, "train_loss": 4.004887834191322, "train_ppl": 54.865670001675674, "lr": 0.00056, "grad_norm": 0.731, "tokens_per_sec": 148173, "dt_s": 4.423, "eta_s": 36923, "world_size": 1, "timestamp": "2026-05-04T21:52:09.763487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11790, "epoch": 0, "train_loss": 3.9658448100090027, "train_ppl": 52.76482681953631, "lr": 0.00056, "grad_norm": 0.7022, "tokens_per_sec": 148161, "dt_s": 4.423, "eta_s": 36829, "world_size": 1, "timestamp": "2026-05-04T21:52:14.186766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11800, "epoch": 0, "train_loss": 4.053959935903549, "train_ppl": 57.62519791840058, "lr": 0.00056, "grad_norm": 0.7403, "tokens_per_sec": 145256, "dt_s": 4.512, "eta_s": 36134, "world_size": 1, "timestamp": "2026-05-04T21:52:18.698519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11810, "epoch": 0, "train_loss": 3.9436657279729843, "train_ppl": 51.60743379847698, "lr": 0.00056, "grad_norm": 0.7528, "tokens_per_sec": 149413, "dt_s": 4.386, "eta_s": 36064, "world_size": 1, "timestamp": "2026-05-04T21:52:23.084749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11820, "epoch": 0, "train_loss": 3.9638352394104004, "train_ppl": 52.65889864563947, "lr": 0.00056, "grad_norm": 0.7282, "tokens_per_sec": 146679, "dt_s": 4.468, "eta_s": 35918, "world_size": 1, "timestamp": "2026-05-04T21:52:27.552746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11830, "epoch": 0, "train_loss": 3.9946519136428833, "train_ppl": 54.306933830642116, "lr": 0.00056, "grad_norm": 0.8407, "tokens_per_sec": 148221, "dt_s": 4.422, "eta_s": 35911, "world_size": 1, "timestamp": "2026-05-04T21:52:31.974263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11840, "epoch": 0, "train_loss": 3.9565977454185486, "train_ppl": 52.27915603312917, "lr": 0.00056, "grad_norm": 0.7061, "tokens_per_sec": 148600, "dt_s": 4.41, "eta_s": 35885, "world_size": 1, "timestamp": "2026-05-04T21:52:36.384478"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11850, "epoch": 0, "train_loss": 4.0540149211883545, "train_ppl": 57.62836654343316, "lr": 0.00056, "grad_norm": 0.7094, "tokens_per_sec": 146957, "dt_s": 4.46, "eta_s": 35796, "world_size": 1, "timestamp": "2026-05-04T21:52:40.844035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11860, "epoch": 0, "train_loss": 3.9370667934417725, "train_ppl": 51.26800090099383, "lr": 0.00056, "grad_norm": 0.7013, "tokens_per_sec": 148004, "dt_s": 4.428, "eta_s": 35859, "world_size": 1, "timestamp": "2026-05-04T21:52:45.272012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11870, "epoch": 0, "train_loss": 3.9020859003067017, "train_ppl": 49.505605238573466, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 149760, "dt_s": 4.376, "eta_s": 35706, "world_size": 1, "timestamp": "2026-05-04T21:52:49.648071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11880, "epoch": 0, "train_loss": 3.930772691965103, "train_ppl": 50.94632828280772, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 145964, "dt_s": 4.49, "eta_s": 35813, "world_size": 1, "timestamp": "2026-05-04T21:52:54.137952"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11890, "epoch": 0, "train_loss": 3.879641577601433, "train_ppl": 48.4068618581065, "lr": 0.00056, "grad_norm": 0.7334, "tokens_per_sec": 149421, "dt_s": 4.386, "eta_s": 35769, "world_size": 1, "timestamp": "2026-05-04T21:52:58.523968"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11900, "epoch": 0, "train_loss": 3.914431244134903, "train_ppl": 50.12055704214012, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 149517, "dt_s": 4.383, "eta_s": 35641, "world_size": 1, "timestamp": "2026-05-04T21:53:02.907143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11910, "epoch": 0, "train_loss": 3.923284962773323, "train_ppl": 50.56628059603705, "lr": 0.00056, "grad_norm": 0.7307, "tokens_per_sec": 146102, "dt_s": 4.486, "eta_s": 35730, "world_size": 1, "timestamp": "2026-05-04T21:53:07.392781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11920, "epoch": 0, "train_loss": 4.012816861271858, "train_ppl": 55.30243064013563, "lr": 0.00056, "grad_norm": 0.7045, "tokens_per_sec": 149390, "dt_s": 4.387, "eta_s": 35743, "world_size": 1, "timestamp": "2026-05-04T21:53:11.779697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11930, "epoch": 0, "train_loss": 4.005835324525833, "train_ppl": 54.91767932897529, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 148049, "dt_s": 4.427, "eta_s": 35637, "world_size": 1, "timestamp": "2026-05-04T21:53:16.206331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11940, "epoch": 0, "train_loss": 3.9047907292842865, "train_ppl": 49.63969069153642, "lr": 0.00056, "grad_norm": 0.755, "tokens_per_sec": 144053, "dt_s": 4.549, "eta_s": 35896, "world_size": 1, "timestamp": "2026-05-04T21:53:20.755760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11950, "epoch": 0, "train_loss": 3.9111236929893494, "train_ppl": 49.955054591072006, "lr": 0.00056, "grad_norm": 0.7516, "tokens_per_sec": 149134, "dt_s": 4.394, "eta_s": 35910, "world_size": 1, "timestamp": "2026-05-04T21:53:25.150213"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11960, "epoch": 0, "train_loss": 3.992674872279167, "train_ppl": 54.19967284076671, "lr": 0.00056, "grad_norm": 0.6872, "tokens_per_sec": 146151, "dt_s": 4.484, "eta_s": 35903, "world_size": 1, "timestamp": "2026-05-04T21:53:29.634370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11970, "epoch": 0, "train_loss": 3.9935528934001923, "train_ppl": 54.24728219622479, "lr": 0.00056, "grad_norm": 0.7984, "tokens_per_sec": 146381, "dt_s": 4.477, "eta_s": 36044, "world_size": 1, "timestamp": "2026-05-04T21:53:34.111448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11980, "epoch": 0, "train_loss": 4.047691389918327, "train_ppl": 57.265101534866126, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 148999, "dt_s": 4.398, "eta_s": 35994, "world_size": 1, "timestamp": "2026-05-04T21:53:38.509868"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 11990, "epoch": 0, "train_loss": 4.250053808093071, "train_ppl": 70.10918468673049, "lr": 0.00056, "grad_norm": 1.5413, "tokens_per_sec": 144734, "dt_s": 4.528, "eta_s": 35955, "world_size": 1, "timestamp": "2026-05-04T21:53:43.037914"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12000, "epoch": 0, "train_loss": 3.852963298559189, "train_ppl": 47.13252423839338, "lr": 0.00056, "grad_norm": 0.7598, "tokens_per_sec": 146665, "dt_s": 4.468, "eta_s": 36070, "world_size": 1, "timestamp": "2026-05-04T21:53:47.506338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12010, "epoch": 0, "train_loss": 3.894448235630989, "train_ppl": 49.1289382851719, "lr": 0.00056, "grad_norm": 0.6921, "tokens_per_sec": 107320, "dt_s": 6.107, "eta_s": 36042, "world_size": 1, "timestamp": "2026-05-04T21:53:53.612962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12020, "epoch": 0, "train_loss": 3.8553807586431503, "train_ppl": 47.246603069348716, "lr": 0.00056, "grad_norm": 0.74, "tokens_per_sec": 140513, "dt_s": 4.664, "eta_s": 36339, "world_size": 1, "timestamp": "2026-05-04T21:53:58.276980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12030, "epoch": 0, "train_loss": 3.9261447340250015, "train_ppl": 50.71109556173286, "lr": 0.00056, "grad_norm": 0.6841, "tokens_per_sec": 147799, "dt_s": 4.434, "eta_s": 36392, "world_size": 1, "timestamp": "2026-05-04T21:54:02.711133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12040, "epoch": 0, "train_loss": 3.929743692278862, "train_ppl": 50.8939314897547, "lr": 0.00056, "grad_norm": 0.7539, "tokens_per_sec": 131692, "dt_s": 4.976, "eta_s": 37111, "world_size": 1, "timestamp": "2026-05-04T21:54:07.687609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12050, "epoch": 0, "train_loss": 3.9467050433158875, "train_ppl": 51.76452366572478, "lr": 0.00056, "grad_norm": 0.7114, "tokens_per_sec": 147704, "dt_s": 4.437, "eta_s": 37056, "world_size": 1, "timestamp": "2026-05-04T21:54:12.124581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12060, "epoch": 0, "train_loss": 4.018014416098595, "train_ppl": 55.590616337152206, "lr": 0.00056, "grad_norm": 0.7831, "tokens_per_sec": 148948, "dt_s": 4.4, "eta_s": 36939, "world_size": 1, "timestamp": "2026-05-04T21:54:16.524468"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12070, "epoch": 0, "train_loss": 3.9578649699687958, "train_ppl": 52.34544745730943, "lr": 0.00056, "grad_norm": 0.7549, "tokens_per_sec": 145304, "dt_s": 4.51, "eta_s": 36686, "world_size": 1, "timestamp": "2026-05-04T21:54:21.034772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12080, "epoch": 0, "train_loss": 3.9465450942516327, "train_ppl": 51.75624464073166, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 149634, "dt_s": 4.38, "eta_s": 36594, "world_size": 1, "timestamp": "2026-05-04T21:54:25.414539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12090, "epoch": 0, "train_loss": 3.8885503113269806, "train_ppl": 48.840032336216545, "lr": 0.00056, "grad_norm": 0.7091, "tokens_per_sec": 149247, "dt_s": 4.391, "eta_s": 35646, "world_size": 1, "timestamp": "2026-05-04T21:54:29.805680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12100, "epoch": 0, "train_loss": 3.969362825155258, "train_ppl": 52.950781182822695, "lr": 0.00056, "grad_norm": 0.6465, "tokens_per_sec": 147320, "dt_s": 4.449, "eta_s": 35660, "world_size": 1, "timestamp": "2026-05-04T21:54:34.254210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12110, "epoch": 0, "train_loss": 4.058127015829086, "train_ppl": 57.86582773749338, "lr": 0.00056, "grad_norm": 0.8378, "tokens_per_sec": 149183, "dt_s": 4.393, "eta_s": 35645, "world_size": 1, "timestamp": "2026-05-04T21:54:38.647217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12120, "epoch": 0, "train_loss": 3.9991624802351, "train_ppl": 54.55244214666544, "lr": 0.00056, "grad_norm": 0.7195, "tokens_per_sec": 148943, "dt_s": 4.4, "eta_s": 35463, "world_size": 1, "timestamp": "2026-05-04T21:54:43.047297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12130, "epoch": 0, "train_loss": 4.003195524215698, "train_ppl": 54.77289880196662, "lr": 0.00056, "grad_norm": 0.7201, "tokens_per_sec": 145306, "dt_s": 4.51, "eta_s": 35668, "world_size": 1, "timestamp": "2026-05-04T21:54:47.557469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12140, "epoch": 0, "train_loss": 3.974497139453888, "train_ppl": 53.22334625438417, "lr": 0.00056, "grad_norm": 0.7606, "tokens_per_sec": 147018, "dt_s": 4.458, "eta_s": 35771, "world_size": 1, "timestamp": "2026-05-04T21:54:52.015198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12150, "epoch": 0, "train_loss": 3.9609109312295914, "train_ppl": 52.50513273656896, "lr": 0.00056, "grad_norm": 0.7224, "tokens_per_sec": 145770, "dt_s": 4.496, "eta_s": 35843, "world_size": 1, "timestamp": "2026-05-04T21:54:56.511011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12160, "epoch": 0, "train_loss": 3.797502502799034, "train_ppl": 44.58968270580765, "lr": 0.00056, "grad_norm": 0.6898, "tokens_per_sec": 147553, "dt_s": 4.442, "eta_s": 35917, "world_size": 1, "timestamp": "2026-05-04T21:55:00.952554"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12170, "epoch": 0, "train_loss": 3.9775769263505936, "train_ppl": 53.38751549214762, "lr": 0.00056, "grad_norm": 0.7006, "tokens_per_sec": 150009, "dt_s": 4.369, "eta_s": 35862, "world_size": 1, "timestamp": "2026-05-04T21:55:05.321357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12180, "epoch": 0, "train_loss": 4.031990796327591, "train_ppl": 56.37302682523743, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 144136, "dt_s": 4.547, "eta_s": 35916, "world_size": 1, "timestamp": "2026-05-04T21:55:09.868169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12190, "epoch": 0, "train_loss": 4.023803994059563, "train_ppl": 55.91339602179907, "lr": 0.00056, "grad_norm": 0.6932, "tokens_per_sec": 146713, "dt_s": 4.467, "eta_s": 35927, "world_size": 1, "timestamp": "2026-05-04T21:55:14.335123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12200, "epoch": 0, "train_loss": 3.9819676280021667, "train_ppl": 53.62243950781376, "lr": 0.00056, "grad_norm": 0.6935, "tokens_per_sec": 147899, "dt_s": 4.431, "eta_s": 35818, "world_size": 1, "timestamp": "2026-05-04T21:55:18.766258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12210, "epoch": 0, "train_loss": 3.9057530909776688, "train_ppl": 49.68748502235719, "lr": 0.00056, "grad_norm": 0.6648, "tokens_per_sec": 144270, "dt_s": 4.543, "eta_s": 35976, "world_size": 1, "timestamp": "2026-05-04T21:55:23.308847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12220, "epoch": 0, "train_loss": 4.043822169303894, "train_ppl": 57.043958325344406, "lr": 0.00056, "grad_norm": 0.7129, "tokens_per_sec": 145609, "dt_s": 4.501, "eta_s": 36184, "world_size": 1, "timestamp": "2026-05-04T21:55:27.809659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12230, "epoch": 0, "train_loss": 3.9444384425878525, "train_ppl": 51.64732702786461, "lr": 0.00056, "grad_norm": 0.7293, "tokens_per_sec": 148405, "dt_s": 4.416, "eta_s": 35969, "world_size": 1, "timestamp": "2026-05-04T21:55:32.225688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12240, "epoch": 0, "train_loss": 4.00796765089035, "train_ppl": 55.034906683558376, "lr": 0.00056, "grad_norm": 0.7279, "tokens_per_sec": 146858, "dt_s": 4.463, "eta_s": 35958, "world_size": 1, "timestamp": "2026-05-04T21:55:36.688246"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12250, "epoch": 0, "train_loss": 3.8821288347244263, "train_ppl": 48.52741202755055, "lr": 0.00056, "grad_norm": 0.6788, "tokens_per_sec": 146915, "dt_s": 4.461, "eta_s": 36001, "world_size": 1, "timestamp": "2026-05-04T21:55:41.149058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12260, "epoch": 0, "train_loss": 4.113355591893196, "train_ppl": 61.15157339506635, "lr": 0.00056, "grad_norm": 0.808, "tokens_per_sec": 148006, "dt_s": 4.428, "eta_s": 35812, "world_size": 1, "timestamp": "2026-05-04T21:55:45.576975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12270, "epoch": 0, "train_loss": 3.865590885281563, "train_ppl": 47.73146792358149, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 147253, "dt_s": 4.451, "eta_s": 35727, "world_size": 1, "timestamp": "2026-05-04T21:55:50.027550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12280, "epoch": 0, "train_loss": 3.975672125816345, "train_ppl": 53.28591971467962, "lr": 0.00056, "grad_norm": 0.7044, "tokens_per_sec": 147860, "dt_s": 4.432, "eta_s": 35749, "world_size": 1, "timestamp": "2026-05-04T21:55:54.459855"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12290, "epoch": 0, "train_loss": 4.029494300484657, "train_ppl": 56.23246732422261, "lr": 0.00056, "grad_norm": 0.7418, "tokens_per_sec": 143471, "dt_s": 4.568, "eta_s": 35914, "world_size": 1, "timestamp": "2026-05-04T21:55:59.027740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12300, "epoch": 0, "train_loss": 3.976783439517021, "train_ppl": 53.3451700040429, "lr": 0.00056, "grad_norm": 0.7207, "tokens_per_sec": 148538, "dt_s": 4.412, "eta_s": 35831, "world_size": 1, "timestamp": "2026-05-04T21:56:03.439826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12310, "epoch": 0, "train_loss": 4.021275043487549, "train_ppl": 55.7721724559227, "lr": 0.00056, "grad_norm": 0.7597, "tokens_per_sec": 149042, "dt_s": 4.397, "eta_s": 35777, "world_size": 1, "timestamp": "2026-05-04T21:56:07.836935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12320, "epoch": 0, "train_loss": 3.9955794662237167, "train_ppl": 54.35732973608677, "lr": 0.00056, "grad_norm": 0.7032, "tokens_per_sec": 146602, "dt_s": 4.47, "eta_s": 35804, "world_size": 1, "timestamp": "2026-05-04T21:56:12.307302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12330, "epoch": 0, "train_loss": 3.937968283891678, "train_ppl": 51.314239352821744, "lr": 0.00056, "grad_norm": 0.7264, "tokens_per_sec": 148774, "dt_s": 4.405, "eta_s": 35756, "world_size": 1, "timestamp": "2026-05-04T21:56:16.712389"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12340, "epoch": 0, "train_loss": 3.9905535727739334, "train_ppl": 54.084820962270875, "lr": 0.00056, "grad_norm": 0.6971, "tokens_per_sec": 132044, "dt_s": 4.963, "eta_s": 36387, "world_size": 1, "timestamp": "2026-05-04T21:56:21.675551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12350, "epoch": 0, "train_loss": 3.9330511689186096, "train_ppl": 51.06254066100262, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 146614, "dt_s": 4.47, "eta_s": 36475, "world_size": 1, "timestamp": "2026-05-04T21:56:26.145500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12360, "epoch": 0, "train_loss": 3.9642070531845093, "train_ppl": 52.678481589863914, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 146580, "dt_s": 4.471, "eta_s": 36589, "world_size": 1, "timestamp": "2026-05-04T21:56:30.616508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12370, "epoch": 0, "train_loss": 4.010615780949593, "train_ppl": 55.1808394133984, "lr": 0.00056, "grad_norm": 0.7671, "tokens_per_sec": 145568, "dt_s": 4.502, "eta_s": 36635, "world_size": 1, "timestamp": "2026-05-04T21:56:35.118576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12380, "epoch": 0, "train_loss": 3.969640076160431, "train_ppl": 52.9654638754319, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 147516, "dt_s": 4.443, "eta_s": 36691, "world_size": 1, "timestamp": "2026-05-04T21:56:39.561214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12390, "epoch": 0, "train_loss": 3.8667216300964355, "train_ppl": 47.78547055928859, "lr": 0.00056, "grad_norm": 0.7519, "tokens_per_sec": 146275, "dt_s": 4.48, "eta_s": 35911, "world_size": 1, "timestamp": "2026-05-04T21:56:44.041524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12400, "epoch": 0, "train_loss": 4.010816231369972, "train_ppl": 55.1919015445231, "lr": 0.00056, "grad_norm": 0.7635, "tokens_per_sec": 145287, "dt_s": 4.511, "eta_s": 35972, "world_size": 1, "timestamp": "2026-05-04T21:56:48.552334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12410, "epoch": 0, "train_loss": 3.952836662530899, "train_ppl": 52.082899094648646, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 147836, "dt_s": 4.433, "eta_s": 35907, "world_size": 1, "timestamp": "2026-05-04T21:56:52.985360"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12420, "epoch": 0, "train_loss": 3.885645091533661, "train_ppl": 48.69834722042382, "lr": 0.00056, "grad_norm": 0.7565, "tokens_per_sec": 148143, "dt_s": 4.424, "eta_s": 35777, "world_size": 1, "timestamp": "2026-05-04T21:56:57.409204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12430, "epoch": 0, "train_loss": 3.9298422932624817, "train_ppl": 50.89894992886731, "lr": 0.00056, "grad_norm": 0.6368, "tokens_per_sec": 147471, "dt_s": 4.444, "eta_s": 35775, "world_size": 1, "timestamp": "2026-05-04T21:57:01.853159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12440, "epoch": 0, "train_loss": 3.9812070429325104, "train_ppl": 53.58167058700716, "lr": 0.00056, "grad_norm": 0.7026, "tokens_per_sec": 148116, "dt_s": 4.425, "eta_s": 35681, "world_size": 1, "timestamp": "2026-05-04T21:57:06.277804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12450, "epoch": 0, "train_loss": 4.005399018526077, "train_ppl": 54.89372364237631, "lr": 0.00056, "grad_norm": 0.6919, "tokens_per_sec": 147105, "dt_s": 4.455, "eta_s": 35587, "world_size": 1, "timestamp": "2026-05-04T21:57:10.732840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12460, "epoch": 0, "train_loss": 3.96388678252697, "train_ppl": 52.661612919341245, "lr": 0.00056, "grad_norm": 0.724, "tokens_per_sec": 146331, "dt_s": 4.479, "eta_s": 35656, "world_size": 1, "timestamp": "2026-05-04T21:57:15.211451"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12470, "epoch": 0, "train_loss": 4.030362516641617, "train_ppl": 56.2813104610313, "lr": 0.00056, "grad_norm": 0.7418, "tokens_per_sec": 148808, "dt_s": 4.404, "eta_s": 35619, "world_size": 1, "timestamp": "2026-05-04T21:57:19.615501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12480, "epoch": 0, "train_loss": 4.033889889717102, "train_ppl": 56.48018618844077, "lr": 0.00056, "grad_norm": 0.7323, "tokens_per_sec": 146459, "dt_s": 4.475, "eta_s": 35664, "world_size": 1, "timestamp": "2026-05-04T21:57:24.090231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12490, "epoch": 0, "train_loss": 3.9239334613084793, "train_ppl": 50.59908339006422, "lr": 0.00056, "grad_norm": 0.7164, "tokens_per_sec": 149524, "dt_s": 4.383, "eta_s": 35593, "world_size": 1, "timestamp": "2026-05-04T21:57:28.473201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12500, "epoch": 0, "train_loss": 3.937643274664879, "train_ppl": 51.297564461459466, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 147895, "dt_s": 4.431, "eta_s": 35550, "world_size": 1, "timestamp": "2026-05-04T21:57:32.904435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12510, "epoch": 0, "train_loss": 4.006634384393692, "train_ppl": 54.96157937961151, "lr": 0.00056, "grad_norm": 0.7141, "tokens_per_sec": 108510, "dt_s": 6.04, "eta_s": 35491, "world_size": 1, "timestamp": "2026-05-04T21:57:38.944095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12520, "epoch": 0, "train_loss": 3.9220881164073944, "train_ppl": 50.50579672903533, "lr": 0.00056, "grad_norm": 0.7328, "tokens_per_sec": 145989, "dt_s": 4.489, "eta_s": 35623, "world_size": 1, "timestamp": "2026-05-04T21:57:43.433215"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12530, "epoch": 0, "train_loss": 3.9017993956804276, "train_ppl": 49.491423685283436, "lr": 0.00056, "grad_norm": 0.7204, "tokens_per_sec": 148202, "dt_s": 4.422, "eta_s": 35534, "world_size": 1, "timestamp": "2026-05-04T21:57:47.855269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12540, "epoch": 0, "train_loss": 4.090869888663292, "train_ppl": 59.79188136801613, "lr": 0.00056, "grad_norm": 0.716, "tokens_per_sec": 147820, "dt_s": 4.433, "eta_s": 35610, "world_size": 1, "timestamp": "2026-05-04T21:57:52.288748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12550, "epoch": 0, "train_loss": 3.928205206990242, "train_ppl": 50.81569212536553, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 149260, "dt_s": 4.391, "eta_s": 35541, "world_size": 1, "timestamp": "2026-05-04T21:57:56.679461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12560, "epoch": 0, "train_loss": 3.930032402276993, "train_ppl": 50.908627197916836, "lr": 0.00056, "grad_norm": 0.7118, "tokens_per_sec": 148634, "dt_s": 4.409, "eta_s": 35480, "world_size": 1, "timestamp": "2026-05-04T21:58:01.088672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12570, "epoch": 0, "train_loss": 3.876208409667015, "train_ppl": 48.240957923158454, "lr": 0.00056, "grad_norm": 0.7138, "tokens_per_sec": 148104, "dt_s": 4.425, "eta_s": 35373, "world_size": 1, "timestamp": "2026-05-04T21:58:05.513681"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12580, "epoch": 0, "train_loss": 3.935856908559799, "train_ppl": 51.20601003024536, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 150045, "dt_s": 4.368, "eta_s": 35282, "world_size": 1, "timestamp": "2026-05-04T21:58:09.881454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12590, "epoch": 0, "train_loss": 3.937631368637085, "train_ppl": 51.296953714867016, "lr": 0.00056, "grad_norm": 0.7396, "tokens_per_sec": 147785, "dt_s": 4.435, "eta_s": 35279, "world_size": 1, "timestamp": "2026-05-04T21:58:14.316009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12600, "epoch": 0, "train_loss": 3.9368505626916885, "train_ppl": 51.256916381153914, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 150397, "dt_s": 4.358, "eta_s": 35222, "world_size": 1, "timestamp": "2026-05-04T21:58:18.673548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12610, "epoch": 0, "train_loss": 3.9291264712810516, "train_ppl": 50.862528378904756, "lr": 0.00056, "grad_norm": 0.7447, "tokens_per_sec": 147260, "dt_s": 4.45, "eta_s": 35283, "world_size": 1, "timestamp": "2026-05-04T21:58:23.123910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12620, "epoch": 0, "train_loss": 3.9744101613759995, "train_ppl": 53.21871719134459, "lr": 0.00056, "grad_norm": 0.7234, "tokens_per_sec": 146860, "dt_s": 4.462, "eta_s": 35339, "world_size": 1, "timestamp": "2026-05-04T21:58:27.586392"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12630, "epoch": 0, "train_loss": 3.8713392168283463, "train_ppl": 48.00663434262401, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 149609, "dt_s": 4.38, "eta_s": 35355, "world_size": 1, "timestamp": "2026-05-04T21:58:31.966871"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12640, "epoch": 0, "train_loss": 3.9323823153972626, "train_ppl": 51.0283987201241, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 133130, "dt_s": 4.923, "eta_s": 36132, "world_size": 1, "timestamp": "2026-05-04T21:58:36.889632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12650, "epoch": 0, "train_loss": 3.924806982278824, "train_ppl": 50.64330206064196, "lr": 0.00056, "grad_norm": 0.702, "tokens_per_sec": 146966, "dt_s": 4.459, "eta_s": 36290, "world_size": 1, "timestamp": "2026-05-04T21:58:41.348856"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12660, "epoch": 0, "train_loss": 3.9142933785915375, "train_ppl": 50.113647620606244, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 148067, "dt_s": 4.426, "eta_s": 36247, "world_size": 1, "timestamp": "2026-05-04T21:58:45.774954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12670, "epoch": 0, "train_loss": 3.976682588458061, "train_ppl": 53.33979035843363, "lr": 0.00056, "grad_norm": 0.7611, "tokens_per_sec": 147835, "dt_s": 4.433, "eta_s": 36195, "world_size": 1, "timestamp": "2026-05-04T21:58:50.208018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12680, "epoch": 0, "train_loss": 3.918450638651848, "train_ppl": 50.32241673941924, "lr": 0.00056, "grad_norm": 0.7312, "tokens_per_sec": 147970, "dt_s": 4.429, "eta_s": 36268, "world_size": 1, "timestamp": "2026-05-04T21:58:54.637024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12690, "epoch": 0, "train_loss": 4.059654593467712, "train_ppl": 57.954289831141686, "lr": 0.00056, "grad_norm": 0.7054, "tokens_per_sec": 149511, "dt_s": 4.383, "eta_s": 35401, "world_size": 1, "timestamp": "2026-05-04T21:58:59.020399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12700, "epoch": 0, "train_loss": 3.995281308889389, "train_ppl": 54.34112511543478, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 148531, "dt_s": 4.412, "eta_s": 35321, "world_size": 1, "timestamp": "2026-05-04T21:59:03.432653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12710, "epoch": 0, "train_loss": 4.116898775100708, "train_ppl": 61.368628929716536, "lr": 0.00056, "grad_norm": 0.7257, "tokens_per_sec": 148346, "dt_s": 4.418, "eta_s": 35304, "world_size": 1, "timestamp": "2026-05-04T21:59:07.850447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12720, "epoch": 0, "train_loss": 3.877665564417839, "train_ppl": 48.311303704077815, "lr": 0.00056, "grad_norm": 0.7292, "tokens_per_sec": 148515, "dt_s": 4.413, "eta_s": 35267, "world_size": 1, "timestamp": "2026-05-04T21:59:12.263209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12730, "epoch": 0, "train_loss": 3.958487033843994, "train_ppl": 52.37801979919197, "lr": 0.00056, "grad_norm": 0.683, "tokens_per_sec": 146215, "dt_s": 4.482, "eta_s": 35347, "world_size": 1, "timestamp": "2026-05-04T21:59:16.745384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12740, "epoch": 0, "train_loss": 4.197986364364624, "train_ppl": 66.55218417513075, "lr": 0.00056, "grad_norm": 0.8289, "tokens_per_sec": 150629, "dt_s": 4.351, "eta_s": 35291, "world_size": 1, "timestamp": "2026-05-04T21:59:21.096196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12750, "epoch": 0, "train_loss": 3.9651558846235275, "train_ppl": 52.72848830957836, "lr": 0.00056, "grad_norm": 0.714, "tokens_per_sec": 148692, "dt_s": 4.408, "eta_s": 35279, "world_size": 1, "timestamp": "2026-05-04T21:59:25.503702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12760, "epoch": 0, "train_loss": 3.9154964685440063, "train_ppl": 50.17397512897312, "lr": 0.00056, "grad_norm": 0.7713, "tokens_per_sec": 148062, "dt_s": 4.426, "eta_s": 35288, "world_size": 1, "timestamp": "2026-05-04T21:59:29.929946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12770, "epoch": 0, "train_loss": 3.8658025562763214, "train_ppl": 47.74157236024852, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 150419, "dt_s": 4.357, "eta_s": 35194, "world_size": 1, "timestamp": "2026-05-04T21:59:34.286843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12780, "epoch": 0, "train_loss": 3.9149539321660995, "train_ppl": 50.14676130515369, "lr": 0.00056, "grad_norm": 0.8327, "tokens_per_sec": 148147, "dt_s": 4.424, "eta_s": 35096, "world_size": 1, "timestamp": "2026-05-04T21:59:38.710548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12790, "epoch": 0, "train_loss": 3.9121655970811844, "train_ppl": 50.00713009098556, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 147094, "dt_s": 4.455, "eta_s": 35259, "world_size": 1, "timestamp": "2026-05-04T21:59:43.165949"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12800, "epoch": 0, "train_loss": 3.9697408080101013, "train_ppl": 52.97079945330353, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 145859, "dt_s": 4.493, "eta_s": 35391, "world_size": 1, "timestamp": "2026-05-04T21:59:47.659063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12810, "epoch": 0, "train_loss": 3.939514845609665, "train_ppl": 51.393661390703635, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 147024, "dt_s": 4.458, "eta_s": 35437, "world_size": 1, "timestamp": "2026-05-04T21:59:52.116560"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12820, "epoch": 0, "train_loss": 4.0748914778232574, "train_ppl": 58.84409435262197, "lr": 0.00056, "grad_norm": 0.6931, "tokens_per_sec": 147993, "dt_s": 4.428, "eta_s": 35547, "world_size": 1, "timestamp": "2026-05-04T21:59:56.544873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12830, "epoch": 0, "train_loss": 3.9361152350902557, "train_ppl": 51.21923960985713, "lr": 0.00056, "grad_norm": 0.6583, "tokens_per_sec": 149443, "dt_s": 4.385, "eta_s": 35481, "world_size": 1, "timestamp": "2026-05-04T22:00:00.930240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12840, "epoch": 0, "train_loss": 3.9276502430438995, "train_ppl": 50.787499072115146, "lr": 0.00056, "grad_norm": 0.6425, "tokens_per_sec": 147446, "dt_s": 4.445, "eta_s": 35459, "world_size": 1, "timestamp": "2026-05-04T22:00:05.374979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12850, "epoch": 0, "train_loss": 3.9513362646102905, "train_ppl": 52.00481261619171, "lr": 0.00056, "grad_norm": 0.7373, "tokens_per_sec": 148192, "dt_s": 4.422, "eta_s": 35342, "world_size": 1, "timestamp": "2026-05-04T22:00:09.797374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12860, "epoch": 0, "train_loss": 3.830575332045555, "train_ppl": 46.089047113532885, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 148677, "dt_s": 4.408, "eta_s": 35259, "world_size": 1, "timestamp": "2026-05-04T22:00:14.205321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12870, "epoch": 0, "train_loss": 3.9614191949367523, "train_ppl": 52.53182597300564, "lr": 0.00056, "grad_norm": 0.792, "tokens_per_sec": 144350, "dt_s": 4.54, "eta_s": 35432, "world_size": 1, "timestamp": "2026-05-04T22:00:18.745382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12880, "epoch": 0, "train_loss": 3.95242477953434, "train_ppl": 52.06145145136306, "lr": 0.00056, "grad_norm": 0.7455, "tokens_per_sec": 149506, "dt_s": 4.383, "eta_s": 35425, "world_size": 1, "timestamp": "2026-05-04T22:00:23.128899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12890, "epoch": 0, "train_loss": 3.993119850754738, "train_ppl": 54.22379589528579, "lr": 0.00056, "grad_norm": 0.763, "tokens_per_sec": 147775, "dt_s": 4.435, "eta_s": 35405, "world_size": 1, "timestamp": "2026-05-04T22:00:27.563740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12900, "epoch": 0, "train_loss": 3.897579848766327, "train_ppl": 49.28303226906967, "lr": 0.00056, "grad_norm": 0.7375, "tokens_per_sec": 146975, "dt_s": 4.459, "eta_s": 35459, "world_size": 1, "timestamp": "2026-05-04T22:00:32.022718"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12910, "epoch": 0, "train_loss": 3.8516277223825455, "train_ppl": 47.069617179810855, "lr": 0.00056, "grad_norm": 0.7197, "tokens_per_sec": 148771, "dt_s": 4.405, "eta_s": 35450, "world_size": 1, "timestamp": "2026-05-04T22:00:36.427884"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12920, "epoch": 0, "train_loss": 4.0105646550655365, "train_ppl": 55.17801831631659, "lr": 0.00056, "grad_norm": 0.7098, "tokens_per_sec": 148155, "dt_s": 4.423, "eta_s": 35260, "world_size": 1, "timestamp": "2026-05-04T22:00:40.851367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12930, "epoch": 0, "train_loss": 3.879178985953331, "train_ppl": 48.384474426618766, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 130941, "dt_s": 5.005, "eta_s": 36246, "world_size": 1, "timestamp": "2026-05-04T22:00:45.856368"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12940, "epoch": 0, "train_loss": 3.7930065244436264, "train_ppl": 44.38965844679515, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 149995, "dt_s": 4.369, "eta_s": 36137, "world_size": 1, "timestamp": "2026-05-04T22:00:50.225578"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12950, "epoch": 0, "train_loss": 3.9560161232948303, "train_ppl": 52.248758160265595, "lr": 0.00056, "grad_norm": 0.6953, "tokens_per_sec": 148565, "dt_s": 4.411, "eta_s": 36056, "world_size": 1, "timestamp": "2026-05-04T22:00:54.636837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12960, "epoch": 0, "train_loss": 4.122368901968002, "train_ppl": 61.70524293694172, "lr": 0.00056, "grad_norm": 0.7733, "tokens_per_sec": 144888, "dt_s": 4.523, "eta_s": 36240, "world_size": 1, "timestamp": "2026-05-04T22:00:59.160067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12970, "epoch": 0, "train_loss": 3.8296459317207336, "train_ppl": 46.046231837522505, "lr": 0.00056, "grad_norm": 0.7759, "tokens_per_sec": 149113, "dt_s": 4.395, "eta_s": 36190, "world_size": 1, "timestamp": "2026-05-04T22:01:03.555138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12980, "epoch": 0, "train_loss": 4.034689337015152, "train_ppl": 56.52535717418793, "lr": 0.00056, "grad_norm": 0.7369, "tokens_per_sec": 146134, "dt_s": 4.485, "eta_s": 35356, "world_size": 1, "timestamp": "2026-05-04T22:01:08.039774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 12990, "epoch": 0, "train_loss": 3.9644061774015427, "train_ppl": 52.688972195697175, "lr": 0.00056, "grad_norm": 0.6985, "tokens_per_sec": 150066, "dt_s": 4.367, "eta_s": 35349, "world_size": 1, "timestamp": "2026-05-04T22:01:12.406919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13000, "epoch": 0, "train_loss": 3.891470119357109, "train_ppl": 48.982844245043395, "lr": 0.00056, "grad_norm": 0.7543, "tokens_per_sec": 149601, "dt_s": 4.381, "eta_s": 35296, "world_size": 1, "timestamp": "2026-05-04T22:01:16.787654"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13010, "epoch": 0, "train_loss": 3.9920611828565598, "train_ppl": 54.166421278945016, "lr": 0.00056, "grad_norm": 0.7387, "tokens_per_sec": 106831, "dt_s": 6.135, "eta_s": 35371, "world_size": 1, "timestamp": "2026-05-04T22:01:22.922217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13020, "epoch": 0, "train_loss": 3.9220619797706604, "train_ppl": 50.50447669462397, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 144157, "dt_s": 4.546, "eta_s": 35607, "world_size": 1, "timestamp": "2026-05-04T22:01:27.468356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13030, "epoch": 0, "train_loss": 3.9458473920822144, "train_ppl": 51.72014679080331, "lr": 0.00056, "grad_norm": 0.7042, "tokens_per_sec": 148597, "dt_s": 4.41, "eta_s": 35484, "world_size": 1, "timestamp": "2026-05-04T22:01:31.878667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13040, "epoch": 0, "train_loss": 3.947669267654419, "train_ppl": 51.81446035053024, "lr": 0.00056, "grad_norm": 0.6863, "tokens_per_sec": 148333, "dt_s": 4.418, "eta_s": 35561, "world_size": 1, "timestamp": "2026-05-04T22:01:36.296866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13050, "epoch": 0, "train_loss": 4.004419103264809, "train_ppl": 54.8399587916324, "lr": 0.00056, "grad_norm": 0.7447, "tokens_per_sec": 149270, "dt_s": 4.39, "eta_s": 35572, "world_size": 1, "timestamp": "2026-05-04T22:01:40.687337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13060, "epoch": 0, "train_loss": 4.044986292719841, "train_ppl": 57.110403200483, "lr": 0.00056, "grad_norm": 0.6923, "tokens_per_sec": 147034, "dt_s": 4.457, "eta_s": 35383, "world_size": 1, "timestamp": "2026-05-04T22:01:45.144493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13070, "epoch": 0, "train_loss": 3.895170509815216, "train_ppl": 49.164435666871775, "lr": 0.00056, "grad_norm": 0.6686, "tokens_per_sec": 147194, "dt_s": 4.452, "eta_s": 35229, "world_size": 1, "timestamp": "2026-05-04T22:01:49.596852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13080, "epoch": 0, "train_loss": 3.978650003671646, "train_ppl": 53.44483517297409, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 148389, "dt_s": 4.417, "eta_s": 35234, "world_size": 1, "timestamp": "2026-05-04T22:01:54.013375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13090, "epoch": 0, "train_loss": 3.966913729906082, "train_ppl": 52.82125834780818, "lr": 0.00056, "grad_norm": 0.7303, "tokens_per_sec": 145002, "dt_s": 4.52, "eta_s": 35392, "world_size": 1, "timestamp": "2026-05-04T22:01:58.533030"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13100, "epoch": 0, "train_loss": 3.91097728908062, "train_ppl": 49.947741511163876, "lr": 0.00056, "grad_norm": 0.6911, "tokens_per_sec": 148146, "dt_s": 4.424, "eta_s": 35440, "world_size": 1, "timestamp": "2026-05-04T22:02:02.956775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13110, "epoch": 0, "train_loss": 4.016825243830681, "train_ppl": 55.524548808469966, "lr": 0.00056, "grad_norm": 0.7236, "tokens_per_sec": 149009, "dt_s": 4.398, "eta_s": 35342, "world_size": 1, "timestamp": "2026-05-04T22:02:07.354884"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13120, "epoch": 0, "train_loss": 3.9316384196281433, "train_ppl": 50.99045302578188, "lr": 0.00056, "grad_norm": 0.7515, "tokens_per_sec": 144275, "dt_s": 4.542, "eta_s": 35481, "world_size": 1, "timestamp": "2026-05-04T22:02:11.897345"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13130, "epoch": 0, "train_loss": 3.8057702630758286, "train_ppl": 44.959867704962704, "lr": 0.00056, "grad_norm": 0.8435, "tokens_per_sec": 148906, "dt_s": 4.401, "eta_s": 35452, "world_size": 1, "timestamp": "2026-05-04T22:02:16.298480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13140, "epoch": 0, "train_loss": 3.897820830345154, "train_ppl": 49.29491000309532, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 148325, "dt_s": 4.418, "eta_s": 35286, "world_size": 1, "timestamp": "2026-05-04T22:02:20.716901"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13150, "epoch": 0, "train_loss": 3.8224907219409943, "train_ppl": 45.717937297949454, "lr": 0.00056, "grad_norm": 0.6928, "tokens_per_sec": 145783, "dt_s": 4.495, "eta_s": 35396, "world_size": 1, "timestamp": "2026-05-04T22:02:25.212399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13160, "epoch": 0, "train_loss": 3.8903843760490417, "train_ppl": 48.92969031068016, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 147778, "dt_s": 4.435, "eta_s": 35450, "world_size": 1, "timestamp": "2026-05-04T22:02:29.647143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13170, "epoch": 0, "train_loss": 3.9124237447977066, "train_ppl": 50.020040983815434, "lr": 0.00056, "grad_norm": 0.7382, "tokens_per_sec": 146312, "dt_s": 4.479, "eta_s": 35345, "world_size": 1, "timestamp": "2026-05-04T22:02:34.126341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13180, "epoch": 0, "train_loss": 4.0886359214782715, "train_ppl": 59.658457355031615, "lr": 0.00056, "grad_norm": 0.7498, "tokens_per_sec": 147165, "dt_s": 4.453, "eta_s": 35423, "world_size": 1, "timestamp": "2026-05-04T22:02:38.579572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13190, "epoch": 0, "train_loss": 3.995294898748398, "train_ppl": 54.34186360868147, "lr": 0.00056, "grad_norm": 0.7524, "tokens_per_sec": 149919, "dt_s": 4.371, "eta_s": 35344, "world_size": 1, "timestamp": "2026-05-04T22:02:42.951011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13200, "epoch": 0, "train_loss": 3.938026115298271, "train_ppl": 51.31720701327295, "lr": 0.00056, "grad_norm": 0.7665, "tokens_per_sec": 144915, "dt_s": 4.522, "eta_s": 35382, "world_size": 1, "timestamp": "2026-05-04T22:02:47.473393"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13210, "epoch": 0, "train_loss": 4.022690996527672, "train_ppl": 55.85119916891965, "lr": 0.00056, "grad_norm": 0.7576, "tokens_per_sec": 148503, "dt_s": 4.413, "eta_s": 35343, "world_size": 1, "timestamp": "2026-05-04T22:02:51.886473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13220, "epoch": 0, "train_loss": 3.9447725117206573, "train_ppl": 51.66458368791474, "lr": 0.00056, "grad_norm": 0.686, "tokens_per_sec": 133311, "dt_s": 4.916, "eta_s": 36033, "world_size": 1, "timestamp": "2026-05-04T22:02:56.802513"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13230, "epoch": 0, "train_loss": 3.7754180133342743, "train_ppl": 43.61573646507992, "lr": 0.00056, "grad_norm": 1.1434, "tokens_per_sec": 143875, "dt_s": 4.555, "eta_s": 36190, "world_size": 1, "timestamp": "2026-05-04T22:03:01.357570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13240, "epoch": 0, "train_loss": 3.864996910095215, "train_ppl": 47.70312503434667, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 147547, "dt_s": 4.442, "eta_s": 36297, "world_size": 1, "timestamp": "2026-05-04T22:03:05.799317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13250, "epoch": 0, "train_loss": 4.087014377117157, "train_ppl": 59.561796910693964, "lr": 0.00056, "grad_norm": 0.8919, "tokens_per_sec": 146496, "dt_s": 4.474, "eta_s": 36215, "world_size": 1, "timestamp": "2026-05-04T22:03:10.272855"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13260, "epoch": 0, "train_loss": 3.9586561769247055, "train_ppl": 52.386879928115995, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 146122, "dt_s": 4.485, "eta_s": 36325, "world_size": 1, "timestamp": "2026-05-04T22:03:14.757868"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13270, "epoch": 0, "train_loss": 3.990997329354286, "train_ppl": 54.10882678344671, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 148336, "dt_s": 4.418, "eta_s": 35529, "world_size": 1, "timestamp": "2026-05-04T22:03:19.175932"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13280, "epoch": 0, "train_loss": 3.969978779554367, "train_ppl": 52.983406496249664, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 147973, "dt_s": 4.429, "eta_s": 35325, "world_size": 1, "timestamp": "2026-05-04T22:03:23.604850"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13290, "epoch": 0, "train_loss": 4.046950086951256, "train_ppl": 57.22266647574958, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 146740, "dt_s": 4.466, "eta_s": 35359, "world_size": 1, "timestamp": "2026-05-04T22:03:28.070974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13300, "epoch": 0, "train_loss": 3.8865728229284286, "train_ppl": 48.74354716947749, "lr": 0.00056, "grad_norm": 0.6983, "tokens_per_sec": 149186, "dt_s": 4.393, "eta_s": 35227, "world_size": 1, "timestamp": "2026-05-04T22:03:32.463905"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13310, "epoch": 0, "train_loss": 3.9184237122535706, "train_ppl": 50.321061756226335, "lr": 0.00056, "grad_norm": 0.6481, "tokens_per_sec": 144499, "dt_s": 4.535, "eta_s": 35302, "world_size": 1, "timestamp": "2026-05-04T22:03:36.999310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13320, "epoch": 0, "train_loss": 3.9677498787641525, "train_ppl": 52.86544325266211, "lr": 0.00056, "grad_norm": 0.862, "tokens_per_sec": 147436, "dt_s": 4.445, "eta_s": 35340, "world_size": 1, "timestamp": "2026-05-04T22:03:41.444357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13330, "epoch": 0, "train_loss": 4.043430894613266, "train_ppl": 57.0216428342266, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 148368, "dt_s": 4.417, "eta_s": 35317, "world_size": 1, "timestamp": "2026-05-04T22:03:45.861460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13340, "epoch": 0, "train_loss": 3.9169005006551743, "train_ppl": 50.24447047848568, "lr": 0.00056, "grad_norm": 0.8269, "tokens_per_sec": 144813, "dt_s": 4.526, "eta_s": 35407, "world_size": 1, "timestamp": "2026-05-04T22:03:50.387026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13350, "epoch": 0, "train_loss": 3.974358096718788, "train_ppl": 53.21594644920629, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 148277, "dt_s": 4.42, "eta_s": 35445, "world_size": 1, "timestamp": "2026-05-04T22:03:54.806843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13360, "epoch": 0, "train_loss": 3.8766874819993973, "train_ppl": 48.264074368169545, "lr": 0.00056, "grad_norm": 0.6998, "tokens_per_sec": 150067, "dt_s": 4.367, "eta_s": 35174, "world_size": 1, "timestamp": "2026-05-04T22:03:59.173971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13370, "epoch": 0, "train_loss": 3.9303221255540848, "train_ppl": 50.92337874905155, "lr": 0.00056, "grad_norm": 0.7266, "tokens_per_sec": 145180, "dt_s": 4.514, "eta_s": 35279, "world_size": 1, "timestamp": "2026-05-04T22:04:03.688095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13380, "epoch": 0, "train_loss": 4.002837717533112, "train_ppl": 54.75330419849971, "lr": 0.00056, "grad_norm": 0.7858, "tokens_per_sec": 149804, "dt_s": 4.375, "eta_s": 35207, "world_size": 1, "timestamp": "2026-05-04T22:04:08.062868"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13390, "epoch": 0, "train_loss": 3.925227478146553, "train_ppl": 50.66460183780691, "lr": 0.00056, "grad_norm": 0.6931, "tokens_per_sec": 147504, "dt_s": 4.443, "eta_s": 35072, "world_size": 1, "timestamp": "2026-05-04T22:04:12.505859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13400, "epoch": 0, "train_loss": 3.992165118455887, "train_ppl": 54.17205139098352, "lr": 0.00056, "grad_norm": 0.7105, "tokens_per_sec": 146566, "dt_s": 4.471, "eta_s": 35150, "world_size": 1, "timestamp": "2026-05-04T22:04:16.977294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13410, "epoch": 0, "train_loss": 3.9867579489946365, "train_ppl": 53.87992443079226, "lr": 0.00056, "grad_norm": 0.7515, "tokens_per_sec": 149268, "dt_s": 4.39, "eta_s": 35182, "world_size": 1, "timestamp": "2026-05-04T22:04:21.367767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13420, "epoch": 0, "train_loss": 3.9668538123369217, "train_ppl": 52.81809352122329, "lr": 0.00056, "grad_norm": 0.7655, "tokens_per_sec": 149454, "dt_s": 4.385, "eta_s": 34973, "world_size": 1, "timestamp": "2026-05-04T22:04:25.752785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13430, "epoch": 0, "train_loss": 3.894136503338814, "train_ppl": 49.113625595482056, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 145919, "dt_s": 4.491, "eta_s": 35153, "world_size": 1, "timestamp": "2026-05-04T22:04:30.244057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13440, "epoch": 0, "train_loss": 3.8576157838106155, "train_ppl": 47.352318510614836, "lr": 0.00056, "grad_norm": 0.7232, "tokens_per_sec": 149333, "dt_s": 4.389, "eta_s": 35063, "world_size": 1, "timestamp": "2026-05-04T22:04:34.632649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13450, "epoch": 0, "train_loss": 3.8798474818468094, "train_ppl": 48.41683006268105, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 144651, "dt_s": 4.531, "eta_s": 35152, "world_size": 1, "timestamp": "2026-05-04T22:04:39.163273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13460, "epoch": 0, "train_loss": 3.9425127059221268, "train_ppl": 51.54796358114375, "lr": 0.00056, "grad_norm": 0.7123, "tokens_per_sec": 145523, "dt_s": 4.503, "eta_s": 35327, "world_size": 1, "timestamp": "2026-05-04T22:04:43.666796"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13470, "epoch": 0, "train_loss": 3.948266923427582, "train_ppl": 51.84543681760219, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 145465, "dt_s": 4.505, "eta_s": 35513, "world_size": 1, "timestamp": "2026-05-04T22:04:48.172019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13480, "epoch": 0, "train_loss": 3.890175759792328, "train_ppl": 48.91948384649957, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 141158, "dt_s": 4.643, "eta_s": 35748, "world_size": 1, "timestamp": "2026-05-04T22:04:52.814778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13490, "epoch": 0, "train_loss": 3.977329134941101, "train_ppl": 53.374288163311434, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 147801, "dt_s": 4.434, "eta_s": 35816, "world_size": 1, "timestamp": "2026-05-04T22:04:57.248825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13500, "epoch": 0, "train_loss": 3.763040065765381, "train_ppl": 43.07919068419962, "lr": 0.00056, "grad_norm": 0.7966, "tokens_per_sec": 147315, "dt_s": 4.449, "eta_s": 35681, "world_size": 1, "timestamp": "2026-05-04T22:05:01.697543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13510, "epoch": 0, "train_loss": 3.919341340661049, "train_ppl": 50.36725898469124, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 124431, "dt_s": 5.267, "eta_s": 35649, "world_size": 1, "timestamp": "2026-05-04T22:05:06.964384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13520, "epoch": 0, "train_loss": 4.03847736120224, "train_ppl": 56.73988265167523, "lr": 0.00056, "grad_norm": 0.7285, "tokens_per_sec": 132783, "dt_s": 4.936, "eta_s": 36326, "world_size": 1, "timestamp": "2026-05-04T22:05:11.899969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13530, "epoch": 0, "train_loss": 3.923997685313225, "train_ppl": 50.60233317019181, "lr": 0.00056, "grad_norm": 0.7279, "tokens_per_sec": 146709, "dt_s": 4.467, "eta_s": 36043, "world_size": 1, "timestamp": "2026-05-04T22:05:16.367021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13540, "epoch": 0, "train_loss": 3.9032066464424133, "train_ppl": 49.561119557258905, "lr": 0.00056, "grad_norm": 0.772, "tokens_per_sec": 146911, "dt_s": 4.461, "eta_s": 36081, "world_size": 1, "timestamp": "2026-05-04T22:05:20.827949"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13550, "epoch": 0, "train_loss": 3.963904693722725, "train_ppl": 52.662556160246275, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 150135, "dt_s": 4.365, "eta_s": 35944, "world_size": 1, "timestamp": "2026-05-04T22:05:25.193097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13560, "epoch": 0, "train_loss": 3.8781345933675766, "train_ppl": 48.33396841890279, "lr": 0.00056, "grad_norm": 0.7066, "tokens_per_sec": 145845, "dt_s": 4.494, "eta_s": 35952, "world_size": 1, "timestamp": "2026-05-04T22:05:29.686646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13570, "epoch": 0, "train_loss": 3.9656015187501907, "train_ppl": 52.7519911598636, "lr": 0.00056, "grad_norm": 0.6768, "tokens_per_sec": 148200, "dt_s": 4.422, "eta_s": 35135, "world_size": 1, "timestamp": "2026-05-04T22:05:34.108775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13580, "epoch": 0, "train_loss": 4.018314436078072, "train_ppl": 55.60729713488597, "lr": 0.00056, "grad_norm": 0.7679, "tokens_per_sec": 148722, "dt_s": 4.407, "eta_s": 35035, "world_size": 1, "timestamp": "2026-05-04T22:05:38.515394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13590, "epoch": 0, "train_loss": 3.8667097240686417, "train_ppl": 47.784901627534836, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 145974, "dt_s": 4.49, "eta_s": 35076, "world_size": 1, "timestamp": "2026-05-04T22:05:43.005007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13600, "epoch": 0, "train_loss": 3.976972058415413, "train_ppl": 53.35523286023617, "lr": 0.00056, "grad_norm": 0.7633, "tokens_per_sec": 149398, "dt_s": 4.387, "eta_s": 35105, "world_size": 1, "timestamp": "2026-05-04T22:05:47.391628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13610, "epoch": 0, "train_loss": 3.9964932948350906, "train_ppl": 54.4070257225863, "lr": 0.00056, "grad_norm": 0.7605, "tokens_per_sec": 147787, "dt_s": 4.434, "eta_s": 35007, "world_size": 1, "timestamp": "2026-05-04T22:05:51.826096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13620, "epoch": 0, "train_loss": 3.9292021095752716, "train_ppl": 50.86637567929083, "lr": 0.00056, "grad_norm": 0.7312, "tokens_per_sec": 144841, "dt_s": 4.525, "eta_s": 35165, "world_size": 1, "timestamp": "2026-05-04T22:05:56.350788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13630, "epoch": 0, "train_loss": 4.0055131316185, "train_ppl": 54.89998809235681, "lr": 0.00056, "grad_norm": 0.6927, "tokens_per_sec": 149872, "dt_s": 4.373, "eta_s": 35107, "world_size": 1, "timestamp": "2026-05-04T22:06:00.723563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13640, "epoch": 0, "train_loss": 3.9203790575265884, "train_ppl": 50.41955306734366, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 145314, "dt_s": 4.51, "eta_s": 35135, "world_size": 1, "timestamp": "2026-05-04T22:06:05.233582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13650, "epoch": 0, "train_loss": 3.982457086443901, "train_ppl": 53.64869188769761, "lr": 0.00056, "grad_norm": 0.7083, "tokens_per_sec": 147161, "dt_s": 4.453, "eta_s": 35236, "world_size": 1, "timestamp": "2026-05-04T22:06:09.686887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13660, "epoch": 0, "train_loss": 3.878819078207016, "train_ppl": 48.36706361280257, "lr": 0.00056, "grad_norm": 0.7571, "tokens_per_sec": 148226, "dt_s": 4.421, "eta_s": 35211, "world_size": 1, "timestamp": "2026-05-04T22:06:14.108268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13670, "epoch": 0, "train_loss": 3.965099036693573, "train_ppl": 52.725490889367705, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 146388, "dt_s": 4.477, "eta_s": 35130, "world_size": 1, "timestamp": "2026-05-04T22:06:18.585096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13680, "epoch": 0, "train_loss": 3.9514556229114532, "train_ppl": 52.01102019273339, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 147616, "dt_s": 4.44, "eta_s": 35232, "world_size": 1, "timestamp": "2026-05-04T22:06:23.024733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13690, "epoch": 0, "train_loss": 3.8851214200258255, "train_ppl": 48.67285195965927, "lr": 0.00056, "grad_norm": 0.7049, "tokens_per_sec": 148375, "dt_s": 4.417, "eta_s": 35080, "world_size": 1, "timestamp": "2026-05-04T22:06:27.441641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13700, "epoch": 0, "train_loss": 3.9355209469795227, "train_ppl": 51.18880966768822, "lr": 0.00056, "grad_norm": 0.747, "tokens_per_sec": 143396, "dt_s": 4.57, "eta_s": 35261, "world_size": 1, "timestamp": "2026-05-04T22:06:32.011936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13710, "epoch": 0, "train_loss": 3.882045105099678, "train_ppl": 48.52334901565108, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 144415, "dt_s": 4.538, "eta_s": 35441, "world_size": 1, "timestamp": "2026-05-04T22:06:36.549950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13720, "epoch": 0, "train_loss": 3.960214227437973, "train_ppl": 52.4685649514484, "lr": 0.00056, "grad_norm": 0.6833, "tokens_per_sec": 145849, "dt_s": 4.493, "eta_s": 35462, "world_size": 1, "timestamp": "2026-05-04T22:06:41.043394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13730, "epoch": 0, "train_loss": 3.948211833834648, "train_ppl": 51.842580752262904, "lr": 0.00056, "grad_norm": 0.712, "tokens_per_sec": 143993, "dt_s": 4.551, "eta_s": 35634, "world_size": 1, "timestamp": "2026-05-04T22:06:45.594713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13740, "epoch": 0, "train_loss": 4.03032922744751, "train_ppl": 56.27943693274706, "lr": 0.00056, "grad_norm": 0.746, "tokens_per_sec": 145689, "dt_s": 4.498, "eta_s": 35758, "world_size": 1, "timestamp": "2026-05-04T22:06:50.093029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13750, "epoch": 0, "train_loss": 4.0746365040540695, "train_ppl": 58.82909256470291, "lr": 0.00056, "grad_norm": 0.8411, "tokens_per_sec": 145095, "dt_s": 4.517, "eta_s": 35669, "world_size": 1, "timestamp": "2026-05-04T22:06:54.609810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13760, "epoch": 0, "train_loss": 3.9606389701366425, "train_ppl": 52.490855324822746, "lr": 0.00056, "grad_norm": 0.7502, "tokens_per_sec": 144522, "dt_s": 4.535, "eta_s": 35659, "world_size": 1, "timestamp": "2026-05-04T22:06:59.144494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13770, "epoch": 0, "train_loss": 3.950707972049713, "train_ppl": 51.97214864165135, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 146643, "dt_s": 4.469, "eta_s": 35616, "world_size": 1, "timestamp": "2026-05-04T22:07:03.613596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13780, "epoch": 0, "train_loss": 3.952140361070633, "train_ppl": 52.046646318849234, "lr": 0.00056, "grad_norm": 0.6971, "tokens_per_sec": 144066, "dt_s": 4.549, "eta_s": 35608, "world_size": 1, "timestamp": "2026-05-04T22:07:08.162591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13790, "epoch": 0, "train_loss": 3.9555416107177734, "train_ppl": 52.22397134867496, "lr": 0.00056, "grad_norm": 0.7729, "tokens_per_sec": 147489, "dt_s": 4.443, "eta_s": 35517, "world_size": 1, "timestamp": "2026-05-04T22:07:12.606052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13800, "epoch": 0, "train_loss": 4.0107012540102005, "train_ppl": 55.18555609020151, "lr": 0.00056, "grad_norm": 0.6921, "tokens_per_sec": 148844, "dt_s": 4.403, "eta_s": 35333, "world_size": 1, "timestamp": "2026-05-04T22:07:17.009070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13810, "epoch": 0, "train_loss": 3.9190917015075684, "train_ppl": 50.35468691410099, "lr": 0.00056, "grad_norm": 0.7286, "tokens_per_sec": 128720, "dt_s": 5.091, "eta_s": 36207, "world_size": 1, "timestamp": "2026-05-04T22:07:22.100472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13820, "epoch": 0, "train_loss": 3.9615135192871094, "train_ppl": 52.53678123706097, "lr": 0.00056, "grad_norm": 0.6788, "tokens_per_sec": 147166, "dt_s": 4.453, "eta_s": 36177, "world_size": 1, "timestamp": "2026-05-04T22:07:26.553647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13830, "epoch": 0, "train_loss": 3.846334308385849, "train_ppl": 46.82111649815894, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 145396, "dt_s": 4.507, "eta_s": 36107, "world_size": 1, "timestamp": "2026-05-04T22:07:31.061047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13840, "epoch": 0, "train_loss": 3.9182637482881546, "train_ppl": 50.313012843429036, "lr": 0.00056, "grad_norm": 0.7206, "tokens_per_sec": 144052, "dt_s": 4.549, "eta_s": 36270, "world_size": 1, "timestamp": "2026-05-04T22:07:35.610504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13850, "epoch": 0, "train_loss": 3.8893337845802307, "train_ppl": 48.878312188902086, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 146503, "dt_s": 4.473, "eta_s": 36376, "world_size": 1, "timestamp": "2026-05-04T22:07:40.083878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13860, "epoch": 0, "train_loss": 3.9467233419418335, "train_ppl": 51.76547089404708, "lr": 0.00056, "grad_norm": 0.7092, "tokens_per_sec": 140968, "dt_s": 4.649, "eta_s": 35674, "world_size": 1, "timestamp": "2026-05-04T22:07:44.732878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13870, "epoch": 0, "train_loss": 3.927375867962837, "train_ppl": 50.773566159449715, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 144300, "dt_s": 4.542, "eta_s": 35809, "world_size": 1, "timestamp": "2026-05-04T22:07:49.274540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13880, "epoch": 0, "train_loss": 4.016508623957634, "train_ppl": 55.50697141569802, "lr": 0.00056, "grad_norm": 0.7298, "tokens_per_sec": 145077, "dt_s": 4.517, "eta_s": 35820, "world_size": 1, "timestamp": "2026-05-04T22:07:53.791858"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13890, "epoch": 0, "train_loss": 4.00049102306366, "train_ppl": 54.62496556702769, "lr": 0.00056, "grad_norm": 0.6961, "tokens_per_sec": 143131, "dt_s": 4.579, "eta_s": 35861, "world_size": 1, "timestamp": "2026-05-04T22:07:58.370611"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13900, "epoch": 0, "train_loss": 3.9233588576316833, "train_ppl": 50.57001732224023, "lr": 0.00056, "grad_norm": 0.8277, "tokens_per_sec": 144764, "dt_s": 4.527, "eta_s": 35941, "world_size": 1, "timestamp": "2026-05-04T22:08:02.897716"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13910, "epoch": 0, "train_loss": 4.0243891179561615, "train_ppl": 55.94612185935252, "lr": 0.00056, "grad_norm": 0.7771, "tokens_per_sec": 145543, "dt_s": 4.503, "eta_s": 35707, "world_size": 1, "timestamp": "2026-05-04T22:08:07.400579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13920, "epoch": 0, "train_loss": 3.8520190119743347, "train_ppl": 47.08803863492961, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 144423, "dt_s": 4.538, "eta_s": 35696, "world_size": 1, "timestamp": "2026-05-04T22:08:11.938410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13930, "epoch": 0, "train_loss": 3.9136589020490646, "train_ppl": 50.08186177148736, "lr": 0.00056, "grad_norm": 0.7199, "tokens_per_sec": 146616, "dt_s": 4.47, "eta_s": 35617, "world_size": 1, "timestamp": "2026-05-04T22:08:16.408275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13940, "epoch": 0, "train_loss": 4.031685769557953, "train_ppl": 56.35583416521429, "lr": 0.00056, "grad_norm": 0.7452, "tokens_per_sec": 144853, "dt_s": 4.524, "eta_s": 35527, "world_size": 1, "timestamp": "2026-05-04T22:08:20.932570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13950, "epoch": 0, "train_loss": 3.8938004821538925, "train_ppl": 49.09712514921869, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 144992, "dt_s": 4.52, "eta_s": 35511, "world_size": 1, "timestamp": "2026-05-04T22:08:25.452574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13960, "epoch": 0, "train_loss": 3.9582835286855698, "train_ppl": 52.367361686502015, "lr": 0.00056, "grad_norm": 0.7836, "tokens_per_sec": 146182, "dt_s": 4.483, "eta_s": 35475, "world_size": 1, "timestamp": "2026-05-04T22:08:29.935760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13970, "epoch": 0, "train_loss": 3.935987278819084, "train_ppl": 51.21268620622789, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 143630, "dt_s": 4.563, "eta_s": 35510, "world_size": 1, "timestamp": "2026-05-04T22:08:34.498621"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13980, "epoch": 0, "train_loss": 3.934271365404129, "train_ppl": 51.12488502210876, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 144714, "dt_s": 4.529, "eta_s": 35598, "world_size": 1, "timestamp": "2026-05-04T22:08:39.027283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 13990, "epoch": 0, "train_loss": 3.9350163340568542, "train_ppl": 51.16298564894865, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 146403, "dt_s": 4.476, "eta_s": 35519, "world_size": 1, "timestamp": "2026-05-04T22:08:43.503702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14000, "epoch": 0, "train_loss": 3.9478912353515625, "train_ppl": 51.82596276350781, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 142936, "dt_s": 4.585, "eta_s": 35616, "world_size": 1, "timestamp": "2026-05-04T22:08:48.088741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14010, "epoch": 0, "train_loss": 3.8567505478858948, "train_ppl": 47.3113653031687, "lr": 0.00056, "grad_norm": 0.7651, "tokens_per_sec": 107440, "dt_s": 6.1, "eta_s": 35628, "world_size": 1, "timestamp": "2026-05-04T22:08:54.188511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14020, "epoch": 0, "train_loss": 3.9167238771915436, "train_ppl": 50.23559690974477, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 139157, "dt_s": 4.71, "eta_s": 35854, "world_size": 1, "timestamp": "2026-05-04T22:08:58.897976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14030, "epoch": 0, "train_loss": 3.8896889239549637, "train_ppl": 48.895673884859306, "lr": 0.00056, "grad_norm": 0.7298, "tokens_per_sec": 145649, "dt_s": 4.5, "eta_s": 35803, "world_size": 1, "timestamp": "2026-05-04T22:09:03.397584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14040, "epoch": 0, "train_loss": 3.8691647797822952, "train_ppl": 47.90236034815692, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 145811, "dt_s": 4.495, "eta_s": 35827, "world_size": 1, "timestamp": "2026-05-04T22:09:07.892169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14050, "epoch": 0, "train_loss": 4.029394716024399, "train_ppl": 56.22686772313638, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 144876, "dt_s": 4.524, "eta_s": 35726, "world_size": 1, "timestamp": "2026-05-04T22:09:12.415773"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14060, "epoch": 0, "train_loss": 4.035022661089897, "train_ppl": 56.54420157703976, "lr": 0.00056, "grad_norm": 0.7723, "tokens_per_sec": 145827, "dt_s": 4.494, "eta_s": 35723, "world_size": 1, "timestamp": "2026-05-04T22:09:16.909881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14070, "epoch": 0, "train_loss": 3.9766389280557632, "train_ppl": 53.337461572566326, "lr": 0.00056, "grad_norm": 0.7344, "tokens_per_sec": 146654, "dt_s": 4.469, "eta_s": 35340, "world_size": 1, "timestamp": "2026-05-04T22:09:21.378700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14080, "epoch": 0, "train_loss": 3.892329841852188, "train_ppl": 49.02497400547026, "lr": 0.00056, "grad_norm": 0.69, "tokens_per_sec": 144908, "dt_s": 4.523, "eta_s": 35372, "world_size": 1, "timestamp": "2026-05-04T22:09:25.901223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14090, "epoch": 0, "train_loss": 3.9317772686481476, "train_ppl": 50.99753349176059, "lr": 0.00056, "grad_norm": 0.8107, "tokens_per_sec": 145819, "dt_s": 4.494, "eta_s": 35367, "world_size": 1, "timestamp": "2026-05-04T22:09:30.395540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14100, "epoch": 0, "train_loss": 3.94793102145195, "train_ppl": 51.82802475748408, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 147007, "dt_s": 4.458, "eta_s": 35259, "world_size": 1, "timestamp": "2026-05-04T22:09:34.853582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14110, "epoch": 0, "train_loss": 4.053053945302963, "train_ppl": 57.57301367351677, "lr": 0.00056, "grad_norm": 0.7984, "tokens_per_sec": 131739, "dt_s": 4.975, "eta_s": 36010, "world_size": 1, "timestamp": "2026-05-04T22:09:39.828279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14120, "epoch": 0, "train_loss": 3.990958273410797, "train_ppl": 54.10671355343296, "lr": 0.00056, "grad_norm": 0.7554, "tokens_per_sec": 147296, "dt_s": 4.449, "eta_s": 35975, "world_size": 1, "timestamp": "2026-05-04T22:09:44.277519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14130, "epoch": 0, "train_loss": 3.8868191689252853, "train_ppl": 48.75555642635073, "lr": 0.00056, "grad_norm": 0.737, "tokens_per_sec": 145627, "dt_s": 4.5, "eta_s": 35935, "world_size": 1, "timestamp": "2026-05-04T22:09:48.777777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14140, "epoch": 0, "train_loss": 3.840263679623604, "train_ppl": 46.537743876906674, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 148624, "dt_s": 4.41, "eta_s": 35797, "world_size": 1, "timestamp": "2026-05-04T22:09:53.187291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14150, "epoch": 0, "train_loss": 3.944755434989929, "train_ppl": 51.66370143326395, "lr": 0.00056, "grad_norm": 0.7015, "tokens_per_sec": 147876, "dt_s": 4.432, "eta_s": 35752, "world_size": 1, "timestamp": "2026-05-04T22:09:57.619101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14160, "epoch": 0, "train_loss": 3.997742012143135, "train_ppl": 54.47500715324627, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 144135, "dt_s": 4.547, "eta_s": 35075, "world_size": 1, "timestamp": "2026-05-04T22:10:02.165950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14170, "epoch": 0, "train_loss": 3.9396505653858185, "train_ppl": 51.400637000276674, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 147579, "dt_s": 4.441, "eta_s": 35057, "world_size": 1, "timestamp": "2026-05-04T22:10:06.606721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14180, "epoch": 0, "train_loss": 3.856157585978508, "train_ppl": 47.283319781542204, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 145744, "dt_s": 4.497, "eta_s": 35047, "world_size": 1, "timestamp": "2026-05-04T22:10:11.103356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14190, "epoch": 0, "train_loss": 3.9901818186044693, "train_ppl": 54.064718441402924, "lr": 0.00056, "grad_norm": 0.7368, "tokens_per_sec": 144233, "dt_s": 4.544, "eta_s": 35254, "world_size": 1, "timestamp": "2026-05-04T22:10:15.647126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14200, "epoch": 0, "train_loss": 3.833644151687622, "train_ppl": 46.23070333412271, "lr": 0.00056, "grad_norm": 0.7188, "tokens_per_sec": 146552, "dt_s": 4.472, "eta_s": 35312, "world_size": 1, "timestamp": "2026-05-04T22:10:20.119008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14210, "epoch": 0, "train_loss": 3.897346630692482, "train_ppl": 49.27153991537502, "lr": 0.00056, "grad_norm": 0.743, "tokens_per_sec": 147705, "dt_s": 4.437, "eta_s": 35135, "world_size": 1, "timestamp": "2026-05-04T22:10:24.555946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14220, "epoch": 0, "train_loss": 3.873427912592888, "train_ppl": 48.10701038746308, "lr": 0.00056, "grad_norm": 0.7018, "tokens_per_sec": 145209, "dt_s": 4.513, "eta_s": 35244, "world_size": 1, "timestamp": "2026-05-04T22:10:29.069163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14230, "epoch": 0, "train_loss": 4.001139208674431, "train_ppl": 54.66038416136863, "lr": 0.00056, "grad_norm": 0.7315, "tokens_per_sec": 146786, "dt_s": 4.465, "eta_s": 35190, "world_size": 1, "timestamp": "2026-05-04T22:10:33.533904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14240, "epoch": 0, "train_loss": 3.926063910126686, "train_ppl": 50.70699705893243, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 145532, "dt_s": 4.503, "eta_s": 35121, "world_size": 1, "timestamp": "2026-05-04T22:10:38.037198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14250, "epoch": 0, "train_loss": 3.985158532857895, "train_ppl": 53.79381688946298, "lr": 0.00056, "grad_norm": 0.7138, "tokens_per_sec": 148214, "dt_s": 4.422, "eta_s": 35038, "world_size": 1, "timestamp": "2026-05-04T22:10:42.458836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14260, "epoch": 0, "train_loss": 3.868104636669159, "train_ppl": 47.85160390003409, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 147934, "dt_s": 4.43, "eta_s": 35023, "world_size": 1, "timestamp": "2026-05-04T22:10:46.888906"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14270, "epoch": 0, "train_loss": 3.9523743987083435, "train_ppl": 52.058828618507164, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 144512, "dt_s": 4.535, "eta_s": 35053, "world_size": 1, "timestamp": "2026-05-04T22:10:51.423887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14280, "epoch": 0, "train_loss": 4.059276208281517, "train_ppl": 57.93236493468272, "lr": 0.00056, "grad_norm": 1.086, "tokens_per_sec": 147691, "dt_s": 4.437, "eta_s": 35005, "world_size": 1, "timestamp": "2026-05-04T22:10:55.861273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14290, "epoch": 0, "train_loss": 3.9582763463258743, "train_ppl": 52.3669855666248, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 146861, "dt_s": 4.462, "eta_s": 34937, "world_size": 1, "timestamp": "2026-05-04T22:11:00.323736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14300, "epoch": 0, "train_loss": 3.8556020706892014, "train_ppl": 47.25706046887451, "lr": 0.00056, "grad_norm": 0.6959, "tokens_per_sec": 143744, "dt_s": 4.559, "eta_s": 35148, "world_size": 1, "timestamp": "2026-05-04T22:11:04.882977"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14310, "epoch": 0, "train_loss": 3.9354559034109116, "train_ppl": 51.185480273113505, "lr": 0.00056, "grad_norm": 0.6883, "tokens_per_sec": 146760, "dt_s": 4.466, "eta_s": 35199, "world_size": 1, "timestamp": "2026-05-04T22:11:09.348452"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14320, "epoch": 0, "train_loss": 3.8426997363567352, "train_ppl": 46.65125065957122, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 146109, "dt_s": 4.485, "eta_s": 35117, "world_size": 1, "timestamp": "2026-05-04T22:11:13.833887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14330, "epoch": 0, "train_loss": 3.94184772670269, "train_ppl": 51.51369665121889, "lr": 0.00056, "grad_norm": 0.6962, "tokens_per_sec": 143774, "dt_s": 4.558, "eta_s": 35302, "world_size": 1, "timestamp": "2026-05-04T22:11:18.392144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14340, "epoch": 0, "train_loss": 3.9757883548736572, "train_ppl": 53.29211344683491, "lr": 0.00056, "grad_norm": 0.7262, "tokens_per_sec": 147381, "dt_s": 4.447, "eta_s": 35273, "world_size": 1, "timestamp": "2026-05-04T22:11:22.838868"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14350, "epoch": 0, "train_loss": 3.925262525677681, "train_ppl": 50.66637753813368, "lr": 0.00056, "grad_norm": 0.7013, "tokens_per_sec": 146336, "dt_s": 4.478, "eta_s": 35142, "world_size": 1, "timestamp": "2026-05-04T22:11:27.317335"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14360, "epoch": 0, "train_loss": 3.9082425087690353, "train_ppl": 49.8113320210824, "lr": 0.00056, "grad_norm": 0.6961, "tokens_per_sec": 147783, "dt_s": 4.435, "eta_s": 35089, "world_size": 1, "timestamp": "2026-05-04T22:11:31.751930"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14370, "epoch": 0, "train_loss": 3.955196291208267, "train_ppl": 52.205940505884435, "lr": 0.00056, "grad_norm": 0.7076, "tokens_per_sec": 147655, "dt_s": 4.438, "eta_s": 35011, "world_size": 1, "timestamp": "2026-05-04T22:11:36.190404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14380, "epoch": 0, "train_loss": 3.980953112244606, "train_ppl": 53.568066283884065, "lr": 0.00056, "grad_norm": 0.7044, "tokens_per_sec": 145339, "dt_s": 4.509, "eta_s": 34930, "world_size": 1, "timestamp": "2026-05-04T22:11:40.699565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14390, "epoch": 0, "train_loss": 4.017083689570427, "train_ppl": 55.53890074607952, "lr": 0.00056, "grad_norm": 0.6984, "tokens_per_sec": 148516, "dt_s": 4.413, "eta_s": 34872, "world_size": 1, "timestamp": "2026-05-04T22:11:45.112305"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14400, "epoch": 0, "train_loss": 3.8986308723688126, "train_ppl": 49.33485712899131, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 132721, "dt_s": 4.938, "eta_s": 35587, "world_size": 1, "timestamp": "2026-05-04T22:11:50.050166"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14410, "epoch": 0, "train_loss": 4.078933283686638, "train_ppl": 59.08241205070283, "lr": 0.00056, "grad_norm": 0.98, "tokens_per_sec": 145131, "dt_s": 4.516, "eta_s": 35709, "world_size": 1, "timestamp": "2026-05-04T22:11:54.565823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14420, "epoch": 0, "train_loss": 4.095381334424019, "train_ppl": 60.06223859016017, "lr": 0.00056, "grad_norm": 0.7484, "tokens_per_sec": 149532, "dt_s": 4.383, "eta_s": 35617, "world_size": 1, "timestamp": "2026-05-04T22:11:58.948572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14430, "epoch": 0, "train_loss": 3.9544721096754074, "train_ppl": 52.16814761397337, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 147409, "dt_s": 4.446, "eta_s": 35514, "world_size": 1, "timestamp": "2026-05-04T22:12:03.394428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14440, "epoch": 0, "train_loss": 3.8899723887443542, "train_ppl": 48.90953605138442, "lr": 0.00056, "grad_norm": 0.6832, "tokens_per_sec": 146730, "dt_s": 4.466, "eta_s": 35593, "world_size": 1, "timestamp": "2026-05-04T22:12:07.860873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14450, "epoch": 0, "train_loss": 3.9814315885305405, "train_ppl": 53.59370346618704, "lr": 0.00056, "grad_norm": 0.6987, "tokens_per_sec": 146817, "dt_s": 4.464, "eta_s": 34847, "world_size": 1, "timestamp": "2026-05-04T22:12:12.324702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14460, "epoch": 0, "train_loss": 3.9632547050714493, "train_ppl": 52.628337218561576, "lr": 0.00056, "grad_norm": 0.6995, "tokens_per_sec": 146628, "dt_s": 4.47, "eta_s": 34770, "world_size": 1, "timestamp": "2026-05-04T22:12:16.794256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14470, "epoch": 0, "train_loss": 4.0531996339559555, "train_ppl": 57.58140201935613, "lr": 0.00056, "grad_norm": 0.7332, "tokens_per_sec": 146558, "dt_s": 4.472, "eta_s": 34905, "world_size": 1, "timestamp": "2026-05-04T22:12:21.265905"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14480, "epoch": 0, "train_loss": 3.9981608241796494, "train_ppl": 54.49782672015234, "lr": 0.00056, "grad_norm": 0.7389, "tokens_per_sec": 148706, "dt_s": 4.407, "eta_s": 34840, "world_size": 1, "timestamp": "2026-05-04T22:12:25.673005"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14490, "epoch": 0, "train_loss": 3.8284497410058975, "train_ppl": 45.9911846925509, "lr": 0.00056, "grad_norm": 0.6999, "tokens_per_sec": 144863, "dt_s": 4.524, "eta_s": 34925, "world_size": 1, "timestamp": "2026-05-04T22:12:30.197022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14500, "epoch": 0, "train_loss": 3.886729896068573, "train_ppl": 48.75120407282436, "lr": 0.00056, "grad_norm": 0.7614, "tokens_per_sec": 148586, "dt_s": 4.411, "eta_s": 34838, "world_size": 1, "timestamp": "2026-05-04T22:12:34.607692"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14510, "epoch": 0, "train_loss": 3.8698637038469315, "train_ppl": 47.93585216331153, "lr": 0.00056, "grad_norm": 0.8347, "tokens_per_sec": 124382, "dt_s": 5.269, "eta_s": 34880, "world_size": 1, "timestamp": "2026-05-04T22:12:39.876603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14520, "epoch": 0, "train_loss": 3.9337909519672394, "train_ppl": 51.10032983917689, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 144949, "dt_s": 4.521, "eta_s": 34954, "world_size": 1, "timestamp": "2026-05-04T22:12:44.397903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14530, "epoch": 0, "train_loss": 3.8331862837076187, "train_ppl": 46.20954062060714, "lr": 0.00056, "grad_norm": 0.7679, "tokens_per_sec": 144688, "dt_s": 4.529, "eta_s": 35140, "world_size": 1, "timestamp": "2026-05-04T22:12:48.927399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14540, "epoch": 0, "train_loss": 3.7881689965724945, "train_ppl": 44.1754407965497, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 147146, "dt_s": 4.454, "eta_s": 35026, "world_size": 1, "timestamp": "2026-05-04T22:12:53.381196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14550, "epoch": 0, "train_loss": 3.9104566872119904, "train_ppl": 49.92174539100085, "lr": 0.00056, "grad_norm": 0.6913, "tokens_per_sec": 148214, "dt_s": 4.422, "eta_s": 35039, "world_size": 1, "timestamp": "2026-05-04T22:12:57.802912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14560, "epoch": 0, "train_loss": 3.9880732595920563, "train_ppl": 53.95083989409056, "lr": 0.00056, "grad_norm": 0.715, "tokens_per_sec": 147022, "dt_s": 4.458, "eta_s": 34969, "world_size": 1, "timestamp": "2026-05-04T22:13:02.260497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14570, "epoch": 0, "train_loss": 3.9641302675008774, "train_ppl": 52.67443679193562, "lr": 0.00056, "grad_norm": 0.7047, "tokens_per_sec": 147245, "dt_s": 4.451, "eta_s": 34854, "world_size": 1, "timestamp": "2026-05-04T22:13:06.711321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14580, "epoch": 0, "train_loss": 3.9405729174613953, "train_ppl": 51.44806835534162, "lr": 0.00056, "grad_norm": 0.6802, "tokens_per_sec": 149228, "dt_s": 4.392, "eta_s": 34635, "world_size": 1, "timestamp": "2026-05-04T22:13:11.102987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14590, "epoch": 0, "train_loss": 3.974954977631569, "train_ppl": 53.24771951332183, "lr": 0.00056, "grad_norm": 0.7397, "tokens_per_sec": 150580, "dt_s": 4.352, "eta_s": 34472, "world_size": 1, "timestamp": "2026-05-04T22:13:15.455257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14600, "epoch": 0, "train_loss": 3.945709526538849, "train_ppl": 51.7130168561605, "lr": 0.00056, "grad_norm": 0.7317, "tokens_per_sec": 146437, "dt_s": 4.475, "eta_s": 34551, "world_size": 1, "timestamp": "2026-05-04T22:13:19.930592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14610, "epoch": 0, "train_loss": 3.9911180585622787, "train_ppl": 54.11535969359828, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 148212, "dt_s": 4.422, "eta_s": 34490, "world_size": 1, "timestamp": "2026-05-04T22:13:24.352363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14620, "epoch": 0, "train_loss": 3.8903805017471313, "train_ppl": 48.92950074265474, "lr": 0.00056, "grad_norm": 0.7797, "tokens_per_sec": 146299, "dt_s": 4.48, "eta_s": 34531, "world_size": 1, "timestamp": "2026-05-04T22:13:28.831969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14630, "epoch": 0, "train_loss": 3.872986987233162, "train_ppl": 48.08580346228178, "lr": 0.00056, "grad_norm": 0.7177, "tokens_per_sec": 144646, "dt_s": 4.531, "eta_s": 34744, "world_size": 1, "timestamp": "2026-05-04T22:13:33.362756"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14640, "epoch": 0, "train_loss": 3.914980337023735, "train_ppl": 50.14808544072856, "lr": 0.00056, "grad_norm": 0.7317, "tokens_per_sec": 148692, "dt_s": 4.407, "eta_s": 34825, "world_size": 1, "timestamp": "2026-05-04T22:13:37.770262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14650, "epoch": 0, "train_loss": 3.771519511938095, "train_ppl": 43.446031468095015, "lr": 0.00056, "grad_norm": 1.0929, "tokens_per_sec": 147474, "dt_s": 4.444, "eta_s": 34772, "world_size": 1, "timestamp": "2026-05-04T22:13:42.214139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14660, "epoch": 0, "train_loss": 3.8666119426488876, "train_ppl": 47.78022938044409, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 146862, "dt_s": 4.462, "eta_s": 34831, "world_size": 1, "timestamp": "2026-05-04T22:13:46.676588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14670, "epoch": 0, "train_loss": 3.8110849261283875, "train_ppl": 45.199450339259265, "lr": 0.00056, "grad_norm": 0.7722, "tokens_per_sec": 148812, "dt_s": 4.404, "eta_s": 34708, "world_size": 1, "timestamp": "2026-05-04T22:13:51.080576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14680, "epoch": 0, "train_loss": 4.005711525678635, "train_ppl": 54.9108810044046, "lr": 0.00056, "grad_norm": 0.8483, "tokens_per_sec": 147297, "dt_s": 4.449, "eta_s": 34577, "world_size": 1, "timestamp": "2026-05-04T22:13:55.529787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14690, "epoch": 0, "train_loss": 4.069752246141434, "train_ppl": 58.54245667548688, "lr": 0.00056, "grad_norm": 0.7017, "tokens_per_sec": 149525, "dt_s": 4.383, "eta_s": 34534, "world_size": 1, "timestamp": "2026-05-04T22:13:59.912741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14700, "epoch": 0, "train_loss": 3.963781550526619, "train_ppl": 52.656071524043206, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 134048, "dt_s": 4.889, "eta_s": 35224, "world_size": 1, "timestamp": "2026-05-04T22:14:04.801720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14710, "epoch": 0, "train_loss": 3.889343112707138, "train_ppl": 48.87876813412776, "lr": 0.00056, "grad_norm": 0.8223, "tokens_per_sec": 145538, "dt_s": 4.503, "eta_s": 35283, "world_size": 1, "timestamp": "2026-05-04T22:14:09.304740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14720, "epoch": 0, "train_loss": 3.9066957384347916, "train_ppl": 49.7343448864603, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 149009, "dt_s": 4.398, "eta_s": 35269, "world_size": 1, "timestamp": "2026-05-04T22:14:13.702872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14730, "epoch": 0, "train_loss": 3.9194997251033783, "train_ppl": 50.3752370066978, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 148944, "dt_s": 4.4, "eta_s": 35188, "world_size": 1, "timestamp": "2026-05-04T22:14:18.102922"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14740, "epoch": 0, "train_loss": 3.9372789561748505, "train_ppl": 51.27887921412995, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 145871, "dt_s": 4.493, "eta_s": 35354, "world_size": 1, "timestamp": "2026-05-04T22:14:22.595614"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14750, "epoch": 0, "train_loss": 3.8960166722536087, "train_ppl": 49.20605437124682, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 149918, "dt_s": 4.371, "eta_s": 34543, "world_size": 1, "timestamp": "2026-05-04T22:14:26.967082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14760, "epoch": 0, "train_loss": 3.9774491786956787, "train_ppl": 53.380695797851025, "lr": 0.00056, "grad_norm": 0.7107, "tokens_per_sec": 148371, "dt_s": 4.417, "eta_s": 34405, "world_size": 1, "timestamp": "2026-05-04T22:14:31.384101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14770, "epoch": 0, "train_loss": 3.860431417822838, "train_ppl": 47.48583318519394, "lr": 0.00056, "grad_norm": 0.7099, "tokens_per_sec": 147463, "dt_s": 4.444, "eta_s": 34472, "world_size": 1, "timestamp": "2026-05-04T22:14:35.828372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14780, "epoch": 0, "train_loss": 4.0121049284935, "train_ppl": 55.26307303868566, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 148936, "dt_s": 4.4, "eta_s": 34468, "world_size": 1, "timestamp": "2026-05-04T22:14:40.228656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14790, "epoch": 0, "train_loss": 3.833706855773926, "train_ppl": 46.23360227902135, "lr": 0.00056, "grad_norm": 0.8174, "tokens_per_sec": 147938, "dt_s": 4.43, "eta_s": 34366, "world_size": 1, "timestamp": "2026-05-04T22:14:44.658605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14800, "epoch": 0, "train_loss": 3.9401872754096985, "train_ppl": 51.42823164188651, "lr": 0.00056, "grad_norm": 0.741, "tokens_per_sec": 146880, "dt_s": 4.462, "eta_s": 34502, "world_size": 1, "timestamp": "2026-05-04T22:14:49.120491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14810, "epoch": 0, "train_loss": 3.9605255722999573, "train_ppl": 52.48490331286218, "lr": 0.00056, "grad_norm": 0.7111, "tokens_per_sec": 150609, "dt_s": 4.351, "eta_s": 34396, "world_size": 1, "timestamp": "2026-05-04T22:14:53.471874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14820, "epoch": 0, "train_loss": 3.902310386300087, "train_ppl": 49.516719801028714, "lr": 0.00056, "grad_norm": 0.7369, "tokens_per_sec": 145780, "dt_s": 4.496, "eta_s": 34471, "world_size": 1, "timestamp": "2026-05-04T22:14:57.967466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14830, "epoch": 0, "train_loss": 3.90611232817173, "train_ppl": 49.70533782155933, "lr": 0.00056, "grad_norm": 0.6832, "tokens_per_sec": 148431, "dt_s": 4.415, "eta_s": 34490, "world_size": 1, "timestamp": "2026-05-04T22:15:02.382677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14840, "epoch": 0, "train_loss": 3.968477338552475, "train_ppl": 52.903914728349456, "lr": 0.00056, "grad_norm": 0.763, "tokens_per_sec": 150078, "dt_s": 4.367, "eta_s": 34387, "world_size": 1, "timestamp": "2026-05-04T22:15:06.749459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14850, "epoch": 0, "train_loss": 3.8987663984298706, "train_ppl": 49.341543740945724, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 146530, "dt_s": 4.473, "eta_s": 34399, "world_size": 1, "timestamp": "2026-05-04T22:15:11.222007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14860, "epoch": 0, "train_loss": 3.852256029844284, "train_ppl": 47.099200664294905, "lr": 0.00056, "grad_norm": 0.7503, "tokens_per_sec": 148760, "dt_s": 4.405, "eta_s": 34479, "world_size": 1, "timestamp": "2026-05-04T22:15:15.627517"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14870, "epoch": 0, "train_loss": 3.9221490174531937, "train_ppl": 50.508872678538374, "lr": 0.00056, "grad_norm": 0.8869, "tokens_per_sec": 148436, "dt_s": 4.415, "eta_s": 34349, "world_size": 1, "timestamp": "2026-05-04T22:15:20.042576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14880, "epoch": 0, "train_loss": 3.97379270195961, "train_ppl": 53.18586693618001, "lr": 0.00056, "grad_norm": 0.7541, "tokens_per_sec": 145394, "dt_s": 4.507, "eta_s": 34488, "world_size": 1, "timestamp": "2026-05-04T22:15:24.550043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14890, "epoch": 0, "train_loss": 3.978630319237709, "train_ppl": 53.443783152001096, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 148872, "dt_s": 4.402, "eta_s": 34539, "world_size": 1, "timestamp": "2026-05-04T22:15:28.952263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14900, "epoch": 0, "train_loss": 3.841463103890419, "train_ppl": 46.59359586465438, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 148280, "dt_s": 4.42, "eta_s": 34453, "world_size": 1, "timestamp": "2026-05-04T22:15:33.371986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14910, "epoch": 0, "train_loss": 3.878639355301857, "train_ppl": 48.35837172470515, "lr": 0.00056, "grad_norm": 0.717, "tokens_per_sec": 145351, "dt_s": 4.509, "eta_s": 34609, "world_size": 1, "timestamp": "2026-05-04T22:15:37.880804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14920, "epoch": 0, "train_loss": 3.965249225497246, "train_ppl": 52.73341026245314, "lr": 0.00056, "grad_norm": 0.7521, "tokens_per_sec": 150265, "dt_s": 4.361, "eta_s": 34521, "world_size": 1, "timestamp": "2026-05-04T22:15:42.242171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14930, "epoch": 0, "train_loss": 3.878967270255089, "train_ppl": 48.37423175813657, "lr": 0.00056, "grad_norm": 0.8367, "tokens_per_sec": 147689, "dt_s": 4.437, "eta_s": 34408, "world_size": 1, "timestamp": "2026-05-04T22:15:46.679603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14940, "epoch": 0, "train_loss": 4.117104575037956, "train_ppl": 61.381259889380054, "lr": 0.00056, "grad_norm": 0.8121, "tokens_per_sec": 147465, "dt_s": 4.444, "eta_s": 34469, "world_size": 1, "timestamp": "2026-05-04T22:15:51.123792"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14950, "epoch": 0, "train_loss": 3.9348307996988297, "train_ppl": 51.15349403778887, "lr": 0.00056, "grad_norm": 0.6958, "tokens_per_sec": 149888, "dt_s": 4.372, "eta_s": 34390, "world_size": 1, "timestamp": "2026-05-04T22:15:55.496101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14960, "epoch": 0, "train_loss": 3.9510652273893356, "train_ppl": 51.9907192862987, "lr": 0.00056, "grad_norm": 0.7824, "tokens_per_sec": 148108, "dt_s": 4.425, "eta_s": 34256, "world_size": 1, "timestamp": "2026-05-04T22:15:59.921025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14970, "epoch": 0, "train_loss": 3.916160210967064, "train_ppl": 50.20728877941815, "lr": 0.00056, "grad_norm": 0.6955, "tokens_per_sec": 149643, "dt_s": 4.38, "eta_s": 34279, "world_size": 1, "timestamp": "2026-05-04T22:16:04.300526"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14980, "epoch": 0, "train_loss": 3.9232922792434692, "train_ppl": 50.566650564072866, "lr": 0.00056, "grad_norm": 0.7615, "tokens_per_sec": 149553, "dt_s": 4.382, "eta_s": 34189, "world_size": 1, "timestamp": "2026-05-04T22:16:08.682641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 14990, "epoch": 0, "train_loss": 3.9287306368350983, "train_ppl": 50.8423992223337, "lr": 0.00056, "grad_norm": 0.7367, "tokens_per_sec": 130872, "dt_s": 5.008, "eta_s": 35060, "world_size": 1, "timestamp": "2026-05-04T22:16:13.690307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15000, "epoch": 0, "train_loss": 3.9735725224018097, "train_ppl": 53.17415778462199, "lr": 0.00056, "grad_norm": 0.7617, "tokens_per_sec": 148089, "dt_s": 4.425, "eta_s": 35138, "world_size": 1, "timestamp": "2026-05-04T22:16:18.115724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15010, "epoch": 0, "train_loss": 3.9060704112052917, "train_ppl": 49.70325436824839, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 126563, "dt_s": 5.178, "eta_s": 35106, "world_size": 1, "timestamp": "2026-05-04T22:16:23.293869"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15020, "epoch": 0, "train_loss": 4.008197128772736, "train_ppl": 55.04753742658404, "lr": 0.00056, "grad_norm": 0.9112, "tokens_per_sec": 146145, "dt_s": 4.484, "eta_s": 35264, "world_size": 1, "timestamp": "2026-05-04T22:16:27.778205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15030, "epoch": 0, "train_loss": 3.8331568241119385, "train_ppl": 46.20817932627558, "lr": 0.00056, "grad_norm": 0.7054, "tokens_per_sec": 148863, "dt_s": 4.402, "eta_s": 35291, "world_size": 1, "timestamp": "2026-05-04T22:16:32.180627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15040, "epoch": 0, "train_loss": 3.9179300367832184, "train_ppl": 50.296225613396864, "lr": 0.00056, "grad_norm": 0.7246, "tokens_per_sec": 145612, "dt_s": 4.501, "eta_s": 34500, "world_size": 1, "timestamp": "2026-05-04T22:16:36.681376"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15050, "epoch": 0, "train_loss": 3.9849673956632614, "train_ppl": 53.783535872787745, "lr": 0.00056, "grad_norm": 0.7513, "tokens_per_sec": 147758, "dt_s": 4.435, "eta_s": 34511, "world_size": 1, "timestamp": "2026-05-04T22:16:41.116699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15060, "epoch": 0, "train_loss": 3.9101971238851547, "train_ppl": 49.908789218232116, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 149871, "dt_s": 4.373, "eta_s": 34453, "world_size": 1, "timestamp": "2026-05-04T22:16:45.489556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15070, "epoch": 0, "train_loss": 3.8994046598672867, "train_ppl": 49.37304659803811, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 144870, "dt_s": 4.524, "eta_s": 34510, "world_size": 1, "timestamp": "2026-05-04T22:16:50.013336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15080, "epoch": 0, "train_loss": 3.9626923948526382, "train_ppl": 52.59875208553541, "lr": 0.00056, "grad_norm": 0.703, "tokens_per_sec": 149162, "dt_s": 4.394, "eta_s": 34492, "world_size": 1, "timestamp": "2026-05-04T22:16:54.406924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15090, "epoch": 0, "train_loss": 3.89572536945343, "train_ppl": 49.19172259736702, "lr": 0.00056, "grad_norm": 0.6998, "tokens_per_sec": 149203, "dt_s": 4.392, "eta_s": 34319, "world_size": 1, "timestamp": "2026-05-04T22:16:58.799339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15100, "epoch": 0, "train_loss": 4.01191945374012, "train_ppl": 55.25282408433332, "lr": 0.00056, "grad_norm": 0.8284, "tokens_per_sec": 147308, "dt_s": 4.449, "eta_s": 34336, "world_size": 1, "timestamp": "2026-05-04T22:17:03.248300"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15110, "epoch": 0, "train_loss": 3.875326544046402, "train_ppl": 48.19843463352999, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 149200, "dt_s": 4.393, "eta_s": 34362, "world_size": 1, "timestamp": "2026-05-04T22:17:07.640744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15120, "epoch": 0, "train_loss": 3.918136805295944, "train_ppl": 50.30662636439952, "lr": 0.00056, "grad_norm": 0.7227, "tokens_per_sec": 149070, "dt_s": 4.396, "eta_s": 34159, "world_size": 1, "timestamp": "2026-05-04T22:17:12.037108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15130, "epoch": 0, "train_loss": 3.8715613931417465, "train_ppl": 48.017301464608046, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 146584, "dt_s": 4.471, "eta_s": 34274, "world_size": 1, "timestamp": "2026-05-04T22:17:16.507935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15140, "epoch": 0, "train_loss": 3.917255476117134, "train_ppl": 50.262309198571344, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 148431, "dt_s": 4.415, "eta_s": 34305, "world_size": 1, "timestamp": "2026-05-04T22:17:20.923168"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15150, "epoch": 0, "train_loss": 4.017207786440849, "train_ppl": 55.54579337751696, "lr": 0.00056, "grad_norm": 0.7152, "tokens_per_sec": 148227, "dt_s": 4.421, "eta_s": 34258, "world_size": 1, "timestamp": "2026-05-04T22:17:25.344482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15160, "epoch": 0, "train_loss": 4.069322049617767, "train_ppl": 58.51727733055986, "lr": 0.00056, "grad_norm": 0.7332, "tokens_per_sec": 144979, "dt_s": 4.52, "eta_s": 34452, "world_size": 1, "timestamp": "2026-05-04T22:17:29.864850"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15170, "epoch": 0, "train_loss": 3.927838012576103, "train_ppl": 50.79703631243127, "lr": 0.00056, "grad_norm": 0.7014, "tokens_per_sec": 148467, "dt_s": 4.414, "eta_s": 34476, "world_size": 1, "timestamp": "2026-05-04T22:17:34.279045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15180, "epoch": 0, "train_loss": 3.976572796702385, "train_ppl": 53.33393441067605, "lr": 0.00056, "grad_norm": 0.7032, "tokens_per_sec": 146315, "dt_s": 4.479, "eta_s": 34484, "world_size": 1, "timestamp": "2026-05-04T22:17:38.758136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15190, "epoch": 0, "train_loss": 3.815178871154785, "train_ppl": 45.384873701717325, "lr": 0.00056, "grad_norm": 0.7156, "tokens_per_sec": 147610, "dt_s": 4.44, "eta_s": 34517, "world_size": 1, "timestamp": "2026-05-04T22:17:43.197928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15200, "epoch": 0, "train_loss": 3.9851220846176147, "train_ppl": 53.791856235230824, "lr": 0.00056, "grad_norm": 1.5942, "tokens_per_sec": 148800, "dt_s": 4.404, "eta_s": 34487, "world_size": 1, "timestamp": "2026-05-04T22:17:47.602292"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15210, "epoch": 0, "train_loss": 4.043900936841965, "train_ppl": 57.048451714467774, "lr": 0.00056, "grad_norm": 0.7206, "tokens_per_sec": 144803, "dt_s": 4.526, "eta_s": 34491, "world_size": 1, "timestamp": "2026-05-04T22:17:52.128125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15220, "epoch": 0, "train_loss": 3.920412912964821, "train_ppl": 50.421260072303774, "lr": 0.00056, "grad_norm": 0.7476, "tokens_per_sec": 148678, "dt_s": 4.408, "eta_s": 34477, "world_size": 1, "timestamp": "2026-05-04T22:17:56.536070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15230, "epoch": 0, "train_loss": 3.8649374693632126, "train_ppl": 47.70028960994652, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 148469, "dt_s": 4.414, "eta_s": 34371, "world_size": 1, "timestamp": "2026-05-04T22:18:00.950230"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15240, "epoch": 0, "train_loss": 3.9873417913913727, "train_ppl": 53.91139099987261, "lr": 0.00056, "grad_norm": 0.6904, "tokens_per_sec": 145612, "dt_s": 4.501, "eta_s": 34462, "world_size": 1, "timestamp": "2026-05-04T22:18:05.450947"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15250, "epoch": 0, "train_loss": 3.9974756687879562, "train_ppl": 54.4605000290915, "lr": 0.00056, "grad_norm": 0.794, "tokens_per_sec": 149758, "dt_s": 4.376, "eta_s": 34413, "world_size": 1, "timestamp": "2026-05-04T22:18:09.827061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15260, "epoch": 0, "train_loss": 3.9961937963962555, "train_ppl": 54.3907333432136, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 149588, "dt_s": 4.381, "eta_s": 34185, "world_size": 1, "timestamp": "2026-05-04T22:18:14.208161"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15270, "epoch": 0, "train_loss": 3.9925164729356766, "train_ppl": 54.19108832807986, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 145973, "dt_s": 4.49, "eta_s": 34307, "world_size": 1, "timestamp": "2026-05-04T22:18:18.697761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15280, "epoch": 0, "train_loss": 3.8680621534585953, "train_ppl": 47.84957105345103, "lr": 0.00056, "grad_norm": 0.7173, "tokens_per_sec": 149901, "dt_s": 4.372, "eta_s": 34237, "world_size": 1, "timestamp": "2026-05-04T22:18:23.069710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15290, "epoch": 0, "train_loss": 3.9091377705335617, "train_ppl": 49.85594616977191, "lr": 0.00056, "grad_norm": 0.7354, "tokens_per_sec": 132284, "dt_s": 4.954, "eta_s": 34935, "world_size": 1, "timestamp": "2026-05-04T22:18:28.023910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15300, "epoch": 0, "train_loss": 3.9746962189674377, "train_ppl": 53.23394298702718, "lr": 0.00056, "grad_norm": 0.7191, "tokens_per_sec": 144996, "dt_s": 4.52, "eta_s": 35153, "world_size": 1, "timestamp": "2026-05-04T22:18:32.543781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15310, "epoch": 0, "train_loss": 3.889457553625107, "train_ppl": 48.88436218531028, "lr": 0.00056, "grad_norm": 0.7059, "tokens_per_sec": 150579, "dt_s": 4.352, "eta_s": 35104, "world_size": 1, "timestamp": "2026-05-04T22:18:36.896060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15320, "epoch": 0, "train_loss": 3.9949750900268555, "train_ppl": 54.32448738543652, "lr": 0.00056, "grad_norm": 0.7054, "tokens_per_sec": 143984, "dt_s": 4.552, "eta_s": 35195, "world_size": 1, "timestamp": "2026-05-04T22:18:41.447659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15330, "epoch": 0, "train_loss": 3.858788713812828, "train_ppl": 47.40789205123548, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 148148, "dt_s": 4.424, "eta_s": 35271, "world_size": 1, "timestamp": "2026-05-04T22:18:45.871350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15340, "epoch": 0, "train_loss": 3.773521140217781, "train_ppl": 43.53308136502551, "lr": 0.00056, "grad_norm": 0.7247, "tokens_per_sec": 146872, "dt_s": 4.462, "eta_s": 34505, "world_size": 1, "timestamp": "2026-05-04T22:18:50.333462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15350, "epoch": 0, "train_loss": 3.773990884423256, "train_ppl": 43.55353558149191, "lr": 0.00056, "grad_norm": 0.6425, "tokens_per_sec": 145676, "dt_s": 4.499, "eta_s": 34468, "world_size": 1, "timestamp": "2026-05-04T22:18:54.832215"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15360, "epoch": 0, "train_loss": 3.9166527688503265, "train_ppl": 50.232024866781, "lr": 0.00056, "grad_norm": 0.7145, "tokens_per_sec": 148809, "dt_s": 4.404, "eta_s": 34543, "world_size": 1, "timestamp": "2026-05-04T22:18:59.236233"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15370, "epoch": 0, "train_loss": 3.8887289315462112, "train_ppl": 48.848756932671144, "lr": 0.00056, "grad_norm": 0.7481, "tokens_per_sec": 148533, "dt_s": 4.412, "eta_s": 34323, "world_size": 1, "timestamp": "2026-05-04T22:19:03.648472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15380, "epoch": 0, "train_loss": 3.9790024161338806, "train_ppl": 53.46367311809946, "lr": 0.00056, "grad_norm": 0.7077, "tokens_per_sec": 146753, "dt_s": 4.466, "eta_s": 34384, "world_size": 1, "timestamp": "2026-05-04T22:19:08.114223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15390, "epoch": 0, "train_loss": 3.903345003724098, "train_ppl": 49.567977173427735, "lr": 0.00056, "grad_norm": 0.7116, "tokens_per_sec": 148061, "dt_s": 4.426, "eta_s": 34324, "world_size": 1, "timestamp": "2026-05-04T22:19:12.540480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15400, "epoch": 0, "train_loss": 3.950350373983383, "train_ppl": 51.953566824403055, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 148097, "dt_s": 4.425, "eta_s": 34206, "world_size": 1, "timestamp": "2026-05-04T22:19:16.965695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15410, "epoch": 0, "train_loss": 3.9500049352645874, "train_ppl": 51.9356231502407, "lr": 0.00056, "grad_norm": 0.7075, "tokens_per_sec": 145430, "dt_s": 4.506, "eta_s": 34359, "world_size": 1, "timestamp": "2026-05-04T22:19:21.472049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15420, "epoch": 0, "train_loss": 3.838676393032074, "train_ppl": 46.463933734472434, "lr": 0.00056, "grad_norm": 0.7447, "tokens_per_sec": 148606, "dt_s": 4.41, "eta_s": 34352, "world_size": 1, "timestamp": "2026-05-04T22:19:25.882104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15430, "epoch": 0, "train_loss": 3.8856987208127975, "train_ppl": 48.7009589477123, "lr": 0.00056, "grad_norm": 0.7021, "tokens_per_sec": 145687, "dt_s": 4.498, "eta_s": 34398, "world_size": 1, "timestamp": "2026-05-04T22:19:30.380544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15440, "epoch": 0, "train_loss": 3.9103180170059204, "train_ppl": 49.914823212241174, "lr": 0.00056, "grad_norm": 0.7383, "tokens_per_sec": 146352, "dt_s": 4.478, "eta_s": 34473, "world_size": 1, "timestamp": "2026-05-04T22:19:34.858515"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15450, "epoch": 0, "train_loss": 3.9494513869285583, "train_ppl": 51.906882227943996, "lr": 0.00056, "grad_norm": 0.7329, "tokens_per_sec": 148063, "dt_s": 4.426, "eta_s": 34470, "world_size": 1, "timestamp": "2026-05-04T22:19:39.284713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15460, "epoch": 0, "train_loss": 3.932895064353943, "train_ppl": 51.0545701874518, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 143693, "dt_s": 4.561, "eta_s": 34550, "world_size": 1, "timestamp": "2026-05-04T22:19:43.845574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15470, "epoch": 0, "train_loss": 3.87519508600235, "train_ppl": 48.192098978032256, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 147801, "dt_s": 4.434, "eta_s": 34583, "world_size": 1, "timestamp": "2026-05-04T22:19:48.279651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15480, "epoch": 0, "train_loss": 3.849758058786392, "train_ppl": 46.98169504810139, "lr": 0.00056, "grad_norm": 0.7395, "tokens_per_sec": 148344, "dt_s": 4.418, "eta_s": 34454, "world_size": 1, "timestamp": "2026-05-04T22:19:52.697495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15490, "epoch": 0, "train_loss": 3.913269892334938, "train_ppl": 50.06238322967334, "lr": 0.00056, "grad_norm": 0.7234, "tokens_per_sec": 144224, "dt_s": 4.544, "eta_s": 34551, "world_size": 1, "timestamp": "2026-05-04T22:19:57.241549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15500, "epoch": 0, "train_loss": 3.859106183052063, "train_ppl": 47.42294498795447, "lr": 0.00056, "grad_norm": 0.7181, "tokens_per_sec": 146808, "dt_s": 4.464, "eta_s": 34605, "world_size": 1, "timestamp": "2026-05-04T22:20:01.705594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15510, "epoch": 0, "train_loss": 3.933687761425972, "train_ppl": 51.09505704053799, "lr": 0.00056, "grad_norm": 0.7259, "tokens_per_sec": 126474, "dt_s": 5.182, "eta_s": 34374, "world_size": 1, "timestamp": "2026-05-04T22:20:06.887400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15520, "epoch": 0, "train_loss": 3.910299703478813, "train_ppl": 49.91390910414352, "lr": 0.00056, "grad_norm": Infinity, "tokens_per_sec": 145007, "dt_s": 4.52, "eta_s": 34502, "world_size": 1, "timestamp": "2026-05-04T22:20:11.406810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15530, "epoch": 0, "train_loss": 3.912321701645851, "train_ppl": 50.014937041593114, "lr": 0.00056, "grad_norm": 0.7232, "tokens_per_sec": 145733, "dt_s": 4.497, "eta_s": 34619, "world_size": 1, "timestamp": "2026-05-04T22:20:15.903879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15540, "epoch": 0, "train_loss": 3.8633776009082794, "train_ppl": 47.62594143465471, "lr": 0.00056, "grad_norm": 0.7707, "tokens_per_sec": 146881, "dt_s": 4.462, "eta_s": 34488, "world_size": 1, "timestamp": "2026-05-04T22:20:20.365732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15550, "epoch": 0, "train_loss": 3.950342059135437, "train_ppl": 51.95313484019059, "lr": 0.00056, "grad_norm": 0.7419, "tokens_per_sec": 149611, "dt_s": 4.38, "eta_s": 34355, "world_size": 1, "timestamp": "2026-05-04T22:20:24.746133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15560, "epoch": 0, "train_loss": 3.8070006668567657, "train_ppl": 45.01522054236435, "lr": 0.00056, "grad_norm": 0.7831, "tokens_per_sec": 149857, "dt_s": 4.373, "eta_s": 34287, "world_size": 1, "timestamp": "2026-05-04T22:20:29.119389"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15570, "epoch": 0, "train_loss": 3.8362106680870056, "train_ppl": 46.34950758369851, "lr": 0.00056, "grad_norm": 0.7033, "tokens_per_sec": 147254, "dt_s": 4.451, "eta_s": 34176, "world_size": 1, "timestamp": "2026-05-04T22:20:33.569927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15580, "epoch": 0, "train_loss": 3.864385709166527, "train_ppl": 47.67397774835586, "lr": 0.00056, "grad_norm": 0.7013, "tokens_per_sec": 133304, "dt_s": 4.916, "eta_s": 34818, "world_size": 1, "timestamp": "2026-05-04T22:20:38.486220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15590, "epoch": 0, "train_loss": 3.947830557823181, "train_ppl": 51.82281818758485, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 148954, "dt_s": 4.4, "eta_s": 34718, "world_size": 1, "timestamp": "2026-05-04T22:20:42.885951"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15600, "epoch": 0, "train_loss": 3.856568828225136, "train_ppl": 47.30276867902723, "lr": 0.00056, "grad_norm": 0.7013, "tokens_per_sec": 148019, "dt_s": 4.428, "eta_s": 34786, "world_size": 1, "timestamp": "2026-05-04T22:20:47.313507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15610, "epoch": 0, "train_loss": 3.9164092391729355, "train_ppl": 50.2197933673975, "lr": 0.00056, "grad_norm": 0.8466, "tokens_per_sec": 148284, "dt_s": 4.42, "eta_s": 34853, "world_size": 1, "timestamp": "2026-05-04T22:20:51.733131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15620, "epoch": 0, "train_loss": 3.986030101776123, "train_ppl": 53.840722345956436, "lr": 0.00056, "grad_norm": 0.7695, "tokens_per_sec": 149007, "dt_s": 4.398, "eta_s": 34768, "world_size": 1, "timestamp": "2026-05-04T22:20:56.131311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15630, "epoch": 0, "train_loss": 3.8734166771173477, "train_ppl": 48.106469885360966, "lr": 0.00056, "grad_norm": 0.7134, "tokens_per_sec": 147586, "dt_s": 4.441, "eta_s": 34030, "world_size": 1, "timestamp": "2026-05-04T22:21:00.571818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15640, "epoch": 0, "train_loss": 3.8055368959903717, "train_ppl": 44.949376775840264, "lr": 0.00056, "grad_norm": 0.717, "tokens_per_sec": 148306, "dt_s": 4.419, "eta_s": 34055, "world_size": 1, "timestamp": "2026-05-04T22:21:04.990800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15650, "epoch": 0, "train_loss": 3.955348163843155, "train_ppl": 52.2138697617291, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 147193, "dt_s": 4.452, "eta_s": 34089, "world_size": 1, "timestamp": "2026-05-04T22:21:09.443189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15660, "epoch": 0, "train_loss": 4.048704564571381, "train_ppl": 57.3231504861431, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 148186, "dt_s": 4.423, "eta_s": 34089, "world_size": 1, "timestamp": "2026-05-04T22:21:13.865740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15670, "epoch": 0, "train_loss": 3.892428144812584, "train_ppl": 49.029793542431825, "lr": 0.00056, "grad_norm": 0.705, "tokens_per_sec": 146965, "dt_s": 4.459, "eta_s": 34179, "world_size": 1, "timestamp": "2026-05-04T22:21:18.325035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15680, "epoch": 0, "train_loss": 3.9421932995319366, "train_ppl": 51.531501461367725, "lr": 0.00056, "grad_norm": 0.7174, "tokens_per_sec": 146661, "dt_s": 4.469, "eta_s": 34218, "world_size": 1, "timestamp": "2026-05-04T22:21:22.793566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15690, "epoch": 0, "train_loss": 3.8490212708711624, "train_ppl": 46.94709225197931, "lr": 0.00056, "grad_norm": 0.743, "tokens_per_sec": 148435, "dt_s": 4.415, "eta_s": 34207, "world_size": 1, "timestamp": "2026-05-04T22:21:27.208735"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15700, "epoch": 0, "train_loss": 3.87990865111351, "train_ppl": 48.41979177525393, "lr": 0.00056, "grad_norm": 0.6868, "tokens_per_sec": 148696, "dt_s": 4.407, "eta_s": 34134, "world_size": 1, "timestamp": "2026-05-04T22:21:31.616103"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15710, "epoch": 0, "train_loss": 3.8909375816583633, "train_ppl": 48.95676597833749, "lr": 0.00056, "grad_norm": 0.7399, "tokens_per_sec": 147943, "dt_s": 4.43, "eta_s": 34140, "world_size": 1, "timestamp": "2026-05-04T22:21:36.045918"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15720, "epoch": 0, "train_loss": 3.806063264608383, "train_ppl": 44.97304294519225, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 146031, "dt_s": 4.488, "eta_s": 34180, "world_size": 1, "timestamp": "2026-05-04T22:21:40.533740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15730, "epoch": 0, "train_loss": 3.8890698701143265, "train_ppl": 48.8654141973046, "lr": 0.00056, "grad_norm": 0.728, "tokens_per_sec": 145803, "dt_s": 4.495, "eta_s": 34216, "world_size": 1, "timestamp": "2026-05-04T22:21:45.028561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15740, "epoch": 0, "train_loss": 3.9264841079711914, "train_ppl": 50.728308506996626, "lr": 0.00056, "grad_norm": 0.7112, "tokens_per_sec": 143182, "dt_s": 4.577, "eta_s": 34461, "world_size": 1, "timestamp": "2026-05-04T22:21:49.605677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15750, "epoch": 0, "train_loss": 3.8079348653554916, "train_ppl": 45.05729334292857, "lr": 0.00056, "grad_norm": 0.7021, "tokens_per_sec": 145942, "dt_s": 4.491, "eta_s": 34584, "world_size": 1, "timestamp": "2026-05-04T22:21:54.096223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15760, "epoch": 0, "train_loss": 3.971219301223755, "train_ppl": 53.049174344912814, "lr": 0.00056, "grad_norm": 0.7078, "tokens_per_sec": 147525, "dt_s": 4.442, "eta_s": 34599, "world_size": 1, "timestamp": "2026-05-04T22:21:58.538568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15770, "epoch": 0, "train_loss": 3.9750648885965347, "train_ppl": 53.25357234319497, "lr": 0.00056, "grad_norm": 0.7734, "tokens_per_sec": 147023, "dt_s": 4.458, "eta_s": 34548, "world_size": 1, "timestamp": "2026-05-04T22:22:02.996112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15780, "epoch": 0, "train_loss": 3.9451567977666855, "train_ppl": 51.68444148179203, "lr": 0.00056, "grad_norm": 0.7116, "tokens_per_sec": 147437, "dt_s": 4.445, "eta_s": 34467, "world_size": 1, "timestamp": "2026-05-04T22:22:07.441127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15790, "epoch": 0, "train_loss": 3.8755369931459427, "train_ppl": 48.208579018098746, "lr": 0.00056, "grad_norm": 0.6661, "tokens_per_sec": 146806, "dt_s": 4.464, "eta_s": 34289, "world_size": 1, "timestamp": "2026-05-04T22:22:11.905287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15800, "epoch": 0, "train_loss": 3.9317818880081177, "train_ppl": 50.99776906826948, "lr": 0.00056, "grad_norm": 0.7793, "tokens_per_sec": 146788, "dt_s": 4.465, "eta_s": 34244, "world_size": 1, "timestamp": "2026-05-04T22:22:16.369929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15810, "epoch": 0, "train_loss": 3.848289042711258, "train_ppl": 46.91272885146689, "lr": 0.00056, "grad_norm": 0.7181, "tokens_per_sec": 144743, "dt_s": 4.528, "eta_s": 34371, "world_size": 1, "timestamp": "2026-05-04T22:22:20.897686"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15820, "epoch": 0, "train_loss": 3.948408469557762, "train_ppl": 51.85277585794536, "lr": 0.00056, "grad_norm": 0.7112, "tokens_per_sec": 144920, "dt_s": 4.522, "eta_s": 34466, "world_size": 1, "timestamp": "2026-05-04T22:22:25.419904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15830, "epoch": 0, "train_loss": 3.985312581062317, "train_ppl": 53.80210436868306, "lr": 0.00056, "grad_norm": 0.7337, "tokens_per_sec": 148815, "dt_s": 4.404, "eta_s": 34398, "world_size": 1, "timestamp": "2026-05-04T22:22:29.823787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15840, "epoch": 0, "train_loss": 3.925987184047699, "train_ppl": 50.70310665912036, "lr": 0.00056, "grad_norm": 0.7198, "tokens_per_sec": 148012, "dt_s": 4.428, "eta_s": 34338, "world_size": 1, "timestamp": "2026-05-04T22:22:34.251501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15850, "epoch": 0, "train_loss": 3.8694624602794647, "train_ppl": 47.916622069213744, "lr": 0.00056, "grad_norm": 0.756, "tokens_per_sec": 146438, "dt_s": 4.475, "eta_s": 34350, "world_size": 1, "timestamp": "2026-05-04T22:22:38.726863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15860, "epoch": 0, "train_loss": 3.920247331261635, "train_ppl": 50.412911925353484, "lr": 0.00056, "grad_norm": 0.7082, "tokens_per_sec": 148777, "dt_s": 4.405, "eta_s": 34157, "world_size": 1, "timestamp": "2026-05-04T22:22:43.131845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15870, "epoch": 0, "train_loss": 3.9881591796875, "train_ppl": 53.95547555454887, "lr": 0.00056, "grad_norm": 0.7704, "tokens_per_sec": 146973, "dt_s": 4.459, "eta_s": 34055, "world_size": 1, "timestamp": "2026-05-04T22:22:47.590899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15880, "epoch": 0, "train_loss": 3.890250086784363, "train_ppl": 48.92312001971701, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 131067, "dt_s": 5.0, "eta_s": 34967, "world_size": 1, "timestamp": "2026-05-04T22:22:52.591104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15890, "epoch": 0, "train_loss": 3.9200395047664642, "train_ppl": 50.40243587519479, "lr": 0.00056, "grad_norm": 0.7809, "tokens_per_sec": 145362, "dt_s": 4.508, "eta_s": 35087, "world_size": 1, "timestamp": "2026-05-04T22:22:57.099563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15900, "epoch": 0, "train_loss": 3.759903147816658, "train_ppl": 42.94426653136491, "lr": 0.00056, "grad_norm": 0.6492, "tokens_per_sec": 144916, "dt_s": 4.522, "eta_s": 35154, "world_size": 1, "timestamp": "2026-05-04T22:23:01.621889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15910, "epoch": 0, "train_loss": 3.841778039932251, "train_ppl": 46.60827217823947, "lr": 0.00056, "grad_norm": 0.7178, "tokens_per_sec": 149868, "dt_s": 4.373, "eta_s": 35100, "world_size": 1, "timestamp": "2026-05-04T22:23:05.994825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15920, "epoch": 0, "train_loss": 3.8594729155302048, "train_ppl": 47.44033971150076, "lr": 0.00056, "grad_norm": 0.6883, "tokens_per_sec": 148481, "dt_s": 4.414, "eta_s": 35026, "world_size": 1, "timestamp": "2026-05-04T22:23:10.408592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15930, "epoch": 0, "train_loss": 3.8978483378887177, "train_ppl": 49.29626600362974, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 146963, "dt_s": 4.459, "eta_s": 34191, "world_size": 1, "timestamp": "2026-05-04T22:23:14.867913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15940, "epoch": 0, "train_loss": 3.983495742082596, "train_ppl": 53.70444335230982, "lr": 0.00056, "grad_norm": 0.77, "tokens_per_sec": 148726, "dt_s": 4.406, "eta_s": 34030, "world_size": 1, "timestamp": "2026-05-04T22:23:19.274417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15950, "epoch": 0, "train_loss": 3.905750200152397, "train_ppl": 49.68734138472742, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 148498, "dt_s": 4.413, "eta_s": 33859, "world_size": 1, "timestamp": "2026-05-04T22:23:23.687695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15960, "epoch": 0, "train_loss": 3.7845008224248886, "train_ppl": 44.01369442495492, "lr": 0.00056, "grad_norm": 0.6773, "tokens_per_sec": 144009, "dt_s": 4.551, "eta_s": 34127, "world_size": 1, "timestamp": "2026-05-04T22:23:28.238512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15970, "epoch": 0, "train_loss": 3.922354996204376, "train_ppl": 50.519277504606094, "lr": 0.00056, "grad_norm": 0.7936, "tokens_per_sec": 148625, "dt_s": 4.409, "eta_s": 34116, "world_size": 1, "timestamp": "2026-05-04T22:23:32.647989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15980, "epoch": 0, "train_loss": 3.888039529323578, "train_ppl": 48.81509209670859, "lr": 0.00056, "grad_norm": 0.7296, "tokens_per_sec": 147121, "dt_s": 4.455, "eta_s": 34104, "world_size": 1, "timestamp": "2026-05-04T22:23:37.102577"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 15990, "epoch": 0, "train_loss": 3.9703092873096466, "train_ppl": 53.000920817147815, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 146656, "dt_s": 4.469, "eta_s": 34195, "world_size": 1, "timestamp": "2026-05-04T22:23:41.571253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16000, "epoch": 0, "train_loss": 3.9403934478759766, "train_ppl": 51.43883582034769, "lr": 0.00056, "grad_norm": 0.7066, "tokens_per_sec": 148224, "dt_s": 4.421, "eta_s": 34203, "world_size": 1, "timestamp": "2026-05-04T22:23:45.992681"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16010, "epoch": 0, "train_loss": 3.8785578310489655, "train_ppl": 48.35442950527467, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 123574, "dt_s": 5.303, "eta_s": 34164, "world_size": 1, "timestamp": "2026-05-04T22:23:51.296085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16020, "epoch": 0, "train_loss": 3.9545410126447678, "train_ppl": 52.17174227809009, "lr": 0.00056, "grad_norm": 0.7132, "tokens_per_sec": 148561, "dt_s": 4.411, "eta_s": 34162, "world_size": 1, "timestamp": "2026-05-04T22:23:55.707445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16030, "epoch": 0, "train_loss": 3.8894317895174026, "train_ppl": 48.883102739562204, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 148352, "dt_s": 4.418, "eta_s": 34101, "world_size": 1, "timestamp": "2026-05-04T22:24:00.125079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16040, "epoch": 0, "train_loss": 3.897954538464546, "train_ppl": 49.301501573470865, "lr": 0.00056, "grad_norm": 0.7439, "tokens_per_sec": 146121, "dt_s": 4.485, "eta_s": 34122, "world_size": 1, "timestamp": "2026-05-04T22:24:04.610101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16050, "epoch": 0, "train_loss": 3.808121830224991, "train_ppl": 45.065718261455764, "lr": 0.00056, "grad_norm": 0.716, "tokens_per_sec": 148171, "dt_s": 4.423, "eta_s": 34120, "world_size": 1, "timestamp": "2026-05-04T22:24:09.033099"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16060, "epoch": 0, "train_loss": 3.913206160068512, "train_ppl": 50.05919274219699, "lr": 0.00056, "grad_norm": 0.7589, "tokens_per_sec": 147692, "dt_s": 4.437, "eta_s": 33976, "world_size": 1, "timestamp": "2026-05-04T22:24:13.470448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16070, "epoch": 0, "train_loss": 3.9148483872413635, "train_ppl": 50.14146884830687, "lr": 0.00056, "grad_norm": 0.7308, "tokens_per_sec": 146063, "dt_s": 4.487, "eta_s": 34088, "world_size": 1, "timestamp": "2026-05-04T22:24:17.957292"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16080, "epoch": 0, "train_loss": 3.8636023849248886, "train_ppl": 47.63664818837366, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 149658, "dt_s": 4.379, "eta_s": 34024, "world_size": 1, "timestamp": "2026-05-04T22:24:22.336345"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16090, "epoch": 0, "train_loss": 3.9144485741853714, "train_ppl": 50.121425641449584, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 147433, "dt_s": 4.445, "eta_s": 33958, "world_size": 1, "timestamp": "2026-05-04T22:24:26.781487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16100, "epoch": 0, "train_loss": 3.9196547716856003, "train_ppl": 50.38304812055192, "lr": 0.00056, "grad_norm": 0.7385, "tokens_per_sec": 147059, "dt_s": 4.456, "eta_s": 34005, "world_size": 1, "timestamp": "2026-05-04T22:24:31.237929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16110, "epoch": 0, "train_loss": 3.968933269381523, "train_ppl": 52.92804075353285, "lr": 0.00056, "grad_norm": 0.7568, "tokens_per_sec": 149063, "dt_s": 4.397, "eta_s": 33938, "world_size": 1, "timestamp": "2026-05-04T22:24:35.634474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16120, "epoch": 0, "train_loss": 3.8537335246801376, "train_ppl": 47.16884092394256, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 145313, "dt_s": 4.51, "eta_s": 33969, "world_size": 1, "timestamp": "2026-05-04T22:24:40.144463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16130, "epoch": 0, "train_loss": 3.914083108305931, "train_ppl": 50.10311131738285, "lr": 0.00056, "grad_norm": 0.7442, "tokens_per_sec": 147991, "dt_s": 4.428, "eta_s": 34040, "world_size": 1, "timestamp": "2026-05-04T22:24:44.572841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16140, "epoch": 0, "train_loss": 3.977212205529213, "train_ppl": 53.368047504051965, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 147465, "dt_s": 4.444, "eta_s": 34034, "world_size": 1, "timestamp": "2026-05-04T22:24:49.016991"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16150, "epoch": 0, "train_loss": 3.945328488945961, "train_ppl": 51.693316006317225, "lr": 0.00056, "grad_norm": 0.7598, "tokens_per_sec": 147004, "dt_s": 4.458, "eta_s": 34033, "world_size": 1, "timestamp": "2026-05-04T22:24:53.475096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16160, "epoch": 0, "train_loss": 3.9151224344968796, "train_ppl": 50.15521186326405, "lr": 0.00056, "grad_norm": 0.7141, "tokens_per_sec": 149671, "dt_s": 4.379, "eta_s": 34001, "world_size": 1, "timestamp": "2026-05-04T22:24:57.853779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16170, "epoch": 0, "train_loss": 3.8234209418296814, "train_ppl": 45.760484818700704, "lr": 0.00056, "grad_norm": 0.7196, "tokens_per_sec": 132910, "dt_s": 4.931, "eta_s": 34640, "world_size": 1, "timestamp": "2026-05-04T22:25:02.784646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16180, "epoch": 0, "train_loss": 3.9183536171913147, "train_ppl": 50.31753462188854, "lr": 0.00056, "grad_norm": 0.7695, "tokens_per_sec": 145150, "dt_s": 4.515, "eta_s": 34769, "world_size": 1, "timestamp": "2026-05-04T22:25:07.299731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16190, "epoch": 0, "train_loss": 3.8536677807569504, "train_ppl": 47.16573996122387, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 147746, "dt_s": 4.436, "eta_s": 34751, "world_size": 1, "timestamp": "2026-05-04T22:25:11.735420"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16200, "epoch": 0, "train_loss": 3.964336484670639, "train_ppl": 52.68530028529057, "lr": 0.00056, "grad_norm": 0.6771, "tokens_per_sec": 147733, "dt_s": 4.436, "eta_s": 34713, "world_size": 1, "timestamp": "2026-05-04T22:25:16.171552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16210, "epoch": 0, "train_loss": 3.9701984226703644, "train_ppl": 52.99504521488408, "lr": 0.00056, "grad_norm": 0.7915, "tokens_per_sec": 145731, "dt_s": 4.497, "eta_s": 34889, "world_size": 1, "timestamp": "2026-05-04T22:25:20.668609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16220, "epoch": 0, "train_loss": 3.8388720750808716, "train_ppl": 46.47302678186485, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 148873, "dt_s": 4.402, "eta_s": 34076, "world_size": 1, "timestamp": "2026-05-04T22:25:25.070767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16230, "epoch": 0, "train_loss": 3.92235204577446, "train_ppl": 50.519128451238274, "lr": 0.00056, "grad_norm": 0.7626, "tokens_per_sec": 146061, "dt_s": 4.487, "eta_s": 34029, "world_size": 1, "timestamp": "2026-05-04T22:25:29.557632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16240, "epoch": 0, "train_loss": 3.84021957218647, "train_ppl": 46.53569126156241, "lr": 0.00056, "grad_norm": 0.7416, "tokens_per_sec": 148221, "dt_s": 4.422, "eta_s": 34003, "world_size": 1, "timestamp": "2026-05-04T22:25:33.979127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16250, "epoch": 0, "train_loss": 3.7954466938972473, "train_ppl": 44.49810900044441, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 148805, "dt_s": 4.404, "eta_s": 33949, "world_size": 1, "timestamp": "2026-05-04T22:25:38.383315"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16260, "epoch": 0, "train_loss": 3.8061920553445816, "train_ppl": 44.97883542950339, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 145354, "dt_s": 4.509, "eta_s": 33963, "world_size": 1, "timestamp": "2026-05-04T22:25:42.892007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16270, "epoch": 0, "train_loss": 3.995367407798767, "train_ppl": 54.345804028463355, "lr": 0.00056, "grad_norm": 0.9389, "tokens_per_sec": 149167, "dt_s": 4.393, "eta_s": 33945, "world_size": 1, "timestamp": "2026-05-04T22:25:47.285485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16280, "epoch": 0, "train_loss": 3.93983793258667, "train_ppl": 51.41026869605716, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 148185, "dt_s": 4.423, "eta_s": 33842, "world_size": 1, "timestamp": "2026-05-04T22:25:51.708054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16290, "epoch": 0, "train_loss": 3.8134548515081406, "train_ppl": 45.30669669650496, "lr": 0.00056, "grad_norm": 0.7007, "tokens_per_sec": 146820, "dt_s": 4.464, "eta_s": 33902, "world_size": 1, "timestamp": "2026-05-04T22:25:56.171741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16300, "epoch": 0, "train_loss": 3.848532661795616, "train_ppl": 46.92415907976878, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 149286, "dt_s": 4.39, "eta_s": 33876, "world_size": 1, "timestamp": "2026-05-04T22:26:00.561698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16310, "epoch": 0, "train_loss": 3.8344606757164, "train_ppl": 46.26846722972829, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 148669, "dt_s": 4.408, "eta_s": 33718, "world_size": 1, "timestamp": "2026-05-04T22:26:04.969871"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16320, "epoch": 0, "train_loss": 3.8983910530805588, "train_ppl": 49.32302709726013, "lr": 0.00056, "grad_norm": 0.7031, "tokens_per_sec": 146721, "dt_s": 4.467, "eta_s": 33826, "world_size": 1, "timestamp": "2026-05-04T22:26:09.436594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16330, "epoch": 0, "train_loss": 3.887086033821106, "train_ppl": 48.76856930910069, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 148565, "dt_s": 4.411, "eta_s": 33804, "world_size": 1, "timestamp": "2026-05-04T22:26:13.847893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16340, "epoch": 0, "train_loss": 3.8766712844371796, "train_ppl": 48.26329261415336, "lr": 0.00056, "grad_norm": 0.7578, "tokens_per_sec": 147921, "dt_s": 4.43, "eta_s": 33749, "world_size": 1, "timestamp": "2026-05-04T22:26:18.278328"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16350, "epoch": 0, "train_loss": 3.9049886614084244, "train_ppl": 49.64951695339091, "lr": 0.00056, "grad_norm": 0.7237, "tokens_per_sec": 145225, "dt_s": 4.513, "eta_s": 33932, "world_size": 1, "timestamp": "2026-05-04T22:26:22.791039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16360, "epoch": 0, "train_loss": 3.8925327360630035, "train_ppl": 49.03492189803213, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 148530, "dt_s": 4.412, "eta_s": 33933, "world_size": 1, "timestamp": "2026-05-04T22:26:27.203364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16370, "epoch": 0, "train_loss": 3.8630745857954025, "train_ppl": 47.611512240877325, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 144061, "dt_s": 4.549, "eta_s": 34055, "world_size": 1, "timestamp": "2026-05-04T22:26:31.752534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16380, "epoch": 0, "train_loss": 3.921002224087715, "train_ppl": 50.45098263875404, "lr": 0.00056, "grad_norm": 0.7142, "tokens_per_sec": 149451, "dt_s": 4.385, "eta_s": 34011, "world_size": 1, "timestamp": "2026-05-04T22:26:36.137651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16390, "epoch": 0, "train_loss": 3.808152064681053, "train_ppl": 45.06708081953246, "lr": 0.00056, "grad_norm": 0.7292, "tokens_per_sec": 148725, "dt_s": 4.407, "eta_s": 33970, "world_size": 1, "timestamp": "2026-05-04T22:26:40.544193"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16400, "epoch": 0, "train_loss": 3.9792351871728897, "train_ppl": 53.47611936134669, "lr": 0.00056, "grad_norm": 0.7562, "tokens_per_sec": 143552, "dt_s": 4.565, "eta_s": 34045, "world_size": 1, "timestamp": "2026-05-04T22:26:45.109512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16410, "epoch": 0, "train_loss": 3.947651192545891, "train_ppl": 51.813523807000166, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 149105, "dt_s": 4.395, "eta_s": 34015, "world_size": 1, "timestamp": "2026-05-04T22:26:49.504783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16420, "epoch": 0, "train_loss": 3.939784198999405, "train_ppl": 51.40750631211492, "lr": 0.00056, "grad_norm": 0.703, "tokens_per_sec": 149642, "dt_s": 4.38, "eta_s": 33752, "world_size": 1, "timestamp": "2026-05-04T22:26:53.884306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16430, "epoch": 0, "train_loss": 3.9182043224573135, "train_ppl": 50.31002303967536, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 143154, "dt_s": 4.578, "eta_s": 34041, "world_size": 1, "timestamp": "2026-05-04T22:26:58.462384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16440, "epoch": 0, "train_loss": 3.8793456703424454, "train_ppl": 48.39254003536804, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 148707, "dt_s": 4.407, "eta_s": 34038, "world_size": 1, "timestamp": "2026-05-04T22:27:02.869384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16450, "epoch": 0, "train_loss": 3.919923484325409, "train_ppl": 50.39658850156828, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 147875, "dt_s": 4.432, "eta_s": 33830, "world_size": 1, "timestamp": "2026-05-04T22:27:07.301238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16460, "epoch": 0, "train_loss": 4.021667718887329, "train_ppl": 55.79407711646765, "lr": 0.00056, "grad_norm": 0.7529, "tokens_per_sec": 146638, "dt_s": 4.469, "eta_s": 33938, "world_size": 1, "timestamp": "2026-05-04T22:27:11.770482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16470, "epoch": 0, "train_loss": 3.9256933331489563, "train_ppl": 50.68820969450994, "lr": 0.00056, "grad_norm": 0.7047, "tokens_per_sec": 134146, "dt_s": 4.885, "eta_s": 34705, "world_size": 1, "timestamp": "2026-05-04T22:27:16.655911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16480, "epoch": 0, "train_loss": 3.863893538713455, "train_ppl": 47.65051979825621, "lr": 0.00056, "grad_norm": 0.7026, "tokens_per_sec": 144885, "dt_s": 4.523, "eta_s": 34617, "world_size": 1, "timestamp": "2026-05-04T22:27:21.179201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16490, "epoch": 0, "train_loss": 3.911473721265793, "train_ppl": 49.97254333332875, "lr": 0.00056, "grad_norm": 0.6869, "tokens_per_sec": 149522, "dt_s": 4.383, "eta_s": 34576, "world_size": 1, "timestamp": "2026-05-04T22:27:25.562242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16500, "epoch": 0, "train_loss": 3.8471760153770447, "train_ppl": 46.86054274959907, "lr": 0.00056, "grad_norm": 0.7293, "tokens_per_sec": 148901, "dt_s": 4.401, "eta_s": 34525, "world_size": 1, "timestamp": "2026-05-04T22:27:29.963554"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16510, "epoch": 0, "train_loss": 3.97986002266407, "train_ppl": 53.50954357988581, "lr": 0.00056, "grad_norm": 0.7256, "tokens_per_sec": 122801, "dt_s": 5.337, "eta_s": 34673, "world_size": 1, "timestamp": "2026-05-04T22:27:35.300288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16520, "epoch": 0, "train_loss": 3.915615275502205, "train_ppl": 50.17993650045812, "lr": 0.00056, "grad_norm": 0.7334, "tokens_per_sec": 149288, "dt_s": 4.39, "eta_s": 33914, "world_size": 1, "timestamp": "2026-05-04T22:27:39.690214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16530, "epoch": 0, "train_loss": 3.9134874790906906, "train_ppl": 50.07327732638812, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 148039, "dt_s": 4.427, "eta_s": 33762, "world_size": 1, "timestamp": "2026-05-04T22:27:44.117146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16540, "epoch": 0, "train_loss": 3.829877629876137, "train_ppl": 46.05690190057175, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 146282, "dt_s": 4.48, "eta_s": 33906, "world_size": 1, "timestamp": "2026-05-04T22:27:48.597264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16550, "epoch": 0, "train_loss": 3.907262936234474, "train_ppl": 49.76256209906252, "lr": 0.00056, "grad_norm": 0.7735, "tokens_per_sec": 148935, "dt_s": 4.4, "eta_s": 33900, "world_size": 1, "timestamp": "2026-05-04T22:27:52.997576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16560, "epoch": 0, "train_loss": 3.9145237803459167, "train_ppl": 50.12519522317924, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 148663, "dt_s": 4.408, "eta_s": 33650, "world_size": 1, "timestamp": "2026-05-04T22:27:57.405939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16570, "epoch": 0, "train_loss": 3.9962000101804733, "train_ppl": 54.39107131654409, "lr": 0.00056, "grad_norm": 0.6932, "tokens_per_sec": 146660, "dt_s": 4.469, "eta_s": 33765, "world_size": 1, "timestamp": "2026-05-04T22:28:01.874496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16580, "epoch": 0, "train_loss": 3.854306936264038, "train_ppl": 47.195895839787276, "lr": 0.00056, "grad_norm": 0.783, "tokens_per_sec": 149713, "dt_s": 4.377, "eta_s": 33686, "world_size": 1, "timestamp": "2026-05-04T22:28:06.251942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16590, "epoch": 0, "train_loss": 3.8961073011159897, "train_ppl": 49.210514062062046, "lr": 0.00056, "grad_norm": 0.8501, "tokens_per_sec": 147051, "dt_s": 4.457, "eta_s": 33645, "world_size": 1, "timestamp": "2026-05-04T22:28:10.708625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16600, "epoch": 0, "train_loss": 3.9129786491394043, "train_ppl": 50.047805024210106, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 147363, "dt_s": 4.447, "eta_s": 33713, "world_size": 1, "timestamp": "2026-05-04T22:28:15.155889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16610, "epoch": 0, "train_loss": 3.9867369681596756, "train_ppl": 53.878793996848835, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 149088, "dt_s": 4.396, "eta_s": 33689, "world_size": 1, "timestamp": "2026-05-04T22:28:19.551670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16620, "epoch": 0, "train_loss": 3.858221113681793, "train_ppl": 47.380990960748576, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 144908, "dt_s": 4.523, "eta_s": 33767, "world_size": 1, "timestamp": "2026-05-04T22:28:24.074282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16630, "epoch": 0, "train_loss": 3.844427317380905, "train_ppl": 46.731914131244075, "lr": 0.00056, "grad_norm": 0.6757, "tokens_per_sec": 147963, "dt_s": 4.429, "eta_s": 33841, "world_size": 1, "timestamp": "2026-05-04T22:28:28.503470"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16640, "epoch": 0, "train_loss": 3.9300628900527954, "train_ppl": 50.9101793123894, "lr": 0.00056, "grad_norm": 0.7094, "tokens_per_sec": 148404, "dt_s": 4.416, "eta_s": 33775, "world_size": 1, "timestamp": "2026-05-04T22:28:32.919541"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16650, "epoch": 0, "train_loss": 3.9230326265096664, "train_ppl": 50.553522499457344, "lr": 0.00056, "grad_norm": 0.7328, "tokens_per_sec": 145420, "dt_s": 4.507, "eta_s": 33861, "world_size": 1, "timestamp": "2026-05-04T22:28:37.426205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16660, "epoch": 0, "train_loss": 3.869057536125183, "train_ppl": 47.8972233993064, "lr": 0.00056, "grad_norm": 0.6985, "tokens_per_sec": 149385, "dt_s": 4.387, "eta_s": 33843, "world_size": 1, "timestamp": "2026-05-04T22:28:41.813277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16670, "epoch": 0, "train_loss": 3.901932790875435, "train_ppl": 49.49802604374878, "lr": 0.00056, "grad_norm": 0.6805, "tokens_per_sec": 148193, "dt_s": 4.422, "eta_s": 33686, "world_size": 1, "timestamp": "2026-05-04T22:28:46.235597"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16680, "epoch": 0, "train_loss": 3.871585726737976, "train_ppl": 48.01846991245013, "lr": 0.00056, "grad_norm": 0.7847, "tokens_per_sec": 144778, "dt_s": 4.527, "eta_s": 33830, "world_size": 1, "timestamp": "2026-05-04T22:28:50.762287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16690, "epoch": 0, "train_loss": 3.9411796182394028, "train_ppl": 51.47929140900754, "lr": 0.00056, "grad_norm": 0.6647, "tokens_per_sec": 149473, "dt_s": 4.384, "eta_s": 33777, "world_size": 1, "timestamp": "2026-05-04T22:28:55.146749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16700, "epoch": 0, "train_loss": 3.797944039106369, "train_ppl": 44.60937501677002, "lr": 0.00056, "grad_norm": 0.742, "tokens_per_sec": 147986, "dt_s": 4.429, "eta_s": 33654, "world_size": 1, "timestamp": "2026-05-04T22:28:59.575272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16710, "epoch": 0, "train_loss": 3.8775519281625748, "train_ppl": 48.30581410035286, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 146948, "dt_s": 4.46, "eta_s": 33760, "world_size": 1, "timestamp": "2026-05-04T22:29:04.035084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16720, "epoch": 0, "train_loss": 3.8977984488010406, "train_ppl": 49.29380671923917, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 148708, "dt_s": 4.407, "eta_s": 33732, "world_size": 1, "timestamp": "2026-05-04T22:29:08.442104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16730, "epoch": 0, "train_loss": 3.8357138335704803, "train_ppl": 46.32648526812107, "lr": 0.00056, "grad_norm": 0.7028, "tokens_per_sec": 147847, "dt_s": 4.433, "eta_s": 33585, "world_size": 1, "timestamp": "2026-05-04T22:29:12.874788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16740, "epoch": 0, "train_loss": 4.044257387518883, "train_ppl": 57.06879029833417, "lr": 0.00056, "grad_norm": 0.7888, "tokens_per_sec": 150005, "dt_s": 4.369, "eta_s": 33557, "world_size": 1, "timestamp": "2026-05-04T22:29:17.243722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16750, "epoch": 0, "train_loss": 3.9794012010097504, "train_ppl": 53.48499787405987, "lr": 0.00056, "grad_norm": 0.7187, "tokens_per_sec": 149041, "dt_s": 4.397, "eta_s": 33505, "world_size": 1, "timestamp": "2026-05-04T22:29:21.640902"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16760, "epoch": 0, "train_loss": 3.814863994717598, "train_ppl": 45.37058532403872, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 131017, "dt_s": 5.002, "eta_s": 34324, "world_size": 1, "timestamp": "2026-05-04T22:29:26.643000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16770, "epoch": 0, "train_loss": 3.8269353806972504, "train_ppl": 45.92159017679104, "lr": 0.00056, "grad_norm": 0.7042, "tokens_per_sec": 148688, "dt_s": 4.408, "eta_s": 34321, "world_size": 1, "timestamp": "2026-05-04T22:29:31.050609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16780, "epoch": 0, "train_loss": 3.8820708245038986, "train_ppl": 48.52459702332749, "lr": 0.00056, "grad_norm": 0.7047, "tokens_per_sec": 149649, "dt_s": 4.379, "eta_s": 34235, "world_size": 1, "timestamp": "2026-05-04T22:29:35.429935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16790, "epoch": 0, "train_loss": 3.8806147277355194, "train_ppl": 48.45399193081329, "lr": 0.00056, "grad_norm": 0.6961, "tokens_per_sec": 147394, "dt_s": 4.446, "eta_s": 34348, "world_size": 1, "timestamp": "2026-05-04T22:29:39.876263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16800, "epoch": 0, "train_loss": 3.8295453935861588, "train_ppl": 46.041602667977294, "lr": 0.00056, "grad_norm": 0.7179, "tokens_per_sec": 150648, "dt_s": 4.35, "eta_s": 34272, "world_size": 1, "timestamp": "2026-05-04T22:29:44.226534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16810, "epoch": 0, "train_loss": 3.8117586970329285, "train_ppl": 45.22991467563836, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 149045, "dt_s": 4.397, "eta_s": 33350, "world_size": 1, "timestamp": "2026-05-04T22:29:48.623566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16820, "epoch": 0, "train_loss": 3.9056664407253265, "train_ppl": 49.683179775769794, "lr": 0.00056, "grad_norm": 0.7338, "tokens_per_sec": 146722, "dt_s": 4.467, "eta_s": 33435, "world_size": 1, "timestamp": "2026-05-04T22:29:53.090245"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16830, "epoch": 0, "train_loss": 3.9734856635332108, "train_ppl": 53.16953933801754, "lr": 0.00056, "grad_norm": 0.701, "tokens_per_sec": 147641, "dt_s": 4.439, "eta_s": 33521, "world_size": 1, "timestamp": "2026-05-04T22:29:57.529130"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16840, "epoch": 0, "train_loss": 3.808408573269844, "train_ppl": 45.07864239559279, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 147337, "dt_s": 4.448, "eta_s": 33519, "world_size": 1, "timestamp": "2026-05-04T22:30:01.977156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16850, "epoch": 0, "train_loss": 3.9325647056102753, "train_ppl": 51.03770664944817, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 146813, "dt_s": 4.464, "eta_s": 33687, "world_size": 1, "timestamp": "2026-05-04T22:30:06.441079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16860, "epoch": 0, "train_loss": 3.967977210879326, "train_ppl": 52.87746263184, "lr": 0.00056, "grad_norm": 0.7082, "tokens_per_sec": 150141, "dt_s": 4.365, "eta_s": 33634, "world_size": 1, "timestamp": "2026-05-04T22:30:10.806029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16870, "epoch": 0, "train_loss": 3.8702943921089172, "train_ppl": 47.956502018672516, "lr": 0.00056, "grad_norm": 0.685, "tokens_per_sec": 146834, "dt_s": 4.463, "eta_s": 33624, "world_size": 1, "timestamp": "2026-05-04T22:30:15.269306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16880, "epoch": 0, "train_loss": 3.808332845568657, "train_ppl": 45.07522882288417, "lr": 0.00056, "grad_norm": 0.7082, "tokens_per_sec": 148143, "dt_s": 4.424, "eta_s": 33597, "world_size": 1, "timestamp": "2026-05-04T22:30:19.693130"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16890, "epoch": 0, "train_loss": 4.010426625609398, "train_ppl": 55.17040265006281, "lr": 0.00056, "grad_norm": 0.7103, "tokens_per_sec": 148814, "dt_s": 4.404, "eta_s": 33526, "world_size": 1, "timestamp": "2026-05-04T22:30:24.096988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16900, "epoch": 0, "train_loss": 3.9543102085590363, "train_ppl": 52.15970221631375, "lr": 0.00056, "grad_norm": 0.7033, "tokens_per_sec": 146398, "dt_s": 4.477, "eta_s": 33540, "world_size": 1, "timestamp": "2026-05-04T22:30:28.573559"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16910, "epoch": 0, "train_loss": 3.9837965816259384, "train_ppl": 53.72060220301216, "lr": 0.00056, "grad_norm": 0.7641, "tokens_per_sec": 148394, "dt_s": 4.416, "eta_s": 33614, "world_size": 1, "timestamp": "2026-05-04T22:30:32.989908"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16920, "epoch": 0, "train_loss": 3.848731443285942, "train_ppl": 46.93348766118688, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 147682, "dt_s": 4.438, "eta_s": 33571, "world_size": 1, "timestamp": "2026-05-04T22:30:37.427554"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16930, "epoch": 0, "train_loss": 3.9517469704151154, "train_ppl": 52.026175681279135, "lr": 0.00056, "grad_norm": 0.6777, "tokens_per_sec": 147649, "dt_s": 4.439, "eta_s": 33589, "world_size": 1, "timestamp": "2026-05-04T22:30:41.866205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16940, "epoch": 0, "train_loss": 3.809594765305519, "train_ppl": 45.132146048711554, "lr": 0.00056, "grad_norm": 0.6994, "tokens_per_sec": 149948, "dt_s": 4.371, "eta_s": 33534, "world_size": 1, "timestamp": "2026-05-04T22:30:46.236779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16950, "epoch": 0, "train_loss": 3.9372610598802567, "train_ppl": 51.277961520412774, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 150422, "dt_s": 4.357, "eta_s": 33348, "world_size": 1, "timestamp": "2026-05-04T22:30:50.593577"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16960, "epoch": 0, "train_loss": 3.8484587520360947, "train_ppl": 46.92069105461747, "lr": 0.00056, "grad_norm": 0.7562, "tokens_per_sec": 146300, "dt_s": 4.48, "eta_s": 33439, "world_size": 1, "timestamp": "2026-05-04T22:30:55.073151"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16970, "epoch": 0, "train_loss": 3.893007293343544, "train_ppl": 49.058197299538385, "lr": 0.00056, "grad_norm": 0.723, "tokens_per_sec": 150092, "dt_s": 4.366, "eta_s": 33327, "world_size": 1, "timestamp": "2026-05-04T22:30:59.439526"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16980, "epoch": 0, "train_loss": 3.8332951366901398, "train_ppl": 46.21457094070233, "lr": 0.00056, "grad_norm": 0.7464, "tokens_per_sec": 147206, "dt_s": 4.452, "eta_s": 33343, "world_size": 1, "timestamp": "2026-05-04T22:31:03.891505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 16990, "epoch": 0, "train_loss": 3.8137439489364624, "train_ppl": 45.31979663949435, "lr": 0.00056, "grad_norm": 0.6906, "tokens_per_sec": 148371, "dt_s": 4.417, "eta_s": 33409, "world_size": 1, "timestamp": "2026-05-04T22:31:08.308563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17000, "epoch": 0, "train_loss": 3.6120780631899834, "train_ppl": 37.04295047988699, "lr": 0.00056, "grad_norm": 0.8306, "tokens_per_sec": 149626, "dt_s": 4.38, "eta_s": 33439, "world_size": 1, "timestamp": "2026-05-04T22:31:12.688562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17010, "epoch": 0, "train_loss": 4.010932192206383, "train_ppl": 55.19830201468405, "lr": 0.00056, "grad_norm": 0.7065, "tokens_per_sec": 124997, "dt_s": 5.243, "eta_s": 33420, "world_size": 1, "timestamp": "2026-05-04T22:31:17.931554"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17020, "epoch": 0, "train_loss": 3.8126151263713837, "train_ppl": 45.2686674936989, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 147510, "dt_s": 4.443, "eta_s": 33531, "world_size": 1, "timestamp": "2026-05-04T22:31:22.374369"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17030, "epoch": 0, "train_loss": 3.8615878522396088, "train_ppl": 47.5407792016071, "lr": 0.00056, "grad_norm": 0.7501, "tokens_per_sec": 145736, "dt_s": 4.497, "eta_s": 33595, "world_size": 1, "timestamp": "2026-05-04T22:31:26.871259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17040, "epoch": 0, "train_loss": 3.949060544371605, "train_ppl": 51.88659877344736, "lr": 0.00056, "grad_norm": 0.745, "tokens_per_sec": 144543, "dt_s": 4.534, "eta_s": 33767, "world_size": 1, "timestamp": "2026-05-04T22:31:31.405273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17050, "epoch": 0, "train_loss": 3.8249263912439346, "train_ppl": 45.82942679507522, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 147123, "dt_s": 4.455, "eta_s": 33876, "world_size": 1, "timestamp": "2026-05-04T22:31:35.859765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17060, "epoch": 0, "train_loss": 3.8800756335258484, "train_ppl": 48.42787770397463, "lr": 0.00056, "grad_norm": 0.6955, "tokens_per_sec": 131308, "dt_s": 4.991, "eta_s": 34660, "world_size": 1, "timestamp": "2026-05-04T22:31:40.850768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17070, "epoch": 0, "train_loss": 3.8676028549671173, "train_ppl": 47.82759886393132, "lr": 0.00056, "grad_norm": 1.1066, "tokens_per_sec": 144225, "dt_s": 4.544, "eta_s": 34808, "world_size": 1, "timestamp": "2026-05-04T22:31:45.394776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17080, "epoch": 0, "train_loss": 3.8325654417276382, "train_ppl": 46.180860701684296, "lr": 0.00056, "grad_norm": 0.7276, "tokens_per_sec": 147311, "dt_s": 4.449, "eta_s": 34731, "world_size": 1, "timestamp": "2026-05-04T22:31:49.843590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17090, "epoch": 0, "train_loss": 3.8058497458696365, "train_ppl": 44.9634413828782, "lr": 0.00056, "grad_norm": 0.7046, "tokens_per_sec": 144826, "dt_s": 4.525, "eta_s": 34713, "world_size": 1, "timestamp": "2026-05-04T22:31:54.368772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17100, "epoch": 0, "train_loss": 3.9043892323970795, "train_ppl": 49.61976451065932, "lr": 0.00056, "grad_norm": 0.6877, "tokens_per_sec": 148256, "dt_s": 4.42, "eta_s": 34657, "world_size": 1, "timestamp": "2026-05-04T22:31:58.789223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17110, "epoch": 0, "train_loss": 3.848334953188896, "train_ppl": 46.9148826866972, "lr": 0.00056, "grad_norm": 0.7606, "tokens_per_sec": 149331, "dt_s": 4.389, "eta_s": 33742, "world_size": 1, "timestamp": "2026-05-04T22:32:03.177850"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17120, "epoch": 0, "train_loss": 3.7859496772289276, "train_ppl": 44.077510096223584, "lr": 0.00056, "grad_norm": 0.6908, "tokens_per_sec": 146834, "dt_s": 4.463, "eta_s": 33615, "world_size": 1, "timestamp": "2026-05-04T22:32:07.641121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17130, "epoch": 0, "train_loss": 4.012616962194443, "train_ppl": 55.29137684013086, "lr": 0.00056, "grad_norm": 0.6616, "tokens_per_sec": 148115, "dt_s": 4.425, "eta_s": 33574, "world_size": 1, "timestamp": "2026-05-04T22:32:12.065812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17140, "epoch": 0, "train_loss": 3.9450184106826782, "train_ppl": 51.677289517527974, "lr": 0.00056, "grad_norm": 0.7364, "tokens_per_sec": 147456, "dt_s": 4.444, "eta_s": 33448, "world_size": 1, "timestamp": "2026-05-04T22:32:16.510265"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17150, "epoch": 0, "train_loss": 3.9042628407478333, "train_ppl": 49.61349338310504, "lr": 0.00056, "grad_norm": 0.7154, "tokens_per_sec": 145497, "dt_s": 4.504, "eta_s": 33570, "world_size": 1, "timestamp": "2026-05-04T22:32:21.014550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17160, "epoch": 0, "train_loss": 3.90338534116745, "train_ppl": 49.56997665922583, "lr": 0.00056, "grad_norm": 0.6838, "tokens_per_sec": 148547, "dt_s": 4.412, "eta_s": 33601, "world_size": 1, "timestamp": "2026-05-04T22:32:25.426366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17170, "epoch": 0, "train_loss": 3.970648929476738, "train_ppl": 53.018925222106475, "lr": 0.00056, "grad_norm": 0.7206, "tokens_per_sec": 147654, "dt_s": 4.438, "eta_s": 33559, "world_size": 1, "timestamp": "2026-05-04T22:32:29.864828"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17180, "epoch": 0, "train_loss": 3.7399917244911194, "train_ppl": 42.09764178414695, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 147542, "dt_s": 4.442, "eta_s": 33580, "world_size": 1, "timestamp": "2026-05-04T22:32:34.306679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17190, "epoch": 0, "train_loss": 3.8697746247053146, "train_ppl": 47.93158226893024, "lr": 0.00056, "grad_norm": 0.6977, "tokens_per_sec": 147253, "dt_s": 4.451, "eta_s": 33585, "world_size": 1, "timestamp": "2026-05-04T22:32:38.757256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17200, "epoch": 0, "train_loss": 3.8307457864284515, "train_ppl": 46.0969038632066, "lr": 0.00056, "grad_norm": 0.6848, "tokens_per_sec": 147402, "dt_s": 4.446, "eta_s": 33493, "world_size": 1, "timestamp": "2026-05-04T22:32:43.203332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17210, "epoch": 0, "train_loss": 3.884342059493065, "train_ppl": 48.63493303799984, "lr": 0.00056, "grad_norm": 0.7884, "tokens_per_sec": 147194, "dt_s": 4.452, "eta_s": 33549, "world_size": 1, "timestamp": "2026-05-04T22:32:47.655722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17220, "epoch": 0, "train_loss": 3.869917333126068, "train_ppl": 47.938422997443226, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 149142, "dt_s": 4.394, "eta_s": 33478, "world_size": 1, "timestamp": "2026-05-04T22:32:52.049894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17230, "epoch": 0, "train_loss": 3.918353497982025, "train_ppl": 50.31752862357135, "lr": 0.00056, "grad_norm": 0.7607, "tokens_per_sec": 144813, "dt_s": 4.526, "eta_s": 33600, "world_size": 1, "timestamp": "2026-05-04T22:32:56.575463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17240, "epoch": 0, "train_loss": 3.96490915119648, "train_ppl": 52.715480033808916, "lr": 0.00056, "grad_norm": 0.7113, "tokens_per_sec": 147426, "dt_s": 4.445, "eta_s": 33588, "world_size": 1, "timestamp": "2026-05-04T22:33:01.020855"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17250, "epoch": 0, "train_loss": 3.8567578345537186, "train_ppl": 47.311710046627965, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 148867, "dt_s": 4.402, "eta_s": 33517, "world_size": 1, "timestamp": "2026-05-04T22:33:05.423148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17260, "epoch": 0, "train_loss": 3.960305318236351, "train_ppl": 52.473344572606024, "lr": 0.00056, "grad_norm": 0.7335, "tokens_per_sec": 145028, "dt_s": 4.519, "eta_s": 33613, "world_size": 1, "timestamp": "2026-05-04T22:33:09.942017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17270, "epoch": 0, "train_loss": 3.8450171798467636, "train_ppl": 46.759487664844585, "lr": 0.00056, "grad_norm": 0.6889, "tokens_per_sec": 149075, "dt_s": 4.396, "eta_s": 33611, "world_size": 1, "timestamp": "2026-05-04T22:33:14.338165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17280, "epoch": 0, "train_loss": 3.8285044878721237, "train_ppl": 45.99370263471094, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 149033, "dt_s": 4.397, "eta_s": 33414, "world_size": 1, "timestamp": "2026-05-04T22:33:18.735572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17290, "epoch": 0, "train_loss": 3.8886848539114, "train_ppl": 48.84660384245398, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 145194, "dt_s": 4.514, "eta_s": 33512, "world_size": 1, "timestamp": "2026-05-04T22:33:23.249286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17300, "epoch": 0, "train_loss": 3.97802671790123, "train_ppl": 53.41153414681448, "lr": 0.00056, "grad_norm": 0.7966, "tokens_per_sec": 148517, "dt_s": 4.413, "eta_s": 33524, "world_size": 1, "timestamp": "2026-05-04T22:33:27.661974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17310, "epoch": 0, "train_loss": 4.134346142411232, "train_ppl": 62.448745132293645, "lr": 0.00056, "grad_norm": 0.7309, "tokens_per_sec": 148009, "dt_s": 4.428, "eta_s": 33382, "world_size": 1, "timestamp": "2026-05-04T22:33:32.089803"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17320, "epoch": 0, "train_loss": 3.8572562783956528, "train_ppl": 47.33529815533748, "lr": 0.00056, "grad_norm": 0.7077, "tokens_per_sec": 148940, "dt_s": 4.4, "eta_s": 33384, "world_size": 1, "timestamp": "2026-05-04T22:33:36.489989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17330, "epoch": 0, "train_loss": 3.8930690735578537, "train_ppl": 49.061228219105665, "lr": 0.00056, "grad_norm": 0.7843, "tokens_per_sec": 147374, "dt_s": 4.447, "eta_s": 33454, "world_size": 1, "timestamp": "2026-05-04T22:33:40.936870"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17340, "epoch": 0, "train_loss": 3.8461032807826996, "train_ppl": 46.81030077725074, "lr": 0.00056, "grad_norm": 0.7257, "tokens_per_sec": 146547, "dt_s": 4.472, "eta_s": 33386, "world_size": 1, "timestamp": "2026-05-04T22:33:45.408878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17350, "epoch": 0, "train_loss": 3.7887022495269775, "train_ppl": 44.199003762822386, "lr": 0.00056, "grad_norm": 0.6923, "tokens_per_sec": 133099, "dt_s": 4.924, "eta_s": 34152, "world_size": 1, "timestamp": "2026-05-04T22:33:50.332727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17360, "epoch": 0, "train_loss": 3.788687452673912, "train_ppl": 44.19834976149666, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 147482, "dt_s": 4.444, "eta_s": 34172, "world_size": 1, "timestamp": "2026-05-04T22:33:54.776386"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17370, "epoch": 0, "train_loss": 3.8112493455410004, "train_ppl": 45.20688261732313, "lr": 0.00056, "grad_norm": 0.7263, "tokens_per_sec": 147045, "dt_s": 4.457, "eta_s": 34252, "world_size": 1, "timestamp": "2026-05-04T22:33:59.233277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17380, "epoch": 0, "train_loss": 3.8728990256786346, "train_ppl": 48.081573946278674, "lr": 0.00056, "grad_norm": 0.8014, "tokens_per_sec": 149071, "dt_s": 4.396, "eta_s": 34172, "world_size": 1, "timestamp": "2026-05-04T22:34:03.629530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17390, "epoch": 0, "train_loss": 4.0730694234371185, "train_ppl": 58.736974830841, "lr": 0.00056, "grad_norm": 0.748, "tokens_per_sec": 149526, "dt_s": 4.383, "eta_s": 34033, "world_size": 1, "timestamp": "2026-05-04T22:34:08.012460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17400, "epoch": 0, "train_loss": 3.828721582889557, "train_ppl": 46.00368872231195, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 145478, "dt_s": 4.505, "eta_s": 33398, "world_size": 1, "timestamp": "2026-05-04T22:34:12.517346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17410, "epoch": 0, "train_loss": 3.806944876909256, "train_ppl": 45.0127092156272, "lr": 0.00056, "grad_norm": 0.7943, "tokens_per_sec": 149557, "dt_s": 4.382, "eta_s": 33300, "world_size": 1, "timestamp": "2026-05-04T22:34:16.899359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17420, "epoch": 0, "train_loss": 3.9913168102502823, "train_ppl": 54.1261162815939, "lr": 0.00056, "grad_norm": 0.712, "tokens_per_sec": 148165, "dt_s": 4.423, "eta_s": 33245, "world_size": 1, "timestamp": "2026-05-04T22:34:21.322532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17430, "epoch": 0, "train_loss": 3.8178178519010544, "train_ppl": 45.50480168380546, "lr": 0.00056, "grad_norm": 0.6647, "tokens_per_sec": 146385, "dt_s": 4.477, "eta_s": 33362, "world_size": 1, "timestamp": "2026-05-04T22:34:25.799485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17440, "epoch": 0, "train_loss": 3.737695813179016, "train_ppl": 42.001100200026954, "lr": 0.00056, "grad_norm": 0.7179, "tokens_per_sec": 146916, "dt_s": 4.461, "eta_s": 33475, "world_size": 1, "timestamp": "2026-05-04T22:34:30.260282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17450, "epoch": 0, "train_loss": 3.8455292731523514, "train_ppl": 46.78343901759075, "lr": 0.00056, "grad_norm": 1.0621, "tokens_per_sec": 148007, "dt_s": 4.428, "eta_s": 33355, "world_size": 1, "timestamp": "2026-05-04T22:34:34.688154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17460, "epoch": 0, "train_loss": 3.9434862434864044, "train_ppl": 51.59817189592673, "lr": 0.00056, "grad_norm": 0.7728, "tokens_per_sec": 149216, "dt_s": 4.392, "eta_s": 33365, "world_size": 1, "timestamp": "2026-05-04T22:34:39.080191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17470, "epoch": 0, "train_loss": 3.960460603237152, "train_ppl": 52.48149352864896, "lr": 0.00056, "grad_norm": 0.7373, "tokens_per_sec": 148184, "dt_s": 4.423, "eta_s": 33360, "world_size": 1, "timestamp": "2026-05-04T22:34:43.502814"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17480, "epoch": 0, "train_loss": 3.8997141420841217, "train_ppl": 49.388329042651606, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 147265, "dt_s": 4.45, "eta_s": 33315, "world_size": 1, "timestamp": "2026-05-04T22:34:47.953035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17490, "epoch": 0, "train_loss": 4.045931652188301, "train_ppl": 57.164418588852755, "lr": 0.00056, "grad_norm": 0.7314, "tokens_per_sec": 149642, "dt_s": 4.38, "eta_s": 33189, "world_size": 1, "timestamp": "2026-05-04T22:34:52.332516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17500, "epoch": 0, "train_loss": 3.811848297715187, "train_ppl": 45.23396748841648, "lr": 0.00056, "grad_norm": 0.7333, "tokens_per_sec": 148371, "dt_s": 4.417, "eta_s": 33168, "world_size": 1, "timestamp": "2026-05-04T22:34:56.749532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17510, "epoch": 0, "train_loss": 3.929090678691864, "train_ppl": 50.860707909901286, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 107022, "dt_s": 6.124, "eta_s": 33336, "world_size": 1, "timestamp": "2026-05-04T22:35:02.873142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17520, "epoch": 0, "train_loss": 4.03249229490757, "train_ppl": 56.40130490825598, "lr": 0.00056, "grad_norm": 0.7136, "tokens_per_sec": 143170, "dt_s": 4.577, "eta_s": 33564, "world_size": 1, "timestamp": "2026-05-04T22:35:07.450651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17530, "epoch": 0, "train_loss": 3.945536822080612, "train_ppl": 51.70408655877386, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 145302, "dt_s": 4.51, "eta_s": 33650, "world_size": 1, "timestamp": "2026-05-04T22:35:11.960974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17540, "epoch": 0, "train_loss": 3.9805732369422913, "train_ppl": 53.54772096309716, "lr": 0.00056, "grad_norm": 0.8752, "tokens_per_sec": 148491, "dt_s": 4.413, "eta_s": 33697, "world_size": 1, "timestamp": "2026-05-04T22:35:16.374422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17550, "epoch": 0, "train_loss": 3.825395181775093, "train_ppl": 45.850916233034134, "lr": 0.00056, "grad_norm": 0.6611, "tokens_per_sec": 147841, "dt_s": 4.433, "eta_s": 33716, "world_size": 1, "timestamp": "2026-05-04T22:35:20.807324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17560, "epoch": 0, "train_loss": 3.9224807024002075, "train_ppl": 50.525628489968085, "lr": 0.00056, "grad_norm": 0.7045, "tokens_per_sec": 146725, "dt_s": 4.467, "eta_s": 33651, "world_size": 1, "timestamp": "2026-05-04T22:35:25.273896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17570, "epoch": 0, "train_loss": 3.829851657152176, "train_ppl": 46.05570569290665, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 149086, "dt_s": 4.396, "eta_s": 33374, "world_size": 1, "timestamp": "2026-05-04T22:35:29.669779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17580, "epoch": 0, "train_loss": 3.920384392142296, "train_ppl": 50.419822037000834, "lr": 0.00056, "grad_norm": 0.7023, "tokens_per_sec": 146531, "dt_s": 4.472, "eta_s": 33313, "world_size": 1, "timestamp": "2026-05-04T22:35:34.142276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17590, "epoch": 0, "train_loss": 3.847725123167038, "train_ppl": 46.88628130464001, "lr": 0.00056, "grad_norm": 0.7012, "tokens_per_sec": 144988, "dt_s": 4.52, "eta_s": 33469, "world_size": 1, "timestamp": "2026-05-04T22:35:38.662376"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17600, "epoch": 0, "train_loss": 3.9357682168483734, "train_ppl": 51.2014686829734, "lr": 0.00056, "grad_norm": 0.6673, "tokens_per_sec": 148232, "dt_s": 4.421, "eta_s": 33446, "world_size": 1, "timestamp": "2026-05-04T22:35:43.083563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17610, "epoch": 0, "train_loss": 3.9466234147548676, "train_ppl": 51.76029837460063, "lr": 0.00056, "grad_norm": 0.7774, "tokens_per_sec": 146288, "dt_s": 4.48, "eta_s": 33462, "world_size": 1, "timestamp": "2026-05-04T22:35:47.563469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17620, "epoch": 0, "train_loss": 3.8001265078783035, "train_ppl": 44.706839903028126, "lr": 0.00056, "grad_norm": 0.6556, "tokens_per_sec": 145402, "dt_s": 4.507, "eta_s": 33625, "world_size": 1, "timestamp": "2026-05-04T22:35:52.070709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17630, "epoch": 0, "train_loss": 3.9239415377378464, "train_ppl": 50.59949205163752, "lr": 0.00056, "grad_norm": 0.7176, "tokens_per_sec": 147550, "dt_s": 4.442, "eta_s": 33574, "world_size": 1, "timestamp": "2026-05-04T22:35:56.512327"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17640, "epoch": 0, "train_loss": 3.833761692047119, "train_ppl": 46.236137626980515, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 146678, "dt_s": 4.468, "eta_s": 33491, "world_size": 1, "timestamp": "2026-05-04T22:36:00.980332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17650, "epoch": 0, "train_loss": 3.8745741695165634, "train_ppl": 48.16218499729512, "lr": 0.00056, "grad_norm": 0.6639, "tokens_per_sec": 132051, "dt_s": 4.963, "eta_s": 34300, "world_size": 1, "timestamp": "2026-05-04T22:36:05.943275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17660, "epoch": 0, "train_loss": 3.9134204536676407, "train_ppl": 50.06992125626409, "lr": 0.00056, "grad_norm": 0.6852, "tokens_per_sec": 147110, "dt_s": 4.455, "eta_s": 34258, "world_size": 1, "timestamp": "2026-05-04T22:36:10.398160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17670, "epoch": 0, "train_loss": 3.8133187890052795, "train_ppl": 45.30053257331859, "lr": 0.00056, "grad_norm": 0.7256, "tokens_per_sec": 146501, "dt_s": 4.473, "eta_s": 34202, "world_size": 1, "timestamp": "2026-05-04T22:36:14.871593"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17680, "epoch": 0, "train_loss": 3.766104057431221, "train_ppl": 43.211387386802286, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 149593, "dt_s": 4.381, "eta_s": 34107, "world_size": 1, "timestamp": "2026-05-04T22:36:19.252534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17690, "epoch": 0, "train_loss": 3.9702237099409103, "train_ppl": 52.9963853318739, "lr": 0.00056, "grad_norm": 0.7219, "tokens_per_sec": 148070, "dt_s": 4.426, "eta_s": 34039, "world_size": 1, "timestamp": "2026-05-04T22:36:23.678554"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17700, "epoch": 0, "train_loss": 3.7718621641397476, "train_ppl": 43.46092089723278, "lr": 0.00056, "grad_norm": 0.6514, "tokens_per_sec": 147107, "dt_s": 4.455, "eta_s": 33273, "world_size": 1, "timestamp": "2026-05-04T22:36:28.133589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17710, "epoch": 0, "train_loss": 3.881209507584572, "train_ppl": 48.48281996113579, "lr": 0.00056, "grad_norm": 0.7981, "tokens_per_sec": 149234, "dt_s": 4.391, "eta_s": 33173, "world_size": 1, "timestamp": "2026-05-04T22:36:32.525018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17720, "epoch": 0, "train_loss": 3.898788869380951, "train_ppl": 49.342652504818794, "lr": 0.00056, "grad_norm": 0.7919, "tokens_per_sec": 147371, "dt_s": 4.447, "eta_s": 33129, "world_size": 1, "timestamp": "2026-05-04T22:36:36.972013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17730, "epoch": 0, "train_loss": 3.7915693819522858, "train_ppl": 44.325910001237155, "lr": 0.00056, "grad_norm": 0.6706, "tokens_per_sec": 146005, "dt_s": 4.489, "eta_s": 33286, "world_size": 1, "timestamp": "2026-05-04T22:36:41.460658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17740, "epoch": 0, "train_loss": 3.9752707332372665, "train_ppl": 53.26453543396967, "lr": 0.00056, "grad_norm": 0.7613, "tokens_per_sec": 148312, "dt_s": 4.419, "eta_s": 33271, "world_size": 1, "timestamp": "2026-05-04T22:36:45.879441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17750, "epoch": 0, "train_loss": 3.865003526210785, "train_ppl": 47.70344064477901, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 147322, "dt_s": 4.448, "eta_s": 33257, "world_size": 1, "timestamp": "2026-05-04T22:36:50.327901"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17760, "epoch": 0, "train_loss": 3.878524512052536, "train_ppl": 48.352818411050805, "lr": 0.00056, "grad_norm": 0.6612, "tokens_per_sec": 147888, "dt_s": 4.431, "eta_s": 33312, "world_size": 1, "timestamp": "2026-05-04T22:36:54.759399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17770, "epoch": 0, "train_loss": 3.8745872229337692, "train_ppl": 48.16281368249267, "lr": 0.00056, "grad_norm": 0.7046, "tokens_per_sec": 149854, "dt_s": 4.373, "eta_s": 33198, "world_size": 1, "timestamp": "2026-05-04T22:36:59.132704"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17780, "epoch": 0, "train_loss": 3.8294326066970825, "train_ppl": 46.03641007167818, "lr": 0.00056, "grad_norm": 0.7174, "tokens_per_sec": 145572, "dt_s": 4.502, "eta_s": 33213, "world_size": 1, "timestamp": "2026-05-04T22:37:03.634668"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17790, "epoch": 0, "train_loss": 3.976505860686302, "train_ppl": 53.3303645690614, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 148627, "dt_s": 4.409, "eta_s": 33195, "world_size": 1, "timestamp": "2026-05-04T22:37:08.044095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17800, "epoch": 0, "train_loss": 3.8058874905109406, "train_ppl": 44.96513854387416, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 147622, "dt_s": 4.439, "eta_s": 33177, "world_size": 1, "timestamp": "2026-05-04T22:37:12.483535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17810, "epoch": 0, "train_loss": 3.9710620641708374, "train_ppl": 53.040833704825125, "lr": 0.00056, "grad_norm": 0.7283, "tokens_per_sec": 145464, "dt_s": 4.505, "eta_s": 33283, "world_size": 1, "timestamp": "2026-05-04T22:37:16.988872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17820, "epoch": 0, "train_loss": 3.8172745406627655, "train_ppl": 45.480085128653634, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 148830, "dt_s": 4.403, "eta_s": 33324, "world_size": 1, "timestamp": "2026-05-04T22:37:21.392288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17830, "epoch": 0, "train_loss": 3.921143025159836, "train_ppl": 50.45808669131649, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 147294, "dt_s": 4.449, "eta_s": 33240, "world_size": 1, "timestamp": "2026-05-04T22:37:25.841608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17840, "epoch": 0, "train_loss": 3.87259541451931, "train_ppl": 48.06697805971983, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 143635, "dt_s": 4.563, "eta_s": 33465, "world_size": 1, "timestamp": "2026-05-04T22:37:30.404307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17850, "epoch": 0, "train_loss": 3.865501806139946, "train_ppl": 47.727216234761954, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 148728, "dt_s": 4.406, "eta_s": 33411, "world_size": 1, "timestamp": "2026-05-04T22:37:34.810717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17860, "epoch": 0, "train_loss": 3.984243616461754, "train_ppl": 53.744622552164294, "lr": 0.00056, "grad_norm": 0.7319, "tokens_per_sec": 147739, "dt_s": 4.436, "eta_s": 33303, "world_size": 1, "timestamp": "2026-05-04T22:37:39.246642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17870, "epoch": 0, "train_loss": 3.943801134824753, "train_ppl": 51.61442227174839, "lr": 0.00056, "grad_norm": 0.7122, "tokens_per_sec": 146926, "dt_s": 4.46, "eta_s": 33384, "world_size": 1, "timestamp": "2026-05-04T22:37:43.707131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17880, "epoch": 0, "train_loss": 3.962913855910301, "train_ppl": 52.61040195075215, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 149282, "dt_s": 4.39, "eta_s": 33291, "world_size": 1, "timestamp": "2026-05-04T22:37:48.097234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17890, "epoch": 0, "train_loss": 3.9206640869379044, "train_ppl": 50.4339261711546, "lr": 0.00056, "grad_norm": 0.6991, "tokens_per_sec": 146621, "dt_s": 4.47, "eta_s": 33147, "world_size": 1, "timestamp": "2026-05-04T22:37:52.566956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17900, "epoch": 0, "train_loss": 3.9173459708690643, "train_ppl": 50.26685787958648, "lr": 0.00056, "grad_norm": 0.7484, "tokens_per_sec": 148448, "dt_s": 4.415, "eta_s": 33155, "world_size": 1, "timestamp": "2026-05-04T22:37:56.981732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17910, "epoch": 0, "train_loss": 3.9605198204517365, "train_ppl": 52.484601428532635, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 149781, "dt_s": 4.375, "eta_s": 33061, "world_size": 1, "timestamp": "2026-05-04T22:38:01.357197"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17920, "epoch": 0, "train_loss": 3.8138908743858337, "train_ppl": 45.3264557601662, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 144992, "dt_s": 4.52, "eta_s": 33145, "world_size": 1, "timestamp": "2026-05-04T22:38:05.877156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17930, "epoch": 0, "train_loss": 3.811489000916481, "train_ppl": 45.21771798807666, "lr": 0.00056, "grad_norm": 0.7172, "tokens_per_sec": 150008, "dt_s": 4.369, "eta_s": 33109, "world_size": 1, "timestamp": "2026-05-04T22:38:10.245988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17940, "epoch": 0, "train_loss": 3.8636959344148636, "train_ppl": 47.64110478096853, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 134063, "dt_s": 4.888, "eta_s": 33730, "world_size": 1, "timestamp": "2026-05-04T22:38:15.134428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17950, "epoch": 0, "train_loss": 3.8075502067804337, "train_ppl": 45.0399650016365, "lr": 0.00056, "grad_norm": 0.7024, "tokens_per_sec": 144074, "dt_s": 4.549, "eta_s": 33926, "world_size": 1, "timestamp": "2026-05-04T22:38:19.683225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17960, "epoch": 0, "train_loss": 3.8311124444007874, "train_ppl": 46.11380875947614, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 148829, "dt_s": 4.403, "eta_s": 33963, "world_size": 1, "timestamp": "2026-05-04T22:38:24.086659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17970, "epoch": 0, "train_loss": 3.876241236925125, "train_ppl": 48.24254156752888, "lr": 0.00056, "grad_norm": 0.6949, "tokens_per_sec": 147181, "dt_s": 4.453, "eta_s": 33858, "world_size": 1, "timestamp": "2026-05-04T22:38:28.539418"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17980, "epoch": 0, "train_loss": 4.036571830511093, "train_ppl": 56.63186601106676, "lr": 0.00056, "grad_norm": 0.7166, "tokens_per_sec": 147871, "dt_s": 4.432, "eta_s": 33948, "world_size": 1, "timestamp": "2026-05-04T22:38:32.971396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 17990, "epoch": 0, "train_loss": 3.8810167759656906, "train_ppl": 48.47347668915764, "lr": 0.00056, "grad_norm": 0.6839, "tokens_per_sec": 148865, "dt_s": 4.402, "eta_s": 33218, "world_size": 1, "timestamp": "2026-05-04T22:38:37.373755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18000, "epoch": 0, "train_loss": 3.873235821723938, "train_ppl": 48.09777035752627, "lr": 0.00056, "grad_norm": 0.7094, "tokens_per_sec": 146213, "dt_s": 4.482, "eta_s": 33114, "world_size": 1, "timestamp": "2026-05-04T22:38:41.855997"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18010, "epoch": 0, "train_loss": 3.8144625425338745, "train_ppl": 45.352374859043614, "lr": 0.00056, "grad_norm": 0.6723, "tokens_per_sec": 125160, "dt_s": 5.236, "eta_s": 33178, "world_size": 1, "timestamp": "2026-05-04T22:38:47.092185"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18020, "epoch": 0, "train_loss": 3.7303274273872375, "train_ppl": 41.69275727994307, "lr": 0.00056, "grad_norm": 0.738, "tokens_per_sec": 146762, "dt_s": 4.465, "eta_s": 33193, "world_size": 1, "timestamp": "2026-05-04T22:38:51.557618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18030, "epoch": 0, "train_loss": 3.8323487043380737, "train_ppl": 46.17085266708479, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 142852, "dt_s": 4.588, "eta_s": 33421, "world_size": 1, "timestamp": "2026-05-04T22:38:56.145347"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18040, "epoch": 0, "train_loss": 3.771453395485878, "train_ppl": 43.443159065589015, "lr": 0.00056, "grad_norm": 0.7309, "tokens_per_sec": 146722, "dt_s": 4.467, "eta_s": 33513, "world_size": 1, "timestamp": "2026-05-04T22:39:00.612040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18050, "epoch": 0, "train_loss": 3.870547339320183, "train_ppl": 47.96863401643301, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 148330, "dt_s": 4.418, "eta_s": 33412, "world_size": 1, "timestamp": "2026-05-04T22:39:05.030288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18060, "epoch": 0, "train_loss": 3.8322082608938217, "train_ppl": 46.16436872883614, "lr": 0.00056, "grad_norm": 0.6684, "tokens_per_sec": 143486, "dt_s": 4.567, "eta_s": 33584, "world_size": 1, "timestamp": "2026-05-04T22:39:09.597724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18070, "epoch": 0, "train_loss": 3.7775981575250626, "train_ppl": 43.71092878835219, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 148518, "dt_s": 4.413, "eta_s": 33500, "world_size": 1, "timestamp": "2026-05-04T22:39:14.010396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18080, "epoch": 0, "train_loss": 3.778411254286766, "train_ppl": 43.746484456141324, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 146260, "dt_s": 4.481, "eta_s": 33336, "world_size": 1, "timestamp": "2026-05-04T22:39:18.491154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18090, "epoch": 0, "train_loss": 3.860493913292885, "train_ppl": 47.48880092739368, "lr": 0.00056, "grad_norm": 0.7306, "tokens_per_sec": 147168, "dt_s": 4.453, "eta_s": 33312, "world_size": 1, "timestamp": "2026-05-04T22:39:22.944301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18100, "epoch": 0, "train_loss": 3.8970936089754105, "train_ppl": 49.259074722791595, "lr": 0.00056, "grad_norm": 0.755, "tokens_per_sec": 149669, "dt_s": 4.379, "eta_s": 33248, "world_size": 1, "timestamp": "2026-05-04T22:39:27.323010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18110, "epoch": 0, "train_loss": 3.911503881216049, "train_ppl": 49.97405052547818, "lr": 0.00056, "grad_norm": 0.7211, "tokens_per_sec": 148330, "dt_s": 4.418, "eta_s": 33021, "world_size": 1, "timestamp": "2026-05-04T22:39:31.741288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18120, "epoch": 0, "train_loss": 3.841188460588455, "train_ppl": 46.58080100272774, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 148918, "dt_s": 4.401, "eta_s": 32999, "world_size": 1, "timestamp": "2026-05-04T22:39:36.142061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18130, "epoch": 0, "train_loss": 3.849613770842552, "train_ppl": 46.97491664495747, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 148661, "dt_s": 4.408, "eta_s": 32887, "world_size": 1, "timestamp": "2026-05-04T22:39:40.550495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18140, "epoch": 0, "train_loss": 3.8381724804639816, "train_ppl": 46.440525872553856, "lr": 0.00056, "grad_norm": 0.7004, "tokens_per_sec": 145563, "dt_s": 4.502, "eta_s": 32956, "world_size": 1, "timestamp": "2026-05-04T22:39:45.052738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18150, "epoch": 0, "train_loss": 3.9362227469682693, "train_ppl": 51.22474658252519, "lr": 0.00056, "grad_norm": 0.6975, "tokens_per_sec": 149447, "dt_s": 4.385, "eta_s": 32961, "world_size": 1, "timestamp": "2026-05-04T22:39:49.437960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18160, "epoch": 0, "train_loss": 3.821165919303894, "train_ppl": 45.65741015616548, "lr": 0.00056, "grad_norm": 0.6485, "tokens_per_sec": 149925, "dt_s": 4.371, "eta_s": 32887, "world_size": 1, "timestamp": "2026-05-04T22:39:53.809209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18170, "epoch": 0, "train_loss": 3.901496469974518, "train_ppl": 49.47643373136291, "lr": 0.00056, "grad_norm": 0.9128, "tokens_per_sec": 143439, "dt_s": 4.569, "eta_s": 33133, "world_size": 1, "timestamp": "2026-05-04T22:39:58.378089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18180, "epoch": 0, "train_loss": 3.868901878595352, "train_ppl": 47.88976841605349, "lr": 0.00056, "grad_norm": 0.7176, "tokens_per_sec": 148557, "dt_s": 4.412, "eta_s": 33133, "world_size": 1, "timestamp": "2026-05-04T22:40:02.789639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18190, "epoch": 0, "train_loss": 3.8933074921369553, "train_ppl": 49.072926721941435, "lr": 0.00056, "grad_norm": 0.7477, "tokens_per_sec": 149230, "dt_s": 4.392, "eta_s": 32964, "world_size": 1, "timestamp": "2026-05-04T22:40:07.181243"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18200, "epoch": 0, "train_loss": 3.768119513988495, "train_ppl": 43.29856588358603, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 144723, "dt_s": 4.528, "eta_s": 33172, "world_size": 1, "timestamp": "2026-05-04T22:40:11.709620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18210, "epoch": 0, "train_loss": 3.8958422243595123, "train_ppl": 49.19747122736242, "lr": 0.00056, "grad_norm": 0.7693, "tokens_per_sec": 148998, "dt_s": 4.398, "eta_s": 33209, "world_size": 1, "timestamp": "2026-05-04T22:40:16.108068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18220, "epoch": 0, "train_loss": 3.7901205718517303, "train_ppl": 44.26173667381878, "lr": 0.00056, "grad_norm": 0.7441, "tokens_per_sec": 149334, "dt_s": 4.389, "eta_s": 32935, "world_size": 1, "timestamp": "2026-05-04T22:40:20.496629"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18230, "epoch": 0, "train_loss": 3.8082129806280136, "train_ppl": 45.069826207055414, "lr": 0.00056, "grad_norm": 0.8182, "tokens_per_sec": 146062, "dt_s": 4.487, "eta_s": 33043, "world_size": 1, "timestamp": "2026-05-04T22:40:24.983495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18240, "epoch": 0, "train_loss": 3.7940901070833206, "train_ppl": 44.43778437957377, "lr": 0.00056, "grad_norm": 0.6837, "tokens_per_sec": 133636, "dt_s": 4.904, "eta_s": 33802, "world_size": 1, "timestamp": "2026-05-04T22:40:29.887535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18250, "epoch": 0, "train_loss": 3.8306564688682556, "train_ppl": 46.092786784087416, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 147440, "dt_s": 4.445, "eta_s": 33673, "world_size": 1, "timestamp": "2026-05-04T22:40:34.332500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18260, "epoch": 0, "train_loss": 4.022323191165924, "train_ppl": 55.83066057573917, "lr": 0.00056, "grad_norm": 0.7549, "tokens_per_sec": 147275, "dt_s": 4.45, "eta_s": 33745, "world_size": 1, "timestamp": "2026-05-04T22:40:38.782374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18270, "epoch": 0, "train_loss": 3.828789070248604, "train_ppl": 46.0067934945355, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 147607, "dt_s": 4.44, "eta_s": 33817, "world_size": 1, "timestamp": "2026-05-04T22:40:43.222281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18280, "epoch": 0, "train_loss": 3.987417832016945, "train_ppl": 53.91549061163625, "lr": 0.00056, "grad_norm": 0.6676, "tokens_per_sec": 145623, "dt_s": 4.5, "eta_s": 33832, "world_size": 1, "timestamp": "2026-05-04T22:40:47.722673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18290, "epoch": 0, "train_loss": 3.955550640821457, "train_ppl": 52.22444293868025, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 149977, "dt_s": 4.37, "eta_s": 33033, "world_size": 1, "timestamp": "2026-05-04T22:40:52.092420"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18300, "epoch": 0, "train_loss": 3.8752264976501465, "train_ppl": 48.19361279504756, "lr": 0.00056, "grad_norm": 0.7414, "tokens_per_sec": 148874, "dt_s": 4.402, "eta_s": 32965, "world_size": 1, "timestamp": "2026-05-04T22:40:56.494516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18310, "epoch": 0, "train_loss": 3.868522420525551, "train_ppl": 47.871599704317255, "lr": 0.00056, "grad_norm": 0.641, "tokens_per_sec": 145626, "dt_s": 4.5, "eta_s": 33035, "world_size": 1, "timestamp": "2026-05-04T22:41:00.994788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18320, "epoch": 0, "train_loss": 3.952384740114212, "train_ppl": 52.05936698276667, "lr": 0.00056, "grad_norm": 0.8251, "tokens_per_sec": 149183, "dt_s": 4.393, "eta_s": 32961, "world_size": 1, "timestamp": "2026-05-04T22:41:05.387796"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18330, "epoch": 0, "train_loss": 3.926438719034195, "train_ppl": 50.72600605525118, "lr": 0.00056, "grad_norm": 1.4309, "tokens_per_sec": 148206, "dt_s": 4.422, "eta_s": 32840, "world_size": 1, "timestamp": "2026-05-04T22:41:09.809781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18340, "epoch": 0, "train_loss": 3.9147935658693314, "train_ppl": 50.13872009953468, "lr": 0.00056, "grad_norm": 0.7349, "tokens_per_sec": 145620, "dt_s": 4.5, "eta_s": 33030, "world_size": 1, "timestamp": "2026-05-04T22:41:14.310255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18350, "epoch": 0, "train_loss": 3.849332422018051, "train_ppl": 46.96170216640455, "lr": 0.00056, "grad_norm": 0.6685, "tokens_per_sec": 149273, "dt_s": 4.39, "eta_s": 33008, "world_size": 1, "timestamp": "2026-05-04T22:41:18.700623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18360, "epoch": 0, "train_loss": 3.7233540415763855, "train_ppl": 41.403028965647174, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 149510, "dt_s": 4.383, "eta_s": 32830, "world_size": 1, "timestamp": "2026-05-04T22:41:23.083974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18370, "epoch": 0, "train_loss": 3.8629549890756607, "train_ppl": 47.605818400680455, "lr": 0.00056, "grad_norm": 0.7549, "tokens_per_sec": 147601, "dt_s": 4.44, "eta_s": 32895, "world_size": 1, "timestamp": "2026-05-04T22:41:27.524074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18380, "epoch": 0, "train_loss": 3.8439541906118393, "train_ppl": 46.709809241318816, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 151090, "dt_s": 4.338, "eta_s": 32765, "world_size": 1, "timestamp": "2026-05-04T22:41:31.861608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18390, "epoch": 0, "train_loss": 3.84215484559536, "train_ppl": 46.62583774833926, "lr": 0.00056, "grad_norm": 0.7143, "tokens_per_sec": 147447, "dt_s": 4.445, "eta_s": 32678, "world_size": 1, "timestamp": "2026-05-04T22:41:36.306342"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18400, "epoch": 0, "train_loss": 3.87589929997921, "train_ppl": 48.22604848016075, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 148376, "dt_s": 4.417, "eta_s": 32713, "world_size": 1, "timestamp": "2026-05-04T22:41:40.723210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18410, "epoch": 0, "train_loss": 3.9354350566864014, "train_ppl": 51.184413234629496, "lr": 0.00056, "grad_norm": 0.686, "tokens_per_sec": 150733, "dt_s": 4.348, "eta_s": 32656, "world_size": 1, "timestamp": "2026-05-04T22:41:45.071011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18420, "epoch": 0, "train_loss": 3.911680907011032, "train_ppl": 49.98289800459387, "lr": 0.00056, "grad_norm": 0.7193, "tokens_per_sec": 145622, "dt_s": 4.5, "eta_s": 32741, "world_size": 1, "timestamp": "2026-05-04T22:41:49.571449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18430, "epoch": 0, "train_loss": 3.8281928300857544, "train_ppl": 45.97937057262626, "lr": 0.00056, "grad_norm": 0.706, "tokens_per_sec": 147595, "dt_s": 4.44, "eta_s": 32889, "world_size": 1, "timestamp": "2026-05-04T22:41:54.011709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18440, "epoch": 0, "train_loss": 3.7465979158878326, "train_ppl": 42.37666749699132, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 148394, "dt_s": 4.416, "eta_s": 32843, "world_size": 1, "timestamp": "2026-05-04T22:41:58.428037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18450, "epoch": 0, "train_loss": 3.8187958002090454, "train_ppl": 45.54932479471967, "lr": 0.00056, "grad_norm": 0.7019, "tokens_per_sec": 142930, "dt_s": 4.585, "eta_s": 33088, "world_size": 1, "timestamp": "2026-05-04T22:42:03.013241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18460, "epoch": 0, "train_loss": 3.894308477640152, "train_ppl": 49.12207260324333, "lr": 0.00056, "grad_norm": 0.8066, "tokens_per_sec": 147149, "dt_s": 4.454, "eta_s": 33241, "world_size": 1, "timestamp": "2026-05-04T22:42:07.466927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18470, "epoch": 0, "train_loss": 3.8515254110097885, "train_ppl": 47.064801669007004, "lr": 0.00056, "grad_norm": 0.6954, "tokens_per_sec": 146472, "dt_s": 4.474, "eta_s": 33198, "world_size": 1, "timestamp": "2026-05-04T22:42:11.941231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18480, "epoch": 0, "train_loss": 3.9176322668790817, "train_ppl": 50.281251140701826, "lr": 0.00056, "grad_norm": 0.7563, "tokens_per_sec": 146436, "dt_s": 4.475, "eta_s": 33245, "world_size": 1, "timestamp": "2026-05-04T22:42:16.416608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18490, "epoch": 0, "train_loss": 4.054557293653488, "train_ppl": 57.6596310603971, "lr": 0.00056, "grad_norm": 0.7508, "tokens_per_sec": 147223, "dt_s": 4.451, "eta_s": 33293, "world_size": 1, "timestamp": "2026-05-04T22:42:20.868118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18500, "epoch": 0, "train_loss": 3.8716863095760345, "train_ppl": 48.0232999893405, "lr": 0.00056, "grad_norm": 0.6852, "tokens_per_sec": 145403, "dt_s": 4.507, "eta_s": 33173, "world_size": 1, "timestamp": "2026-05-04T22:42:25.375312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18510, "epoch": 0, "train_loss": 3.846043035387993, "train_ppl": 46.80748075715154, "lr": 0.00056, "grad_norm": 0.7278, "tokens_per_sec": 106757, "dt_s": 6.139, "eta_s": 33273, "world_size": 1, "timestamp": "2026-05-04T22:42:31.514102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18520, "epoch": 0, "train_loss": 3.8557567447423935, "train_ppl": 47.26437047527877, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 148138, "dt_s": 4.424, "eta_s": 33194, "world_size": 1, "timestamp": "2026-05-04T22:42:35.938098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18530, "epoch": 0, "train_loss": 3.7889851927757263, "train_ppl": 44.21151134192259, "lr": 0.00056, "grad_norm": 0.7602, "tokens_per_sec": 145627, "dt_s": 4.5, "eta_s": 33226, "world_size": 1, "timestamp": "2026-05-04T22:42:40.438358"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18540, "epoch": 0, "train_loss": 3.937540039420128, "train_ppl": 51.29226901818005, "lr": 0.00056, "grad_norm": 0.7501, "tokens_per_sec": 133738, "dt_s": 4.9, "eta_s": 33887, "world_size": 1, "timestamp": "2026-05-04T22:42:45.338686"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18550, "epoch": 0, "train_loss": 3.8932467997074127, "train_ppl": 49.06994845717389, "lr": 0.00056, "grad_norm": 0.6988, "tokens_per_sec": 148789, "dt_s": 4.405, "eta_s": 33731, "world_size": 1, "timestamp": "2026-05-04T22:42:49.743312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18560, "epoch": 0, "train_loss": 3.866799473762512, "train_ppl": 47.78919050028717, "lr": 0.00056, "grad_norm": 0.7446, "tokens_per_sec": 144840, "dt_s": 4.525, "eta_s": 33727, "world_size": 1, "timestamp": "2026-05-04T22:42:54.268026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18570, "epoch": 0, "train_loss": 3.827821969985962, "train_ppl": 45.96232182021372, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 149029, "dt_s": 4.398, "eta_s": 33683, "world_size": 1, "timestamp": "2026-05-04T22:42:58.665562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18580, "epoch": 0, "train_loss": 3.8462982922792435, "train_ppl": 46.819430214202626, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 144629, "dt_s": 4.531, "eta_s": 33725, "world_size": 1, "timestamp": "2026-05-04T22:43:03.196888"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18590, "epoch": 0, "train_loss": 3.741009831428528, "train_ppl": 42.14052351068351, "lr": 0.00056, "grad_norm": 0.7733, "tokens_per_sec": 147835, "dt_s": 4.433, "eta_s": 33028, "world_size": 1, "timestamp": "2026-05-04T22:43:07.629937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18600, "epoch": 0, "train_loss": 3.8041581213474274, "train_ppl": 44.887444420117625, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 148619, "dt_s": 4.41, "eta_s": 33031, "world_size": 1, "timestamp": "2026-05-04T22:43:12.039612"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18610, "epoch": 0, "train_loss": 3.789432853460312, "train_ppl": 44.231307528012984, "lr": 0.00056, "grad_norm": 0.6706, "tokens_per_sec": 144999, "dt_s": 4.52, "eta_s": 33019, "world_size": 1, "timestamp": "2026-05-04T22:43:16.559395"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18620, "epoch": 0, "train_loss": 3.970202714204788, "train_ppl": 52.99527264543293, "lr": 0.00056, "grad_norm": 0.752, "tokens_per_sec": 147350, "dt_s": 4.448, "eta_s": 33089, "world_size": 1, "timestamp": "2026-05-04T22:43:21.007059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18630, "epoch": 0, "train_loss": 3.815434902906418, "train_ppl": 45.396495158096236, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 149452, "dt_s": 4.385, "eta_s": 32868, "world_size": 1, "timestamp": "2026-05-04T22:43:25.392110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18640, "epoch": 0, "train_loss": 3.9404190480709076, "train_ppl": 51.4401526814276, "lr": 0.00056, "grad_norm": 0.6904, "tokens_per_sec": 146468, "dt_s": 4.474, "eta_s": 32924, "world_size": 1, "timestamp": "2026-05-04T22:43:29.866531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18650, "epoch": 0, "train_loss": 3.8842210322618484, "train_ppl": 48.62904724289178, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 148527, "dt_s": 4.412, "eta_s": 32924, "world_size": 1, "timestamp": "2026-05-04T22:43:34.278929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18660, "epoch": 0, "train_loss": 3.923578202724457, "train_ppl": 50.58111082398944, "lr": 0.00056, "grad_norm": 0.7947, "tokens_per_sec": 148160, "dt_s": 4.423, "eta_s": 32777, "world_size": 1, "timestamp": "2026-05-04T22:43:38.702272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18670, "epoch": 0, "train_loss": 3.806202471256256, "train_ppl": 44.97930392752036, "lr": 0.00056, "grad_norm": 1.0959, "tokens_per_sec": 145738, "dt_s": 4.497, "eta_s": 32845, "world_size": 1, "timestamp": "2026-05-04T22:43:43.199091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18680, "epoch": 0, "train_loss": 3.8967792838811874, "train_ppl": 49.24359379263789, "lr": 0.00056, "grad_norm": 0.7016, "tokens_per_sec": 147669, "dt_s": 4.438, "eta_s": 32919, "world_size": 1, "timestamp": "2026-05-04T22:43:47.637141"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18690, "epoch": 0, "train_loss": 3.8399918377399445, "train_ppl": 46.52509468831744, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 145179, "dt_s": 4.514, "eta_s": 32973, "world_size": 1, "timestamp": "2026-05-04T22:43:52.151312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18700, "epoch": 0, "train_loss": 3.8065893203020096, "train_ppl": 44.99670749438295, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 147707, "dt_s": 4.437, "eta_s": 33005, "world_size": 1, "timestamp": "2026-05-04T22:43:56.588201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18710, "epoch": 0, "train_loss": 3.7951027303934097, "train_ppl": 44.48280590696217, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 148251, "dt_s": 4.421, "eta_s": 32997, "world_size": 1, "timestamp": "2026-05-04T22:44:01.008817"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18720, "epoch": 0, "train_loss": 3.9354740232229233, "train_ppl": 51.18640775279663, "lr": 0.00056, "grad_norm": 0.6964, "tokens_per_sec": 147065, "dt_s": 4.456, "eta_s": 32932, "world_size": 1, "timestamp": "2026-05-04T22:44:05.465062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18730, "epoch": 0, "train_loss": 3.9231755286455154, "train_ppl": 50.56074722199906, "lr": 0.00056, "grad_norm": 0.7362, "tokens_per_sec": 149914, "dt_s": 4.372, "eta_s": 32830, "world_size": 1, "timestamp": "2026-05-04T22:44:09.836626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18740, "epoch": 0, "train_loss": 3.9206753075122833, "train_ppl": 50.43449207194929, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 149917, "dt_s": 4.371, "eta_s": 32614, "world_size": 1, "timestamp": "2026-05-04T22:44:14.208111"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18750, "epoch": 0, "train_loss": 3.877800092101097, "train_ppl": 48.31780334902159, "lr": 0.00056, "grad_norm": 0.6665, "tokens_per_sec": 147554, "dt_s": 4.441, "eta_s": 32617, "world_size": 1, "timestamp": "2026-05-04T22:44:18.649602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18760, "epoch": 0, "train_loss": 3.8611783534288406, "train_ppl": 47.521315294556445, "lr": 0.00056, "grad_norm": 0.7267, "tokens_per_sec": 148384, "dt_s": 4.417, "eta_s": 32606, "world_size": 1, "timestamp": "2026-05-04T22:44:23.066245"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18770, "epoch": 0, "train_loss": 3.81782865524292, "train_ppl": 45.50529329039007, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 149251, "dt_s": 4.391, "eta_s": 32505, "world_size": 1, "timestamp": "2026-05-04T22:44:27.457251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18780, "epoch": 0, "train_loss": 3.8701994121074677, "train_ppl": 47.95194732634703, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 147666, "dt_s": 4.438, "eta_s": 32599, "world_size": 1, "timestamp": "2026-05-04T22:44:31.895348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18790, "epoch": 0, "train_loss": 3.7861422896385193, "train_ppl": 44.086000789332196, "lr": 0.00056, "grad_norm": 0.6793, "tokens_per_sec": 149245, "dt_s": 4.391, "eta_s": 32624, "world_size": 1, "timestamp": "2026-05-04T22:44:36.286518"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18800, "epoch": 0, "train_loss": 3.8629695773124695, "train_ppl": 47.60651289069844, "lr": 0.00056, "grad_norm": 0.706, "tokens_per_sec": 147281, "dt_s": 4.45, "eta_s": 32632, "world_size": 1, "timestamp": "2026-05-04T22:44:40.736261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18810, "epoch": 0, "train_loss": 3.825262501835823, "train_ppl": 45.84483313981401, "lr": 0.00056, "grad_norm": 0.6794, "tokens_per_sec": 147481, "dt_s": 4.444, "eta_s": 32667, "world_size": 1, "timestamp": "2026-05-04T22:44:45.179934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18820, "epoch": 0, "train_loss": 3.911928802728653, "train_ppl": 49.995290086872096, "lr": 0.00056, "grad_norm": 0.7274, "tokens_per_sec": 148747, "dt_s": 4.406, "eta_s": 32685, "world_size": 1, "timestamp": "2026-05-04T22:44:49.585804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18830, "epoch": 0, "train_loss": 3.973243623971939, "train_ppl": 53.15667176333704, "lr": 0.00056, "grad_norm": 0.7245, "tokens_per_sec": 131045, "dt_s": 5.001, "eta_s": 33512, "world_size": 1, "timestamp": "2026-05-04T22:44:54.586853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18840, "epoch": 0, "train_loss": 3.7168539464473724, "train_ppl": 41.13477811127066, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 148114, "dt_s": 4.425, "eta_s": 33557, "world_size": 1, "timestamp": "2026-05-04T22:44:59.011512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18850, "epoch": 0, "train_loss": 3.808912068605423, "train_ppl": 45.101344996620455, "lr": 0.00056, "grad_norm": 0.7252, "tokens_per_sec": 148743, "dt_s": 4.406, "eta_s": 33488, "world_size": 1, "timestamp": "2026-05-04T22:45:03.417507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18860, "epoch": 0, "train_loss": 3.9151674658060074, "train_ppl": 50.15747046896744, "lr": 0.00056, "grad_norm": 0.7335, "tokens_per_sec": 146313, "dt_s": 4.479, "eta_s": 33536, "world_size": 1, "timestamp": "2026-05-04T22:45:07.896674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18870, "epoch": 0, "train_loss": 3.810652695596218, "train_ppl": 45.17991797833084, "lr": 0.00056, "grad_norm": 0.7847, "tokens_per_sec": 148218, "dt_s": 4.422, "eta_s": 33554, "world_size": 1, "timestamp": "2026-05-04T22:45:12.318277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18880, "epoch": 0, "train_loss": 3.921813875436783, "train_ppl": 50.491947869368005, "lr": 0.00056, "grad_norm": 1.0134, "tokens_per_sec": 149208, "dt_s": 4.392, "eta_s": 32651, "world_size": 1, "timestamp": "2026-05-04T22:45:16.710517"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18890, "epoch": 0, "train_loss": 3.793006867170334, "train_ppl": 44.38967366031925, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 144688, "dt_s": 4.529, "eta_s": 32801, "world_size": 1, "timestamp": "2026-05-04T22:45:21.239971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18900, "epoch": 0, "train_loss": 3.8900383710861206, "train_ppl": 48.91276332357812, "lr": 0.00056, "grad_norm": 0.7427, "tokens_per_sec": 146187, "dt_s": 4.483, "eta_s": 32911, "world_size": 1, "timestamp": "2026-05-04T22:45:25.723019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18910, "epoch": 0, "train_loss": 3.9036931097507477, "train_ppl": 49.5852350886285, "lr": 0.00056, "grad_norm": 0.7427, "tokens_per_sec": 147700, "dt_s": 4.437, "eta_s": 32844, "world_size": 1, "timestamp": "2026-05-04T22:45:30.160101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18920, "epoch": 0, "train_loss": 3.911791741847992, "train_ppl": 49.98843815796032, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 145780, "dt_s": 4.496, "eta_s": 32949, "world_size": 1, "timestamp": "2026-05-04T22:45:34.655668"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18930, "epoch": 0, "train_loss": 3.8996186703443527, "train_ppl": 49.38361407803014, "lr": 0.00056, "grad_norm": 0.7285, "tokens_per_sec": 147936, "dt_s": 4.43, "eta_s": 33000, "world_size": 1, "timestamp": "2026-05-04T22:45:39.085706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18940, "epoch": 0, "train_loss": 3.9270922392606735, "train_ppl": 50.75916736082849, "lr": 0.00056, "grad_norm": 1.3675, "tokens_per_sec": 145983, "dt_s": 4.489, "eta_s": 32936, "world_size": 1, "timestamp": "2026-05-04T22:45:43.574996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18950, "epoch": 0, "train_loss": 3.884316459298134, "train_ppl": 48.633687990170415, "lr": 0.00056, "grad_norm": 0.8085, "tokens_per_sec": 148455, "dt_s": 4.415, "eta_s": 32831, "world_size": 1, "timestamp": "2026-05-04T22:45:47.989530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18960, "epoch": 0, "train_loss": 3.805874288082123, "train_ppl": 44.96454489875204, "lr": 0.00056, "grad_norm": 0.7048, "tokens_per_sec": 149754, "dt_s": 4.376, "eta_s": 32737, "world_size": 1, "timestamp": "2026-05-04T22:45:52.365771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18970, "epoch": 0, "train_loss": 3.803909122943878, "train_ppl": 44.8762689095175, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 146933, "dt_s": 4.46, "eta_s": 32680, "world_size": 1, "timestamp": "2026-05-04T22:45:56.826042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18980, "epoch": 0, "train_loss": 3.892578586935997, "train_ppl": 49.03717024355223, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 146922, "dt_s": 4.461, "eta_s": 32721, "world_size": 1, "timestamp": "2026-05-04T22:46:01.286655"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 18990, "epoch": 0, "train_loss": 3.8938813358545303, "train_ppl": 49.101094993963834, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 148414, "dt_s": 4.416, "eta_s": 32608, "world_size": 1, "timestamp": "2026-05-04T22:46:05.702424"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19000, "epoch": 0, "train_loss": 3.899813190102577, "train_ppl": 49.393221101048454, "lr": 0.00056, "grad_norm": 0.7389, "tokens_per_sec": 145677, "dt_s": 4.499, "eta_s": 32728, "world_size": 1, "timestamp": "2026-05-04T22:46:10.201136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19010, "epoch": 0, "train_loss": 3.8476175367832184, "train_ppl": 46.88123725052433, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 126220, "dt_s": 5.192, "eta_s": 32792, "world_size": 1, "timestamp": "2026-05-04T22:46:15.393363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19020, "epoch": 0, "train_loss": 3.8293011635541916, "train_ppl": 46.03035929892595, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 149247, "dt_s": 4.391, "eta_s": 32686, "world_size": 1, "timestamp": "2026-05-04T22:46:19.784471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19030, "epoch": 0, "train_loss": 3.9969066083431244, "train_ppl": 54.429517529012905, "lr": 0.00056, "grad_norm": 0.706, "tokens_per_sec": 145704, "dt_s": 4.498, "eta_s": 32736, "world_size": 1, "timestamp": "2026-05-04T22:46:24.282340"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19040, "epoch": 0, "train_loss": 3.890306442975998, "train_ppl": 48.92587721813611, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 147245, "dt_s": 4.451, "eta_s": 32783, "world_size": 1, "timestamp": "2026-05-04T22:46:28.733162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19050, "epoch": 0, "train_loss": 3.88872030377388, "train_ppl": 48.848335478535766, "lr": 0.00056, "grad_norm": 0.9285, "tokens_per_sec": 147266, "dt_s": 4.45, "eta_s": 32707, "world_size": 1, "timestamp": "2026-05-04T22:46:33.183311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19060, "epoch": 0, "train_loss": 3.8190419375896454, "train_ppl": 45.56053756609722, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 149066, "dt_s": 4.396, "eta_s": 32664, "world_size": 1, "timestamp": "2026-05-04T22:46:37.579775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19070, "epoch": 0, "train_loss": 3.84204538166523, "train_ppl": 46.620734180227124, "lr": 0.00056, "grad_norm": 0.737, "tokens_per_sec": 148754, "dt_s": 4.406, "eta_s": 32681, "world_size": 1, "timestamp": "2026-05-04T22:46:41.985432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19080, "epoch": 0, "train_loss": 3.7905193269252777, "train_ppl": 44.27938978528145, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 146518, "dt_s": 4.473, "eta_s": 32640, "world_size": 1, "timestamp": "2026-05-04T22:46:46.458327"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19090, "epoch": 0, "train_loss": 3.896821141242981, "train_ppl": 49.245655042698225, "lr": 0.00056, "grad_norm": 0.7284, "tokens_per_sec": 148513, "dt_s": 4.413, "eta_s": 32579, "world_size": 1, "timestamp": "2026-05-04T22:46:50.871124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19100, "epoch": 0, "train_loss": 3.8991551995277405, "train_ppl": 49.36073151719532, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 149499, "dt_s": 4.384, "eta_s": 32477, "world_size": 1, "timestamp": "2026-05-04T22:46:55.254847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19110, "epoch": 0, "train_loss": 3.84631310403347, "train_ppl": 46.82012369723183, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 147017, "dt_s": 4.458, "eta_s": 32563, "world_size": 1, "timestamp": "2026-05-04T22:46:59.712545"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19120, "epoch": 0, "train_loss": 3.9467369318008423, "train_ppl": 51.76617438427821, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 148252, "dt_s": 4.421, "eta_s": 32580, "world_size": 1, "timestamp": "2026-05-04T22:47:04.133137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19130, "epoch": 0, "train_loss": 3.879327654838562, "train_ppl": 48.391668227228166, "lr": 0.00056, "grad_norm": 0.6935, "tokens_per_sec": 131333, "dt_s": 4.99, "eta_s": 33337, "world_size": 1, "timestamp": "2026-05-04T22:47:09.123226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19140, "epoch": 0, "train_loss": 3.790933594107628, "train_ppl": 44.29773708340334, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 147457, "dt_s": 4.444, "eta_s": 33379, "world_size": 1, "timestamp": "2026-05-04T22:47:13.567628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19150, "epoch": 0, "train_loss": 3.779984802007675, "train_ppl": 43.81537582476823, "lr": 0.00056, "grad_norm": 0.6356, "tokens_per_sec": 148318, "dt_s": 4.419, "eta_s": 33425, "world_size": 1, "timestamp": "2026-05-04T22:47:17.986257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19160, "epoch": 0, "train_loss": 3.8520812690258026, "train_ppl": 47.09097028863154, "lr": 0.00056, "grad_norm": 0.7573, "tokens_per_sec": 146487, "dt_s": 4.474, "eta_s": 33445, "world_size": 1, "timestamp": "2026-05-04T22:47:22.460073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19170, "epoch": 0, "train_loss": 3.9343331903219223, "train_ppl": 51.128045911632306, "lr": 0.00056, "grad_norm": 0.6951, "tokens_per_sec": 149490, "dt_s": 4.384, "eta_s": 33386, "world_size": 1, "timestamp": "2026-05-04T22:47:26.844036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19180, "epoch": 0, "train_loss": 3.932164803147316, "train_ppl": 51.01730062533686, "lr": 0.00056, "grad_norm": 0.727, "tokens_per_sec": 148352, "dt_s": 4.418, "eta_s": 32540, "world_size": 1, "timestamp": "2026-05-04T22:47:31.261626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19190, "epoch": 0, "train_loss": 3.8886139541864395, "train_ppl": 48.84314075444375, "lr": 0.00056, "grad_norm": 0.6971, "tokens_per_sec": 147238, "dt_s": 4.451, "eta_s": 32545, "world_size": 1, "timestamp": "2026-05-04T22:47:35.712671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19200, "epoch": 0, "train_loss": 3.8026592433452606, "train_ppl": 44.8202140147775, "lr": 0.00056, "grad_norm": 0.7425, "tokens_per_sec": 149001, "dt_s": 4.398, "eta_s": 32511, "world_size": 1, "timestamp": "2026-05-04T22:47:40.111048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19210, "epoch": 0, "train_loss": 3.9556224644184113, "train_ppl": 52.22819402072753, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 149247, "dt_s": 4.391, "eta_s": 32385, "world_size": 1, "timestamp": "2026-05-04T22:47:44.502152"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19220, "epoch": 0, "train_loss": 3.8488208204507828, "train_ppl": 46.937682630714576, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 146055, "dt_s": 4.487, "eta_s": 32532, "world_size": 1, "timestamp": "2026-05-04T22:47:48.989245"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19230, "epoch": 0, "train_loss": 3.9616958647966385, "train_ppl": 52.546361956678716, "lr": 0.00056, "grad_norm": 0.6964, "tokens_per_sec": 149729, "dt_s": 4.377, "eta_s": 32468, "world_size": 1, "timestamp": "2026-05-04T22:47:53.366214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19240, "epoch": 0, "train_loss": 3.8815364688634872, "train_ppl": 48.498674557734056, "lr": 0.00056, "grad_norm": 0.692, "tokens_per_sec": 147626, "dt_s": 4.439, "eta_s": 32447, "world_size": 1, "timestamp": "2026-05-04T22:47:57.805550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19250, "epoch": 0, "train_loss": 3.8038452118635178, "train_ppl": 44.87340091033839, "lr": 0.00056, "grad_norm": 0.7543, "tokens_per_sec": 146952, "dt_s": 4.46, "eta_s": 32532, "world_size": 1, "timestamp": "2026-05-04T22:48:02.265237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19260, "epoch": 0, "train_loss": 3.8869254142045975, "train_ppl": 48.76073674924882, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 148720, "dt_s": 4.407, "eta_s": 32551, "world_size": 1, "timestamp": "2026-05-04T22:48:06.671905"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19270, "epoch": 0, "train_loss": 3.8419328927993774, "train_ppl": 46.615490161666365, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 145251, "dt_s": 4.512, "eta_s": 32583, "world_size": 1, "timestamp": "2026-05-04T22:48:11.183821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19280, "epoch": 0, "train_loss": 3.9200475960969925, "train_ppl": 50.40284369961281, "lr": 0.00056, "grad_norm": 0.6861, "tokens_per_sec": 148889, "dt_s": 4.402, "eta_s": 32614, "world_size": 1, "timestamp": "2026-05-04T22:48:15.585472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19290, "epoch": 0, "train_loss": 3.83978271484375, "train_ppl": 46.51536624302628, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 147833, "dt_s": 4.433, "eta_s": 32601, "world_size": 1, "timestamp": "2026-05-04T22:48:20.018580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19300, "epoch": 0, "train_loss": 3.8000259697437286, "train_ppl": 44.70234538668047, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 146262, "dt_s": 4.481, "eta_s": 32627, "world_size": 1, "timestamp": "2026-05-04T22:48:24.499291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19310, "epoch": 0, "train_loss": 3.822510287165642, "train_ppl": 45.71883178841353, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 149859, "dt_s": 4.373, "eta_s": 32574, "world_size": 1, "timestamp": "2026-05-04T22:48:28.872448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19320, "epoch": 0, "train_loss": 3.8118882328271866, "train_ppl": 45.23577394804466, "lr": 0.00056, "grad_norm": 0.7463, "tokens_per_sec": 149128, "dt_s": 4.395, "eta_s": 32397, "world_size": 1, "timestamp": "2026-05-04T22:48:33.267100"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19330, "epoch": 0, "train_loss": 3.848095417022705, "train_ppl": 46.90364622138476, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 148006, "dt_s": 4.428, "eta_s": 32431, "world_size": 1, "timestamp": "2026-05-04T22:48:37.695036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19340, "epoch": 0, "train_loss": 3.777096763253212, "train_ppl": 43.68901787250332, "lr": 0.00056, "grad_norm": 0.747, "tokens_per_sec": 149574, "dt_s": 4.382, "eta_s": 32351, "world_size": 1, "timestamp": "2026-05-04T22:48:42.076543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19350, "epoch": 0, "train_loss": 3.811616897583008, "train_ppl": 45.22350155331675, "lr": 0.00056, "grad_norm": 0.6749, "tokens_per_sec": 146961, "dt_s": 4.459, "eta_s": 32316, "world_size": 1, "timestamp": "2026-05-04T22:48:46.535976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19360, "epoch": 0, "train_loss": 3.88635016977787, "train_ppl": 48.73269547325781, "lr": 0.00056, "grad_norm": 0.6413, "tokens_per_sec": 147494, "dt_s": 4.443, "eta_s": 32414, "world_size": 1, "timestamp": "2026-05-04T22:48:50.979265"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19370, "epoch": 0, "train_loss": 3.9137159883975983, "train_ppl": 50.08472084370988, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 148878, "dt_s": 4.402, "eta_s": 32420, "world_size": 1, "timestamp": "2026-05-04T22:48:55.381242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19380, "epoch": 0, "train_loss": 3.777434855699539, "train_ppl": 43.70379129668431, "lr": 0.00056, "grad_norm": 0.7052, "tokens_per_sec": 145457, "dt_s": 4.506, "eta_s": 32530, "world_size": 1, "timestamp": "2026-05-04T22:48:59.886780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19390, "epoch": 0, "train_loss": 3.9061663150787354, "train_ppl": 49.708021331446524, "lr": 0.00056, "grad_norm": 0.731, "tokens_per_sec": 150061, "dt_s": 4.367, "eta_s": 32504, "world_size": 1, "timestamp": "2026-05-04T22:49:04.254086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19400, "epoch": 0, "train_loss": 4.032412812113762, "train_ppl": 56.396822153120766, "lr": 0.00056, "grad_norm": 0.8527, "tokens_per_sec": 150547, "dt_s": 4.353, "eta_s": 32344, "world_size": 1, "timestamp": "2026-05-04T22:49:08.607294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19410, "epoch": 0, "train_loss": 3.9352346807718277, "train_ppl": 51.17415813848714, "lr": 0.00056, "grad_norm": 0.7966, "tokens_per_sec": 146214, "dt_s": 4.482, "eta_s": 32397, "world_size": 1, "timestamp": "2026-05-04T22:49:13.089480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19420, "epoch": 0, "train_loss": 3.840124636888504, "train_ppl": 46.53127359154617, "lr": 0.00056, "grad_norm": 0.7178, "tokens_per_sec": 133302, "dt_s": 4.916, "eta_s": 33146, "world_size": 1, "timestamp": "2026-05-04T22:49:18.005861"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19430, "epoch": 0, "train_loss": 3.8085516691207886, "train_ppl": 45.08509342383249, "lr": 0.00056, "grad_norm": 0.7256, "tokens_per_sec": 148729, "dt_s": 4.406, "eta_s": 32996, "world_size": 1, "timestamp": "2026-05-04T22:49:22.412235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19440, "epoch": 0, "train_loss": 3.9429518282413483, "train_ppl": 51.57060441314616, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 147621, "dt_s": 4.439, "eta_s": 33098, "world_size": 1, "timestamp": "2026-05-04T22:49:26.851702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19450, "epoch": 0, "train_loss": 3.9002788066864014, "train_ppl": 49.4162247589483, "lr": 0.00056, "grad_norm": 0.6942, "tokens_per_sec": 149432, "dt_s": 4.386, "eta_s": 33141, "world_size": 1, "timestamp": "2026-05-04T22:49:31.237395"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19460, "epoch": 0, "train_loss": 3.8028819113969803, "train_ppl": 44.83019515570919, "lr": 0.00056, "grad_norm": 0.6463, "tokens_per_sec": 146200, "dt_s": 4.483, "eta_s": 33137, "world_size": 1, "timestamp": "2026-05-04T22:49:35.720007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19470, "epoch": 0, "train_loss": 3.840569794178009, "train_ppl": 46.5519919382948, "lr": 0.00056, "grad_norm": 0.7004, "tokens_per_sec": 148727, "dt_s": 4.406, "eta_s": 32386, "world_size": 1, "timestamp": "2026-05-04T22:49:40.126486"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19480, "epoch": 0, "train_loss": 3.813926801085472, "train_ppl": 45.32808421938037, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 149922, "dt_s": 4.371, "eta_s": 32330, "world_size": 1, "timestamp": "2026-05-04T22:49:44.497841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19490, "epoch": 0, "train_loss": 3.8469095677137375, "train_ppl": 46.848058530751466, "lr": 0.00056, "grad_norm": 0.6889, "tokens_per_sec": 145577, "dt_s": 4.502, "eta_s": 32417, "world_size": 1, "timestamp": "2026-05-04T22:49:48.999667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19500, "epoch": 0, "train_loss": 3.903227686882019, "train_ppl": 49.56216235597218, "lr": 0.00056, "grad_norm": 0.753, "tokens_per_sec": 149847, "dt_s": 4.374, "eta_s": 32394, "world_size": 1, "timestamp": "2026-05-04T22:49:53.373196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19510, "epoch": 0, "train_loss": 3.8416979610919952, "train_ppl": 46.604539991293706, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 125418, "dt_s": 5.225, "eta_s": 32353, "world_size": 1, "timestamp": "2026-05-04T22:49:58.598595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19520, "epoch": 0, "train_loss": 3.8097384572029114, "train_ppl": 45.13863163836286, "lr": 0.00056, "grad_norm": 0.6992, "tokens_per_sec": 145401, "dt_s": 4.507, "eta_s": 32496, "world_size": 1, "timestamp": "2026-05-04T22:50:03.105849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19530, "epoch": 0, "train_loss": 3.8810865432024, "train_ppl": 48.47685866765423, "lr": 0.00056, "grad_norm": 0.6841, "tokens_per_sec": 147717, "dt_s": 4.437, "eta_s": 32587, "world_size": 1, "timestamp": "2026-05-04T22:50:07.542441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19540, "epoch": 0, "train_loss": 3.8685585856437683, "train_ppl": 47.873331017686205, "lr": 0.00056, "grad_norm": 0.7365, "tokens_per_sec": 147862, "dt_s": 4.432, "eta_s": 32481, "world_size": 1, "timestamp": "2026-05-04T22:50:11.974660"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19550, "epoch": 0, "train_loss": 4.083410426974297, "train_ppl": 59.34752550785627, "lr": 0.00056, "grad_norm": 0.7397, "tokens_per_sec": 149374, "dt_s": 4.387, "eta_s": 32497, "world_size": 1, "timestamp": "2026-05-04T22:50:16.362059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19560, "epoch": 0, "train_loss": 3.9108947217464447, "train_ppl": 49.94361762955053, "lr": 0.00056, "grad_norm": 0.7377, "tokens_per_sec": 150450, "dt_s": 4.356, "eta_s": 32344, "world_size": 1, "timestamp": "2026-05-04T22:50:20.718049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19570, "epoch": 0, "train_loss": 3.862011954188347, "train_ppl": 47.56094561471632, "lr": 0.00056, "grad_norm": 0.6915, "tokens_per_sec": 147791, "dt_s": 4.434, "eta_s": 32233, "world_size": 1, "timestamp": "2026-05-04T22:50:25.152441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19580, "epoch": 0, "train_loss": 3.8676075488328934, "train_ppl": 47.82782336078766, "lr": 0.00056, "grad_norm": 0.7516, "tokens_per_sec": 148630, "dt_s": 4.409, "eta_s": 32189, "world_size": 1, "timestamp": "2026-05-04T22:50:29.561777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19590, "epoch": 0, "train_loss": 3.8690570890903473, "train_ppl": 47.897201987583784, "lr": 0.00056, "grad_norm": 0.7074, "tokens_per_sec": 149079, "dt_s": 4.396, "eta_s": 32132, "world_size": 1, "timestamp": "2026-05-04T22:50:33.957828"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19600, "epoch": 0, "train_loss": 3.751948133111, "train_ppl": 42.603999468731566, "lr": 0.00056, "grad_norm": 0.6907, "tokens_per_sec": 147641, "dt_s": 4.439, "eta_s": 32203, "world_size": 1, "timestamp": "2026-05-04T22:50:38.396736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19610, "epoch": 0, "train_loss": 3.797409012913704, "train_ppl": 44.58551421634341, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 147708, "dt_s": 4.437, "eta_s": 32316, "world_size": 1, "timestamp": "2026-05-04T22:50:42.833608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19620, "epoch": 0, "train_loss": 3.6945961117744446, "train_ppl": 40.22932115717432, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 148828, "dt_s": 4.403, "eta_s": 32267, "world_size": 1, "timestamp": "2026-05-04T22:50:47.237063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19630, "epoch": 0, "train_loss": 3.86714206635952, "train_ppl": 47.805565527991035, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 147229, "dt_s": 4.451, "eta_s": 32324, "world_size": 1, "timestamp": "2026-05-04T22:50:51.688418"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19640, "epoch": 0, "train_loss": 3.96162086725235, "train_ppl": 52.542421256343935, "lr": 0.00056, "grad_norm": 0.7314, "tokens_per_sec": 148245, "dt_s": 4.421, "eta_s": 32355, "world_size": 1, "timestamp": "2026-05-04T22:50:56.109149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19650, "epoch": 0, "train_loss": 3.8738337606191635, "train_ppl": 48.12653848513076, "lr": 0.00056, "grad_norm": 0.6941, "tokens_per_sec": 147371, "dt_s": 4.447, "eta_s": 32363, "world_size": 1, "timestamp": "2026-05-04T22:51:00.556154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19660, "epoch": 0, "train_loss": 3.856094479560852, "train_ppl": 47.28033599476495, "lr": 0.00056, "grad_norm": 0.6879, "tokens_per_sec": 146460, "dt_s": 4.475, "eta_s": 32413, "world_size": 1, "timestamp": "2026-05-04T22:51:05.030827"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19670, "epoch": 0, "train_loss": 3.8166605681180954, "train_ppl": 45.45217017543972, "lr": 0.00056, "grad_norm": 0.676, "tokens_per_sec": 147360, "dt_s": 4.447, "eta_s": 32473, "world_size": 1, "timestamp": "2026-05-04T22:51:09.478148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19680, "epoch": 0, "train_loss": 3.8981216698884964, "train_ppl": 49.30974209223736, "lr": 0.00056, "grad_norm": 0.6291, "tokens_per_sec": 147439, "dt_s": 4.445, "eta_s": 32459, "world_size": 1, "timestamp": "2026-05-04T22:51:13.923101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19690, "epoch": 0, "train_loss": 3.8450026363134384, "train_ppl": 46.75880762162259, "lr": 0.00056, "grad_norm": 0.7326, "tokens_per_sec": 149247, "dt_s": 4.391, "eta_s": 32411, "world_size": 1, "timestamp": "2026-05-04T22:51:18.314204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19700, "epoch": 0, "train_loss": 3.8103679418563843, "train_ppl": 45.16705465924715, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 148927, "dt_s": 4.401, "eta_s": 32339, "world_size": 1, "timestamp": "2026-05-04T22:51:22.714759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19710, "epoch": 0, "train_loss": 3.875455155968666, "train_ppl": 48.20463392550121, "lr": 0.00056, "grad_norm": 0.6574, "tokens_per_sec": 148126, "dt_s": 4.424, "eta_s": 32261, "world_size": 1, "timestamp": "2026-05-04T22:51:27.139083"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19720, "epoch": 0, "train_loss": 3.8534993082284927, "train_ppl": 47.15779449907087, "lr": 0.00056, "grad_norm": 0.7422, "tokens_per_sec": 133457, "dt_s": 4.911, "eta_s": 32933, "world_size": 1, "timestamp": "2026-05-04T22:51:32.049760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19730, "epoch": 0, "train_loss": 3.8081667125225067, "train_ppl": 45.067740959821876, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 147576, "dt_s": 4.441, "eta_s": 32923, "world_size": 1, "timestamp": "2026-05-04T22:51:36.490604"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19740, "epoch": 0, "train_loss": 3.9724256694316864, "train_ppl": 53.11320979970192, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 149562, "dt_s": 4.382, "eta_s": 32905, "world_size": 1, "timestamp": "2026-05-04T22:51:40.872451"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19750, "epoch": 0, "train_loss": 3.864304780960083, "train_ppl": 47.670119734955804, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 149734, "dt_s": 4.377, "eta_s": 32866, "world_size": 1, "timestamp": "2026-05-04T22:51:45.249287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19760, "epoch": 0, "train_loss": 3.810967281460762, "train_ppl": 45.194133177721234, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 149391, "dt_s": 4.387, "eta_s": 32806, "world_size": 1, "timestamp": "2026-05-04T22:51:49.636129"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19770, "epoch": 0, "train_loss": 3.9248887598514557, "train_ppl": 50.6474437162995, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 150436, "dt_s": 4.356, "eta_s": 31994, "world_size": 1, "timestamp": "2026-05-04T22:51:53.992552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19780, "epoch": 0, "train_loss": 3.8174548149108887, "train_ppl": 45.488284755873394, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 148754, "dt_s": 4.406, "eta_s": 31938, "world_size": 1, "timestamp": "2026-05-04T22:51:58.398218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19790, "epoch": 0, "train_loss": 3.8036806881427765, "train_ppl": 44.866018778742834, "lr": 0.00056, "grad_norm": 0.7731, "tokens_per_sec": 146830, "dt_s": 4.463, "eta_s": 32052, "world_size": 1, "timestamp": "2026-05-04T22:52:02.861585"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19800, "epoch": 0, "train_loss": 3.836443915963173, "train_ppl": 46.36031976881439, "lr": 0.00056, "grad_norm": 0.6747, "tokens_per_sec": 149038, "dt_s": 4.397, "eta_s": 32078, "world_size": 1, "timestamp": "2026-05-04T22:52:07.258846"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19810, "epoch": 0, "train_loss": 3.921842262148857, "train_ppl": 50.49338119009778, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 147568, "dt_s": 4.441, "eta_s": 32152, "world_size": 1, "timestamp": "2026-05-04T22:52:11.699919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19820, "epoch": 0, "train_loss": 3.797105610370636, "train_ppl": 44.571988909856124, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 148610, "dt_s": 4.41, "eta_s": 32226, "world_size": 1, "timestamp": "2026-05-04T22:52:16.109839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19830, "epoch": 0, "train_loss": 3.6966511756181717, "train_ppl": 40.31207998875719, "lr": 0.00056, "grad_norm": 0.7079, "tokens_per_sec": 147935, "dt_s": 4.43, "eta_s": 32257, "world_size": 1, "timestamp": "2026-05-04T22:52:20.539888"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19840, "epoch": 0, "train_loss": 3.857869014143944, "train_ppl": 47.36431107239127, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 146741, "dt_s": 4.466, "eta_s": 32256, "world_size": 1, "timestamp": "2026-05-04T22:52:25.005983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19850, "epoch": 0, "train_loss": 3.882115602493286, "train_ppl": 48.526769905866324, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 148776, "dt_s": 4.405, "eta_s": 32263, "world_size": 1, "timestamp": "2026-05-04T22:52:29.410970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19860, "epoch": 0, "train_loss": 3.855669289827347, "train_ppl": 47.26023715451635, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 147721, "dt_s": 4.436, "eta_s": 32252, "world_size": 1, "timestamp": "2026-05-04T22:52:33.847441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19870, "epoch": 0, "train_loss": 3.883698284626007, "train_ppl": 48.60363316656523, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 147565, "dt_s": 4.441, "eta_s": 32293, "world_size": 1, "timestamp": "2026-05-04T22:52:38.288620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19880, "epoch": 0, "train_loss": 3.8590229898691177, "train_ppl": 47.418999886321345, "lr": 0.00056, "grad_norm": 0.6729, "tokens_per_sec": 150174, "dt_s": 4.364, "eta_s": 32192, "world_size": 1, "timestamp": "2026-05-04T22:52:42.652653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19890, "epoch": 0, "train_loss": 3.7484264224767685, "train_ppl": 42.45422439776988, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 148838, "dt_s": 4.403, "eta_s": 32097, "world_size": 1, "timestamp": "2026-05-04T22:52:47.055838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19900, "epoch": 0, "train_loss": 3.9240132719278336, "train_ppl": 50.603121895404, "lr": 0.00056, "grad_norm": 0.7429, "tokens_per_sec": 148798, "dt_s": 4.404, "eta_s": 32091, "world_size": 1, "timestamp": "2026-05-04T22:52:51.460197"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19910, "epoch": 0, "train_loss": 3.8729382902383804, "train_ppl": 48.083461885175865, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 150004, "dt_s": 4.369, "eta_s": 31989, "world_size": 1, "timestamp": "2026-05-04T22:52:55.829142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19920, "epoch": 0, "train_loss": 4.002552375197411, "train_ppl": 54.73768299159419, "lr": 0.00056, "grad_norm": 0.7134, "tokens_per_sec": 147649, "dt_s": 4.439, "eta_s": 31981, "world_size": 1, "timestamp": "2026-05-04T22:53:00.267789"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19930, "epoch": 0, "train_loss": 3.8327818363904953, "train_ppl": 46.190855074791784, "lr": 0.00056, "grad_norm": 0.7277, "tokens_per_sec": 148277, "dt_s": 4.42, "eta_s": 32058, "world_size": 1, "timestamp": "2026-05-04T22:53:04.687620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19940, "epoch": 0, "train_loss": 3.8646824210882187, "train_ppl": 47.68812528467583, "lr": 0.00056, "grad_norm": 0.6565, "tokens_per_sec": 148533, "dt_s": 4.412, "eta_s": 32066, "world_size": 1, "timestamp": "2026-05-04T22:53:09.099874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19950, "epoch": 0, "train_loss": 3.8738948106765747, "train_ppl": 48.12947670275654, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 146605, "dt_s": 4.47, "eta_s": 32158, "world_size": 1, "timestamp": "2026-05-04T22:53:13.570099"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19960, "epoch": 0, "train_loss": 3.799182802438736, "train_ppl": 44.66466971627273, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 149536, "dt_s": 4.383, "eta_s": 32173, "world_size": 1, "timestamp": "2026-05-04T22:53:17.952708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19970, "epoch": 0, "train_loss": 3.790256842970848, "train_ppl": 44.26776868119428, "lr": 0.00056, "grad_norm": 0.7436, "tokens_per_sec": 149637, "dt_s": 4.38, "eta_s": 32083, "world_size": 1, "timestamp": "2026-05-04T22:53:22.332402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19980, "epoch": 0, "train_loss": 3.8058812469244003, "train_ppl": 44.96485780101679, "lr": 0.00056, "grad_norm": 0.628, "tokens_per_sec": 146158, "dt_s": 4.484, "eta_s": 32172, "world_size": 1, "timestamp": "2026-05-04T22:53:26.816316"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 19990, "epoch": 0, "train_loss": 3.8509745448827744, "train_ppl": 47.038882403671, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 148927, "dt_s": 4.401, "eta_s": 32150, "world_size": 1, "timestamp": "2026-05-04T22:53:31.216851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20000, "epoch": 0, "train_loss": 3.8063357919454575, "train_ppl": 44.98530099907775, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 147992, "dt_s": 4.428, "eta_s": 32085, "world_size": 1, "timestamp": "2026-05-04T22:53:35.645203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20010, "epoch": 0, "train_loss": 3.830800473690033, "train_ppl": 46.09942484557844, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 113442, "dt_s": 5.777, "eta_s": 32964, "world_size": 1, "timestamp": "2026-05-04T22:53:41.422284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20020, "epoch": 0, "train_loss": 3.7643633782863617, "train_ppl": 43.136235652461565, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 147135, "dt_s": 4.454, "eta_s": 33067, "world_size": 1, "timestamp": "2026-05-04T22:53:45.876397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20030, "epoch": 0, "train_loss": 3.7801619321107864, "train_ppl": 43.823137534201834, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 143218, "dt_s": 4.576, "eta_s": 33197, "world_size": 1, "timestamp": "2026-05-04T22:53:50.452401"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20040, "epoch": 0, "train_loss": 3.8100436627864838, "train_ppl": 45.15241030332949, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 149379, "dt_s": 4.387, "eta_s": 33173, "world_size": 1, "timestamp": "2026-05-04T22:53:54.839657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20050, "epoch": 0, "train_loss": 3.9075951278209686, "train_ppl": 49.779095549498756, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 148757, "dt_s": 4.406, "eta_s": 33135, "world_size": 1, "timestamp": "2026-05-04T22:53:59.245217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20060, "epoch": 0, "train_loss": 3.878635808825493, "train_ppl": 48.358200223186934, "lr": 0.00056, "grad_norm": 0.8041, "tokens_per_sec": 145300, "dt_s": 4.51, "eta_s": 32434, "world_size": 1, "timestamp": "2026-05-04T22:54:03.755608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20070, "epoch": 0, "train_loss": 3.836929589509964, "train_ppl": 46.382841218340324, "lr": 0.00056, "grad_norm": 0.771, "tokens_per_sec": 149590, "dt_s": 4.381, "eta_s": 32323, "world_size": 1, "timestamp": "2026-05-04T22:54:08.136676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20080, "epoch": 0, "train_loss": 3.9391648024320602, "train_ppl": 51.37567453843282, "lr": 0.00056, "grad_norm": 0.6817, "tokens_per_sec": 147741, "dt_s": 4.436, "eta_s": 32115, "world_size": 1, "timestamp": "2026-05-04T22:54:12.572537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20090, "epoch": 0, "train_loss": 3.7913805842399597, "train_ppl": 44.31754216077157, "lr": 0.00056, "grad_norm": 0.6938, "tokens_per_sec": 147117, "dt_s": 4.455, "eta_s": 32209, "world_size": 1, "timestamp": "2026-05-04T22:54:17.027244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20100, "epoch": 0, "train_loss": 3.8153284788131714, "train_ppl": 45.39166413433569, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 150014, "dt_s": 4.369, "eta_s": 32151, "world_size": 1, "timestamp": "2026-05-04T22:54:21.395892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20110, "epoch": 0, "train_loss": 3.849601998925209, "train_ppl": 46.97436366337636, "lr": 0.00056, "grad_norm": 0.7177, "tokens_per_sec": 148909, "dt_s": 4.401, "eta_s": 31988, "world_size": 1, "timestamp": "2026-05-04T22:54:25.796969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20120, "epoch": 0, "train_loss": 3.8248369842767715, "train_ppl": 45.825329508184375, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 149392, "dt_s": 4.387, "eta_s": 31992, "world_size": 1, "timestamp": "2026-05-04T22:54:30.183801"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20130, "epoch": 0, "train_loss": 3.8678525537252426, "train_ppl": 47.83954284710858, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 149587, "dt_s": 4.381, "eta_s": 31908, "world_size": 1, "timestamp": "2026-05-04T22:54:34.564931"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20140, "epoch": 0, "train_loss": 3.811628669500351, "train_ppl": 45.22403392377251, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 148343, "dt_s": 4.418, "eta_s": 31850, "world_size": 1, "timestamp": "2026-05-04T22:54:38.982810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20150, "epoch": 0, "train_loss": 3.8363232612609863, "train_ppl": 46.35472651567235, "lr": 0.00056, "grad_norm": 0.6868, "tokens_per_sec": 150778, "dt_s": 4.347, "eta_s": 31813, "world_size": 1, "timestamp": "2026-05-04T22:54:43.329356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20160, "epoch": 0, "train_loss": 3.9241959005594254, "train_ppl": 50.61236431824986, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 148638, "dt_s": 4.409, "eta_s": 31821, "world_size": 1, "timestamp": "2026-05-04T22:54:47.738417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20170, "epoch": 0, "train_loss": 3.9573196470737457, "train_ppl": 52.31691006811237, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 146658, "dt_s": 4.469, "eta_s": 31935, "world_size": 1, "timestamp": "2026-05-04T22:54:52.207060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20180, "epoch": 0, "train_loss": 3.7341541051864624, "train_ppl": 41.85260768154263, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 149869, "dt_s": 4.373, "eta_s": 31918, "world_size": 1, "timestamp": "2026-05-04T22:54:56.579934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20190, "epoch": 0, "train_loss": 3.8522152453660965, "train_ppl": 47.09727978714402, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 145739, "dt_s": 4.497, "eta_s": 32029, "world_size": 1, "timestamp": "2026-05-04T22:55:01.076761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20200, "epoch": 0, "train_loss": 3.886360004544258, "train_ppl": 48.73317475029003, "lr": 0.00056, "grad_norm": 0.7065, "tokens_per_sec": 142663, "dt_s": 4.594, "eta_s": 32382, "world_size": 1, "timestamp": "2026-05-04T22:55:05.670516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20210, "epoch": 0, "train_loss": 3.8897548019886017, "train_ppl": 48.89889514181212, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 146843, "dt_s": 4.463, "eta_s": 32456, "world_size": 1, "timestamp": "2026-05-04T22:55:10.133528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20220, "epoch": 0, "train_loss": 3.8433416336774826, "train_ppl": 46.681205585343875, "lr": 0.00056, "grad_norm": 0.6568, "tokens_per_sec": 148107, "dt_s": 4.425, "eta_s": 32388, "world_size": 1, "timestamp": "2026-05-04T22:55:14.558440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20230, "epoch": 0, "train_loss": 3.8649065792560577, "train_ppl": 47.6988161656467, "lr": 0.00056, "grad_norm": 0.7032, "tokens_per_sec": 147165, "dt_s": 4.453, "eta_s": 32500, "world_size": 1, "timestamp": "2026-05-04T22:55:19.011669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20240, "epoch": 0, "train_loss": 3.770083948969841, "train_ppl": 43.383706700469475, "lr": 0.00056, "grad_norm": 0.7249, "tokens_per_sec": 149012, "dt_s": 4.398, "eta_s": 32353, "world_size": 1, "timestamp": "2026-05-04T22:55:23.409697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20250, "epoch": 0, "train_loss": 3.9294351637363434, "train_ppl": 50.87823168129317, "lr": 0.00056, "grad_norm": 0.695, "tokens_per_sec": 145536, "dt_s": 4.503, "eta_s": 32217, "world_size": 1, "timestamp": "2026-05-04T22:55:27.912787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20260, "epoch": 0, "train_loss": 3.8675881773233414, "train_ppl": 47.82689687262434, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 148016, "dt_s": 4.428, "eta_s": 32161, "world_size": 1, "timestamp": "2026-05-04T22:55:32.340448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20270, "epoch": 0, "train_loss": 3.893860846757889, "train_ppl": 49.10008896718963, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 149157, "dt_s": 4.394, "eta_s": 32112, "world_size": 1, "timestamp": "2026-05-04T22:55:36.734188"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20280, "epoch": 0, "train_loss": 3.844491347670555, "train_ppl": 46.734906485041414, "lr": 0.00056, "grad_norm": 0.7161, "tokens_per_sec": 147944, "dt_s": 4.43, "eta_s": 32073, "world_size": 1, "timestamp": "2026-05-04T22:55:41.163968"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20290, "epoch": 0, "train_loss": 3.871149569749832, "train_ppl": 47.99753088792072, "lr": 0.00056, "grad_norm": 0.696, "tokens_per_sec": 149914, "dt_s": 4.372, "eta_s": 32030, "world_size": 1, "timestamp": "2026-05-04T22:55:45.535536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20300, "epoch": 0, "train_loss": 3.8326271772384644, "train_ppl": 46.183711788715854, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 146937, "dt_s": 4.46, "eta_s": 31964, "world_size": 1, "timestamp": "2026-05-04T22:55:49.995658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20310, "epoch": 0, "train_loss": 3.871094822883606, "train_ppl": 47.99490324544626, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 132896, "dt_s": 4.931, "eta_s": 32689, "world_size": 1, "timestamp": "2026-05-04T22:55:54.927025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20320, "epoch": 0, "train_loss": 3.817547231912613, "train_ppl": 45.49248884102559, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 148026, "dt_s": 4.427, "eta_s": 32733, "world_size": 1, "timestamp": "2026-05-04T22:55:59.354351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20330, "epoch": 0, "train_loss": 3.881337270140648, "train_ppl": 48.48901464585575, "lr": 0.00056, "grad_norm": 0.6772, "tokens_per_sec": 147525, "dt_s": 4.442, "eta_s": 32746, "world_size": 1, "timestamp": "2026-05-04T22:56:03.796714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20340, "epoch": 0, "train_loss": 3.8875223249197006, "train_ppl": 48.789851244003245, "lr": 0.00056, "grad_norm": 0.7157, "tokens_per_sec": 149510, "dt_s": 4.383, "eta_s": 32759, "world_size": 1, "timestamp": "2026-05-04T22:56:08.180106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20350, "epoch": 0, "train_loss": 3.758042573928833, "train_ppl": 42.864439835186325, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 149612, "dt_s": 4.38, "eta_s": 32639, "world_size": 1, "timestamp": "2026-05-04T22:56:12.560526"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20360, "epoch": 0, "train_loss": 3.8295892030000687, "train_ppl": 46.04361976778931, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 150316, "dt_s": 4.36, "eta_s": 31808, "world_size": 1, "timestamp": "2026-05-04T22:56:16.920406"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20370, "epoch": 0, "train_loss": 3.864946961402893, "train_ppl": 47.70074238513715, "lr": 0.00056, "grad_norm": 0.733, "tokens_per_sec": 149985, "dt_s": 4.369, "eta_s": 31720, "world_size": 1, "timestamp": "2026-05-04T22:56:21.289894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20380, "epoch": 0, "train_loss": 3.817733481526375, "train_ppl": 45.500962588592905, "lr": 0.00056, "grad_norm": 0.6361, "tokens_per_sec": 147691, "dt_s": 4.437, "eta_s": 31708, "world_size": 1, "timestamp": "2026-05-04T22:56:25.727303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20390, "epoch": 0, "train_loss": 3.871533378958702, "train_ppl": 48.0159563179772, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 145767, "dt_s": 4.496, "eta_s": 31867, "world_size": 1, "timestamp": "2026-05-04T22:56:30.223264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20400, "epoch": 0, "train_loss": 3.842451199889183, "train_ppl": 46.63965756323868, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 147577, "dt_s": 4.441, "eta_s": 31949, "world_size": 1, "timestamp": "2026-05-04T22:56:34.664012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20410, "epoch": 0, "train_loss": 3.9491173923015594, "train_ppl": 51.889548503022226, "lr": 0.00056, "grad_norm": 0.7021, "tokens_per_sec": 145769, "dt_s": 4.496, "eta_s": 32142, "world_size": 1, "timestamp": "2026-05-04T22:56:39.159881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20420, "epoch": 0, "train_loss": 3.911720246076584, "train_ppl": 49.98486432377127, "lr": 0.00056, "grad_norm": 0.748, "tokens_per_sec": 149657, "dt_s": 4.379, "eta_s": 32151, "world_size": 1, "timestamp": "2026-05-04T22:56:43.538986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20430, "epoch": 0, "train_loss": 3.8499162197113037, "train_ppl": 46.98912630409578, "lr": 0.00056, "grad_norm": 0.7412, "tokens_per_sec": 149258, "dt_s": 4.391, "eta_s": 32079, "world_size": 1, "timestamp": "2026-05-04T22:56:47.929754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20440, "epoch": 0, "train_loss": 3.876400962471962, "train_ppl": 48.25024774928222, "lr": 0.00056, "grad_norm": 0.8428, "tokens_per_sec": 147010, "dt_s": 4.458, "eta_s": 32020, "world_size": 1, "timestamp": "2026-05-04T22:56:52.387676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20450, "epoch": 0, "train_loss": 3.9480549544095993, "train_ppl": 51.83444835592095, "lr": 0.00056, "grad_norm": 0.6447, "tokens_per_sec": 147851, "dt_s": 4.433, "eta_s": 32004, "world_size": 1, "timestamp": "2026-05-04T22:56:56.820279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20460, "epoch": 0, "train_loss": 3.937415733933449, "train_ppl": 51.28589350398078, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 146578, "dt_s": 4.471, "eta_s": 31963, "world_size": 1, "timestamp": "2026-05-04T22:57:01.291385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20470, "epoch": 0, "train_loss": 3.9548581391572952, "train_ppl": 52.18828994448441, "lr": 0.00056, "grad_norm": 0.7285, "tokens_per_sec": 147715, "dt_s": 4.437, "eta_s": 32042, "world_size": 1, "timestamp": "2026-05-04T22:57:05.727987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20480, "epoch": 0, "train_loss": 3.8312118500471115, "train_ppl": 46.1183929602843, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 147205, "dt_s": 4.452, "eta_s": 32126, "world_size": 1, "timestamp": "2026-05-04T22:57:10.180009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20490, "epoch": 0, "train_loss": 4.004450246691704, "train_ppl": 54.84166672247523, "lr": 0.00056, "grad_norm": 0.7055, "tokens_per_sec": 147019, "dt_s": 4.458, "eta_s": 32121, "world_size": 1, "timestamp": "2026-05-04T22:57:14.637645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20500, "epoch": 0, "train_loss": 3.9274518936872482, "train_ppl": 50.77742640333497, "lr": 0.00056, "grad_norm": 0.6958, "tokens_per_sec": 150489, "dt_s": 4.355, "eta_s": 32004, "world_size": 1, "timestamp": "2026-05-04T22:57:18.992518"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20510, "epoch": 0, "train_loss": 3.8872328996658325, "train_ppl": 48.77573227221282, "lr": 0.00056, "grad_norm": 0.6853, "tokens_per_sec": 126514, "dt_s": 5.18, "eta_s": 31922, "world_size": 1, "timestamp": "2026-05-04T22:57:24.172656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20520, "epoch": 0, "train_loss": 3.7779377698898315, "train_ppl": 43.72577608126368, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 148532, "dt_s": 4.412, "eta_s": 31883, "world_size": 1, "timestamp": "2026-05-04T22:57:28.584900"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20530, "epoch": 0, "train_loss": 3.861259788274765, "train_ppl": 47.52518534312184, "lr": 0.00056, "grad_norm": 0.7273, "tokens_per_sec": 146583, "dt_s": 4.471, "eta_s": 31905, "world_size": 1, "timestamp": "2026-05-04T22:57:33.055812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20540, "epoch": 0, "train_loss": 3.809243366122246, "train_ppl": 45.11628943561428, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 147533, "dt_s": 4.442, "eta_s": 31879, "world_size": 1, "timestamp": "2026-05-04T22:57:37.497946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20550, "epoch": 0, "train_loss": 3.7978348284959793, "train_ppl": 44.6045034657127, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 148244, "dt_s": 4.421, "eta_s": 31969, "world_size": 1, "timestamp": "2026-05-04T22:57:41.918748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20560, "epoch": 0, "train_loss": 3.902535155415535, "train_ppl": 49.52785088125294, "lr": 0.00056, "grad_norm": 0.716, "tokens_per_sec": 148837, "dt_s": 4.403, "eta_s": 31945, "world_size": 1, "timestamp": "2026-05-04T22:57:46.321950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20570, "epoch": 0, "train_loss": 3.8500216752290726, "train_ppl": 46.99408182802876, "lr": 0.00056, "grad_norm": 0.619, "tokens_per_sec": 145525, "dt_s": 4.503, "eta_s": 32072, "world_size": 1, "timestamp": "2026-05-04T22:57:50.825370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20580, "epoch": 0, "train_loss": 3.736899197101593, "train_ppl": 41.96765477168982, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 149119, "dt_s": 4.395, "eta_s": 31958, "world_size": 1, "timestamp": "2026-05-04T22:57:55.220251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20590, "epoch": 0, "train_loss": 3.8064122796058655, "train_ppl": 44.988741951097424, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 148769, "dt_s": 4.405, "eta_s": 31900, "world_size": 1, "timestamp": "2026-05-04T22:57:59.625455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20600, "epoch": 0, "train_loss": 3.948079466819763, "train_ppl": 51.83571895875237, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 130446, "dt_s": 5.024, "eta_s": 32765, "world_size": 1, "timestamp": "2026-05-04T22:58:04.649464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20610, "epoch": 0, "train_loss": 3.947409838438034, "train_ppl": 51.80101990918106, "lr": 0.00056, "grad_norm": 0.848, "tokens_per_sec": 148908, "dt_s": 4.401, "eta_s": 32758, "world_size": 1, "timestamp": "2026-05-04T22:58:09.050561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20620, "epoch": 0, "train_loss": 3.916774868965149, "train_ppl": 50.23815857724075, "lr": 0.00056, "grad_norm": 0.7887, "tokens_per_sec": 148173, "dt_s": 4.423, "eta_s": 32637, "world_size": 1, "timestamp": "2026-05-04T22:58:13.473508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20630, "epoch": 0, "train_loss": 3.8344424962997437, "train_ppl": 46.26762610363007, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 147084, "dt_s": 4.456, "eta_s": 32720, "world_size": 1, "timestamp": "2026-05-04T22:58:17.929222"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20640, "epoch": 0, "train_loss": 3.821068614721298, "train_ppl": 45.65296769706715, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 148587, "dt_s": 4.411, "eta_s": 32724, "world_size": 1, "timestamp": "2026-05-04T22:58:22.339812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20650, "epoch": 0, "train_loss": 3.848126143217087, "train_ppl": 46.90508741407684, "lr": 0.00056, "grad_norm": 0.6762, "tokens_per_sec": 147086, "dt_s": 4.456, "eta_s": 31900, "world_size": 1, "timestamp": "2026-05-04T22:58:26.795423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20660, "epoch": 0, "train_loss": 3.755059838294983, "train_ppl": 42.736777029912346, "lr": 0.00056, "grad_norm": 0.7073, "tokens_per_sec": 149733, "dt_s": 4.377, "eta_s": 31861, "world_size": 1, "timestamp": "2026-05-04T22:58:31.172275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20670, "epoch": 0, "train_loss": 3.806041434407234, "train_ppl": 44.97206118533452, "lr": 0.00056, "grad_norm": 0.7132, "tokens_per_sec": 150247, "dt_s": 4.362, "eta_s": 31768, "world_size": 1, "timestamp": "2026-05-04T22:58:35.534171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20680, "epoch": 0, "train_loss": 3.8664979934692383, "train_ppl": 47.77478517269012, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 144093, "dt_s": 4.548, "eta_s": 31897, "world_size": 1, "timestamp": "2026-05-04T22:58:40.082334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20690, "epoch": 0, "train_loss": 3.9196259826421738, "train_ppl": 50.38159766167038, "lr": 0.00056, "grad_norm": 0.6659, "tokens_per_sec": 147527, "dt_s": 4.442, "eta_s": 31938, "world_size": 1, "timestamp": "2026-05-04T22:58:44.524615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20700, "epoch": 0, "train_loss": 3.8375285118818283, "train_ppl": 46.410629260225946, "lr": 0.00056, "grad_norm": 0.675, "tokens_per_sec": 147840, "dt_s": 4.433, "eta_s": 31901, "world_size": 1, "timestamp": "2026-05-04T22:58:48.957501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20710, "epoch": 0, "train_loss": 3.842821344733238, "train_ppl": 46.656924187393024, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 145719, "dt_s": 4.497, "eta_s": 32070, "world_size": 1, "timestamp": "2026-05-04T22:58:53.454926"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20720, "epoch": 0, "train_loss": 3.8518802523612976, "train_ppl": 47.081505170210946, "lr": 0.00056, "grad_norm": 0.6992, "tokens_per_sec": 149691, "dt_s": 4.378, "eta_s": 32089, "world_size": 1, "timestamp": "2026-05-04T22:58:57.832999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20730, "epoch": 0, "train_loss": 3.8756970018148422, "train_ppl": 48.21629342582657, "lr": 0.00056, "grad_norm": 0.6804, "tokens_per_sec": 147324, "dt_s": 4.448, "eta_s": 31941, "world_size": 1, "timestamp": "2026-05-04T22:59:02.281463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20740, "epoch": 0, "train_loss": 3.8907152265310287, "train_ppl": 48.94588140057018, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 146990, "dt_s": 4.459, "eta_s": 31960, "world_size": 1, "timestamp": "2026-05-04T22:59:06.739958"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20750, "epoch": 0, "train_loss": 3.8555430471897125, "train_ppl": 47.25427127410491, "lr": 0.00056, "grad_norm": 0.7125, "tokens_per_sec": 147863, "dt_s": 4.432, "eta_s": 31955, "world_size": 1, "timestamp": "2026-05-04T22:59:11.172200"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20760, "epoch": 0, "train_loss": 3.8995185643434525, "train_ppl": 49.37867072934835, "lr": 0.00056, "grad_norm": 0.7103, "tokens_per_sec": 146280, "dt_s": 4.48, "eta_s": 31926, "world_size": 1, "timestamp": "2026-05-04T22:59:15.652366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20770, "epoch": 0, "train_loss": 3.891171395778656, "train_ppl": 48.96821409982117, "lr": 0.00056, "grad_norm": 0.6762, "tokens_per_sec": 147680, "dt_s": 4.438, "eta_s": 32007, "world_size": 1, "timestamp": "2026-05-04T22:59:20.090068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20780, "epoch": 0, "train_loss": 3.8335532397031784, "train_ppl": 46.22650060018271, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 149800, "dt_s": 4.375, "eta_s": 31897, "world_size": 1, "timestamp": "2026-05-04T22:59:24.464955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20790, "epoch": 0, "train_loss": 3.9363673627376556, "train_ppl": 51.23215502433969, "lr": 0.00056, "grad_norm": 0.7286, "tokens_per_sec": 145762, "dt_s": 4.496, "eta_s": 31946, "world_size": 1, "timestamp": "2026-05-04T22:59:28.961047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20800, "epoch": 0, "train_loss": 3.8560822159051895, "train_ppl": 47.2797561685601, "lr": 0.00056, "grad_norm": 0.7072, "tokens_per_sec": 150212, "dt_s": 4.363, "eta_s": 31842, "world_size": 1, "timestamp": "2026-05-04T22:59:33.323947"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20810, "epoch": 0, "train_loss": 3.7862167954444885, "train_ppl": 44.08928557471924, "lr": 0.00056, "grad_norm": 0.6957, "tokens_per_sec": 149044, "dt_s": 4.397, "eta_s": 31718, "world_size": 1, "timestamp": "2026-05-04T22:59:37.721050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20820, "epoch": 0, "train_loss": 3.8060117810964584, "train_ppl": 44.97072763460015, "lr": 0.00056, "grad_norm": 0.7462, "tokens_per_sec": 145875, "dt_s": 4.493, "eta_s": 31793, "world_size": 1, "timestamp": "2026-05-04T22:59:42.213667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20830, "epoch": 0, "train_loss": 3.854127272963524, "train_ppl": 47.18741723104019, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 150267, "dt_s": 4.361, "eta_s": 31769, "world_size": 1, "timestamp": "2026-05-04T22:59:46.574958"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20840, "epoch": 0, "train_loss": 3.7913889288902283, "train_ppl": 44.31791197670466, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 148750, "dt_s": 4.406, "eta_s": 31635, "world_size": 1, "timestamp": "2026-05-04T22:59:50.980741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20850, "epoch": 0, "train_loss": 3.831867516040802, "train_ppl": 46.148641137498956, "lr": 0.00056, "grad_norm": 0.6315, "tokens_per_sec": 148258, "dt_s": 4.42, "eta_s": 31713, "world_size": 1, "timestamp": "2026-05-04T22:59:55.401157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20860, "epoch": 0, "train_loss": 3.8679725974798203, "train_ppl": 47.84528603015896, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 149367, "dt_s": 4.388, "eta_s": 31695, "world_size": 1, "timestamp": "2026-05-04T22:59:59.788748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20870, "epoch": 0, "train_loss": 3.9295923709869385, "train_ppl": 50.88623073694923, "lr": 0.00056, "grad_norm": 0.6821, "tokens_per_sec": 146619, "dt_s": 4.47, "eta_s": 31658, "world_size": 1, "timestamp": "2026-05-04T23:00:04.258576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20880, "epoch": 0, "train_loss": 3.8194386959075928, "train_ppl": 45.57861767482631, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 149664, "dt_s": 4.379, "eta_s": 31679, "world_size": 1, "timestamp": "2026-05-04T23:00:08.637454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20890, "epoch": 0, "train_loss": 3.749262586236, "train_ppl": 42.48973792713152, "lr": 0.00056, "grad_norm": 0.7251, "tokens_per_sec": 150588, "dt_s": 4.352, "eta_s": 31597, "world_size": 1, "timestamp": "2026-05-04T23:00:12.989456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20900, "epoch": 0, "train_loss": 3.9544361382722855, "train_ppl": 52.16627108625629, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 131104, "dt_s": 4.999, "eta_s": 32423, "world_size": 1, "timestamp": "2026-05-04T23:00:17.988279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20910, "epoch": 0, "train_loss": 3.9025394916534424, "train_ppl": 49.52806564626304, "lr": 0.00056, "grad_norm": 0.6964, "tokens_per_sec": 149499, "dt_s": 4.384, "eta_s": 32413, "world_size": 1, "timestamp": "2026-05-04T23:00:22.371938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20920, "epoch": 0, "train_loss": 3.9823212027549744, "train_ppl": 53.64140240081022, "lr": 0.00056, "grad_norm": 0.692, "tokens_per_sec": 147861, "dt_s": 4.432, "eta_s": 32354, "world_size": 1, "timestamp": "2026-05-04T23:00:26.804219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20930, "epoch": 0, "train_loss": 3.7331976890563965, "train_ppl": 41.81259830832545, "lr": 0.00056, "grad_norm": 0.6684, "tokens_per_sec": 146576, "dt_s": 4.471, "eta_s": 32482, "world_size": 1, "timestamp": "2026-05-04T23:00:31.275349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20940, "epoch": 0, "train_loss": 3.7476911544799805, "train_ppl": 42.42302063821041, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 149073, "dt_s": 4.396, "eta_s": 32541, "world_size": 1, "timestamp": "2026-05-04T23:00:35.671564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20950, "epoch": 0, "train_loss": 3.8347494453191757, "train_ppl": 46.28183008593173, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 148033, "dt_s": 4.427, "eta_s": 31716, "world_size": 1, "timestamp": "2026-05-04T23:00:40.098690"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20960, "epoch": 0, "train_loss": 3.8779627978801727, "train_ppl": 48.325665574456124, "lr": 0.00056, "grad_norm": 0.7084, "tokens_per_sec": 149169, "dt_s": 4.393, "eta_s": 31726, "world_size": 1, "timestamp": "2026-05-04T23:00:44.492098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20970, "epoch": 0, "train_loss": 3.8370906114578247, "train_ppl": 46.390310475121616, "lr": 0.00056, "grad_norm": 0.7054, "tokens_per_sec": 149442, "dt_s": 4.385, "eta_s": 31654, "world_size": 1, "timestamp": "2026-05-04T23:00:48.877496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20980, "epoch": 0, "train_loss": 3.831310212612152, "train_ppl": 46.12292950682094, "lr": 0.00056, "grad_norm": 0.6971, "tokens_per_sec": 147932, "dt_s": 4.43, "eta_s": 31591, "world_size": 1, "timestamp": "2026-05-04T23:00:53.307624"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 20990, "epoch": 0, "train_loss": 3.8778097182512283, "train_ppl": 48.31826846568927, "lr": 0.00056, "grad_norm": 0.6899, "tokens_per_sec": 148099, "dt_s": 4.425, "eta_s": 31628, "world_size": 1, "timestamp": "2026-05-04T23:00:57.732812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21000, "epoch": 0, "train_loss": 3.799220383167267, "train_ppl": 44.66634827864087, "lr": 0.00056, "grad_norm": 1.1803, "tokens_per_sec": 148851, "dt_s": 4.403, "eta_s": 31589, "world_size": 1, "timestamp": "2026-05-04T23:01:02.135567"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21010, "epoch": 0, "train_loss": 3.9693707078695297, "train_ppl": 52.95119858034632, "lr": 0.00056, "grad_norm": 0.76, "tokens_per_sec": 127082, "dt_s": 5.157, "eta_s": 31579, "world_size": 1, "timestamp": "2026-05-04T23:01:07.292558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21020, "epoch": 0, "train_loss": 3.7465348541736603, "train_ppl": 42.373995235957615, "lr": 0.00056, "grad_norm": 0.6475, "tokens_per_sec": 145095, "dt_s": 4.517, "eta_s": 31763, "world_size": 1, "timestamp": "2026-05-04T23:01:11.809337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21030, "epoch": 0, "train_loss": 3.919333189725876, "train_ppl": 50.36684844610156, "lr": 0.00056, "grad_norm": 0.7092, "tokens_per_sec": 149411, "dt_s": 4.386, "eta_s": 31695, "world_size": 1, "timestamp": "2026-05-04T23:01:16.195567"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21040, "epoch": 0, "train_loss": 3.8144029676914215, "train_ppl": 45.34967307893638, "lr": 0.00056, "grad_norm": 0.7592, "tokens_per_sec": 148571, "dt_s": 4.411, "eta_s": 31671, "world_size": 1, "timestamp": "2026-05-04T23:01:20.606684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21050, "epoch": 0, "train_loss": 3.895746812224388, "train_ppl": 49.19277741551677, "lr": 0.00056, "grad_norm": 0.641, "tokens_per_sec": 149020, "dt_s": 4.398, "eta_s": 31659, "world_size": 1, "timestamp": "2026-05-04T23:01:25.004466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21060, "epoch": 0, "train_loss": 3.873661518096924, "train_ppl": 48.11824976261114, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 147124, "dt_s": 4.454, "eta_s": 31748, "world_size": 1, "timestamp": "2026-05-04T23:01:29.458964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21070, "epoch": 0, "train_loss": 3.9407248347997665, "train_ppl": 51.45588480266241, "lr": 0.00056, "grad_norm": 0.733, "tokens_per_sec": 149675, "dt_s": 4.379, "eta_s": 31546, "world_size": 1, "timestamp": "2026-05-04T23:01:33.837519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21080, "epoch": 0, "train_loss": 3.84017550945282, "train_ppl": 46.53364081696755, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 148858, "dt_s": 4.403, "eta_s": 31565, "world_size": 1, "timestamp": "2026-05-04T23:01:38.240070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21090, "epoch": 0, "train_loss": 3.8618125915527344, "train_ppl": 47.551464684349384, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 146733, "dt_s": 4.466, "eta_s": 31639, "world_size": 1, "timestamp": "2026-05-04T23:01:42.706411"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21100, "epoch": 0, "train_loss": 3.858332797884941, "train_ppl": 47.3862829644795, "lr": 0.00056, "grad_norm": 0.6417, "tokens_per_sec": 147860, "dt_s": 4.432, "eta_s": 31684, "world_size": 1, "timestamp": "2026-05-04T23:01:47.138716"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21110, "epoch": 0, "train_loss": 3.9575137346982956, "train_ppl": 52.32706511836419, "lr": 0.00056, "grad_norm": 0.6648, "tokens_per_sec": 145914, "dt_s": 4.491, "eta_s": 31733, "world_size": 1, "timestamp": "2026-05-04T23:01:51.630122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21120, "epoch": 0, "train_loss": 3.810145065188408, "train_ppl": 45.15698909833339, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 148147, "dt_s": 4.424, "eta_s": 31793, "world_size": 1, "timestamp": "2026-05-04T23:01:56.053849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21130, "epoch": 0, "train_loss": 3.8501473367214203, "train_ppl": 46.99998754553565, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 148560, "dt_s": 4.411, "eta_s": 31801, "world_size": 1, "timestamp": "2026-05-04T23:02:00.465280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21140, "epoch": 0, "train_loss": 3.8457091599702835, "train_ppl": 46.79185549855184, "lr": 0.00056, "grad_norm": 0.7426, "tokens_per_sec": 147857, "dt_s": 4.432, "eta_s": 31748, "world_size": 1, "timestamp": "2026-05-04T23:02:04.897676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21150, "epoch": 0, "train_loss": 3.931579664349556, "train_ppl": 50.98745715552151, "lr": 0.00056, "grad_norm": 0.7015, "tokens_per_sec": 150113, "dt_s": 4.366, "eta_s": 31648, "world_size": 1, "timestamp": "2026-05-04T23:02:09.263435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21160, "epoch": 0, "train_loss": 3.8749888241291046, "train_ppl": 48.18215981049232, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 148154, "dt_s": 4.424, "eta_s": 31547, "world_size": 1, "timestamp": "2026-05-04T23:02:13.686943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21170, "epoch": 0, "train_loss": 3.7222491055727005, "train_ppl": 41.357306533114176, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 145236, "dt_s": 4.512, "eta_s": 31669, "world_size": 1, "timestamp": "2026-05-04T23:02:18.199338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21180, "epoch": 0, "train_loss": 3.724331110715866, "train_ppl": 41.44350235696147, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 149604, "dt_s": 4.381, "eta_s": 31621, "world_size": 1, "timestamp": "2026-05-04T23:02:22.579953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21190, "epoch": 0, "train_loss": 3.7399652153253555, "train_ppl": 42.09652582557426, "lr": 0.00056, "grad_norm": 0.733, "tokens_per_sec": 132032, "dt_s": 4.964, "eta_s": 32376, "world_size": 1, "timestamp": "2026-05-04T23:02:27.543586"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21200, "epoch": 0, "train_loss": 3.8728798627853394, "train_ppl": 48.08065257303579, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 145906, "dt_s": 4.492, "eta_s": 32551, "world_size": 1, "timestamp": "2026-05-04T23:02:32.035299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21210, "epoch": 0, "train_loss": 3.8788509368896484, "train_ppl": 48.36860454827803, "lr": 0.00056, "grad_norm": 0.6866, "tokens_per_sec": 149814, "dt_s": 4.374, "eta_s": 32477, "world_size": 1, "timestamp": "2026-05-04T23:02:36.409757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21220, "epoch": 0, "train_loss": 3.973112404346466, "train_ppl": 53.149697022398314, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 144350, "dt_s": 4.54, "eta_s": 32512, "world_size": 1, "timestamp": "2026-05-04T23:02:40.949821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21230, "epoch": 0, "train_loss": 3.792186424136162, "train_ppl": 44.35326939762939, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 149788, "dt_s": 4.375, "eta_s": 32500, "world_size": 1, "timestamp": "2026-05-04T23:02:45.325073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21240, "epoch": 0, "train_loss": 3.8882334381341934, "train_ppl": 48.8245586909555, "lr": 0.00056, "grad_norm": 0.6891, "tokens_per_sec": 148755, "dt_s": 4.406, "eta_s": 31698, "world_size": 1, "timestamp": "2026-05-04T23:02:49.730714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21250, "epoch": 0, "train_loss": 3.852562576532364, "train_ppl": 47.11364098147238, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 145814, "dt_s": 4.494, "eta_s": 31697, "world_size": 1, "timestamp": "2026-05-04T23:02:54.225210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21260, "epoch": 0, "train_loss": 3.8515454083681107, "train_ppl": 47.06574285012088, "lr": 0.00056, "grad_norm": 0.6556, "tokens_per_sec": 149724, "dt_s": 4.377, "eta_s": 31697, "world_size": 1, "timestamp": "2026-05-04T23:02:58.602335"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21270, "epoch": 0, "train_loss": 3.7792909145355225, "train_ppl": 43.78498343006618, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 147011, "dt_s": 4.458, "eta_s": 31575, "world_size": 1, "timestamp": "2026-05-04T23:03:03.060237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21280, "epoch": 0, "train_loss": 3.9091638773679733, "train_ppl": 49.85724776769323, "lr": 0.00056, "grad_norm": 0.6284, "tokens_per_sec": 147766, "dt_s": 4.435, "eta_s": 31656, "world_size": 1, "timestamp": "2026-05-04T23:03:07.495417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21290, "epoch": 0, "train_loss": 3.843727543950081, "train_ppl": 46.69922381860357, "lr": 0.00056, "grad_norm": 0.6404, "tokens_per_sec": 150415, "dt_s": 4.357, "eta_s": 31582, "world_size": 1, "timestamp": "2026-05-04T23:03:11.852386"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21300, "epoch": 0, "train_loss": 3.798666164278984, "train_ppl": 44.64160020331309, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 147526, "dt_s": 4.442, "eta_s": 31503, "world_size": 1, "timestamp": "2026-05-04T23:03:16.294712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21310, "epoch": 0, "train_loss": 3.8503421545028687, "train_ppl": 47.00914487081331, "lr": 0.00056, "grad_norm": 0.752, "tokens_per_sec": 148765, "dt_s": 4.405, "eta_s": 31539, "world_size": 1, "timestamp": "2026-05-04T23:03:20.700047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21320, "epoch": 0, "train_loss": 3.8106998950242996, "train_ppl": 45.18205049494657, "lr": 0.00056, "grad_norm": 0.6756, "tokens_per_sec": 149860, "dt_s": 4.373, "eta_s": 31414, "world_size": 1, "timestamp": "2026-05-04T23:03:25.073209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21330, "epoch": 0, "train_loss": 3.9697809666395187, "train_ppl": 52.972926730722705, "lr": 0.00056, "grad_norm": 0.7077, "tokens_per_sec": 147075, "dt_s": 4.456, "eta_s": 31439, "world_size": 1, "timestamp": "2026-05-04T23:03:29.529158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21340, "epoch": 0, "train_loss": 3.8691966980695724, "train_ppl": 47.903889333856945, "lr": 0.00056, "grad_norm": 0.7054, "tokens_per_sec": 149103, "dt_s": 4.395, "eta_s": 31489, "world_size": 1, "timestamp": "2026-05-04T23:03:33.924506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21350, "epoch": 0, "train_loss": 3.858737364411354, "train_ppl": 47.405457746854076, "lr": 0.00056, "grad_norm": 0.6673, "tokens_per_sec": 151549, "dt_s": 4.324, "eta_s": 31317, "world_size": 1, "timestamp": "2026-05-04T23:03:38.248920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21360, "epoch": 0, "train_loss": 3.922602742910385, "train_ppl": 50.53179503972283, "lr": 0.00056, "grad_norm": 0.7121, "tokens_per_sec": 145166, "dt_s": 4.515, "eta_s": 31468, "world_size": 1, "timestamp": "2026-05-04T23:03:42.763470"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21370, "epoch": 0, "train_loss": 3.892711579799652, "train_ppl": 49.0436922709304, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 147093, "dt_s": 4.455, "eta_s": 31581, "world_size": 1, "timestamp": "2026-05-04T23:03:47.218876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21380, "epoch": 0, "train_loss": 3.882983475923538, "train_ppl": 48.56890328069727, "lr": 0.00056, "grad_norm": 0.7042, "tokens_per_sec": 147786, "dt_s": 4.435, "eta_s": 31546, "world_size": 1, "timestamp": "2026-05-04T23:03:51.653403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21390, "epoch": 0, "train_loss": 3.9023692905902863, "train_ppl": 49.51963663416774, "lr": 0.00056, "grad_norm": 0.7467, "tokens_per_sec": 146897, "dt_s": 4.461, "eta_s": 31636, "world_size": 1, "timestamp": "2026-05-04T23:03:56.114760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21400, "epoch": 0, "train_loss": 3.9064409881830215, "train_ppl": 49.72167666326389, "lr": 0.00056, "grad_norm": 0.7544, "tokens_per_sec": 148617, "dt_s": 4.41, "eta_s": 31753, "world_size": 1, "timestamp": "2026-05-04T23:04:00.524473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21410, "epoch": 0, "train_loss": 3.888448804616928, "train_ppl": 48.8350749968211, "lr": 0.00056, "grad_norm": 0.7586, "tokens_per_sec": 147237, "dt_s": 4.451, "eta_s": 31658, "world_size": 1, "timestamp": "2026-05-04T23:04:04.975518"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21420, "epoch": 0, "train_loss": 3.874169275164604, "train_ppl": 48.142688347919666, "lr": 0.00056, "grad_norm": 0.6473, "tokens_per_sec": 148240, "dt_s": 4.421, "eta_s": 31604, "world_size": 1, "timestamp": "2026-05-04T23:04:09.396473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21430, "epoch": 0, "train_loss": 3.8342661261558533, "train_ppl": 46.25946659532469, "lr": 0.00056, "grad_norm": 0.6747, "tokens_per_sec": 152324, "dt_s": 4.302, "eta_s": 31412, "world_size": 1, "timestamp": "2026-05-04T23:04:13.698892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21440, "epoch": 0, "train_loss": 3.756628632545471, "train_ppl": 42.80387485758312, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 147029, "dt_s": 4.457, "eta_s": 31402, "world_size": 1, "timestamp": "2026-05-04T23:04:18.156262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21450, "epoch": 0, "train_loss": 3.8790977895259857, "train_ppl": 48.38054593964806, "lr": 0.00056, "grad_norm": 0.7052, "tokens_per_sec": 149051, "dt_s": 4.397, "eta_s": 31379, "world_size": 1, "timestamp": "2026-05-04T23:04:22.553121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21460, "epoch": 0, "train_loss": 3.9575323909521103, "train_ppl": 52.32804135447884, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 150378, "dt_s": 4.358, "eta_s": 31242, "world_size": 1, "timestamp": "2026-05-04T23:04:26.911246"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21470, "epoch": 0, "train_loss": 3.8469984382390976, "train_ppl": 46.85222212733296, "lr": 0.00056, "grad_norm": 0.6272, "tokens_per_sec": 146999, "dt_s": 4.458, "eta_s": 31291, "world_size": 1, "timestamp": "2026-05-04T23:04:31.369470"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21480, "epoch": 0, "train_loss": 3.8548343032598495, "train_ppl": 47.22079196171045, "lr": 0.00056, "grad_norm": 0.719, "tokens_per_sec": 145903, "dt_s": 4.492, "eta_s": 31556, "world_size": 1, "timestamp": "2026-05-04T23:04:35.861221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21490, "epoch": 0, "train_loss": 3.7874725610017776, "train_ppl": 44.1446861587818, "lr": 0.00056, "grad_norm": 0.7541, "tokens_per_sec": 130416, "dt_s": 5.025, "eta_s": 32360, "world_size": 1, "timestamp": "2026-05-04T23:04:40.886385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21500, "epoch": 0, "train_loss": 3.704431653022766, "train_ppl": 40.626950549472504, "lr": 0.00056, "grad_norm": 0.7363, "tokens_per_sec": 146356, "dt_s": 4.478, "eta_s": 32471, "world_size": 1, "timestamp": "2026-05-04T23:04:45.364240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21510, "epoch": 0, "train_loss": 3.9531924426555634, "train_ppl": 52.101432451685966, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 126407, "dt_s": 5.185, "eta_s": 32547, "world_size": 1, "timestamp": "2026-05-04T23:04:50.548768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21520, "epoch": 0, "train_loss": 3.822036400437355, "train_ppl": 45.69717137349316, "lr": 0.00056, "grad_norm": 0.7367, "tokens_per_sec": 146431, "dt_s": 4.476, "eta_s": 32567, "world_size": 1, "timestamp": "2026-05-04T23:04:55.024330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21530, "epoch": 0, "train_loss": 3.8699752390384674, "train_ppl": 47.941198995938464, "lr": 0.00056, "grad_norm": 0.7275, "tokens_per_sec": 145957, "dt_s": 4.49, "eta_s": 32560, "world_size": 1, "timestamp": "2026-05-04T23:04:59.514398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21540, "epoch": 0, "train_loss": 3.794536828994751, "train_ppl": 44.45764014622436, "lr": 0.00056, "grad_norm": 0.723, "tokens_per_sec": 145278, "dt_s": 4.511, "eta_s": 31823, "world_size": 1, "timestamp": "2026-05-04T23:05:04.025455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21550, "epoch": 0, "train_loss": 3.995890587568283, "train_ppl": 54.374244092673536, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 148091, "dt_s": 4.425, "eta_s": 31744, "world_size": 1, "timestamp": "2026-05-04T23:05:08.450817"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21560, "epoch": 0, "train_loss": 3.8825346678495407, "train_ppl": 48.54711005561319, "lr": 0.00056, "grad_norm": 0.7366, "tokens_per_sec": 148866, "dt_s": 4.402, "eta_s": 31722, "world_size": 1, "timestamp": "2026-05-04T23:05:12.853170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21570, "epoch": 0, "train_loss": 3.8654345870018005, "train_ppl": 47.724008160243784, "lr": 0.00056, "grad_norm": 1.3281, "tokens_per_sec": 148618, "dt_s": 4.41, "eta_s": 31624, "world_size": 1, "timestamp": "2026-05-04T23:05:17.262878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21580, "epoch": 0, "train_loss": 3.8366747349500656, "train_ppl": 46.371021845929235, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 147500, "dt_s": 4.443, "eta_s": 31553, "world_size": 1, "timestamp": "2026-05-04T23:05:21.706013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21590, "epoch": 0, "train_loss": 3.8283381313085556, "train_ppl": 45.986051916786074, "lr": 0.00056, "grad_norm": 0.717, "tokens_per_sec": 148857, "dt_s": 4.403, "eta_s": 31394, "world_size": 1, "timestamp": "2026-05-04T23:05:26.108615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21600, "epoch": 0, "train_loss": 3.7861877977848053, "train_ppl": 44.08800710715684, "lr": 0.00056, "grad_norm": 0.7786, "tokens_per_sec": 148169, "dt_s": 4.423, "eta_s": 31387, "world_size": 1, "timestamp": "2026-05-04T23:05:30.531672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21610, "epoch": 0, "train_loss": 3.873696893453598, "train_ppl": 48.11995199296736, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 148694, "dt_s": 4.407, "eta_s": 31390, "world_size": 1, "timestamp": "2026-05-04T23:05:34.939125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21620, "epoch": 0, "train_loss": 3.842931866645813, "train_ppl": 46.66208108485891, "lr": 0.00056, "grad_norm": 0.6564, "tokens_per_sec": 148963, "dt_s": 4.399, "eta_s": 31371, "world_size": 1, "timestamp": "2026-05-04T23:05:39.338615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21630, "epoch": 0, "train_loss": 3.8974156975746155, "train_ppl": 49.274943064536, "lr": 0.00056, "grad_norm": 0.6951, "tokens_per_sec": 148764, "dt_s": 4.405, "eta_s": 31313, "world_size": 1, "timestamp": "2026-05-04T23:05:43.743988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21640, "epoch": 0, "train_loss": 3.9682531505823135, "train_ppl": 52.892055636475455, "lr": 0.00056, "grad_norm": 0.6726, "tokens_per_sec": 147999, "dt_s": 4.428, "eta_s": 31345, "world_size": 1, "timestamp": "2026-05-04T23:05:48.172119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21650, "epoch": 0, "train_loss": 3.948202207684517, "train_ppl": 51.842081710199324, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 149617, "dt_s": 4.38, "eta_s": 31279, "world_size": 1, "timestamp": "2026-05-04T23:05:52.552368"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21660, "epoch": 0, "train_loss": 3.8897277861833572, "train_ppl": 48.897574116628654, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 147104, "dt_s": 4.455, "eta_s": 31343, "world_size": 1, "timestamp": "2026-05-04T23:05:57.007426"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21670, "epoch": 0, "train_loss": 3.8665060251951218, "train_ppl": 47.77516888820972, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 148373, "dt_s": 4.417, "eta_s": 31363, "world_size": 1, "timestamp": "2026-05-04T23:06:01.424396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21680, "epoch": 0, "train_loss": 3.7246697694063187, "train_ppl": 41.45753993603736, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 149788, "dt_s": 4.375, "eta_s": 31316, "world_size": 1, "timestamp": "2026-05-04T23:06:05.799641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21690, "epoch": 0, "train_loss": 3.7551109939813614, "train_ppl": 42.7389633149949, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 148715, "dt_s": 4.407, "eta_s": 31281, "world_size": 1, "timestamp": "2026-05-04T23:06:10.206480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21700, "epoch": 0, "train_loss": 3.7958036810159683, "train_ppl": 44.51399708791756, "lr": 0.00056, "grad_norm": 0.7346, "tokens_per_sec": 148542, "dt_s": 4.412, "eta_s": 31322, "world_size": 1, "timestamp": "2026-05-04T23:06:14.618415"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21710, "epoch": 0, "train_loss": 3.8490846008062363, "train_ppl": 46.95006550243042, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 146926, "dt_s": 4.46, "eta_s": 31325, "world_size": 1, "timestamp": "2026-05-04T23:06:19.078925"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21720, "epoch": 0, "train_loss": 3.932429850101471, "train_ppl": 51.030824397614964, "lr": 0.00056, "grad_norm": 0.9131, "tokens_per_sec": 148748, "dt_s": 4.406, "eta_s": 31305, "world_size": 1, "timestamp": "2026-05-04T23:06:23.484754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21730, "epoch": 0, "train_loss": 3.763342797756195, "train_ppl": 43.092234107589256, "lr": 0.00056, "grad_norm": 0.6294, "tokens_per_sec": 148704, "dt_s": 4.407, "eta_s": 31346, "world_size": 1, "timestamp": "2026-05-04T23:06:27.891898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21740, "epoch": 0, "train_loss": 3.9756458401679993, "train_ppl": 53.28451907814061, "lr": 0.00056, "grad_norm": 0.7584, "tokens_per_sec": 146983, "dt_s": 4.459, "eta_s": 31415, "world_size": 1, "timestamp": "2026-05-04T23:06:32.350627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21750, "epoch": 0, "train_loss": 3.868102788925171, "train_ppl": 47.85151548260235, "lr": 0.00056, "grad_norm": 0.9008, "tokens_per_sec": 148088, "dt_s": 4.425, "eta_s": 31430, "world_size": 1, "timestamp": "2026-05-04T23:06:36.776126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21760, "epoch": 0, "train_loss": 3.812122330069542, "train_ppl": 45.24636475757299, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 148606, "dt_s": 4.41, "eta_s": 31354, "world_size": 1, "timestamp": "2026-05-04T23:06:41.186157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21770, "epoch": 0, "train_loss": 3.8987279683351517, "train_ppl": 49.33964757718129, "lr": 0.00056, "grad_norm": 0.676, "tokens_per_sec": 148304, "dt_s": 4.419, "eta_s": 31368, "world_size": 1, "timestamp": "2026-05-04T23:06:45.605218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21780, "epoch": 0, "train_loss": 3.816354885697365, "train_ppl": 45.438278369381656, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 149035, "dt_s": 4.397, "eta_s": 31350, "world_size": 1, "timestamp": "2026-05-04T23:06:50.002583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21790, "epoch": 0, "train_loss": 3.816569969058037, "train_ppl": 45.448052438078584, "lr": 0.00056, "grad_norm": 0.694, "tokens_per_sec": 131662, "dt_s": 4.978, "eta_s": 32081, "world_size": 1, "timestamp": "2026-05-04T23:06:54.980189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21800, "epoch": 0, "train_loss": 3.8140024095773697, "train_ppl": 45.33151153703433, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 148864, "dt_s": 4.402, "eta_s": 32044, "world_size": 1, "timestamp": "2026-05-04T23:06:59.382551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21810, "epoch": 0, "train_loss": 3.7693575620651245, "train_ppl": 43.35220478671814, "lr": 0.00056, "grad_norm": 0.6351, "tokens_per_sec": 147608, "dt_s": 4.44, "eta_s": 32081, "world_size": 1, "timestamp": "2026-05-04T23:07:03.822427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21820, "epoch": 0, "train_loss": 3.8871671557426453, "train_ppl": 48.77252566962539, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 146570, "dt_s": 4.471, "eta_s": 32151, "world_size": 1, "timestamp": "2026-05-04T23:07:08.293737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21830, "epoch": 0, "train_loss": 3.8706239461898804, "train_ppl": 47.97230888408693, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 149852, "dt_s": 4.373, "eta_s": 32113, "world_size": 1, "timestamp": "2026-05-04T23:07:12.667124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21840, "epoch": 0, "train_loss": 3.965021938085556, "train_ppl": 52.721425984114404, "lr": 0.00056, "grad_norm": 1.093, "tokens_per_sec": 145847, "dt_s": 4.493, "eta_s": 31422, "world_size": 1, "timestamp": "2026-05-04T23:07:17.160596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21850, "epoch": 0, "train_loss": 3.8019226789474487, "train_ppl": 44.787213195938016, "lr": 0.00056, "grad_norm": 0.6911, "tokens_per_sec": 146361, "dt_s": 4.478, "eta_s": 31524, "world_size": 1, "timestamp": "2026-05-04T23:07:21.638299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21860, "epoch": 0, "train_loss": 3.7762923538684845, "train_ppl": 43.65388814775159, "lr": 0.00056, "grad_norm": 0.6969, "tokens_per_sec": 147819, "dt_s": 4.434, "eta_s": 31511, "world_size": 1, "timestamp": "2026-05-04T23:07:26.071829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21870, "epoch": 0, "train_loss": 3.7666504979133606, "train_ppl": 43.235006290734766, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 146193, "dt_s": 4.483, "eta_s": 31523, "world_size": 1, "timestamp": "2026-05-04T23:07:30.554665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21880, "epoch": 0, "train_loss": 3.8769951313734055, "train_ppl": 48.278925064722614, "lr": 0.00056, "grad_norm": 0.8144, "tokens_per_sec": 150801, "dt_s": 4.346, "eta_s": 31479, "world_size": 1, "timestamp": "2026-05-04T23:07:34.900550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21890, "epoch": 0, "train_loss": 3.836813375353813, "train_ppl": 46.37745118879319, "lr": 0.00056, "grad_norm": 0.7281, "tokens_per_sec": 149536, "dt_s": 4.383, "eta_s": 31318, "world_size": 1, "timestamp": "2026-05-04T23:07:39.283154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21900, "epoch": 0, "train_loss": 3.886334329843521, "train_ppl": 48.73192355667444, "lr": 0.00056, "grad_norm": 0.711, "tokens_per_sec": 145087, "dt_s": 4.517, "eta_s": 31369, "world_size": 1, "timestamp": "2026-05-04T23:07:43.800212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21910, "epoch": 0, "train_loss": 3.851325884461403, "train_ppl": 47.055411928361956, "lr": 0.00056, "grad_norm": 0.6723, "tokens_per_sec": 150340, "dt_s": 4.359, "eta_s": 31259, "world_size": 1, "timestamp": "2026-05-04T23:07:48.159384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21920, "epoch": 0, "train_loss": 3.7730388790369034, "train_ppl": 43.51209211135709, "lr": 0.00056, "grad_norm": 0.7234, "tokens_per_sec": 151254, "dt_s": 4.333, "eta_s": 31043, "world_size": 1, "timestamp": "2026-05-04T23:07:52.492234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21930, "epoch": 0, "train_loss": 3.8872864693403244, "train_ppl": 48.7783452423011, "lr": 0.00056, "grad_norm": 0.7112, "tokens_per_sec": 148366, "dt_s": 4.417, "eta_s": 31139, "world_size": 1, "timestamp": "2026-05-04T23:07:56.909422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21940, "epoch": 0, "train_loss": 3.966595619916916, "train_ppl": 52.80445805020033, "lr": 0.00056, "grad_norm": 0.788, "tokens_per_sec": 148082, "dt_s": 4.426, "eta_s": 31196, "world_size": 1, "timestamp": "2026-05-04T23:08:01.335089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21950, "epoch": 0, "train_loss": 3.8463467061519623, "train_ppl": 46.82169697900878, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 149302, "dt_s": 4.39, "eta_s": 31011, "world_size": 1, "timestamp": "2026-05-04T23:08:05.724591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21960, "epoch": 0, "train_loss": 3.828629821538925, "train_ppl": 45.999467555373705, "lr": 0.00056, "grad_norm": 0.793, "tokens_per_sec": 148542, "dt_s": 4.412, "eta_s": 31081, "world_size": 1, "timestamp": "2026-05-04T23:08:10.136541"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21970, "epoch": 0, "train_loss": 3.7462832033634186, "train_ppl": 42.36333312734371, "lr": 0.00056, "grad_norm": 0.6605, "tokens_per_sec": 151272, "dt_s": 4.332, "eta_s": 31076, "world_size": 1, "timestamp": "2026-05-04T23:08:14.468863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21980, "epoch": 0, "train_loss": 3.6807666569948196, "train_ppl": 39.67680091222653, "lr": 0.00056, "grad_norm": 0.6805, "tokens_per_sec": 148157, "dt_s": 4.423, "eta_s": 31080, "world_size": 1, "timestamp": "2026-05-04T23:08:18.892284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 21990, "epoch": 0, "train_loss": 3.9123119115829468, "train_ppl": 50.01444739461016, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 151871, "dt_s": 4.315, "eta_s": 30920, "world_size": 1, "timestamp": "2026-05-04T23:08:23.207495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22000, "epoch": 0, "train_loss": 3.8835301101207733, "train_ppl": 48.59545996188647, "lr": 0.00056, "grad_norm": 0.7086, "tokens_per_sec": 151971, "dt_s": 4.312, "eta_s": 30807, "world_size": 1, "timestamp": "2026-05-04T23:08:27.519893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22010, "epoch": 0, "train_loss": 3.7772320955991745, "train_ppl": 43.694930809881626, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 125662, "dt_s": 5.215, "eta_s": 30849, "world_size": 1, "timestamp": "2026-05-04T23:08:32.735167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22020, "epoch": 0, "train_loss": 3.9313692301511765, "train_ppl": 50.976728779695755, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 148773, "dt_s": 4.405, "eta_s": 30948, "world_size": 1, "timestamp": "2026-05-04T23:08:37.140287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22030, "epoch": 0, "train_loss": 3.815975546836853, "train_ppl": 45.42104513346609, "lr": 0.00056, "grad_norm": 0.7232, "tokens_per_sec": 149876, "dt_s": 4.373, "eta_s": 30872, "world_size": 1, "timestamp": "2026-05-04T23:08:41.512935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22040, "epoch": 0, "train_loss": 3.8172623068094254, "train_ppl": 45.4795287353657, "lr": 0.00056, "grad_norm": 0.6639, "tokens_per_sec": 147802, "dt_s": 4.434, "eta_s": 31035, "world_size": 1, "timestamp": "2026-05-04T23:08:45.946970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22050, "epoch": 0, "train_loss": 4.001337304711342, "train_ppl": 54.67121323941034, "lr": 0.00056, "grad_norm": 0.6992, "tokens_per_sec": 150319, "dt_s": 4.36, "eta_s": 31098, "world_size": 1, "timestamp": "2026-05-04T23:08:50.306762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22060, "epoch": 0, "train_loss": 3.753176838159561, "train_ppl": 42.65637939111533, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 149117, "dt_s": 4.395, "eta_s": 31022, "world_size": 1, "timestamp": "2026-05-04T23:08:54.701692"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22070, "epoch": 0, "train_loss": 3.789137601852417, "train_ppl": 44.21825009105553, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 147859, "dt_s": 4.432, "eta_s": 31056, "world_size": 1, "timestamp": "2026-05-04T23:08:59.134043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22080, "epoch": 0, "train_loss": 3.9089150577783585, "train_ppl": 49.84484385099707, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 134139, "dt_s": 4.886, "eta_s": 31776, "world_size": 1, "timestamp": "2026-05-04T23:09:04.019698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22090, "epoch": 0, "train_loss": 3.9060905128717422, "train_ppl": 49.704253496531244, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 147477, "dt_s": 4.444, "eta_s": 31786, "world_size": 1, "timestamp": "2026-05-04T23:09:08.463504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22100, "epoch": 0, "train_loss": 3.826330006122589, "train_ppl": 45.893798826607146, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 150085, "dt_s": 4.367, "eta_s": 31791, "world_size": 1, "timestamp": "2026-05-04T23:09:12.830081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22110, "epoch": 0, "train_loss": 3.726368024945259, "train_ppl": 41.52800524998385, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 150189, "dt_s": 4.364, "eta_s": 31742, "world_size": 1, "timestamp": "2026-05-04T23:09:17.193704"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22120, "epoch": 0, "train_loss": 3.7706324458122253, "train_ppl": 43.40750905376668, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 148367, "dt_s": 4.417, "eta_s": 31716, "world_size": 1, "timestamp": "2026-05-04T23:09:21.610850"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22130, "epoch": 0, "train_loss": 3.7408080101013184, "train_ppl": 42.13201951247225, "lr": 0.00056, "grad_norm": 0.6863, "tokens_per_sec": 150392, "dt_s": 4.358, "eta_s": 30967, "world_size": 1, "timestamp": "2026-05-04T23:09:25.968521"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22140, "epoch": 0, "train_loss": 3.863314688205719, "train_ppl": 47.622945252217036, "lr": 0.00056, "grad_norm": 0.7165, "tokens_per_sec": 149934, "dt_s": 4.371, "eta_s": 30859, "world_size": 1, "timestamp": "2026-05-04T23:09:30.339501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22150, "epoch": 0, "train_loss": 3.8417493253946304, "train_ppl": 46.606933862469226, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 149212, "dt_s": 4.392, "eta_s": 30891, "world_size": 1, "timestamp": "2026-05-04T23:09:34.731669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22160, "epoch": 0, "train_loss": 3.8507852256298065, "train_ppl": 47.02997788051933, "lr": 0.00056, "grad_norm": 0.7958, "tokens_per_sec": 150695, "dt_s": 4.349, "eta_s": 30866, "world_size": 1, "timestamp": "2026-05-04T23:09:39.080598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22170, "epoch": 0, "train_loss": 3.967169314622879, "train_ppl": 52.83476037954684, "lr": 0.00056, "grad_norm": 0.7435, "tokens_per_sec": 148685, "dt_s": 4.408, "eta_s": 30848, "world_size": 1, "timestamp": "2026-05-04T23:09:43.488279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22180, "epoch": 0, "train_loss": 3.86302088201046, "train_ppl": 47.60895539112001, "lr": 0.00056, "grad_norm": 0.7134, "tokens_per_sec": 150332, "dt_s": 4.359, "eta_s": 30847, "world_size": 1, "timestamp": "2026-05-04T23:09:47.847687"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22190, "epoch": 0, "train_loss": 3.842569127678871, "train_ppl": 46.64515799928768, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 150757, "dt_s": 4.347, "eta_s": 30809, "world_size": 1, "timestamp": "2026-05-04T23:09:52.194818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22200, "epoch": 0, "train_loss": 3.6904586404561996, "train_ppl": 40.063217356545785, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 149381, "dt_s": 4.387, "eta_s": 30797, "world_size": 1, "timestamp": "2026-05-04T23:09:56.581974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22210, "epoch": 0, "train_loss": 3.917008027434349, "train_ppl": 50.249873395041476, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 150215, "dt_s": 4.363, "eta_s": 30812, "world_size": 1, "timestamp": "2026-05-04T23:10:00.944808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22220, "epoch": 0, "train_loss": 3.8144408613443375, "train_ppl": 45.35139157626775, "lr": 0.00056, "grad_norm": 0.6725, "tokens_per_sec": 151477, "dt_s": 4.326, "eta_s": 30693, "world_size": 1, "timestamp": "2026-05-04T23:10:05.271280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22230, "epoch": 0, "train_loss": 3.902720555663109, "train_ppl": 49.53703420833738, "lr": 0.00056, "grad_norm": 0.7672, "tokens_per_sec": 145398, "dt_s": 4.507, "eta_s": 30897, "world_size": 1, "timestamp": "2026-05-04T23:10:09.778622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22240, "epoch": 0, "train_loss": 3.8166278153657913, "train_ppl": 45.45068151614726, "lr": 0.00056, "grad_norm": 0.6979, "tokens_per_sec": 149724, "dt_s": 4.377, "eta_s": 30935, "world_size": 1, "timestamp": "2026-05-04T23:10:14.155749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22250, "epoch": 0, "train_loss": 3.9293815791606903, "train_ppl": 50.87550546588075, "lr": 0.00056, "grad_norm": 0.7686, "tokens_per_sec": 150319, "dt_s": 4.36, "eta_s": 30892, "world_size": 1, "timestamp": "2026-05-04T23:10:18.515596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22260, "epoch": 0, "train_loss": 3.9963658303022385, "train_ppl": 54.40009119843102, "lr": 0.00056, "grad_norm": 0.6995, "tokens_per_sec": 149661, "dt_s": 4.379, "eta_s": 30910, "world_size": 1, "timestamp": "2026-05-04T23:10:22.894530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22270, "epoch": 0, "train_loss": 3.841364562511444, "train_ppl": 46.58900469368036, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 151665, "dt_s": 4.321, "eta_s": 30899, "world_size": 1, "timestamp": "2026-05-04T23:10:27.215619"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22280, "epoch": 0, "train_loss": 3.8512776494026184, "train_ppl": 47.05314226254063, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 149482, "dt_s": 4.384, "eta_s": 30721, "world_size": 1, "timestamp": "2026-05-04T23:10:31.599838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22290, "epoch": 0, "train_loss": 3.8102164268493652, "train_ppl": 45.1602116910627, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 151096, "dt_s": 4.337, "eta_s": 30660, "world_size": 1, "timestamp": "2026-05-04T23:10:35.937210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22300, "epoch": 0, "train_loss": 3.8457994014024734, "train_ppl": 46.7960782531377, "lr": 0.00056, "grad_norm": 0.7514, "tokens_per_sec": 150677, "dt_s": 4.349, "eta_s": 30642, "world_size": 1, "timestamp": "2026-05-04T23:10:40.286637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22310, "epoch": 0, "train_loss": 3.87824609875679, "train_ppl": 48.33935821735378, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 146492, "dt_s": 4.474, "eta_s": 30771, "world_size": 1, "timestamp": "2026-05-04T23:10:44.760341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22320, "epoch": 0, "train_loss": 3.8240807056427, "train_ppl": 45.790685892342516, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 151085, "dt_s": 4.338, "eta_s": 30790, "world_size": 1, "timestamp": "2026-05-04T23:10:49.098030"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22330, "epoch": 0, "train_loss": 3.778142735362053, "train_ppl": 43.73473927414757, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 150995, "dt_s": 4.34, "eta_s": 30724, "world_size": 1, "timestamp": "2026-05-04T23:10:53.438309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22340, "epoch": 0, "train_loss": 3.863637387752533, "train_ppl": 47.63831563494226, "lr": 0.00056, "grad_norm": 0.7228, "tokens_per_sec": 148790, "dt_s": 4.405, "eta_s": 30814, "world_size": 1, "timestamp": "2026-05-04T23:10:57.842888"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22350, "epoch": 0, "train_loss": 3.8790165930986404, "train_ppl": 48.3766177716435, "lr": 0.00056, "grad_norm": 0.8122, "tokens_per_sec": 148994, "dt_s": 4.399, "eta_s": 30879, "world_size": 1, "timestamp": "2026-05-04T23:11:02.241454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22360, "epoch": 0, "train_loss": 3.8131267726421356, "train_ppl": 45.29183496487367, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 151055, "dt_s": 4.339, "eta_s": 30684, "world_size": 1, "timestamp": "2026-05-04T23:11:06.579992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22370, "epoch": 0, "train_loss": 3.8177473545074463, "train_ppl": 45.5015938269642, "lr": 0.00056, "grad_norm": 0.7378, "tokens_per_sec": 149990, "dt_s": 4.369, "eta_s": 30724, "world_size": 1, "timestamp": "2026-05-04T23:11:10.949384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22380, "epoch": 0, "train_loss": 3.8209919035434723, "train_ppl": 45.64946573846526, "lr": 0.00056, "grad_norm": 0.7171, "tokens_per_sec": 134467, "dt_s": 4.874, "eta_s": 31470, "world_size": 1, "timestamp": "2026-05-04T23:11:15.823134"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22390, "epoch": 0, "train_loss": 3.797018140554428, "train_ppl": 44.56809037668264, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 147050, "dt_s": 4.457, "eta_s": 31539, "world_size": 1, "timestamp": "2026-05-04T23:11:20.279849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22400, "epoch": 0, "train_loss": 3.8145577907562256, "train_ppl": 45.35669479785832, "lr": 0.00056, "grad_norm": 0.673, "tokens_per_sec": 152248, "dt_s": 4.305, "eta_s": 31402, "world_size": 1, "timestamp": "2026-05-04T23:11:24.584410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22410, "epoch": 0, "train_loss": 3.9166113883256912, "train_ppl": 50.22994628224528, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 150532, "dt_s": 4.354, "eta_s": 31419, "world_size": 1, "timestamp": "2026-05-04T23:11:28.938027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22420, "epoch": 0, "train_loss": 3.872860699892044, "train_ppl": 48.07973121744891, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 149554, "dt_s": 4.382, "eta_s": 31432, "world_size": 1, "timestamp": "2026-05-04T23:11:33.320108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22430, "epoch": 0, "train_loss": 3.7795926332473755, "train_ppl": 43.79819617203071, "lr": 0.00056, "grad_norm": 0.7136, "tokens_per_sec": 150183, "dt_s": 4.364, "eta_s": 30711, "world_size": 1, "timestamp": "2026-05-04T23:11:37.683866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22440, "epoch": 0, "train_loss": 3.8623715937137604, "train_ppl": 47.578053486774515, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 149741, "dt_s": 4.377, "eta_s": 30594, "world_size": 1, "timestamp": "2026-05-04T23:11:42.060495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22450, "epoch": 0, "train_loss": 3.7591789215803146, "train_ppl": 42.91317642633616, "lr": 0.00056, "grad_norm": 0.7725, "tokens_per_sec": 148847, "dt_s": 4.403, "eta_s": 30728, "world_size": 1, "timestamp": "2026-05-04T23:11:46.463394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22460, "epoch": 0, "train_loss": 3.7752080112695694, "train_ppl": 43.60657803204721, "lr": 0.00056, "grad_norm": 0.6644, "tokens_per_sec": 150415, "dt_s": 4.357, "eta_s": 30728, "world_size": 1, "timestamp": "2026-05-04T23:11:50.820389"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22470, "epoch": 0, "train_loss": 3.771758571267128, "train_ppl": 43.456418888782366, "lr": 0.00056, "grad_norm": 0.6902, "tokens_per_sec": 149144, "dt_s": 4.394, "eta_s": 30741, "world_size": 1, "timestamp": "2026-05-04T23:11:55.214541"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22480, "epoch": 0, "train_loss": 3.8085729777812958, "train_ppl": 45.08605413701791, "lr": 0.00056, "grad_norm": 0.7971, "tokens_per_sec": 151054, "dt_s": 4.339, "eta_s": 30701, "world_size": 1, "timestamp": "2026-05-04T23:11:59.553111"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22490, "epoch": 0, "train_loss": 3.812568500638008, "train_ppl": 45.26655685808342, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 150698, "dt_s": 4.349, "eta_s": 30658, "world_size": 1, "timestamp": "2026-05-04T23:12:03.901936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22500, "epoch": 0, "train_loss": 3.344360202550888, "train_ppl": 28.34243644601899, "lr": 0.00056, "grad_norm": 2.0504, "tokens_per_sec": 148849, "dt_s": 4.403, "eta_s": 30653, "world_size": 1, "timestamp": "2026-05-04T23:12:08.304787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22510, "epoch": 0, "train_loss": 3.7385562360286713, "train_ppl": 42.0372544580941, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 128041, "dt_s": 5.118, "eta_s": 30624, "world_size": 1, "timestamp": "2026-05-04T23:12:13.423144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22520, "epoch": 0, "train_loss": 3.9335216879844666, "train_ppl": 51.08657221314309, "lr": 0.00056, "grad_norm": 0.7395, "tokens_per_sec": 150336, "dt_s": 4.359, "eta_s": 30570, "world_size": 1, "timestamp": "2026-05-04T23:12:17.782453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22530, "epoch": 0, "train_loss": 3.790401354432106, "train_ppl": 44.27416634338965, "lr": 0.00056, "grad_norm": 0.7449, "tokens_per_sec": 145844, "dt_s": 4.494, "eta_s": 30783, "world_size": 1, "timestamp": "2026-05-04T23:12:22.276033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22540, "epoch": 0, "train_loss": 3.7999503165483475, "train_ppl": 44.69896363933254, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 150181, "dt_s": 4.364, "eta_s": 30800, "world_size": 1, "timestamp": "2026-05-04T23:12:26.639825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22550, "epoch": 0, "train_loss": 3.8692727833986282, "train_ppl": 47.90753425570074, "lr": 0.00056, "grad_norm": 0.7293, "tokens_per_sec": 149259, "dt_s": 4.391, "eta_s": 30779, "world_size": 1, "timestamp": "2026-05-04T23:12:31.030585"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22560, "epoch": 0, "train_loss": 3.7907990515232086, "train_ppl": 44.29177755228657, "lr": 0.00056, "grad_norm": 0.9038, "tokens_per_sec": 148717, "dt_s": 4.407, "eta_s": 30870, "world_size": 1, "timestamp": "2026-05-04T23:12:35.437353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22570, "epoch": 0, "train_loss": 3.830810159444809, "train_ppl": 46.0998713554652, "lr": 0.00056, "grad_norm": 0.6915, "tokens_per_sec": 150394, "dt_s": 4.358, "eta_s": 30863, "world_size": 1, "timestamp": "2026-05-04T23:12:39.794946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22580, "epoch": 0, "train_loss": 3.7364724576473236, "train_ppl": 41.94974933834473, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 146643, "dt_s": 4.469, "eta_s": 30824, "world_size": 1, "timestamp": "2026-05-04T23:12:44.264047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22590, "epoch": 0, "train_loss": 3.865510016679764, "train_ppl": 47.727608102579964, "lr": 0.00056, "grad_norm": 0.691, "tokens_per_sec": 149367, "dt_s": 4.388, "eta_s": 30853, "world_size": 1, "timestamp": "2026-05-04T23:12:48.651637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22600, "epoch": 0, "train_loss": 3.887059211730957, "train_ppl": 48.76726125168074, "lr": 0.00056, "grad_norm": 0.7213, "tokens_per_sec": 151704, "dt_s": 4.32, "eta_s": 30750, "world_size": 1, "timestamp": "2026-05-04T23:12:52.971631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22610, "epoch": 0, "train_loss": 3.811966449022293, "train_ppl": 45.23931225654028, "lr": 0.00056, "grad_norm": 0.7214, "tokens_per_sec": 146418, "dt_s": 4.476, "eta_s": 30842, "world_size": 1, "timestamp": "2026-05-04T23:12:57.447589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22620, "epoch": 0, "train_loss": 3.7339770644903183, "train_ppl": 41.84519872260621, "lr": 0.00056, "grad_norm": 0.6533, "tokens_per_sec": 150944, "dt_s": 4.342, "eta_s": 30816, "world_size": 1, "timestamp": "2026-05-04T23:13:01.789338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22630, "epoch": 0, "train_loss": 3.768982544541359, "train_ppl": 43.335949998334996, "lr": 0.00056, "grad_norm": 0.6967, "tokens_per_sec": 149837, "dt_s": 4.374, "eta_s": 30678, "world_size": 1, "timestamp": "2026-05-04T23:13:06.163141"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22640, "epoch": 0, "train_loss": 3.9317071586847305, "train_ppl": 50.99395818188701, "lr": 0.00056, "grad_norm": 0.7655, "tokens_per_sec": 145596, "dt_s": 4.501, "eta_s": 30832, "world_size": 1, "timestamp": "2026-05-04T23:13:10.664367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22650, "epoch": 0, "train_loss": 3.8402698189020157, "train_ppl": 46.53802958595002, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 151116, "dt_s": 4.337, "eta_s": 30852, "world_size": 1, "timestamp": "2026-05-04T23:13:15.001211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22660, "epoch": 0, "train_loss": 3.8427650034427643, "train_ppl": 46.654295550125866, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 152071, "dt_s": 4.31, "eta_s": 30614, "world_size": 1, "timestamp": "2026-05-04T23:13:19.310744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22670, "epoch": 0, "train_loss": 3.8710893392562866, "train_ppl": 47.99464006000524, "lr": 0.00056, "grad_norm": 0.6757, "tokens_per_sec": 131731, "dt_s": 4.975, "eta_s": 31496, "world_size": 1, "timestamp": "2026-05-04T23:13:24.285747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22680, "epoch": 0, "train_loss": 3.784693405032158, "train_ppl": 44.02217151322651, "lr": 0.00056, "grad_norm": 0.7384, "tokens_per_sec": 150092, "dt_s": 4.366, "eta_s": 31482, "world_size": 1, "timestamp": "2026-05-04T23:13:28.652118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22690, "epoch": 0, "train_loss": 3.8274912536144257, "train_ppl": 45.947123841162714, "lr": 0.00056, "grad_norm": 0.7251, "tokens_per_sec": 146207, "dt_s": 4.482, "eta_s": 31451, "world_size": 1, "timestamp": "2026-05-04T23:13:33.134549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22700, "epoch": 0, "train_loss": 3.8284576386213303, "train_ppl": 45.991547914675195, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 151856, "dt_s": 4.316, "eta_s": 31417, "world_size": 1, "timestamp": "2026-05-04T23:13:37.450223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22710, "epoch": 0, "train_loss": 3.8112953454256058, "train_ppl": 45.20896217653627, "lr": 0.00056, "grad_norm": 0.7102, "tokens_per_sec": 151474, "dt_s": 4.327, "eta_s": 31436, "world_size": 1, "timestamp": "2026-05-04T23:13:41.776745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22720, "epoch": 0, "train_loss": 3.976942092180252, "train_ppl": 53.35363402873682, "lr": 0.00056, "grad_norm": 0.7014, "tokens_per_sec": 147902, "dt_s": 4.431, "eta_s": 30670, "world_size": 1, "timestamp": "2026-05-04T23:13:46.207782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22730, "epoch": 0, "train_loss": 3.808062583208084, "train_ppl": 45.0630483311775, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 151011, "dt_s": 4.34, "eta_s": 30629, "world_size": 1, "timestamp": "2026-05-04T23:13:50.547583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22740, "epoch": 0, "train_loss": 3.9565613865852356, "train_ppl": 52.2772552585644, "lr": 0.00056, "grad_norm": 0.6885, "tokens_per_sec": 151281, "dt_s": 4.332, "eta_s": 30414, "world_size": 1, "timestamp": "2026-05-04T23:13:54.879673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22750, "epoch": 0, "train_loss": 3.969729021191597, "train_ppl": 52.97017509978392, "lr": 0.00056, "grad_norm": 0.8174, "tokens_per_sec": 149765, "dt_s": 4.376, "eta_s": 30494, "world_size": 1, "timestamp": "2026-05-04T23:13:59.255583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22760, "epoch": 0, "train_loss": 3.8751248866319656, "train_ppl": 48.18871604176789, "lr": 0.00056, "grad_norm": 0.6898, "tokens_per_sec": 150978, "dt_s": 4.341, "eta_s": 30509, "world_size": 1, "timestamp": "2026-05-04T23:14:03.596363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22770, "epoch": 0, "train_loss": 3.9837068766355515, "train_ppl": 53.71578341304595, "lr": 0.00056, "grad_norm": 0.829, "tokens_per_sec": 151173, "dt_s": 4.335, "eta_s": 30371, "world_size": 1, "timestamp": "2026-05-04T23:14:07.931520"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22780, "epoch": 0, "train_loss": 3.775862619280815, "train_ppl": 43.63513259237231, "lr": 0.00056, "grad_norm": 0.6902, "tokens_per_sec": 148844, "dt_s": 4.403, "eta_s": 30455, "world_size": 1, "timestamp": "2026-05-04T23:14:12.334521"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22790, "epoch": 0, "train_loss": 3.800622671842575, "train_ppl": 44.7290273297901, "lr": 0.00056, "grad_norm": 0.6777, "tokens_per_sec": 152921, "dt_s": 4.286, "eta_s": 30386, "world_size": 1, "timestamp": "2026-05-04T23:14:16.620156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22800, "epoch": 0, "train_loss": 3.897341027855873, "train_ppl": 49.27126385576077, "lr": 0.00056, "grad_norm": 0.8502, "tokens_per_sec": 151063, "dt_s": 4.338, "eta_s": 30329, "world_size": 1, "timestamp": "2026-05-04T23:14:20.958465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22810, "epoch": 0, "train_loss": 3.795531302690506, "train_ppl": 44.50187409102686, "lr": 0.00056, "grad_norm": 0.668, "tokens_per_sec": 150627, "dt_s": 4.351, "eta_s": 30339, "world_size": 1, "timestamp": "2026-05-04T23:14:25.309355"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22820, "epoch": 0, "train_loss": 3.781025469303131, "train_ppl": 43.86099678743185, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 151925, "dt_s": 4.314, "eta_s": 30304, "world_size": 1, "timestamp": "2026-05-04T23:14:29.623057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22830, "epoch": 0, "train_loss": 3.942739799618721, "train_ppl": 51.559671128050034, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 149284, "dt_s": 4.39, "eta_s": 30282, "world_size": 1, "timestamp": "2026-05-04T23:14:34.013075"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22840, "epoch": 0, "train_loss": 3.9586770683526993, "train_ppl": 52.38797437627808, "lr": 0.00056, "grad_norm": 0.7133, "tokens_per_sec": 150064, "dt_s": 4.367, "eta_s": 30391, "world_size": 1, "timestamp": "2026-05-04T23:14:38.380267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22850, "epoch": 0, "train_loss": 3.7689600437879562, "train_ppl": 43.33497491778068, "lr": 0.00056, "grad_norm": 0.6807, "tokens_per_sec": 152009, "dt_s": 4.311, "eta_s": 30349, "world_size": 1, "timestamp": "2026-05-04T23:14:42.691602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22860, "epoch": 0, "train_loss": 3.9231439530849457, "train_ppl": 50.55915076326737, "lr": 0.00056, "grad_norm": 0.7786, "tokens_per_sec": 149675, "dt_s": 4.379, "eta_s": 30384, "world_size": 1, "timestamp": "2026-05-04T23:14:47.070158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22870, "epoch": 0, "train_loss": 3.800243005156517, "train_ppl": 44.71204843157816, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 151078, "dt_s": 4.338, "eta_s": 30413, "world_size": 1, "timestamp": "2026-05-04T23:14:51.408061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22880, "epoch": 0, "train_loss": 3.854113608598709, "train_ppl": 47.18677244936176, "lr": 0.00056, "grad_norm": 0.6849, "tokens_per_sec": 151609, "dt_s": 4.323, "eta_s": 30315, "world_size": 1, "timestamp": "2026-05-04T23:14:55.730749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22890, "epoch": 0, "train_loss": 3.766241744160652, "train_ppl": 43.21733743101743, "lr": 0.00056, "grad_norm": 0.7184, "tokens_per_sec": 149935, "dt_s": 4.371, "eta_s": 30316, "world_size": 1, "timestamp": "2026-05-04T23:15:00.101699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22900, "epoch": 0, "train_loss": 3.7912821918725967, "train_ppl": 44.31318186739598, "lr": 0.00056, "grad_norm": 0.6313, "tokens_per_sec": 151152, "dt_s": 4.336, "eta_s": 30345, "world_size": 1, "timestamp": "2026-05-04T23:15:04.437456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22910, "epoch": 0, "train_loss": 3.93090283870697, "train_ppl": 50.95295921293143, "lr": 0.00056, "grad_norm": 0.7313, "tokens_per_sec": 149130, "dt_s": 4.395, "eta_s": 30363, "world_size": 1, "timestamp": "2026-05-04T23:15:08.832037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22920, "epoch": 0, "train_loss": 3.7937325686216354, "train_ppl": 44.42189900249186, "lr": 0.00056, "grad_norm": 0.6487, "tokens_per_sec": 148535, "dt_s": 4.412, "eta_s": 30463, "world_size": 1, "timestamp": "2026-05-04T23:15:13.244195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22930, "epoch": 0, "train_loss": 3.8148861825466156, "train_ppl": 45.37159200999637, "lr": 0.00056, "grad_norm": 0.7123, "tokens_per_sec": 151394, "dt_s": 4.329, "eta_s": 30467, "world_size": 1, "timestamp": "2026-05-04T23:15:17.573031"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22940, "epoch": 0, "train_loss": 3.865713119506836, "train_ppl": 47.73730269918165, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 148945, "dt_s": 4.4, "eta_s": 30503, "world_size": 1, "timestamp": "2026-05-04T23:15:21.973051"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22950, "epoch": 0, "train_loss": 3.847637414932251, "train_ppl": 46.88216917200763, "lr": 0.00056, "grad_norm": 0.7642, "tokens_per_sec": 150672, "dt_s": 4.35, "eta_s": 30518, "world_size": 1, "timestamp": "2026-05-04T23:15:26.322673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22960, "epoch": 0, "train_loss": 3.889607325196266, "train_ppl": 48.89168422134263, "lr": 0.00056, "grad_norm": 0.7235, "tokens_per_sec": 150256, "dt_s": 4.362, "eta_s": 30468, "world_size": 1, "timestamp": "2026-05-04T23:15:30.684273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22970, "epoch": 0, "train_loss": 3.814982607960701, "train_ppl": 45.37596719547979, "lr": 0.00056, "grad_norm": 0.7344, "tokens_per_sec": 134980, "dt_s": 4.855, "eta_s": 31081, "world_size": 1, "timestamp": "2026-05-04T23:15:35.539504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22980, "epoch": 0, "train_loss": 3.835586369037628, "train_ppl": 46.3205806606397, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 149589, "dt_s": 4.381, "eta_s": 31149, "world_size": 1, "timestamp": "2026-05-04T23:15:39.920594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 22990, "epoch": 0, "train_loss": 3.7015612721443176, "train_ppl": 40.51050293192482, "lr": 0.00056, "grad_norm": 0.7092, "tokens_per_sec": 152767, "dt_s": 4.29, "eta_s": 30991, "world_size": 1, "timestamp": "2026-05-04T23:15:44.210505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23000, "epoch": 0, "train_loss": 3.8642287850379944, "train_ppl": 47.66649713790352, "lr": 0.00056, "grad_norm": 0.7349, "tokens_per_sec": 150301, "dt_s": 4.36, "eta_s": 31002, "world_size": 1, "timestamp": "2026-05-04T23:15:48.570826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23010, "epoch": 0, "train_loss": 3.6736167818307877, "train_ppl": 39.39412847930021, "lr": 0.00056, "grad_norm": 0.676, "tokens_per_sec": 127899, "dt_s": 5.124, "eta_s": 30992, "world_size": 1, "timestamp": "2026-05-04T23:15:53.694856"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23020, "epoch": 0, "train_loss": 3.8266028463840485, "train_ppl": 45.90632221104241, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 149348, "dt_s": 4.388, "eta_s": 30337, "world_size": 1, "timestamp": "2026-05-04T23:15:58.083000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23030, "epoch": 0, "train_loss": 3.770909398794174, "train_ppl": 43.41953255773388, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 151212, "dt_s": 4.334, "eta_s": 30267, "world_size": 1, "timestamp": "2026-05-04T23:16:02.417048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23040, "epoch": 0, "train_loss": 3.8200999945402145, "train_ppl": 45.60876872069537, "lr": 0.00056, "grad_norm": 0.7654, "tokens_per_sec": 150660, "dt_s": 4.35, "eta_s": 30346, "world_size": 1, "timestamp": "2026-05-04T23:16:06.766991"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23050, "epoch": 0, "train_loss": 3.848203122615814, "train_ppl": 46.90869827848253, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 150083, "dt_s": 4.367, "eta_s": 30350, "world_size": 1, "timestamp": "2026-05-04T23:16:11.133619"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23060, "epoch": 0, "train_loss": 3.786210834980011, "train_ppl": 44.08902278288191, "lr": 0.00056, "grad_norm": 0.7342, "tokens_per_sec": 151494, "dt_s": 4.326, "eta_s": 30302, "world_size": 1, "timestamp": "2026-05-04T23:16:15.459595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23070, "epoch": 0, "train_loss": 3.8723506182432175, "train_ppl": 48.05521288258232, "lr": 0.00056, "grad_norm": 0.7046, "tokens_per_sec": 151553, "dt_s": 4.324, "eta_s": 30209, "world_size": 1, "timestamp": "2026-05-04T23:16:19.783891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23080, "epoch": 0, "train_loss": 3.8436901718378067, "train_ppl": 46.69747860257932, "lr": 0.00056, "grad_norm": 0.7531, "tokens_per_sec": 151174, "dt_s": 4.335, "eta_s": 30206, "world_size": 1, "timestamp": "2026-05-04T23:16:24.119019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23090, "epoch": 0, "train_loss": 3.944233149290085, "train_ppl": 51.63672526605074, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 153173, "dt_s": 4.279, "eta_s": 30102, "world_size": 1, "timestamp": "2026-05-04T23:16:28.397585"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23100, "epoch": 0, "train_loss": 3.804399773478508, "train_ppl": 44.898292877444135, "lr": 0.00056, "grad_norm": 0.6707, "tokens_per_sec": 151141, "dt_s": 4.336, "eta_s": 30056, "world_size": 1, "timestamp": "2026-05-04T23:16:32.733694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23110, "epoch": 0, "train_loss": 3.7853106558322906, "train_ppl": 44.04935262173412, "lr": 0.00056, "grad_norm": 0.6685, "tokens_per_sec": 151620, "dt_s": 4.322, "eta_s": 30046, "world_size": 1, "timestamp": "2026-05-04T23:16:37.056081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23120, "epoch": 0, "train_loss": 3.7936753630638123, "train_ppl": 44.41935789566327, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 151387, "dt_s": 4.329, "eta_s": 30049, "world_size": 1, "timestamp": "2026-05-04T23:16:41.385113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23130, "epoch": 0, "train_loss": 3.837591752409935, "train_ppl": 46.41356438573859, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 150508, "dt_s": 4.354, "eta_s": 30071, "world_size": 1, "timestamp": "2026-05-04T23:16:45.739421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23140, "epoch": 0, "train_loss": 3.804565951228142, "train_ppl": 44.90575459468542, "lr": 0.00056, "grad_norm": 0.6617, "tokens_per_sec": 151528, "dt_s": 4.325, "eta_s": 30131, "world_size": 1, "timestamp": "2026-05-04T23:16:50.064426"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23150, "epoch": 0, "train_loss": 3.885992720723152, "train_ppl": 48.71527913024015, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 152212, "dt_s": 4.306, "eta_s": 30084, "world_size": 1, "timestamp": "2026-05-04T23:16:54.369998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23160, "epoch": 0, "train_loss": 3.858036622405052, "train_ppl": 47.37225038753752, "lr": 0.00056, "grad_norm": 0.6793, "tokens_per_sec": 149959, "dt_s": 4.37, "eta_s": 30147, "world_size": 1, "timestamp": "2026-05-04T23:16:58.740265"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23170, "epoch": 0, "train_loss": 3.8348717093467712, "train_ppl": 46.28748903481842, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 152891, "dt_s": 4.286, "eta_s": 30083, "world_size": 1, "timestamp": "2026-05-04T23:17:03.026720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23180, "epoch": 0, "train_loss": 3.7678480446338654, "train_ppl": 43.28681324516199, "lr": 0.00056, "grad_norm": 0.6984, "tokens_per_sec": 152152, "dt_s": 4.307, "eta_s": 30013, "world_size": 1, "timestamp": "2026-05-04T23:17:07.333981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23190, "epoch": 0, "train_loss": 3.8538605868816376, "train_ppl": 47.17483468149451, "lr": 0.00056, "grad_norm": 0.9456, "tokens_per_sec": 150165, "dt_s": 4.364, "eta_s": 30064, "world_size": 1, "timestamp": "2026-05-04T23:17:11.698253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23200, "epoch": 0, "train_loss": 3.870093137025833, "train_ppl": 47.946851500014866, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 153302, "dt_s": 4.275, "eta_s": 30017, "world_size": 1, "timestamp": "2026-05-04T23:17:15.973214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23210, "epoch": 0, "train_loss": 3.805621460080147, "train_ppl": 44.95317803969621, "lr": 0.00056, "grad_norm": 0.8244, "tokens_per_sec": 151120, "dt_s": 4.337, "eta_s": 29966, "world_size": 1, "timestamp": "2026-05-04T23:17:20.309871"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23220, "epoch": 0, "train_loss": 3.8993883281946182, "train_ppl": 49.372240260186864, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 150980, "dt_s": 4.341, "eta_s": 30037, "world_size": 1, "timestamp": "2026-05-04T23:17:24.650595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23230, "epoch": 0, "train_loss": 3.9131271839141846, "train_ppl": 50.055239415776796, "lr": 0.00056, "grad_norm": 0.6646, "tokens_per_sec": 152723, "dt_s": 4.291, "eta_s": 30010, "world_size": 1, "timestamp": "2026-05-04T23:17:28.941767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23240, "epoch": 0, "train_loss": 3.732647553086281, "train_ppl": 41.789602020114685, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 149947, "dt_s": 4.371, "eta_s": 30015, "world_size": 1, "timestamp": "2026-05-04T23:17:33.312376"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23250, "epoch": 0, "train_loss": 3.861284554004669, "train_ppl": 47.5263623536004, "lr": 0.00056, "grad_norm": 0.651, "tokens_per_sec": 151698, "dt_s": 4.32, "eta_s": 30073, "world_size": 1, "timestamp": "2026-05-04T23:17:37.632531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23260, "epoch": 0, "train_loss": 3.770476207137108, "train_ppl": 43.400727651834394, "lr": 0.00056, "grad_norm": 0.6539, "tokens_per_sec": 135641, "dt_s": 4.832, "eta_s": 30756, "world_size": 1, "timestamp": "2026-05-04T23:17:42.464102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23270, "epoch": 0, "train_loss": 3.819208398461342, "train_ppl": 45.568122244153976, "lr": 0.00056, "grad_norm": 0.7108, "tokens_per_sec": 149636, "dt_s": 4.38, "eta_s": 30806, "world_size": 1, "timestamp": "2026-05-04T23:17:46.843813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23280, "epoch": 0, "train_loss": 3.848158225417137, "train_ppl": 46.90659225661384, "lr": 0.00056, "grad_norm": 0.6708, "tokens_per_sec": 152024, "dt_s": 4.311, "eta_s": 30829, "world_size": 1, "timestamp": "2026-05-04T23:17:51.154697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23290, "epoch": 0, "train_loss": 3.8042861074209213, "train_ppl": 44.89318975553175, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 152857, "dt_s": 4.287, "eta_s": 30709, "world_size": 1, "timestamp": "2026-05-04T23:17:55.442110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23300, "epoch": 0, "train_loss": 3.782310426235199, "train_ppl": 43.91739250457546, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 149091, "dt_s": 4.396, "eta_s": 30809, "world_size": 1, "timestamp": "2026-05-04T23:17:59.837814"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23310, "epoch": 0, "train_loss": 3.812122479081154, "train_ppl": 45.24637149980724, "lr": 0.00056, "grad_norm": 0.6898, "tokens_per_sec": 152259, "dt_s": 4.304, "eta_s": 30073, "world_size": 1, "timestamp": "2026-05-04T23:18:04.142056"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23320, "epoch": 0, "train_loss": 3.687075674533844, "train_ppl": 39.927913850103984, "lr": 0.00056, "grad_norm": 0.681, "tokens_per_sec": 152454, "dt_s": 4.299, "eta_s": 29956, "world_size": 1, "timestamp": "2026-05-04T23:18:08.440790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23330, "epoch": 0, "train_loss": 3.844333365559578, "train_ppl": 46.72752378904089, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 150056, "dt_s": 4.367, "eta_s": 30030, "world_size": 1, "timestamp": "2026-05-04T23:18:12.808242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23340, "epoch": 0, "train_loss": 3.7572406828403473, "train_ppl": 42.830081000732285, "lr": 0.00056, "grad_norm": 0.76, "tokens_per_sec": 152168, "dt_s": 4.307, "eta_s": 30053, "world_size": 1, "timestamp": "2026-05-04T23:18:17.115037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23350, "epoch": 0, "train_loss": 3.781250312924385, "train_ppl": 43.8708597615532, "lr": 0.00056, "grad_norm": 0.7447, "tokens_per_sec": 150449, "dt_s": 4.356, "eta_s": 29994, "world_size": 1, "timestamp": "2026-05-04T23:18:21.471096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23360, "epoch": 0, "train_loss": 3.9315787702798843, "train_ppl": 50.98741156920281, "lr": 0.00056, "grad_norm": 0.7111, "tokens_per_sec": 148877, "dt_s": 4.402, "eta_s": 30125, "world_size": 1, "timestamp": "2026-05-04T23:18:25.873117"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23370, "epoch": 0, "train_loss": 3.8289719372987747, "train_ppl": 46.015207390438384, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 151340, "dt_s": 4.33, "eta_s": 30164, "world_size": 1, "timestamp": "2026-05-04T23:18:30.203495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23380, "epoch": 0, "train_loss": 3.8274777233600616, "train_ppl": 45.94650216909555, "lr": 0.00056, "grad_norm": 0.7569, "tokens_per_sec": 150159, "dt_s": 4.364, "eta_s": 30156, "world_size": 1, "timestamp": "2026-05-04T23:18:34.567919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23390, "epoch": 0, "train_loss": 3.8646674156188965, "train_ppl": 47.68740970734363, "lr": 0.00056, "grad_norm": 1.0044, "tokens_per_sec": 150123, "dt_s": 4.365, "eta_s": 30233, "world_size": 1, "timestamp": "2026-05-04T23:18:38.933409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23400, "epoch": 0, "train_loss": 3.744122952222824, "train_ppl": 42.27191646566088, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 151565, "dt_s": 4.324, "eta_s": 30184, "world_size": 1, "timestamp": "2026-05-04T23:18:43.257360"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23410, "epoch": 0, "train_loss": 3.7489782571792603, "train_ppl": 42.47765857736202, "lr": 0.00056, "grad_norm": 0.7175, "tokens_per_sec": 148072, "dt_s": 4.426, "eta_s": 30213, "world_size": 1, "timestamp": "2026-05-04T23:18:47.683334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23420, "epoch": 0, "train_loss": 3.787309765815735, "train_ppl": 44.137500201321295, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 150958, "dt_s": 4.341, "eta_s": 30224, "world_size": 1, "timestamp": "2026-05-04T23:18:52.024662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23430, "epoch": 0, "train_loss": 3.741704061627388, "train_ppl": 42.16978889198075, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 151402, "dt_s": 4.329, "eta_s": 30170, "world_size": 1, "timestamp": "2026-05-04T23:18:56.353268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23440, "epoch": 0, "train_loss": 3.896906316280365, "train_ppl": 49.249849721845926, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 148458, "dt_s": 4.414, "eta_s": 30233, "world_size": 1, "timestamp": "2026-05-04T23:19:00.767698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23450, "epoch": 0, "train_loss": 3.82585708796978, "train_ppl": 45.872099947342036, "lr": 0.00056, "grad_norm": 0.7568, "tokens_per_sec": 152557, "dt_s": 4.296, "eta_s": 30190, "world_size": 1, "timestamp": "2026-05-04T23:19:05.063531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23460, "epoch": 0, "train_loss": 3.801862046122551, "train_ppl": 44.78449770300748, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 151317, "dt_s": 4.331, "eta_s": 30054, "world_size": 1, "timestamp": "2026-05-04T23:19:09.394573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23470, "epoch": 0, "train_loss": 3.819288283586502, "train_ppl": 45.57176260470616, "lr": 0.00056, "grad_norm": 0.7601, "tokens_per_sec": 147957, "dt_s": 4.429, "eta_s": 30172, "world_size": 1, "timestamp": "2026-05-04T23:19:13.823961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23480, "epoch": 0, "train_loss": 3.8764753490686417, "train_ppl": 48.25383705449768, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 152771, "dt_s": 4.29, "eta_s": 30114, "world_size": 1, "timestamp": "2026-05-04T23:19:18.113836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23490, "epoch": 0, "train_loss": 3.886197730898857, "train_ppl": 48.72526728197555, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 151182, "dt_s": 4.335, "eta_s": 29999, "world_size": 1, "timestamp": "2026-05-04T23:19:22.448723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23500, "epoch": 0, "train_loss": 3.8317866772413254, "train_ppl": 46.1449106875365, "lr": 0.00056, "grad_norm": 0.7625, "tokens_per_sec": 149173, "dt_s": 4.393, "eta_s": 30130, "world_size": 1, "timestamp": "2026-05-04T23:19:26.841990"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23510, "epoch": 0, "train_loss": 3.7581865042448044, "train_ppl": 42.87060977156553, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 108082, "dt_s": 6.064, "eta_s": 30227, "world_size": 1, "timestamp": "2026-05-04T23:19:32.905544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23520, "epoch": 0, "train_loss": 3.819727346301079, "train_ppl": 45.59177585971896, "lr": 0.00056, "grad_norm": 0.7026, "tokens_per_sec": 147374, "dt_s": 4.447, "eta_s": 30247, "world_size": 1, "timestamp": "2026-05-04T23:19:37.352485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23530, "epoch": 0, "train_loss": 3.826471596956253, "train_ppl": 45.90029742790334, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 148714, "dt_s": 4.407, "eta_s": 30404, "world_size": 1, "timestamp": "2026-05-04T23:19:41.759360"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23540, "epoch": 0, "train_loss": 3.9686036556959152, "train_ppl": 52.9105978218205, "lr": 0.00056, "grad_norm": 0.9963, "tokens_per_sec": 149349, "dt_s": 4.388, "eta_s": 30473, "world_size": 1, "timestamp": "2026-05-04T23:19:46.147415"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23550, "epoch": 0, "train_loss": 3.7619836777448654, "train_ppl": 43.03370637199846, "lr": 0.00056, "grad_norm": 0.7547, "tokens_per_sec": 149436, "dt_s": 4.386, "eta_s": 30458, "world_size": 1, "timestamp": "2026-05-04T23:19:50.533000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23560, "epoch": 0, "train_loss": 3.715726360678673, "train_ppl": 41.08842126144754, "lr": 0.00056, "grad_norm": 0.7157, "tokens_per_sec": 135454, "dt_s": 4.838, "eta_s": 31053, "world_size": 1, "timestamp": "2026-05-04T23:19:55.371254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23570, "epoch": 0, "train_loss": 3.7430724799633026, "train_ppl": 42.227534305252334, "lr": 0.00056, "grad_norm": 0.8073, "tokens_per_sec": 149841, "dt_s": 4.374, "eta_s": 30948, "world_size": 1, "timestamp": "2026-05-04T23:19:59.744952"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23580, "epoch": 0, "train_loss": 3.779631122946739, "train_ppl": 43.79988198387703, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 148653, "dt_s": 4.409, "eta_s": 30946, "world_size": 1, "timestamp": "2026-05-04T23:20:04.153594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23590, "epoch": 0, "train_loss": 3.7939858585596085, "train_ppl": 44.43315204761624, "lr": 0.00056, "grad_norm": 0.6482, "tokens_per_sec": 151541, "dt_s": 4.325, "eta_s": 30854, "world_size": 1, "timestamp": "2026-05-04T23:20:08.478258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23600, "epoch": 0, "train_loss": 3.8278507590293884, "train_ppl": 45.96364505053975, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 148366, "dt_s": 4.417, "eta_s": 30893, "world_size": 1, "timestamp": "2026-05-04T23:20:12.895404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23610, "epoch": 0, "train_loss": 3.85537426173687, "train_ppl": 47.24629611359363, "lr": 0.00056, "grad_norm": 0.6955, "tokens_per_sec": 150569, "dt_s": 4.353, "eta_s": 30217, "world_size": 1, "timestamp": "2026-05-04T23:20:17.247980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23620, "epoch": 0, "train_loss": 3.7517721951007843, "train_ppl": 42.59650446518514, "lr": 0.00056, "grad_norm": 0.7684, "tokens_per_sec": 150491, "dt_s": 4.355, "eta_s": 30187, "world_size": 1, "timestamp": "2026-05-04T23:20:21.602783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23630, "epoch": 0, "train_loss": 3.8264275789260864, "train_ppl": 45.89827703169376, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 149600, "dt_s": 4.381, "eta_s": 30144, "world_size": 1, "timestamp": "2026-05-04T23:20:25.983524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23640, "epoch": 0, "train_loss": 3.7889961302280426, "train_ppl": 44.211994905864195, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 150757, "dt_s": 4.347, "eta_s": 30171, "world_size": 1, "timestamp": "2026-05-04T23:20:30.330668"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23650, "epoch": 0, "train_loss": 3.849313899874687, "train_ppl": 46.96083234307993, "lr": 0.00056, "grad_norm": 0.7577, "tokens_per_sec": 151348, "dt_s": 4.33, "eta_s": 30046, "world_size": 1, "timestamp": "2026-05-04T23:20:34.660826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23660, "epoch": 0, "train_loss": 3.823236644268036, "train_ppl": 45.75205205002212, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 150297, "dt_s": 4.36, "eta_s": 30053, "world_size": 1, "timestamp": "2026-05-04T23:20:39.021282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23670, "epoch": 0, "train_loss": 3.8265687078237534, "train_ppl": 45.90475506204394, "lr": 0.00056, "grad_norm": 0.7211, "tokens_per_sec": 151544, "dt_s": 4.325, "eta_s": 30007, "world_size": 1, "timestamp": "2026-05-04T23:20:43.345812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23680, "epoch": 0, "train_loss": 3.7576322704553604, "train_ppl": 42.846856014231584, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 150392, "dt_s": 4.358, "eta_s": 29970, "world_size": 1, "timestamp": "2026-05-04T23:20:47.703490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23690, "epoch": 0, "train_loss": 3.76797154545784, "train_ppl": 43.29215953239364, "lr": 0.00056, "grad_norm": 0.731, "tokens_per_sec": 152680, "dt_s": 4.292, "eta_s": 29891, "world_size": 1, "timestamp": "2026-05-04T23:20:51.995854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23700, "epoch": 0, "train_loss": 3.8428369909524918, "train_ppl": 46.65765419756949, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 150475, "dt_s": 4.355, "eta_s": 29921, "world_size": 1, "timestamp": "2026-05-04T23:20:56.351134"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23710, "epoch": 0, "train_loss": 3.826880380511284, "train_ppl": 45.91906455024704, "lr": 0.00056, "grad_norm": 0.7074, "tokens_per_sec": 148791, "dt_s": 4.405, "eta_s": 29977, "world_size": 1, "timestamp": "2026-05-04T23:21:00.755717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23720, "epoch": 0, "train_loss": 3.751675933599472, "train_ppl": 42.592404259063834, "lr": 0.00056, "grad_norm": 0.6761, "tokens_per_sec": 150386, "dt_s": 4.358, "eta_s": 30019, "world_size": 1, "timestamp": "2026-05-04T23:21:05.113571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23730, "epoch": 0, "train_loss": 3.863908290863037, "train_ppl": 47.651222751036954, "lr": 0.00056, "grad_norm": 0.6975, "tokens_per_sec": 150344, "dt_s": 4.359, "eta_s": 30017, "world_size": 1, "timestamp": "2026-05-04T23:21:09.472636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23740, "epoch": 0, "train_loss": 3.8841254711151123, "train_ppl": 48.62440041740409, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 148312, "dt_s": 4.419, "eta_s": 30187, "world_size": 1, "timestamp": "2026-05-04T23:21:13.891428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23750, "epoch": 0, "train_loss": 3.715332016348839, "train_ppl": 41.07222146985917, "lr": 0.00056, "grad_norm": 0.6752, "tokens_per_sec": 150991, "dt_s": 4.34, "eta_s": 30162, "world_size": 1, "timestamp": "2026-05-04T23:21:18.231807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23760, "epoch": 0, "train_loss": 3.8887680172920227, "train_ppl": 48.85066626008132, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 151179, "dt_s": 4.335, "eta_s": 30061, "world_size": 1, "timestamp": "2026-05-04T23:21:22.566809"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23770, "epoch": 0, "train_loss": 3.832892507314682, "train_ppl": 46.19596734229635, "lr": 0.00056, "grad_norm": 0.7174, "tokens_per_sec": 151381, "dt_s": 4.329, "eta_s": 30018, "world_size": 1, "timestamp": "2026-05-04T23:21:26.896017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23780, "epoch": 0, "train_loss": 3.8802498131990433, "train_ppl": 48.43631359054528, "lr": 0.00056, "grad_norm": 0.6849, "tokens_per_sec": 151281, "dt_s": 4.332, "eta_s": 29976, "world_size": 1, "timestamp": "2026-05-04T23:21:31.228084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23790, "epoch": 0, "train_loss": 3.878371924161911, "train_ppl": 48.34544091935595, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 150577, "dt_s": 4.352, "eta_s": 29880, "world_size": 1, "timestamp": "2026-05-04T23:21:35.580488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23800, "epoch": 0, "train_loss": 3.7911813259124756, "train_ppl": 44.308712401173196, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 151085, "dt_s": 4.338, "eta_s": 29872, "world_size": 1, "timestamp": "2026-05-04T23:21:39.918128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23810, "epoch": 0, "train_loss": 3.9133536368608475, "train_ppl": 50.06657585577509, "lr": 0.00056, "grad_norm": 0.7242, "tokens_per_sec": 150413, "dt_s": 4.357, "eta_s": 29898, "world_size": 1, "timestamp": "2026-05-04T23:21:44.275203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23820, "epoch": 0, "train_loss": 3.8139811605215073, "train_ppl": 45.33054829544737, "lr": 0.00056, "grad_norm": 0.646, "tokens_per_sec": 146947, "dt_s": 4.46, "eta_s": 30074, "world_size": 1, "timestamp": "2026-05-04T23:21:48.735027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23830, "epoch": 0, "train_loss": 3.779503673315048, "train_ppl": 43.794300060764655, "lr": 0.00056, "grad_norm": 0.6842, "tokens_per_sec": 151966, "dt_s": 4.313, "eta_s": 30042, "world_size": 1, "timestamp": "2026-05-04T23:21:53.047569"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23840, "epoch": 0, "train_loss": 3.721672758460045, "train_ppl": 41.333477236539366, "lr": 0.00056, "grad_norm": 0.9053, "tokens_per_sec": 150983, "dt_s": 4.341, "eta_s": 30022, "world_size": 1, "timestamp": "2026-05-04T23:21:57.388205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23850, "epoch": 0, "train_loss": 3.911373868584633, "train_ppl": 49.967553690011314, "lr": 0.00056, "grad_norm": 0.708, "tokens_per_sec": 148960, "dt_s": 4.4, "eta_s": 30103, "world_size": 1, "timestamp": "2026-05-04T23:22:01.787786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23860, "epoch": 0, "train_loss": 3.894042521715164, "train_ppl": 49.10901003409764, "lr": 0.00056, "grad_norm": 0.8924, "tokens_per_sec": 135851, "dt_s": 4.824, "eta_s": 30741, "world_size": 1, "timestamp": "2026-05-04T23:22:06.611890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23870, "epoch": 0, "train_loss": 3.812517747282982, "train_ppl": 45.26425948675253, "lr": 0.00056, "grad_norm": 0.7183, "tokens_per_sec": 149432, "dt_s": 4.386, "eta_s": 30635, "world_size": 1, "timestamp": "2026-05-04T23:22:10.997568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23880, "epoch": 0, "train_loss": 3.758345440030098, "train_ppl": 42.8774239870926, "lr": 0.00056, "grad_norm": 0.6616, "tokens_per_sec": 150467, "dt_s": 4.355, "eta_s": 30689, "world_size": 1, "timestamp": "2026-05-04T23:22:15.353061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23890, "epoch": 0, "train_loss": 3.806913197040558, "train_ppl": 45.011283241496955, "lr": 0.00056, "grad_norm": 0.6661, "tokens_per_sec": 150009, "dt_s": 4.369, "eta_s": 30724, "world_size": 1, "timestamp": "2026-05-04T23:22:19.721881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23900, "epoch": 0, "train_loss": 3.806563913822174, "train_ppl": 44.99556430096366, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 148890, "dt_s": 4.402, "eta_s": 30722, "world_size": 1, "timestamp": "2026-05-04T23:22:24.123496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23910, "epoch": 0, "train_loss": 3.7525572776794434, "train_ppl": 42.62995936946545, "lr": 0.00056, "grad_norm": 0.7316, "tokens_per_sec": 152137, "dt_s": 4.308, "eta_s": 30007, "world_size": 1, "timestamp": "2026-05-04T23:22:28.431206"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23920, "epoch": 0, "train_loss": 3.7003778368234634, "train_ppl": 40.46258972857015, "lr": 0.00056, "grad_norm": 0.7681, "tokens_per_sec": 152344, "dt_s": 4.302, "eta_s": 29888, "world_size": 1, "timestamp": "2026-05-04T23:22:32.733019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23930, "epoch": 0, "train_loss": 3.9228680431842804, "train_ppl": 50.54520291726544, "lr": 0.00056, "grad_norm": 0.6547, "tokens_per_sec": 149314, "dt_s": 4.389, "eta_s": 29929, "world_size": 1, "timestamp": "2026-05-04T23:22:37.122195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23940, "epoch": 0, "train_loss": 3.881328731775284, "train_ppl": 48.48860063070007, "lr": 0.00056, "grad_norm": 0.7178, "tokens_per_sec": 150637, "dt_s": 4.351, "eta_s": 29900, "world_size": 1, "timestamp": "2026-05-04T23:22:41.472763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23950, "epoch": 0, "train_loss": 3.8982432782649994, "train_ppl": 49.31573893454471, "lr": 0.00056, "grad_norm": 0.7518, "tokens_per_sec": 151398, "dt_s": 4.329, "eta_s": 29795, "world_size": 1, "timestamp": "2026-05-04T23:22:45.801481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23960, "epoch": 0, "train_loss": 3.768437147140503, "train_ppl": 43.31232112799023, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 149140, "dt_s": 4.394, "eta_s": 29910, "world_size": 1, "timestamp": "2026-05-04T23:22:50.195741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23970, "epoch": 0, "train_loss": 3.9029482305049896, "train_ppl": 49.54831382876218, "lr": 0.00056, "grad_norm": 0.7151, "tokens_per_sec": 152551, "dt_s": 4.296, "eta_s": 29898, "world_size": 1, "timestamp": "2026-05-04T23:22:54.491727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23980, "epoch": 0, "train_loss": 3.896756172180176, "train_ppl": 49.24245570257307, "lr": 0.00056, "grad_norm": 0.7076, "tokens_per_sec": 150750, "dt_s": 4.347, "eta_s": 29836, "world_size": 1, "timestamp": "2026-05-04T23:22:58.839067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 23990, "epoch": 0, "train_loss": 3.818587601184845, "train_ppl": 45.53984245688542, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 149650, "dt_s": 4.379, "eta_s": 29871, "world_size": 1, "timestamp": "2026-05-04T23:23:03.218364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24000, "epoch": 0, "train_loss": 3.7081493735313416, "train_ppl": 40.77827130658294, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 151866, "dt_s": 4.315, "eta_s": 29848, "world_size": 1, "timestamp": "2026-05-04T23:23:07.533739"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24010, "epoch": 0, "train_loss": 3.9486889839172363, "train_ppl": 51.867323346446426, "lr": 0.00056, "grad_norm": 0.6971, "tokens_per_sec": 124645, "dt_s": 5.258, "eta_s": 29939, "world_size": 1, "timestamp": "2026-05-04T23:23:12.791549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24020, "epoch": 0, "train_loss": 3.8474944084882736, "train_ppl": 46.875465199075485, "lr": 0.00056, "grad_norm": 0.7669, "tokens_per_sec": 152088, "dt_s": 4.309, "eta_s": 29953, "world_size": 1, "timestamp": "2026-05-04T23:23:17.100656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24030, "epoch": 0, "train_loss": 3.7074658274650574, "train_ppl": 40.750407003993594, "lr": 0.00056, "grad_norm": 0.7425, "tokens_per_sec": 150800, "dt_s": 4.346, "eta_s": 29947, "world_size": 1, "timestamp": "2026-05-04T23:23:21.446525"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24040, "epoch": 0, "train_loss": 3.8411395996809006, "train_ppl": 46.578525078118446, "lr": 0.00056, "grad_norm": 0.6427, "tokens_per_sec": 145581, "dt_s": 4.502, "eta_s": 30111, "world_size": 1, "timestamp": "2026-05-04T23:23:25.948256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24050, "epoch": 0, "train_loss": 3.797520413994789, "train_ppl": 44.59048136749572, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 150870, "dt_s": 4.344, "eta_s": 30145, "world_size": 1, "timestamp": "2026-05-04T23:23:30.292081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24060, "epoch": 0, "train_loss": 3.8744451254606247, "train_ppl": 48.155970354590174, "lr": 0.00056, "grad_norm": 0.7045, "tokens_per_sec": 151083, "dt_s": 4.338, "eta_s": 29968, "world_size": 1, "timestamp": "2026-05-04T23:23:34.629836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24070, "epoch": 0, "train_loss": 3.839422583580017, "train_ppl": 46.49861762143033, "lr": 0.00056, "grad_norm": 0.6837, "tokens_per_sec": 148622, "dt_s": 4.41, "eta_s": 30101, "world_size": 1, "timestamp": "2026-05-04T23:23:39.039422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24080, "epoch": 0, "train_loss": 3.816503420472145, "train_ppl": 45.44502803509329, "lr": 0.00056, "grad_norm": 0.723, "tokens_per_sec": 151792, "dt_s": 4.317, "eta_s": 30058, "world_size": 1, "timestamp": "2026-05-04T23:23:43.356928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24090, "epoch": 0, "train_loss": 3.9966113418340683, "train_ppl": 54.41344868779458, "lr": 0.00056, "grad_norm": 0.7708, "tokens_per_sec": 150626, "dt_s": 4.351, "eta_s": 29847, "world_size": 1, "timestamp": "2026-05-04T23:23:47.707855"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24100, "epoch": 0, "train_loss": 3.8531337082386017, "train_ppl": 47.14055676112914, "lr": 0.00056, "grad_norm": 0.7105, "tokens_per_sec": 148466, "dt_s": 4.414, "eta_s": 29939, "world_size": 1, "timestamp": "2026-05-04T23:23:52.122043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24110, "epoch": 0, "train_loss": 3.8705607503652573, "train_ppl": 47.96927733025971, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 151116, "dt_s": 4.337, "eta_s": 29933, "world_size": 1, "timestamp": "2026-05-04T23:23:56.458826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24120, "epoch": 0, "train_loss": 3.876923516392708, "train_ppl": 48.275467694237236, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 150039, "dt_s": 4.368, "eta_s": 29872, "world_size": 1, "timestamp": "2026-05-04T23:24:00.826783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24130, "epoch": 0, "train_loss": 3.7542490363121033, "train_ppl": 42.7021400101343, "lr": 0.00056, "grad_norm": 0.6478, "tokens_per_sec": 150969, "dt_s": 4.341, "eta_s": 29900, "world_size": 1, "timestamp": "2026-05-04T23:24:05.167808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24140, "epoch": 0, "train_loss": 3.7702482491731644, "train_ppl": 43.390835237895324, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 151684, "dt_s": 4.321, "eta_s": 29854, "world_size": 1, "timestamp": "2026-05-04T23:24:09.488351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24150, "epoch": 0, "train_loss": 3.7596780955791473, "train_ppl": 42.93460291554374, "lr": 0.00056, "grad_norm": 0.7311, "tokens_per_sec": 133997, "dt_s": 4.891, "eta_s": 30503, "world_size": 1, "timestamp": "2026-05-04T23:24:14.379203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24160, "epoch": 0, "train_loss": 3.717821925878525, "train_ppl": 41.17461500792512, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 151051, "dt_s": 4.339, "eta_s": 30501, "world_size": 1, "timestamp": "2026-05-04T23:24:18.717877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24170, "epoch": 0, "train_loss": 3.8326592445373535, "train_ppl": 46.18519279935147, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 151735, "dt_s": 4.319, "eta_s": 30429, "world_size": 1, "timestamp": "2026-05-04T23:24:23.036995"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24180, "epoch": 0, "train_loss": 4.051156252622604, "train_ppl": 57.463861388385546, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 148276, "dt_s": 4.42, "eta_s": 30533, "world_size": 1, "timestamp": "2026-05-04T23:24:27.456849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24190, "epoch": 0, "train_loss": 3.8748848885297775, "train_ppl": 48.177152229073045, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 150515, "dt_s": 4.354, "eta_s": 30575, "world_size": 1, "timestamp": "2026-05-04T23:24:31.810975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24200, "epoch": 0, "train_loss": 3.823368862271309, "train_ppl": 45.75810169491699, "lr": 0.00056, "grad_norm": 0.7215, "tokens_per_sec": 149935, "dt_s": 4.371, "eta_s": 29858, "world_size": 1, "timestamp": "2026-05-04T23:24:36.181926"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24210, "epoch": 0, "train_loss": 3.7579829692840576, "train_ppl": 42.86188499161748, "lr": 0.00056, "grad_norm": 0.8239, "tokens_per_sec": 148612, "dt_s": 4.41, "eta_s": 29951, "world_size": 1, "timestamp": "2026-05-04T23:24:40.591835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24220, "epoch": 0, "train_loss": 3.8663633465766907, "train_ppl": 47.76835287937836, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 152262, "dt_s": 4.304, "eta_s": 29926, "world_size": 1, "timestamp": "2026-05-04T23:24:44.895988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24230, "epoch": 0, "train_loss": 3.8483141660690308, "train_ppl": 46.91390747154331, "lr": 0.00056, "grad_norm": 0.6941, "tokens_per_sec": 150171, "dt_s": 4.364, "eta_s": 29846, "world_size": 1, "timestamp": "2026-05-04T23:24:49.260066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24240, "epoch": 0, "train_loss": 3.8534425795078278, "train_ppl": 47.155119373598495, "lr": 0.00056, "grad_norm": 0.6902, "tokens_per_sec": 151330, "dt_s": 4.331, "eta_s": 29809, "world_size": 1, "timestamp": "2026-05-04T23:24:53.590747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24250, "epoch": 0, "train_loss": 3.857274651527405, "train_ppl": 47.336167860996596, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 151435, "dt_s": 4.328, "eta_s": 29746, "world_size": 1, "timestamp": "2026-05-04T23:24:57.918417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24260, "epoch": 0, "train_loss": 3.788023665547371, "train_ppl": 44.16902120094796, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 147578, "dt_s": 4.441, "eta_s": 29783, "world_size": 1, "timestamp": "2026-05-04T23:25:02.359168"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24270, "epoch": 0, "train_loss": 3.892840266227722, "train_ppl": 49.05000393461213, "lr": 0.00056, "grad_norm": 0.6985, "tokens_per_sec": 150334, "dt_s": 4.359, "eta_s": 29855, "world_size": 1, "timestamp": "2026-05-04T23:25:06.718549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24280, "epoch": 0, "train_loss": 3.883039638400078, "train_ppl": 48.57163110718835, "lr": 0.00056, "grad_norm": 0.6756, "tokens_per_sec": 153019, "dt_s": 4.283, "eta_s": 29739, "world_size": 1, "timestamp": "2026-05-04T23:25:11.001413"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24290, "epoch": 0, "train_loss": 3.853895738720894, "train_ppl": 47.17649299284635, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 148417, "dt_s": 4.416, "eta_s": 29851, "world_size": 1, "timestamp": "2026-05-04T23:25:15.417080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24300, "epoch": 0, "train_loss": 3.8407514095306396, "train_ppl": 46.56044726251143, "lr": 0.00056, "grad_norm": 0.6556, "tokens_per_sec": 152312, "dt_s": 4.303, "eta_s": 29813, "world_size": 1, "timestamp": "2026-05-04T23:25:19.719884"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24310, "epoch": 0, "train_loss": 3.839555099606514, "train_ppl": 46.50477984176258, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 151747, "dt_s": 4.319, "eta_s": 29641, "world_size": 1, "timestamp": "2026-05-04T23:25:24.038614"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24320, "epoch": 0, "train_loss": 4.039649918675423, "train_ppl": 56.806452445933644, "lr": 0.00056, "grad_norm": 0.8403, "tokens_per_sec": 149921, "dt_s": 4.371, "eta_s": 29654, "world_size": 1, "timestamp": "2026-05-04T23:25:28.409987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24330, "epoch": 0, "train_loss": 3.757771670818329, "train_ppl": 42.8528292978413, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 150152, "dt_s": 4.365, "eta_s": 29761, "world_size": 1, "timestamp": "2026-05-04T23:25:32.774647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24340, "epoch": 0, "train_loss": 3.911706566810608, "train_ppl": 49.98418057219403, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 149024, "dt_s": 4.398, "eta_s": 29732, "world_size": 1, "timestamp": "2026-05-04T23:25:37.172337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24350, "epoch": 0, "train_loss": 3.9835106432437897, "train_ppl": 53.70524361683947, "lr": 0.00056, "grad_norm": 0.7206, "tokens_per_sec": 150771, "dt_s": 4.347, "eta_s": 29788, "world_size": 1, "timestamp": "2026-05-04T23:25:41.519054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24360, "epoch": 0, "train_loss": 3.8113396763801575, "train_ppl": 45.21096637740759, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 150861, "dt_s": 4.344, "eta_s": 29818, "world_size": 1, "timestamp": "2026-05-04T23:25:45.863173"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24370, "epoch": 0, "train_loss": 3.863637253642082, "train_ppl": 47.638309246146704, "lr": 0.00056, "grad_norm": 0.7501, "tokens_per_sec": 149066, "dt_s": 4.396, "eta_s": 29848, "world_size": 1, "timestamp": "2026-05-04T23:25:50.259634"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24380, "epoch": 0, "train_loss": 3.8170639872550964, "train_ppl": 45.47051014980631, "lr": 0.00056, "grad_norm": 0.8169, "tokens_per_sec": 152241, "dt_s": 4.305, "eta_s": 29762, "world_size": 1, "timestamp": "2026-05-04T23:25:54.564397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24390, "epoch": 0, "train_loss": 3.6977615505456924, "train_ppl": 40.35686637188983, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 150557, "dt_s": 4.353, "eta_s": 29696, "world_size": 1, "timestamp": "2026-05-04T23:25:58.917309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24400, "epoch": 0, "train_loss": 3.8543187975883484, "train_ppl": 47.19645564893398, "lr": 0.00056, "grad_norm": 0.7015, "tokens_per_sec": 148307, "dt_s": 4.419, "eta_s": 29791, "world_size": 1, "timestamp": "2026-05-04T23:26:03.336258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24410, "epoch": 0, "train_loss": 3.6564143300056458, "train_ppl": 38.722248442646475, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 150070, "dt_s": 4.367, "eta_s": 29818, "world_size": 1, "timestamp": "2026-05-04T23:26:07.703283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24420, "epoch": 0, "train_loss": 3.822411596775055, "train_ppl": 45.71432000168582, "lr": 0.00056, "grad_norm": 0.6469, "tokens_per_sec": 150840, "dt_s": 4.345, "eta_s": 29742, "world_size": 1, "timestamp": "2026-05-04T23:26:12.047989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24430, "epoch": 0, "train_loss": 3.8858631253242493, "train_ppl": 48.70896626327669, "lr": 0.00056, "grad_norm": 0.6656, "tokens_per_sec": 149278, "dt_s": 4.39, "eta_s": 29855, "world_size": 1, "timestamp": "2026-05-04T23:26:16.438170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24440, "epoch": 0, "train_loss": 3.877113699913025, "train_ppl": 48.28464976573994, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 149896, "dt_s": 4.372, "eta_s": 29877, "world_size": 1, "timestamp": "2026-05-04T23:26:20.810289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24450, "epoch": 0, "train_loss": 3.778474524617195, "train_ppl": 43.749252398231356, "lr": 0.00056, "grad_norm": 0.7284, "tokens_per_sec": 132662, "dt_s": 4.94, "eta_s": 30583, "world_size": 1, "timestamp": "2026-05-04T23:26:25.750370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24460, "epoch": 0, "train_loss": 3.8688744455575943, "train_ppl": 47.88845467224839, "lr": 0.00056, "grad_norm": 0.7173, "tokens_per_sec": 151063, "dt_s": 4.338, "eta_s": 30540, "world_size": 1, "timestamp": "2026-05-04T23:26:30.088698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24470, "epoch": 0, "train_loss": 3.828881159424782, "train_ppl": 46.01103041733137, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 150079, "dt_s": 4.367, "eta_s": 30565, "world_size": 1, "timestamp": "2026-05-04T23:26:34.455472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24480, "epoch": 0, "train_loss": 3.7919735461473465, "train_ppl": 44.343828567750734, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 149826, "dt_s": 4.374, "eta_s": 30539, "world_size": 1, "timestamp": "2026-05-04T23:26:38.829622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24490, "epoch": 0, "train_loss": 3.7188905775547028, "train_ppl": 41.218639848689506, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 149604, "dt_s": 4.381, "eta_s": 30546, "world_size": 1, "timestamp": "2026-05-04T23:26:43.210252"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24500, "epoch": 0, "train_loss": 3.8513534516096115, "train_ppl": 47.05670912975659, "lr": 0.00056, "grad_norm": 0.6747, "tokens_per_sec": 151268, "dt_s": 4.332, "eta_s": 29713, "world_size": 1, "timestamp": "2026-05-04T23:26:47.542667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24510, "epoch": 0, "train_loss": 3.838323175907135, "train_ppl": 46.447524775518644, "lr": 0.00056, "grad_norm": 0.6853, "tokens_per_sec": 128526, "dt_s": 5.099, "eta_s": 29692, "world_size": 1, "timestamp": "2026-05-04T23:26:52.641705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24520, "epoch": 0, "train_loss": 3.771880567073822, "train_ppl": 43.46172071305432, "lr": 0.00056, "grad_norm": 0.725, "tokens_per_sec": 151209, "dt_s": 4.334, "eta_s": 29643, "world_size": 1, "timestamp": "2026-05-04T23:26:56.975848"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24530, "epoch": 0, "train_loss": 3.877855747938156, "train_ppl": 48.32049259164716, "lr": 0.00056, "grad_norm": 0.7614, "tokens_per_sec": 147982, "dt_s": 4.429, "eta_s": 29713, "world_size": 1, "timestamp": "2026-05-04T23:27:01.404503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24540, "epoch": 0, "train_loss": 3.7847415804862976, "train_ppl": 44.02429235241716, "lr": 0.00056, "grad_norm": 0.6977, "tokens_per_sec": 149695, "dt_s": 4.378, "eta_s": 29705, "world_size": 1, "timestamp": "2026-05-04T23:27:05.782455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24550, "epoch": 0, "train_loss": 3.9771046191453934, "train_ppl": 53.362306137661456, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 150581, "dt_s": 4.352, "eta_s": 29728, "world_size": 1, "timestamp": "2026-05-04T23:27:10.134680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24560, "epoch": 0, "train_loss": 3.6505919247865677, "train_ppl": 38.497446899131226, "lr": 0.00056, "grad_norm": 0.7347, "tokens_per_sec": 148973, "dt_s": 4.399, "eta_s": 29823, "world_size": 1, "timestamp": "2026-05-04T23:27:14.533901"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24570, "epoch": 0, "train_loss": 3.8488072752952576, "train_ppl": 46.93704685680919, "lr": 0.00056, "grad_norm": 0.7256, "tokens_per_sec": 152760, "dt_s": 4.29, "eta_s": 29759, "world_size": 1, "timestamp": "2026-05-04T23:27:18.823998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24580, "epoch": 0, "train_loss": 3.9904442131519318, "train_ppl": 54.07890659009706, "lr": 0.00056, "grad_norm": 0.7653, "tokens_per_sec": 151659, "dt_s": 4.321, "eta_s": 29608, "world_size": 1, "timestamp": "2026-05-04T23:27:23.145290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24590, "epoch": 0, "train_loss": 3.768305853009224, "train_ppl": 43.30663484770984, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 148326, "dt_s": 4.418, "eta_s": 29659, "world_size": 1, "timestamp": "2026-05-04T23:27:27.563639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24600, "epoch": 0, "train_loss": 3.9190842658281326, "train_ppl": 50.35431249418305, "lr": 0.00056, "grad_norm": 0.921, "tokens_per_sec": 152228, "dt_s": 4.305, "eta_s": 29590, "world_size": 1, "timestamp": "2026-05-04T23:27:31.868784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24610, "epoch": 0, "train_loss": 3.8243700563907623, "train_ppl": 45.80393737863102, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 151775, "dt_s": 4.318, "eta_s": 29475, "world_size": 1, "timestamp": "2026-05-04T23:27:36.186751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24620, "epoch": 0, "train_loss": 3.7593407034873962, "train_ppl": 42.92011956347918, "lr": 0.00056, "grad_norm": 0.6768, "tokens_per_sec": 150137, "dt_s": 4.365, "eta_s": 29573, "world_size": 1, "timestamp": "2026-05-04T23:27:40.551860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24630, "epoch": 0, "train_loss": 3.8950047343969345, "train_ppl": 49.15628608750315, "lr": 0.00056, "grad_norm": 0.7411, "tokens_per_sec": 151040, "dt_s": 4.339, "eta_s": 29593, "world_size": 1, "timestamp": "2026-05-04T23:27:44.890843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24640, "epoch": 0, "train_loss": 3.757826089859009, "train_ppl": 42.855161371156136, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 149285, "dt_s": 4.39, "eta_s": 29550, "world_size": 1, "timestamp": "2026-05-04T23:27:49.280829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24650, "epoch": 0, "train_loss": 3.7845754474401474, "train_ppl": 44.0169790701298, "lr": 0.00056, "grad_norm": 0.6726, "tokens_per_sec": 150662, "dt_s": 4.35, "eta_s": 29606, "world_size": 1, "timestamp": "2026-05-04T23:27:53.630714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24660, "epoch": 0, "train_loss": 3.8605813533067703, "train_ppl": 47.49295353035536, "lr": 0.00056, "grad_norm": 0.7415, "tokens_per_sec": 150537, "dt_s": 4.353, "eta_s": 29650, "world_size": 1, "timestamp": "2026-05-04T23:27:57.984208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24670, "epoch": 0, "train_loss": 3.8341639935970306, "train_ppl": 46.25474223889095, "lr": 0.00056, "grad_norm": 0.7594, "tokens_per_sec": 148606, "dt_s": 4.41, "eta_s": 29707, "world_size": 1, "timestamp": "2026-05-04T23:28:02.394247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24680, "epoch": 0, "train_loss": 3.882928490638733, "train_ppl": 48.566232779137515, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 150605, "dt_s": 4.352, "eta_s": 29720, "world_size": 1, "timestamp": "2026-05-04T23:28:06.745743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24690, "epoch": 0, "train_loss": 3.8995466828346252, "train_ppl": 49.380059202586175, "lr": 0.00056, "grad_norm": 0.7664, "tokens_per_sec": 152050, "dt_s": 4.31, "eta_s": 29607, "world_size": 1, "timestamp": "2026-05-04T23:28:11.055897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24700, "epoch": 0, "train_loss": 3.7760270684957504, "train_ppl": 43.642308945727386, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 149327, "dt_s": 4.389, "eta_s": 29655, "world_size": 1, "timestamp": "2026-05-04T23:28:15.444686"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24710, "epoch": 0, "train_loss": 3.7388895601034164, "train_ppl": 42.051268822573945, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 151940, "dt_s": 4.313, "eta_s": 29596, "world_size": 1, "timestamp": "2026-05-04T23:28:19.757964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24720, "epoch": 0, "train_loss": 3.8483720421791077, "train_ppl": 46.916622744590235, "lr": 0.00056, "grad_norm": 0.7108, "tokens_per_sec": 152751, "dt_s": 4.29, "eta_s": 29429, "world_size": 1, "timestamp": "2026-05-04T23:28:24.048351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24730, "epoch": 0, "train_loss": 3.7703239619731903, "train_ppl": 43.39412060389721, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 149960, "dt_s": 4.37, "eta_s": 29450, "world_size": 1, "timestamp": "2026-05-04T23:28:28.418553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24740, "epoch": 0, "train_loss": 3.7940058559179306, "train_ppl": 44.434040602163456, "lr": 0.00056, "grad_norm": 0.6824, "tokens_per_sec": 137706, "dt_s": 4.759, "eta_s": 30056, "world_size": 1, "timestamp": "2026-05-04T23:28:33.177671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24750, "epoch": 0, "train_loss": 3.8578763008117676, "train_ppl": 47.36465620165017, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 150336, "dt_s": 4.359, "eta_s": 30012, "world_size": 1, "timestamp": "2026-05-04T23:28:37.536986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24760, "epoch": 0, "train_loss": 3.7687263935804367, "train_ppl": 43.324850874686724, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 148542, "dt_s": 4.412, "eta_s": 30141, "world_size": 1, "timestamp": "2026-05-04T23:28:41.948919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24770, "epoch": 0, "train_loss": 3.920313686132431, "train_ppl": 50.416257178596446, "lr": 0.00056, "grad_norm": 0.6762, "tokens_per_sec": 151766, "dt_s": 4.318, "eta_s": 30175, "world_size": 1, "timestamp": "2026-05-04T23:28:46.267145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24780, "epoch": 0, "train_loss": 3.9102879017591476, "train_ppl": 49.913320037656874, "lr": 0.00056, "grad_norm": 0.7625, "tokens_per_sec": 147722, "dt_s": 4.436, "eta_s": 30260, "world_size": 1, "timestamp": "2026-05-04T23:28:50.703581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24790, "epoch": 0, "train_loss": 3.791080817580223, "train_ppl": 44.30425923017964, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 151099, "dt_s": 4.337, "eta_s": 29683, "world_size": 1, "timestamp": "2026-05-04T23:28:55.040849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24800, "epoch": 0, "train_loss": 3.6882101595401764, "train_ppl": 39.973237174155706, "lr": 0.00056, "grad_norm": 0.7502, "tokens_per_sec": 149943, "dt_s": 4.371, "eta_s": 29694, "world_size": 1, "timestamp": "2026-05-04T23:28:59.411551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24810, "epoch": 0, "train_loss": 3.76373091340065, "train_ppl": 43.10896212379334, "lr": 0.00056, "grad_norm": 0.6397, "tokens_per_sec": 147399, "dt_s": 4.446, "eta_s": 29736, "world_size": 1, "timestamp": "2026-05-04T23:29:03.857761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24820, "epoch": 0, "train_loss": 3.7740218341350555, "train_ppl": 43.55488357172585, "lr": 0.00056, "grad_norm": 0.6883, "tokens_per_sec": 148430, "dt_s": 4.415, "eta_s": 29863, "world_size": 1, "timestamp": "2026-05-04T23:29:08.273023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24830, "epoch": 0, "train_loss": 3.8122515231370926, "train_ppl": 45.25221065184791, "lr": 0.00056, "grad_norm": 0.6468, "tokens_per_sec": 148353, "dt_s": 4.418, "eta_s": 29833, "world_size": 1, "timestamp": "2026-05-04T23:29:12.690601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24840, "epoch": 0, "train_loss": 3.769859567284584, "train_ppl": 43.37397328328844, "lr": 0.00056, "grad_norm": 0.7007, "tokens_per_sec": 148725, "dt_s": 4.407, "eta_s": 29923, "world_size": 1, "timestamp": "2026-05-04T23:29:17.097136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24850, "epoch": 0, "train_loss": 3.8369209617376328, "train_ppl": 46.382441039472546, "lr": 0.00056, "grad_norm": 0.7397, "tokens_per_sec": 150927, "dt_s": 4.342, "eta_s": 29880, "world_size": 1, "timestamp": "2026-05-04T23:29:21.439381"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24860, "epoch": 0, "train_loss": 3.7785289585590363, "train_ppl": 43.75163390730887, "lr": 0.00056, "grad_norm": 0.6725, "tokens_per_sec": 150072, "dt_s": 4.367, "eta_s": 29768, "world_size": 1, "timestamp": "2026-05-04T23:29:25.806353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24870, "epoch": 0, "train_loss": 3.911391854286194, "train_ppl": 49.96845239960165, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 149287, "dt_s": 4.39, "eta_s": 29729, "world_size": 1, "timestamp": "2026-05-04T23:29:30.196274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24880, "epoch": 0, "train_loss": 3.780169889330864, "train_ppl": 43.82348624593907, "lr": 0.00056, "grad_norm": 0.6952, "tokens_per_sec": 152667, "dt_s": 4.293, "eta_s": 29556, "world_size": 1, "timestamp": "2026-05-04T23:29:34.489011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24890, "epoch": 0, "train_loss": 3.9074224680662155, "train_ppl": 49.77050144501869, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 149318, "dt_s": 4.389, "eta_s": 29528, "world_size": 1, "timestamp": "2026-05-04T23:29:38.878007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24900, "epoch": 0, "train_loss": 3.822590619325638, "train_ppl": 45.7225046284452, "lr": 0.00056, "grad_norm": 0.7216, "tokens_per_sec": 149002, "dt_s": 4.398, "eta_s": 29599, "world_size": 1, "timestamp": "2026-05-04T23:29:43.276398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24910, "epoch": 0, "train_loss": 3.86435866355896, "train_ppl": 47.672688394098294, "lr": 0.00056, "grad_norm": 0.7436, "tokens_per_sec": 151273, "dt_s": 4.332, "eta_s": 29548, "world_size": 1, "timestamp": "2026-05-04T23:29:47.608699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24920, "epoch": 0, "train_loss": 3.877487227320671, "train_ppl": 48.30268877461842, "lr": 0.00056, "grad_norm": 0.783, "tokens_per_sec": 148798, "dt_s": 4.404, "eta_s": 29563, "world_size": 1, "timestamp": "2026-05-04T23:29:52.013013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24930, "epoch": 0, "train_loss": 3.787688747048378, "train_ppl": 44.154230655617226, "lr": 0.00056, "grad_norm": 0.7221, "tokens_per_sec": 150461, "dt_s": 4.356, "eta_s": 29644, "world_size": 1, "timestamp": "2026-05-04T23:29:56.368688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24940, "epoch": 0, "train_loss": 3.9562118351459503, "train_ppl": 52.2589848621546, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 150113, "dt_s": 4.366, "eta_s": 29608, "world_size": 1, "timestamp": "2026-05-04T23:30:00.734449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24950, "epoch": 0, "train_loss": 3.737433686852455, "train_ppl": 41.99009204874616, "lr": 0.00056, "grad_norm": 0.7094, "tokens_per_sec": 149059, "dt_s": 4.397, "eta_s": 29601, "world_size": 1, "timestamp": "2026-05-04T23:30:05.131111"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24960, "epoch": 0, "train_loss": 3.7405565828084946, "train_ppl": 42.121427704455634, "lr": 0.00056, "grad_norm": 0.6794, "tokens_per_sec": 151155, "dt_s": 4.336, "eta_s": 29602, "world_size": 1, "timestamp": "2026-05-04T23:30:09.466800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24970, "epoch": 0, "train_loss": 3.8771476298570633, "train_ppl": 48.28628808899836, "lr": 0.00056, "grad_norm": 0.7199, "tokens_per_sec": 151514, "dt_s": 4.325, "eta_s": 29491, "world_size": 1, "timestamp": "2026-05-04T23:30:13.792220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24980, "epoch": 0, "train_loss": 3.761601760983467, "train_ppl": 43.01727421628748, "lr": 0.00056, "grad_norm": 0.6549, "tokens_per_sec": 150092, "dt_s": 4.366, "eta_s": 29501, "world_size": 1, "timestamp": "2026-05-04T23:30:18.158591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 24990, "epoch": 0, "train_loss": 3.878999277949333, "train_ppl": 48.37578013053577, "lr": 0.00056, "grad_norm": 0.7121, "tokens_per_sec": 150922, "dt_s": 4.342, "eta_s": 29465, "world_size": 1, "timestamp": "2026-05-04T23:30:22.500962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25000, "epoch": 0, "train_loss": 3.8468701243400574, "train_ppl": 46.8462107217148, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 150206, "dt_s": 4.363, "eta_s": 29415, "world_size": 1, "timestamp": "2026-05-04T23:30:26.864050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25010, "epoch": 0, "train_loss": 3.8270821571350098, "train_ppl": 45.92833087888946, "lr": 0.00056, "grad_norm": 0.6802, "tokens_per_sec": 126231, "dt_s": 5.192, "eta_s": 29522, "world_size": 1, "timestamp": "2026-05-04T23:30:32.055816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25020, "epoch": 0, "train_loss": 3.942462593317032, "train_ppl": 51.54538044312579, "lr": 0.00056, "grad_norm": 0.6955, "tokens_per_sec": 147888, "dt_s": 4.431, "eta_s": 29661, "world_size": 1, "timestamp": "2026-05-04T23:30:36.487279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25030, "epoch": 0, "train_loss": 3.766140893101692, "train_ppl": 43.21297913654507, "lr": 0.00056, "grad_norm": 0.758, "tokens_per_sec": 147781, "dt_s": 4.435, "eta_s": 29749, "world_size": 1, "timestamp": "2026-05-04T23:30:40.921936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25040, "epoch": 0, "train_loss": 3.970204159617424, "train_ppl": 52.995349245525006, "lr": 0.00056, "grad_norm": 0.7151, "tokens_per_sec": 135781, "dt_s": 4.827, "eta_s": 30399, "world_size": 1, "timestamp": "2026-05-04T23:30:45.748528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25050, "epoch": 0, "train_loss": 3.857564613223076, "train_ppl": 47.34989552664857, "lr": 0.00056, "grad_norm": 0.7258, "tokens_per_sec": 151999, "dt_s": 4.312, "eta_s": 30325, "world_size": 1, "timestamp": "2026-05-04T23:30:50.060127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25060, "epoch": 0, "train_loss": 3.898305594921112, "train_ppl": 49.31881222224635, "lr": 0.00056, "grad_norm": 0.7694, "tokens_per_sec": 150051, "dt_s": 4.368, "eta_s": 30253, "world_size": 1, "timestamp": "2026-05-04T23:30:54.427719"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25070, "epoch": 0, "train_loss": 3.972588464617729, "train_ppl": 53.121857078421165, "lr": 0.00056, "grad_norm": 0.7339, "tokens_per_sec": 149549, "dt_s": 4.382, "eta_s": 30182, "world_size": 1, "timestamp": "2026-05-04T23:30:58.809972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25080, "epoch": 0, "train_loss": 3.834734618663788, "train_ppl": 46.28114388627336, "lr": 0.00056, "grad_norm": 0.7233, "tokens_per_sec": 147733, "dt_s": 4.436, "eta_s": 30179, "world_size": 1, "timestamp": "2026-05-04T23:31:03.246070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25090, "epoch": 0, "train_loss": 3.873189613223076, "train_ppl": 48.095547883012244, "lr": 0.00056, "grad_norm": 0.6988, "tokens_per_sec": 150660, "dt_s": 4.35, "eta_s": 29531, "world_size": 1, "timestamp": "2026-05-04T23:31:07.595992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25100, "epoch": 0, "train_loss": 3.8466990292072296, "train_ppl": 46.8381962487071, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 150525, "dt_s": 4.354, "eta_s": 29583, "world_size": 1, "timestamp": "2026-05-04T23:31:11.949837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25110, "epoch": 0, "train_loss": 3.851992592215538, "train_ppl": 47.08679459674033, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 149180, "dt_s": 4.393, "eta_s": 29613, "world_size": 1, "timestamp": "2026-05-04T23:31:16.342892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25120, "epoch": 0, "train_loss": 3.8403356224298477, "train_ppl": 46.54109205323459, "lr": 0.00056, "grad_norm": 0.7016, "tokens_per_sec": 151751, "dt_s": 4.319, "eta_s": 29523, "world_size": 1, "timestamp": "2026-05-04T23:31:20.661559"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25130, "epoch": 0, "train_loss": 3.890516072511673, "train_ppl": 48.9361346021476, "lr": 0.00056, "grad_norm": 0.7461, "tokens_per_sec": 150717, "dt_s": 4.348, "eta_s": 29400, "world_size": 1, "timestamp": "2026-05-04T23:31:25.009843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25140, "epoch": 0, "train_loss": 3.94794137775898, "train_ppl": 51.82856150720058, "lr": 0.00056, "grad_norm": 0.7898, "tokens_per_sec": 148636, "dt_s": 4.409, "eta_s": 29476, "world_size": 1, "timestamp": "2026-05-04T23:31:29.418996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25150, "epoch": 0, "train_loss": 3.7678834348917007, "train_ppl": 43.288345203751646, "lr": 0.00056, "grad_norm": 0.7082, "tokens_per_sec": 150055, "dt_s": 4.367, "eta_s": 29490, "world_size": 1, "timestamp": "2026-05-04T23:31:33.786471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25160, "epoch": 0, "train_loss": 3.8823702335357666, "train_ppl": 48.53912790117351, "lr": 0.00056, "grad_norm": 0.7062, "tokens_per_sec": 148292, "dt_s": 4.419, "eta_s": 29521, "world_size": 1, "timestamp": "2026-05-04T23:31:38.205865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25170, "epoch": 0, "train_loss": 3.8395705074071884, "train_ppl": 46.50549638366094, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 149098, "dt_s": 4.395, "eta_s": 29620, "world_size": 1, "timestamp": "2026-05-04T23:31:42.601379"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25180, "epoch": 0, "train_loss": 3.8588214218616486, "train_ppl": 47.409442696242316, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 149212, "dt_s": 4.392, "eta_s": 29675, "world_size": 1, "timestamp": "2026-05-04T23:31:46.993549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25190, "epoch": 0, "train_loss": 3.826661080121994, "train_ppl": 45.908995585619635, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 147259, "dt_s": 4.45, "eta_s": 29726, "world_size": 1, "timestamp": "2026-05-04T23:31:51.443895"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25200, "epoch": 0, "train_loss": 3.8203338384628296, "train_ppl": 45.619435301187565, "lr": 0.00056, "grad_norm": 0.6783, "tokens_per_sec": 152073, "dt_s": 4.31, "eta_s": 29644, "world_size": 1, "timestamp": "2026-05-04T23:31:55.753399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25210, "epoch": 0, "train_loss": 3.7806542366743088, "train_ppl": 43.844717176240074, "lr": 0.00056, "grad_norm": 0.7403, "tokens_per_sec": 151717, "dt_s": 4.32, "eta_s": 29505, "world_size": 1, "timestamp": "2026-05-04T23:32:00.073022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25220, "epoch": 0, "train_loss": 3.7146252542734146, "train_ppl": 41.043203437001324, "lr": 0.00056, "grad_norm": 0.6708, "tokens_per_sec": 150893, "dt_s": 4.343, "eta_s": 29430, "world_size": 1, "timestamp": "2026-05-04T23:32:04.416247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25230, "epoch": 0, "train_loss": 3.698082610964775, "train_ppl": 40.36982544453138, "lr": 0.00056, "grad_norm": 0.7165, "tokens_per_sec": 151767, "dt_s": 4.318, "eta_s": 29326, "world_size": 1, "timestamp": "2026-05-04T23:32:08.734444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25240, "epoch": 0, "train_loss": 3.8315079510211945, "train_ppl": 46.13205068329573, "lr": 0.00056, "grad_norm": 0.6979, "tokens_per_sec": 151549, "dt_s": 4.324, "eta_s": 29151, "world_size": 1, "timestamp": "2026-05-04T23:32:13.058869"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25250, "epoch": 0, "train_loss": 3.7652834057807922, "train_ppl": 43.1759404372142, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 150444, "dt_s": 4.356, "eta_s": 29210, "world_size": 1, "timestamp": "2026-05-04T23:32:17.415033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25260, "epoch": 0, "train_loss": 3.777546316385269, "train_ppl": 43.708662822718004, "lr": 0.00056, "grad_norm": 0.7414, "tokens_per_sec": 153591, "dt_s": 4.267, "eta_s": 29135, "world_size": 1, "timestamp": "2026-05-04T23:32:21.681954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25270, "epoch": 0, "train_loss": 3.8309029191732407, "train_ppl": 46.10414776534911, "lr": 0.00056, "grad_norm": 0.6664, "tokens_per_sec": 147865, "dt_s": 4.432, "eta_s": 29250, "world_size": 1, "timestamp": "2026-05-04T23:32:26.114107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25280, "epoch": 0, "train_loss": 3.8295145630836487, "train_ppl": 46.040183204112175, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 148193, "dt_s": 4.422, "eta_s": 29386, "world_size": 1, "timestamp": "2026-05-04T23:32:30.536453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25290, "epoch": 0, "train_loss": 3.773076206445694, "train_ppl": 43.51371633532052, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 151434, "dt_s": 4.328, "eta_s": 29386, "world_size": 1, "timestamp": "2026-05-04T23:32:34.864141"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25300, "epoch": 0, "train_loss": 3.9746421724557877, "train_ppl": 53.231065955854795, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 146010, "dt_s": 4.488, "eta_s": 29560, "world_size": 1, "timestamp": "2026-05-04T23:32:39.352603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25310, "epoch": 0, "train_loss": 3.8988465070724487, "train_ppl": 49.345496583363825, "lr": 0.00056, "grad_norm": 0.6957, "tokens_per_sec": 146661, "dt_s": 4.469, "eta_s": 29827, "world_size": 1, "timestamp": "2026-05-04T23:32:43.821142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25320, "epoch": 0, "train_loss": 3.887552171945572, "train_ppl": 48.791307497687896, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 148591, "dt_s": 4.41, "eta_s": 29794, "world_size": 1, "timestamp": "2026-05-04T23:32:48.231670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25330, "epoch": 0, "train_loss": 3.696962431073189, "train_ppl": 40.3246292964757, "lr": 0.00056, "grad_norm": 0.6828, "tokens_per_sec": 132429, "dt_s": 4.949, "eta_s": 30498, "world_size": 1, "timestamp": "2026-05-04T23:32:53.180399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25340, "epoch": 0, "train_loss": 3.794617623090744, "train_ppl": 44.46123220617662, "lr": 0.00056, "grad_norm": 0.7612, "tokens_per_sec": 150490, "dt_s": 4.355, "eta_s": 30530, "world_size": 1, "timestamp": "2026-05-04T23:32:57.535246"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25350, "epoch": 0, "train_loss": 3.8305508494377136, "train_ppl": 46.08791874727935, "lr": 0.00056, "grad_norm": 0.7932, "tokens_per_sec": 148874, "dt_s": 4.402, "eta_s": 30409, "world_size": 1, "timestamp": "2026-05-04T23:33:01.937381"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25360, "epoch": 0, "train_loss": 3.8997821360826492, "train_ppl": 49.39168726679206, "lr": 0.00056, "grad_norm": 0.7282, "tokens_per_sec": 146594, "dt_s": 4.471, "eta_s": 30408, "world_size": 1, "timestamp": "2026-05-04T23:33:06.407907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25370, "epoch": 0, "train_loss": 3.792794942855835, "train_ppl": 44.38026740590016, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 148897, "dt_s": 4.401, "eta_s": 30391, "world_size": 1, "timestamp": "2026-05-04T23:33:10.809353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25380, "epoch": 0, "train_loss": 3.8193231523036957, "train_ppl": 45.57335166131246, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 148787, "dt_s": 4.405, "eta_s": 29654, "world_size": 1, "timestamp": "2026-05-04T23:33:15.214045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25390, "epoch": 0, "train_loss": 3.9364716708660126, "train_ppl": 51.237499233259335, "lr": 0.00056, "grad_norm": 0.7077, "tokens_per_sec": 148923, "dt_s": 4.401, "eta_s": 29712, "world_size": 1, "timestamp": "2026-05-04T23:33:19.614689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25400, "epoch": 0, "train_loss": 3.9159467965364456, "train_ppl": 50.19657496275261, "lr": 0.00056, "grad_norm": 0.6908, "tokens_per_sec": 150806, "dt_s": 4.346, "eta_s": 29631, "world_size": 1, "timestamp": "2026-05-04T23:33:23.960428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25410, "epoch": 0, "train_loss": 3.826608717441559, "train_ppl": 45.906591730491385, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 148735, "dt_s": 4.406, "eta_s": 29540, "world_size": 1, "timestamp": "2026-05-04T23:33:28.366627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25420, "epoch": 0, "train_loss": 3.7284490317106247, "train_ppl": 41.614515292633264, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 150657, "dt_s": 4.35, "eta_s": 29467, "world_size": 1, "timestamp": "2026-05-04T23:33:32.716650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25430, "epoch": 0, "train_loss": 3.67036409676075, "train_ppl": 39.26619995414021, "lr": 0.00056, "grad_norm": 0.6539, "tokens_per_sec": 150757, "dt_s": 4.347, "eta_s": 29385, "world_size": 1, "timestamp": "2026-05-04T23:33:37.063780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25440, "epoch": 0, "train_loss": 3.8163763731718063, "train_ppl": 45.43925473371655, "lr": 0.00056, "grad_norm": 0.698, "tokens_per_sec": 147733, "dt_s": 4.436, "eta_s": 29428, "world_size": 1, "timestamp": "2026-05-04T23:33:41.499878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25450, "epoch": 0, "train_loss": 3.879493683576584, "train_ppl": 48.3997033018429, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 152682, "dt_s": 4.292, "eta_s": 29352, "world_size": 1, "timestamp": "2026-05-04T23:33:45.792216"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25460, "epoch": 0, "train_loss": 3.762918457388878, "train_ppl": 43.07395221228693, "lr": 0.00056, "grad_norm": 0.7517, "tokens_per_sec": 149892, "dt_s": 4.372, "eta_s": 29302, "world_size": 1, "timestamp": "2026-05-04T23:33:50.164419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25470, "epoch": 0, "train_loss": 3.827799931168556, "train_ppl": 45.96130887615766, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 149094, "dt_s": 4.396, "eta_s": 29359, "world_size": 1, "timestamp": "2026-05-04T23:33:54.560043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25480, "epoch": 0, "train_loss": 3.875696063041687, "train_ppl": 48.216248161685904, "lr": 0.00056, "grad_norm": 0.7411, "tokens_per_sec": 152157, "dt_s": 4.307, "eta_s": 29301, "world_size": 1, "timestamp": "2026-05-04T23:33:58.867165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25490, "epoch": 0, "train_loss": 3.791418954730034, "train_ppl": 44.31924267920782, "lr": 0.00056, "grad_norm": 0.6402, "tokens_per_sec": 150801, "dt_s": 4.346, "eta_s": 29175, "world_size": 1, "timestamp": "2026-05-04T23:34:03.213017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25500, "epoch": 0, "train_loss": 3.778485342860222, "train_ppl": 43.74972569083614, "lr": 0.00056, "grad_norm": 0.6921, "tokens_per_sec": 149816, "dt_s": 4.374, "eta_s": 29281, "world_size": 1, "timestamp": "2026-05-04T23:34:07.587449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25510, "epoch": 0, "train_loss": 3.7976147681474686, "train_ppl": 44.59468886307694, "lr": 0.00056, "grad_norm": 0.6372, "tokens_per_sec": 127822, "dt_s": 5.127, "eta_s": 29240, "world_size": 1, "timestamp": "2026-05-04T23:34:12.714602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25520, "epoch": 0, "train_loss": 3.7882553339004517, "train_ppl": 44.17925495071898, "lr": 0.00056, "grad_norm": 0.7389, "tokens_per_sec": 147161, "dt_s": 4.453, "eta_s": 29313, "world_size": 1, "timestamp": "2026-05-04T23:34:17.167973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25530, "epoch": 0, "train_loss": 3.8161761462688446, "train_ppl": 45.43015748325587, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 149354, "dt_s": 4.388, "eta_s": 29417, "world_size": 1, "timestamp": "2026-05-04T23:34:21.555926"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25540, "epoch": 0, "train_loss": 3.841621309518814, "train_ppl": 46.60096781689413, "lr": 0.00056, "grad_norm": 0.6465, "tokens_per_sec": 152096, "dt_s": 4.309, "eta_s": 29363, "world_size": 1, "timestamp": "2026-05-04T23:34:25.864772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25550, "epoch": 0, "train_loss": 3.746844455599785, "train_ppl": 42.38711631636106, "lr": 0.00056, "grad_norm": 0.6986, "tokens_per_sec": 148183, "dt_s": 4.423, "eta_s": 29424, "world_size": 1, "timestamp": "2026-05-04T23:34:30.287423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25560, "epoch": 0, "train_loss": 3.848997473716736, "train_ppl": 46.94597505806818, "lr": 0.00056, "grad_norm": 0.6802, "tokens_per_sec": 151348, "dt_s": 4.33, "eta_s": 29399, "world_size": 1, "timestamp": "2026-05-04T23:34:34.617567"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25570, "epoch": 0, "train_loss": 3.8024206906557083, "train_ppl": 44.80952330737716, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 150048, "dt_s": 4.368, "eta_s": 29280, "world_size": 1, "timestamp": "2026-05-04T23:34:38.985245"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25580, "epoch": 0, "train_loss": 3.8033296167850494, "train_ppl": 44.85027036918389, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 148043, "dt_s": 4.427, "eta_s": 29328, "world_size": 1, "timestamp": "2026-05-04T23:34:43.412041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25590, "epoch": 0, "train_loss": 3.8451197147369385, "train_ppl": 46.76428238958602, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 151879, "dt_s": 4.315, "eta_s": 29332, "world_size": 1, "timestamp": "2026-05-04T23:34:47.727052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25600, "epoch": 0, "train_loss": 3.8112004846334457, "train_ppl": 45.204673821973046, "lr": 0.00056, "grad_norm": 0.6838, "tokens_per_sec": 149107, "dt_s": 4.395, "eta_s": 29291, "world_size": 1, "timestamp": "2026-05-04T23:34:52.122308"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25610, "epoch": 0, "train_loss": 3.8982768952846527, "train_ppl": 49.317396810575964, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 150650, "dt_s": 4.35, "eta_s": 29313, "world_size": 1, "timestamp": "2026-05-04T23:34:56.472512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25620, "epoch": 0, "train_loss": 3.855748549103737, "train_ppl": 47.26398311516435, "lr": 0.00056, "grad_norm": 0.7412, "tokens_per_sec": 150198, "dt_s": 4.363, "eta_s": 29303, "world_size": 1, "timestamp": "2026-05-04T23:35:00.835810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25630, "epoch": 0, "train_loss": 3.82811838388443, "train_ppl": 45.975947710558955, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 133867, "dt_s": 4.896, "eta_s": 29927, "world_size": 1, "timestamp": "2026-05-04T23:35:05.731412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25640, "epoch": 0, "train_loss": 3.7574707716703415, "train_ppl": 42.839936857776124, "lr": 0.00056, "grad_norm": 0.6312, "tokens_per_sec": 149748, "dt_s": 4.376, "eta_s": 30005, "world_size": 1, "timestamp": "2026-05-04T23:35:10.107834"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25650, "epoch": 0, "train_loss": 3.810172066092491, "train_ppl": 45.1582083943257, "lr": 0.00056, "grad_norm": 0.7033, "tokens_per_sec": 151785, "dt_s": 4.318, "eta_s": 29897, "world_size": 1, "timestamp": "2026-05-04T23:35:14.425499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25660, "epoch": 0, "train_loss": 3.8008035719394684, "train_ppl": 44.73711954708738, "lr": 0.00056, "grad_norm": 0.7248, "tokens_per_sec": 148473, "dt_s": 4.414, "eta_s": 29978, "world_size": 1, "timestamp": "2026-05-04T23:35:18.839515"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25670, "epoch": 0, "train_loss": 3.8399978280067444, "train_ppl": 46.52537338688226, "lr": 0.00056, "grad_norm": 0.7541, "tokens_per_sec": 151132, "dt_s": 4.336, "eta_s": 29937, "world_size": 1, "timestamp": "2026-05-04T23:35:23.175849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25680, "epoch": 0, "train_loss": 3.699318915605545, "train_ppl": 40.41976571140645, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 148940, "dt_s": 4.4, "eta_s": 29269, "world_size": 1, "timestamp": "2026-05-04T23:35:27.575999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25690, "epoch": 0, "train_loss": 3.8683254718780518, "train_ppl": 47.86217238588115, "lr": 0.00056, "grad_norm": 0.6795, "tokens_per_sec": 148914, "dt_s": 4.401, "eta_s": 29297, "world_size": 1, "timestamp": "2026-05-04T23:35:31.976940"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25700, "epoch": 0, "train_loss": 3.890416517853737, "train_ppl": 48.931263024504716, "lr": 0.00056, "grad_norm": 0.6842, "tokens_per_sec": 150278, "dt_s": 4.361, "eta_s": 29351, "world_size": 1, "timestamp": "2026-05-04T23:35:36.337944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25710, "epoch": 0, "train_loss": 3.869872272014618, "train_ppl": 47.936262887490635, "lr": 0.00056, "grad_norm": 0.673, "tokens_per_sec": 149027, "dt_s": 4.398, "eta_s": 29324, "world_size": 1, "timestamp": "2026-05-04T23:35:40.735556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25720, "epoch": 0, "train_loss": 3.868141397833824, "train_ppl": 47.85336301305788, "lr": 0.00056, "grad_norm": 0.7035, "tokens_per_sec": 151286, "dt_s": 4.332, "eta_s": 29314, "world_size": 1, "timestamp": "2026-05-04T23:35:45.067429"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25730, "epoch": 0, "train_loss": 3.8128832578659058, "train_ppl": 45.28080707659853, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 151214, "dt_s": 4.334, "eta_s": 29221, "world_size": 1, "timestamp": "2026-05-04T23:35:49.401433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25740, "epoch": 0, "train_loss": 3.729975998401642, "train_ppl": 41.6781078108203, "lr": 0.00056, "grad_norm": 0.8298, "tokens_per_sec": 149792, "dt_s": 4.375, "eta_s": 29182, "world_size": 1, "timestamp": "2026-05-04T23:35:53.776575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25750, "epoch": 0, "train_loss": 3.7914423644542694, "train_ppl": 44.320280192601174, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 151623, "dt_s": 4.322, "eta_s": 29126, "world_size": 1, "timestamp": "2026-05-04T23:35:58.098897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25760, "epoch": 0, "train_loss": 3.6707373708486557, "train_ppl": 39.28085974500344, "lr": 0.00056, "grad_norm": 0.6511, "tokens_per_sec": 149818, "dt_s": 4.374, "eta_s": 29091, "world_size": 1, "timestamp": "2026-05-04T23:36:02.473268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25770, "epoch": 0, "train_loss": 3.8628183603286743, "train_ppl": 47.59931452168165, "lr": 0.00056, "grad_norm": 0.7357, "tokens_per_sec": 150287, "dt_s": 4.361, "eta_s": 29125, "world_size": 1, "timestamp": "2026-05-04T23:36:06.833987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25780, "epoch": 0, "train_loss": 3.9242033064365387, "train_ppl": 50.61273914858839, "lr": 0.00056, "grad_norm": 0.7492, "tokens_per_sec": 150818, "dt_s": 4.345, "eta_s": 29136, "world_size": 1, "timestamp": "2026-05-04T23:36:11.179386"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25790, "epoch": 0, "train_loss": 3.7760924994945526, "train_ppl": 43.64516459901485, "lr": 0.00056, "grad_norm": 0.681, "tokens_per_sec": 151292, "dt_s": 4.332, "eta_s": 29074, "world_size": 1, "timestamp": "2026-05-04T23:36:15.511107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25800, "epoch": 0, "train_loss": 3.898803234100342, "train_ppl": 49.34336130326686, "lr": 0.00056, "grad_norm": 0.686, "tokens_per_sec": 149427, "dt_s": 4.386, "eta_s": 29154, "world_size": 1, "timestamp": "2026-05-04T23:36:19.896921"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25810, "epoch": 0, "train_loss": 3.8182533979415894, "train_ppl": 45.52462543676935, "lr": 0.00056, "grad_norm": 0.7398, "tokens_per_sec": 151054, "dt_s": 4.339, "eta_s": 29102, "world_size": 1, "timestamp": "2026-05-04T23:36:24.235500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25820, "epoch": 0, "train_loss": 3.6725503504276276, "train_ppl": 39.352139736626036, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 149423, "dt_s": 4.386, "eta_s": 29131, "world_size": 1, "timestamp": "2026-05-04T23:36:28.621434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25830, "epoch": 0, "train_loss": 3.7724143117666245, "train_ppl": 43.484924367687874, "lr": 0.00056, "grad_norm": 0.7313, "tokens_per_sec": 148158, "dt_s": 4.423, "eta_s": 29231, "world_size": 1, "timestamp": "2026-05-04T23:36:33.044839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25840, "epoch": 0, "train_loss": 3.786239132285118, "train_ppl": 44.090270401063506, "lr": 0.00056, "grad_norm": 0.6822, "tokens_per_sec": 149748, "dt_s": 4.376, "eta_s": 29287, "world_size": 1, "timestamp": "2026-05-04T23:36:37.421265"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25850, "epoch": 0, "train_loss": 3.9153843224048615, "train_ppl": 50.16834862687799, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 148376, "dt_s": 4.417, "eta_s": 29324, "world_size": 1, "timestamp": "2026-05-04T23:36:41.838128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25860, "epoch": 0, "train_loss": 3.860897660255432, "train_ppl": 47.50797825765704, "lr": 0.00056, "grad_norm": 0.6852, "tokens_per_sec": 150251, "dt_s": 4.362, "eta_s": 29350, "world_size": 1, "timestamp": "2026-05-04T23:36:46.199899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25870, "epoch": 0, "train_loss": 3.8378767669200897, "train_ppl": 46.42679481039847, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 148834, "dt_s": 4.403, "eta_s": 29369, "world_size": 1, "timestamp": "2026-05-04T23:36:50.603214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25880, "epoch": 0, "train_loss": 3.8549757599830627, "train_ppl": 47.22747213267529, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 148122, "dt_s": 4.424, "eta_s": 29366, "world_size": 1, "timestamp": "2026-05-04T23:36:55.027677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25890, "epoch": 0, "train_loss": 3.7927155196666718, "train_ppl": 44.37674272349948, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 151903, "dt_s": 4.314, "eta_s": 29279, "world_size": 1, "timestamp": "2026-05-04T23:36:59.342011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25900, "epoch": 0, "train_loss": 3.8259288370609283, "train_ppl": 45.875391346898375, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 149013, "dt_s": 4.398, "eta_s": 29249, "world_size": 1, "timestamp": "2026-05-04T23:37:03.740018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25910, "epoch": 0, "train_loss": 3.844638764858246, "train_ppl": 46.74179652136484, "lr": 0.00056, "grad_norm": 0.6639, "tokens_per_sec": 148206, "dt_s": 4.422, "eta_s": 29325, "world_size": 1, "timestamp": "2026-05-04T23:37:08.161978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25920, "epoch": 0, "train_loss": 3.957999810576439, "train_ppl": 52.352506225145504, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 135166, "dt_s": 4.849, "eta_s": 29916, "world_size": 1, "timestamp": "2026-05-04T23:37:13.010557"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25930, "epoch": 0, "train_loss": 3.751712754368782, "train_ppl": 42.5939725730285, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 147877, "dt_s": 4.432, "eta_s": 29921, "world_size": 1, "timestamp": "2026-05-04T23:37:17.442380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25940, "epoch": 0, "train_loss": 3.792806550860405, "train_ppl": 44.38078257523707, "lr": 0.00056, "grad_norm": 0.7259, "tokens_per_sec": 152858, "dt_s": 4.287, "eta_s": 29880, "world_size": 1, "timestamp": "2026-05-04T23:37:21.729750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25950, "epoch": 0, "train_loss": 3.8182560950517654, "train_ppl": 45.52474822186546, "lr": 0.00056, "grad_norm": 0.7312, "tokens_per_sec": 152666, "dt_s": 4.293, "eta_s": 29735, "world_size": 1, "timestamp": "2026-05-04T23:37:26.022506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25960, "epoch": 0, "train_loss": 3.830229327082634, "train_ppl": 46.073102833054456, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 148761, "dt_s": 4.405, "eta_s": 29709, "world_size": 1, "timestamp": "2026-05-04T23:37:30.427972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25970, "epoch": 0, "train_loss": 3.838663101196289, "train_ppl": 46.463316147599755, "lr": 0.00056, "grad_norm": 0.6938, "tokens_per_sec": 150183, "dt_s": 4.364, "eta_s": 29058, "world_size": 1, "timestamp": "2026-05-04T23:37:34.791726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25980, "epoch": 0, "train_loss": 3.5583741441369057, "train_ppl": 35.10607330501173, "lr": 0.00056, "grad_norm": 1.4781, "tokens_per_sec": 150604, "dt_s": 4.352, "eta_s": 28946, "world_size": 1, "timestamp": "2026-05-04T23:37:39.143291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 25990, "epoch": 0, "train_loss": 3.776700347661972, "train_ppl": 43.67170229696151, "lr": 0.00056, "grad_norm": 0.6461, "tokens_per_sec": 148774, "dt_s": 4.405, "eta_s": 29099, "world_size": 1, "timestamp": "2026-05-04T23:37:43.548343"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26000, "epoch": 0, "train_loss": 3.728828728199005, "train_ppl": 41.630319178105566, "lr": 0.00056, "grad_norm": 0.7183, "tokens_per_sec": 151474, "dt_s": 4.327, "eta_s": 29140, "world_size": 1, "timestamp": "2026-05-04T23:37:47.874867"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26010, "epoch": 0, "train_loss": 3.8683563619852066, "train_ppl": 47.863650876350064, "lr": 0.00056, "grad_norm": 0.6548, "tokens_per_sec": 126903, "dt_s": 5.164, "eta_s": 29123, "world_size": 1, "timestamp": "2026-05-04T23:37:53.039149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26020, "epoch": 0, "train_loss": 3.7788168489933014, "train_ppl": 43.7642313974557, "lr": 0.00056, "grad_norm": 0.7447, "tokens_per_sec": 149654, "dt_s": 4.379, "eta_s": 29139, "world_size": 1, "timestamp": "2026-05-04T23:37:57.418316"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26030, "epoch": 0, "train_loss": 3.8159100115299225, "train_ppl": 45.41806854886892, "lr": 0.00056, "grad_norm": 0.6757, "tokens_per_sec": 150097, "dt_s": 4.366, "eta_s": 29154, "world_size": 1, "timestamp": "2026-05-04T23:38:01.784550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26040, "epoch": 0, "train_loss": 3.8863354921340942, "train_ppl": 48.73198019736271, "lr": 0.00056, "grad_norm": 0.7486, "tokens_per_sec": 146590, "dt_s": 4.471, "eta_s": 29237, "world_size": 1, "timestamp": "2026-05-04T23:38:06.255259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26050, "epoch": 0, "train_loss": 3.7288634926080704, "train_ppl": 41.63176645670774, "lr": 0.00056, "grad_norm": 0.6514, "tokens_per_sec": 150957, "dt_s": 4.341, "eta_s": 29252, "world_size": 1, "timestamp": "2026-05-04T23:38:10.596633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26060, "epoch": 0, "train_loss": 3.7989154905080795, "train_ppl": 44.65273191280848, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 152084, "dt_s": 4.309, "eta_s": 29132, "world_size": 1, "timestamp": "2026-05-04T23:38:14.905847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26070, "epoch": 0, "train_loss": 3.8498872220516205, "train_ppl": 46.98776374915795, "lr": 0.00056, "grad_norm": 0.6772, "tokens_per_sec": 148598, "dt_s": 4.41, "eta_s": 29169, "world_size": 1, "timestamp": "2026-05-04T23:38:19.316098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26080, "epoch": 0, "train_loss": 3.8015824407339096, "train_ppl": 44.77197746656648, "lr": 0.00056, "grad_norm": 0.6302, "tokens_per_sec": 152611, "dt_s": 4.294, "eta_s": 29069, "world_size": 1, "timestamp": "2026-05-04T23:38:23.610419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26090, "epoch": 0, "train_loss": 3.8403687328100204, "train_ppl": 46.54263307199785, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 151213, "dt_s": 4.334, "eta_s": 28883, "world_size": 1, "timestamp": "2026-05-04T23:38:27.944427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26100, "epoch": 0, "train_loss": 3.881094127893448, "train_ppl": 48.477226351044564, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 148150, "dt_s": 4.424, "eta_s": 28988, "world_size": 1, "timestamp": "2026-05-04T23:38:32.368038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26110, "epoch": 0, "train_loss": 3.7491316199302673, "train_ppl": 42.48417356750342, "lr": 0.00056, "grad_norm": 0.8027, "tokens_per_sec": 152050, "dt_s": 4.31, "eta_s": 28985, "world_size": 1, "timestamp": "2026-05-04T23:38:36.678221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26120, "epoch": 0, "train_loss": 3.847222685813904, "train_ppl": 46.86272980263561, "lr": 0.00056, "grad_norm": 0.7102, "tokens_per_sec": 149455, "dt_s": 4.385, "eta_s": 28947, "world_size": 1, "timestamp": "2026-05-04T23:38:41.063237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26130, "epoch": 0, "train_loss": 3.7059613168239594, "train_ppl": 40.689143680244314, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 149958, "dt_s": 4.37, "eta_s": 29044, "world_size": 1, "timestamp": "2026-05-04T23:38:45.433487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26140, "epoch": 0, "train_loss": 3.8882421106100082, "train_ppl": 48.82498212259602, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 152619, "dt_s": 4.294, "eta_s": 28986, "world_size": 1, "timestamp": "2026-05-04T23:38:49.727587"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26150, "epoch": 0, "train_loss": 3.861868992447853, "train_ppl": 47.55414670515526, "lr": 0.00056, "grad_norm": 0.6784, "tokens_per_sec": 149476, "dt_s": 4.384, "eta_s": 28930, "world_size": 1, "timestamp": "2026-05-04T23:38:54.111965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26160, "epoch": 0, "train_loss": 3.897830307483673, "train_ppl": 49.295377179999456, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 151427, "dt_s": 4.328, "eta_s": 28949, "world_size": 1, "timestamp": "2026-05-04T23:38:58.439863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26170, "epoch": 0, "train_loss": 3.7026432305574417, "train_ppl": 40.55435733143203, "lr": 0.00056, "grad_norm": 0.6564, "tokens_per_sec": 151511, "dt_s": 4.325, "eta_s": 28865, "world_size": 1, "timestamp": "2026-05-04T23:39:02.765361"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26180, "epoch": 0, "train_loss": 3.8001523166894913, "train_ppl": 44.70799374830761, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 149764, "dt_s": 4.376, "eta_s": 28868, "world_size": 1, "timestamp": "2026-05-04T23:39:07.141330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26190, "epoch": 0, "train_loss": 3.9219702780246735, "train_ppl": 50.49984555827581, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 150044, "dt_s": 4.368, "eta_s": 28962, "world_size": 1, "timestamp": "2026-05-04T23:39:11.509084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26200, "epoch": 0, "train_loss": 3.7654277682304382, "train_ppl": 43.182173871667594, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 152380, "dt_s": 4.301, "eta_s": 28847, "world_size": 1, "timestamp": "2026-05-04T23:39:15.809904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26210, "epoch": 0, "train_loss": 3.737558364868164, "train_ppl": 41.99532761647556, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 148720, "dt_s": 4.407, "eta_s": 28947, "world_size": 1, "timestamp": "2026-05-04T23:39:20.216579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26220, "epoch": 0, "train_loss": 3.8100178241729736, "train_ppl": 45.151243642723124, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 134168, "dt_s": 4.885, "eta_s": 29686, "world_size": 1, "timestamp": "2026-05-04T23:39:25.101231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26230, "epoch": 0, "train_loss": 3.9024577289819717, "train_ppl": 49.52401626484941, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 149950, "dt_s": 4.371, "eta_s": 29674, "world_size": 1, "timestamp": "2026-05-04T23:39:29.471724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26240, "epoch": 0, "train_loss": 3.805028408765793, "train_ppl": 44.92652640205089, "lr": 0.00056, "grad_norm": 0.6648, "tokens_per_sec": 148692, "dt_s": 4.407, "eta_s": 29723, "world_size": 1, "timestamp": "2026-05-04T23:39:33.879238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26250, "epoch": 0, "train_loss": 3.8847987055778503, "train_ppl": 48.657147061342826, "lr": 0.00056, "grad_norm": 0.703, "tokens_per_sec": 151420, "dt_s": 4.328, "eta_s": 29754, "world_size": 1, "timestamp": "2026-05-04T23:39:38.207330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26260, "epoch": 0, "train_loss": 3.846722334623337, "train_ppl": 46.83928784508039, "lr": 0.00056, "grad_norm": 0.7378, "tokens_per_sec": 148617, "dt_s": 4.41, "eta_s": 29754, "world_size": 1, "timestamp": "2026-05-04T23:39:42.617044"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26270, "epoch": 0, "train_loss": 3.7344004809856415, "train_ppl": 41.86292042156051, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 150616, "dt_s": 4.351, "eta_s": 29041, "world_size": 1, "timestamp": "2026-05-04T23:39:46.968268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26280, "epoch": 0, "train_loss": 3.8728531897068024, "train_ppl": 48.07937013111701, "lr": 0.00056, "grad_norm": 0.7012, "tokens_per_sec": 150133, "dt_s": 4.365, "eta_s": 29030, "world_size": 1, "timestamp": "2026-05-04T23:39:51.333447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26290, "epoch": 0, "train_loss": 3.7379631251096725, "train_ppl": 42.012329095953184, "lr": 0.00056, "grad_norm": 0.6793, "tokens_per_sec": 147580, "dt_s": 4.441, "eta_s": 29069, "world_size": 1, "timestamp": "2026-05-04T23:39:55.774202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26300, "epoch": 0, "train_loss": 3.9087467044591904, "train_ppl": 49.83645301242392, "lr": 0.00056, "grad_norm": 0.6793, "tokens_per_sec": 151279, "dt_s": 4.332, "eta_s": 29070, "world_size": 1, "timestamp": "2026-05-04T23:40:00.106333"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26310, "epoch": 0, "train_loss": 3.917875364422798, "train_ppl": 50.293475875190374, "lr": 0.00056, "grad_norm": 0.7359, "tokens_per_sec": 151376, "dt_s": 4.329, "eta_s": 28959, "world_size": 1, "timestamp": "2026-05-04T23:40:04.435648"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26320, "epoch": 0, "train_loss": 3.9550780057907104, "train_ppl": 52.1997656696165, "lr": 0.00056, "grad_norm": 0.7101, "tokens_per_sec": 147302, "dt_s": 4.449, "eta_s": 29085, "world_size": 1, "timestamp": "2026-05-04T23:40:08.884742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26330, "epoch": 0, "train_loss": 3.7980430722236633, "train_ppl": 44.61379304100022, "lr": 0.00056, "grad_norm": 0.7259, "tokens_per_sec": 150295, "dt_s": 4.36, "eta_s": 29074, "world_size": 1, "timestamp": "2026-05-04T23:40:13.245251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26340, "epoch": 0, "train_loss": 3.7840777039527893, "train_ppl": 43.99507535713193, "lr": 0.00056, "grad_norm": 0.6449, "tokens_per_sec": 150094, "dt_s": 4.366, "eta_s": 28971, "world_size": 1, "timestamp": "2026-05-04T23:40:17.611571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26350, "epoch": 0, "train_loss": 3.7971784323453903, "train_ppl": 44.57523484829366, "lr": 0.00056, "grad_norm": 0.7244, "tokens_per_sec": 149957, "dt_s": 4.37, "eta_s": 29017, "world_size": 1, "timestamp": "2026-05-04T23:40:21.981878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26360, "epoch": 0, "train_loss": 3.768440470099449, "train_ppl": 43.31246505329433, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 149787, "dt_s": 4.375, "eta_s": 29074, "world_size": 1, "timestamp": "2026-05-04T23:40:26.357156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26370, "epoch": 0, "train_loss": 3.8175989538431168, "train_ppl": 45.49484186122272, "lr": 0.00056, "grad_norm": 0.6873, "tokens_per_sec": 148673, "dt_s": 4.408, "eta_s": 29015, "world_size": 1, "timestamp": "2026-05-04T23:40:30.765245"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26380, "epoch": 0, "train_loss": 3.8221424221992493, "train_ppl": 45.702016524956974, "lr": 0.00056, "grad_norm": 0.6911, "tokens_per_sec": 152099, "dt_s": 4.309, "eta_s": 28942, "world_size": 1, "timestamp": "2026-05-04T23:40:35.074002"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26390, "epoch": 0, "train_loss": 3.864299923181534, "train_ppl": 47.66988816463318, "lr": 0.00056, "grad_norm": 0.6972, "tokens_per_sec": 152016, "dt_s": 4.311, "eta_s": 28865, "world_size": 1, "timestamp": "2026-05-04T23:40:39.385112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26400, "epoch": 0, "train_loss": 3.8282528817653656, "train_ppl": 45.982131793963774, "lr": 0.00056, "grad_norm": 0.6985, "tokens_per_sec": 148758, "dt_s": 4.406, "eta_s": 28907, "world_size": 1, "timestamp": "2026-05-04T23:40:43.790667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26410, "epoch": 0, "train_loss": 3.6477699279785156, "train_ppl": 38.38896037317457, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 149945, "dt_s": 4.371, "eta_s": 28897, "world_size": 1, "timestamp": "2026-05-04T23:40:48.161359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26420, "epoch": 0, "train_loss": 3.8294571936130524, "train_ppl": 46.03754197893917, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 151280, "dt_s": 4.332, "eta_s": 28791, "world_size": 1, "timestamp": "2026-05-04T23:40:52.493443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26430, "epoch": 0, "train_loss": 3.646031141281128, "train_ppl": 38.32226815812231, "lr": 0.00056, "grad_norm": 1.0165, "tokens_per_sec": 148777, "dt_s": 4.405, "eta_s": 28914, "world_size": 1, "timestamp": "2026-05-04T23:40:56.898401"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26440, "epoch": 0, "train_loss": 3.791917771100998, "train_ppl": 44.34135535762944, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 150352, "dt_s": 4.359, "eta_s": 28973, "world_size": 1, "timestamp": "2026-05-04T23:41:01.257258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26450, "epoch": 0, "train_loss": 3.837764248251915, "train_ppl": 46.42157122315983, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 152504, "dt_s": 4.297, "eta_s": 28826, "world_size": 1, "timestamp": "2026-05-04T23:41:05.554565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26460, "epoch": 0, "train_loss": 3.7669764012098312, "train_ppl": 43.24909901811627, "lr": 0.00056, "grad_norm": 0.6768, "tokens_per_sec": 150206, "dt_s": 4.363, "eta_s": 28811, "world_size": 1, "timestamp": "2026-05-04T23:41:09.917623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26470, "epoch": 0, "train_loss": 3.813972994685173, "train_ppl": 45.33017813512039, "lr": 0.00056, "grad_norm": 0.7081, "tokens_per_sec": 150707, "dt_s": 4.349, "eta_s": 28829, "world_size": 1, "timestamp": "2026-05-04T23:41:14.266207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26480, "epoch": 0, "train_loss": 3.8077510595321655, "train_ppl": 45.04901231110307, "lr": 0.00056, "grad_norm": 0.6912, "tokens_per_sec": 150587, "dt_s": 4.352, "eta_s": 28754, "world_size": 1, "timestamp": "2026-05-04T23:41:18.618253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26490, "epoch": 0, "train_loss": 3.8420004844665527, "train_ppl": 46.61864108684953, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 152355, "dt_s": 4.302, "eta_s": 28674, "world_size": 1, "timestamp": "2026-05-04T23:41:22.919770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26500, "epoch": 0, "train_loss": 3.8991248309612274, "train_ppl": 49.35923252529834, "lr": 0.00056, "grad_norm": 0.7067, "tokens_per_sec": 149269, "dt_s": 4.39, "eta_s": 28793, "world_size": 1, "timestamp": "2026-05-04T23:41:27.310251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26510, "epoch": 0, "train_loss": 4.0415051728487015, "train_ppl": 56.91194067738669, "lr": 0.00056, "grad_norm": 2.8893, "tokens_per_sec": 124996, "dt_s": 5.243, "eta_s": 28926, "world_size": 1, "timestamp": "2026-05-04T23:41:32.553295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26520, "epoch": 0, "train_loss": 3.974648043513298, "train_ppl": 53.231378479421785, "lr": 0.00056, "grad_norm": 0.7089, "tokens_per_sec": 135870, "dt_s": 4.823, "eta_s": 29550, "world_size": 1, "timestamp": "2026-05-04T23:41:37.376713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26530, "epoch": 0, "train_loss": 3.8751745223999023, "train_ppl": 48.19110798505699, "lr": 0.00056, "grad_norm": 0.7201, "tokens_per_sec": 148539, "dt_s": 4.412, "eta_s": 29625, "world_size": 1, "timestamp": "2026-05-04T23:41:41.788763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26540, "epoch": 0, "train_loss": 3.8697709888219833, "train_ppl": 47.931407995606044, "lr": 0.00056, "grad_norm": 0.7027, "tokens_per_sec": 151477, "dt_s": 4.326, "eta_s": 29653, "world_size": 1, "timestamp": "2026-05-04T23:41:46.115255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26550, "epoch": 0, "train_loss": 3.73453925549984, "train_ppl": 41.8687303311293, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 151952, "dt_s": 4.313, "eta_s": 29546, "world_size": 1, "timestamp": "2026-05-04T23:41:50.428188"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26560, "epoch": 0, "train_loss": 3.8783787935972214, "train_ppl": 48.345773026375596, "lr": 0.00056, "grad_norm": 0.7759, "tokens_per_sec": 150970, "dt_s": 4.341, "eta_s": 29376, "world_size": 1, "timestamp": "2026-05-04T23:41:54.769164"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26570, "epoch": 0, "train_loss": 3.670651435852051, "train_ppl": 39.277484289491575, "lr": 0.00056, "grad_norm": 0.652, "tokens_per_sec": 151456, "dt_s": 4.327, "eta_s": 28715, "world_size": 1, "timestamp": "2026-05-04T23:41:59.096252"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26580, "epoch": 0, "train_loss": 3.8565559685230255, "train_ppl": 47.30216038342429, "lr": 0.00056, "grad_norm": 0.8592, "tokens_per_sec": 150271, "dt_s": 4.361, "eta_s": 28643, "world_size": 1, "timestamp": "2026-05-04T23:42:03.457422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26590, "epoch": 0, "train_loss": 3.787693127989769, "train_ppl": 44.154424093137614, "lr": 0.00056, "grad_norm": 0.7492, "tokens_per_sec": 149874, "dt_s": 4.373, "eta_s": 28700, "world_size": 1, "timestamp": "2026-05-04T23:42:07.830221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26600, "epoch": 0, "train_loss": 3.7618436366319656, "train_ppl": 43.02768030582428, "lr": 0.00056, "grad_norm": 0.7384, "tokens_per_sec": 152183, "dt_s": 4.306, "eta_s": 28687, "world_size": 1, "timestamp": "2026-05-04T23:42:12.136576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26610, "epoch": 0, "train_loss": 3.8049192428588867, "train_ppl": 44.92162222474142, "lr": 0.00056, "grad_norm": 0.6404, "tokens_per_sec": 151125, "dt_s": 4.337, "eta_s": 28677, "world_size": 1, "timestamp": "2026-05-04T23:42:16.473120"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26620, "epoch": 0, "train_loss": 3.8296784162521362, "train_ppl": 46.04772765208189, "lr": 0.00056, "grad_norm": 0.7038, "tokens_per_sec": 149174, "dt_s": 4.393, "eta_s": 28760, "world_size": 1, "timestamp": "2026-05-04T23:42:20.866385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26630, "epoch": 0, "train_loss": 3.7896725684404373, "train_ppl": 44.24191170595736, "lr": 0.00056, "grad_norm": 0.668, "tokens_per_sec": 152807, "dt_s": 4.289, "eta_s": 28660, "world_size": 1, "timestamp": "2026-05-04T23:42:25.155201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26640, "epoch": 0, "train_loss": 3.7977240830659866, "train_ppl": 44.599563994313776, "lr": 0.00056, "grad_norm": 0.6839, "tokens_per_sec": 150257, "dt_s": 4.362, "eta_s": 28641, "world_size": 1, "timestamp": "2026-05-04T23:42:29.516787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26650, "epoch": 0, "train_loss": 3.8666961938142776, "train_ppl": 47.784255090034975, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 151114, "dt_s": 4.337, "eta_s": 28677, "world_size": 1, "timestamp": "2026-05-04T23:42:33.853664"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26660, "epoch": 0, "train_loss": 3.747841954231262, "train_ppl": 42.42941850155725, "lr": 0.00056, "grad_norm": 0.7198, "tokens_per_sec": 151588, "dt_s": 4.323, "eta_s": 28655, "world_size": 1, "timestamp": "2026-05-04T23:42:38.176970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26670, "epoch": 0, "train_loss": 3.8300458043813705, "train_ppl": 46.06464814860394, "lr": 0.00056, "grad_norm": 0.7222, "tokens_per_sec": 150256, "dt_s": 4.362, "eta_s": 28609, "world_size": 1, "timestamp": "2026-05-04T23:42:42.538583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26680, "epoch": 0, "train_loss": 3.886336788535118, "train_ppl": 48.73204337359269, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 151532, "dt_s": 4.325, "eta_s": 28652, "world_size": 1, "timestamp": "2026-05-04T23:42:46.863488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26690, "epoch": 0, "train_loss": 3.8625082969665527, "train_ppl": 47.584558006032026, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 151942, "dt_s": 4.313, "eta_s": 28584, "world_size": 1, "timestamp": "2026-05-04T23:42:51.176703"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26700, "epoch": 0, "train_loss": 3.7461902052164078, "train_ppl": 42.35939359904892, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 149337, "dt_s": 4.388, "eta_s": 28648, "world_size": 1, "timestamp": "2026-05-04T23:42:55.565162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26710, "epoch": 0, "train_loss": 3.8062409311532974, "train_ppl": 44.98103386018471, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 153555, "dt_s": 4.268, "eta_s": 28570, "world_size": 1, "timestamp": "2026-05-04T23:42:59.833073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26720, "epoch": 0, "train_loss": 3.910830020904541, "train_ppl": 49.94038633997689, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 152726, "dt_s": 4.291, "eta_s": 28473, "world_size": 1, "timestamp": "2026-05-04T23:43:04.124161"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26730, "epoch": 0, "train_loss": 3.8829261660575867, "train_ppl": 48.56611988311967, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 149347, "dt_s": 4.388, "eta_s": 28552, "world_size": 1, "timestamp": "2026-05-04T23:43:08.512338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26740, "epoch": 0, "train_loss": 3.741639718413353, "train_ppl": 42.16707563951889, "lr": 0.00056, "grad_norm": 0.6726, "tokens_per_sec": 150654, "dt_s": 4.35, "eta_s": 28596, "world_size": 1, "timestamp": "2026-05-04T23:43:12.862430"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26750, "epoch": 0, "train_loss": 3.8673764914274216, "train_ppl": 47.81677366461878, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 152430, "dt_s": 4.299, "eta_s": 28475, "world_size": 1, "timestamp": "2026-05-04T23:43:17.161842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26760, "epoch": 0, "train_loss": 3.8501689434051514, "train_ppl": 47.00100307037293, "lr": 0.00056, "grad_norm": 0.6547, "tokens_per_sec": 149086, "dt_s": 4.396, "eta_s": 28639, "world_size": 1, "timestamp": "2026-05-04T23:43:21.557678"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26770, "epoch": 0, "train_loss": 3.8638176172971725, "train_ppl": 47.64690224063331, "lr": 0.00056, "grad_norm": 0.7145, "tokens_per_sec": 152682, "dt_s": 4.292, "eta_s": 28636, "world_size": 1, "timestamp": "2026-05-04T23:43:25.850009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26780, "epoch": 0, "train_loss": 3.7712822556495667, "train_ppl": 43.43572484662133, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 151135, "dt_s": 4.336, "eta_s": 28563, "world_size": 1, "timestamp": "2026-05-04T23:43:30.186291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26790, "epoch": 0, "train_loss": 3.8115601539611816, "train_ppl": 45.220935480851765, "lr": 0.00056, "grad_norm": 0.6661, "tokens_per_sec": 148599, "dt_s": 4.41, "eta_s": 28638, "world_size": 1, "timestamp": "2026-05-04T23:43:34.596505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26800, "epoch": 0, "train_loss": 3.732960283756256, "train_ppl": 41.80267295408688, "lr": 0.00056, "grad_norm": 0.6762, "tokens_per_sec": 152043, "dt_s": 4.31, "eta_s": 28648, "world_size": 1, "timestamp": "2026-05-04T23:43:38.906870"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26810, "epoch": 0, "train_loss": 3.8373696357011795, "train_ppl": 46.40325630241677, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 132808, "dt_s": 4.935, "eta_s": 29354, "world_size": 1, "timestamp": "2026-05-04T23:43:43.841529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26820, "epoch": 0, "train_loss": 3.850406274199486, "train_ppl": 47.012159179557905, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 150855, "dt_s": 4.344, "eta_s": 29418, "world_size": 1, "timestamp": "2026-05-04T23:43:48.185839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26830, "epoch": 0, "train_loss": 3.750425949692726, "train_ppl": 42.53919769978956, "lr": 0.00056, "grad_norm": 0.7259, "tokens_per_sec": 150603, "dt_s": 4.352, "eta_s": 29434, "world_size": 1, "timestamp": "2026-05-04T23:43:52.537402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26840, "epoch": 0, "train_loss": 3.5977587401866913, "train_ppl": 36.51630014390603, "lr": 0.00056, "grad_norm": 0.8391, "tokens_per_sec": 147489, "dt_s": 4.443, "eta_s": 29473, "world_size": 1, "timestamp": "2026-05-04T23:43:56.980860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26850, "epoch": 0, "train_loss": 3.744816407561302, "train_ppl": 42.30124031801756, "lr": 0.00056, "grad_norm": 0.7894, "tokens_per_sec": 150819, "dt_s": 4.345, "eta_s": 29515, "world_size": 1, "timestamp": "2026-05-04T23:44:01.326218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26860, "epoch": 0, "train_loss": 3.8258192390203476, "train_ppl": 45.870363769407234, "lr": 0.00056, "grad_norm": 0.7129, "tokens_per_sec": 153289, "dt_s": 4.275, "eta_s": 28642, "world_size": 1, "timestamp": "2026-05-04T23:44:05.601521"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26870, "epoch": 0, "train_loss": 3.7313130497932434, "train_ppl": 41.73387085358772, "lr": 0.00056, "grad_norm": 0.6463, "tokens_per_sec": 147750, "dt_s": 4.436, "eta_s": 28758, "world_size": 1, "timestamp": "2026-05-04T23:44:10.037096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26880, "epoch": 0, "train_loss": 3.8179157227277756, "train_ppl": 45.509255494311546, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 149497, "dt_s": 4.384, "eta_s": 28796, "world_size": 1, "timestamp": "2026-05-04T23:44:14.420854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26890, "epoch": 0, "train_loss": 3.877740517258644, "train_ppl": 48.31492490924154, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 150378, "dt_s": 4.358, "eta_s": 28679, "world_size": 1, "timestamp": "2026-05-04T23:44:18.778938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26900, "epoch": 0, "train_loss": 3.9265109300613403, "train_ppl": 50.729669164508266, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 149916, "dt_s": 4.372, "eta_s": 28709, "world_size": 1, "timestamp": "2026-05-04T23:44:23.150449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26910, "epoch": 0, "train_loss": 3.756004273891449, "train_ppl": 42.77715822913709, "lr": 0.00056, "grad_norm": 0.747, "tokens_per_sec": 153182, "dt_s": 4.278, "eta_s": 28709, "world_size": 1, "timestamp": "2026-05-04T23:44:27.428780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26920, "epoch": 0, "train_loss": 3.7722227871418, "train_ppl": 43.476596731361994, "lr": 0.00056, "grad_norm": 0.7337, "tokens_per_sec": 150575, "dt_s": 4.352, "eta_s": 28595, "world_size": 1, "timestamp": "2026-05-04T23:44:31.781159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26930, "epoch": 0, "train_loss": 3.8982188552618027, "train_ppl": 49.31453451080295, "lr": 0.00056, "grad_norm": 0.7186, "tokens_per_sec": 149869, "dt_s": 4.373, "eta_s": 28576, "world_size": 1, "timestamp": "2026-05-04T23:44:36.154029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26940, "epoch": 0, "train_loss": 3.83853317797184, "train_ppl": 46.457279875881696, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 151639, "dt_s": 4.322, "eta_s": 28524, "world_size": 1, "timestamp": "2026-05-04T23:44:40.475882"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26950, "epoch": 0, "train_loss": 3.8825392723083496, "train_ppl": 48.54733358929636, "lr": 0.00056, "grad_norm": 0.662, "tokens_per_sec": 148820, "dt_s": 4.404, "eta_s": 28562, "world_size": 1, "timestamp": "2026-05-04T23:44:44.879580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26960, "epoch": 0, "train_loss": 3.943678095936775, "train_ppl": 51.60807208129666, "lr": 0.00056, "grad_norm": 0.6742, "tokens_per_sec": 149755, "dt_s": 4.376, "eta_s": 28687, "world_size": 1, "timestamp": "2026-05-04T23:44:49.255791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26970, "epoch": 0, "train_loss": 3.8210980743169785, "train_ppl": 45.65431263484767, "lr": 0.00056, "grad_norm": 0.7542, "tokens_per_sec": 152241, "dt_s": 4.305, "eta_s": 28620, "world_size": 1, "timestamp": "2026-05-04T23:44:53.560578"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26980, "epoch": 0, "train_loss": 3.793569952249527, "train_ppl": 44.4146758617503, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 148273, "dt_s": 4.42, "eta_s": 28677, "world_size": 1, "timestamp": "2026-05-04T23:44:57.980522"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 26990, "epoch": 0, "train_loss": 3.822311222553253, "train_ppl": 45.70973169266828, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 150685, "dt_s": 4.349, "eta_s": 28709, "world_size": 1, "timestamp": "2026-05-04T23:45:02.329698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27000, "epoch": 0, "train_loss": 3.8728712648153305, "train_ppl": 48.08023917880414, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 151457, "dt_s": 4.327, "eta_s": 28604, "world_size": 1, "timestamp": "2026-05-04T23:45:06.656728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27010, "epoch": 0, "train_loss": 3.835470363497734, "train_ppl": 46.31520752833453, "lr": 0.00056, "grad_norm": 0.6368, "tokens_per_sec": 125975, "dt_s": 5.202, "eta_s": 28672, "world_size": 1, "timestamp": "2026-05-04T23:45:11.859012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27020, "epoch": 0, "train_loss": 3.8062091022729874, "train_ppl": 44.979602187026146, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 151038, "dt_s": 4.339, "eta_s": 28713, "world_size": 1, "timestamp": "2026-05-04T23:45:16.198040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27030, "epoch": 0, "train_loss": 3.780942842364311, "train_ppl": 43.857372837253735, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 146041, "dt_s": 4.488, "eta_s": 28797, "world_size": 1, "timestamp": "2026-05-04T23:45:20.685556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27040, "epoch": 0, "train_loss": 3.8957000076770782, "train_ppl": 49.190475023720545, "lr": 0.00056, "grad_norm": 2.1093, "tokens_per_sec": 151052, "dt_s": 4.339, "eta_s": 28779, "world_size": 1, "timestamp": "2026-05-04T23:45:25.024195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27050, "epoch": 0, "train_loss": 3.9156167805194855, "train_ppl": 50.18001202218652, "lr": 0.00056, "grad_norm": 0.7127, "tokens_per_sec": 150795, "dt_s": 4.346, "eta_s": 28799, "world_size": 1, "timestamp": "2026-05-04T23:45:29.370205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27060, "epoch": 0, "train_loss": 3.7522736340761185, "train_ppl": 42.61786936888702, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 148108, "dt_s": 4.425, "eta_s": 28786, "world_size": 1, "timestamp": "2026-05-04T23:45:33.795073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27070, "epoch": 0, "train_loss": 3.7411992847919464, "train_ppl": 42.14850793091258, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 151054, "dt_s": 4.339, "eta_s": 28781, "world_size": 1, "timestamp": "2026-05-04T23:45:38.133671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27080, "epoch": 0, "train_loss": 3.8268485218286514, "train_ppl": 45.91760165264588, "lr": 0.00056, "grad_norm": 0.6907, "tokens_per_sec": 151114, "dt_s": 4.337, "eta_s": 28579, "world_size": 1, "timestamp": "2026-05-04T23:45:42.470500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27090, "epoch": 0, "train_loss": 3.762937605381012, "train_ppl": 43.07477699988157, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 146601, "dt_s": 4.47, "eta_s": 28748, "world_size": 1, "timestamp": "2026-05-04T23:45:46.940901"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27100, "epoch": 0, "train_loss": 3.663004294037819, "train_ppl": 38.9782693247999, "lr": 0.00056, "grad_norm": 0.7902, "tokens_per_sec": 149973, "dt_s": 4.37, "eta_s": 28774, "world_size": 1, "timestamp": "2026-05-04T23:45:51.310704"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27110, "epoch": 0, "train_loss": 3.7941471934318542, "train_ppl": 44.440321242830365, "lr": 0.00056, "grad_norm": 0.6908, "tokens_per_sec": 135212, "dt_s": 4.847, "eta_s": 29324, "world_size": 1, "timestamp": "2026-05-04T23:45:56.157642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27120, "epoch": 0, "train_loss": 3.7493910789489746, "train_ppl": 42.495197899607184, "lr": 0.00056, "grad_norm": 0.7177, "tokens_per_sec": 150950, "dt_s": 4.342, "eta_s": 29323, "world_size": 1, "timestamp": "2026-05-04T23:46:00.499241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27130, "epoch": 0, "train_loss": 3.68470698595047, "train_ppl": 39.83344897950728, "lr": 0.00056, "grad_norm": 0.7273, "tokens_per_sec": 150919, "dt_s": 4.342, "eta_s": 29326, "world_size": 1, "timestamp": "2026-05-04T23:46:04.841685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27140, "epoch": 0, "train_loss": 3.8911606669425964, "train_ppl": 48.967688730698264, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 150846, "dt_s": 4.345, "eta_s": 29157, "world_size": 1, "timestamp": "2026-05-04T23:46:09.186257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27150, "epoch": 0, "train_loss": 3.8671732544898987, "train_ppl": 47.80705651745202, "lr": 0.00056, "grad_norm": 0.7211, "tokens_per_sec": 151355, "dt_s": 4.33, "eta_s": 29100, "world_size": 1, "timestamp": "2026-05-04T23:46:13.516220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27160, "epoch": 0, "train_loss": 3.7221461683511734, "train_ppl": 41.353049545994786, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 151818, "dt_s": 4.317, "eta_s": 28401, "world_size": 1, "timestamp": "2026-05-04T23:46:17.832953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27170, "epoch": 0, "train_loss": 3.6738473773002625, "train_ppl": 39.403213634308976, "lr": 0.00056, "grad_norm": 0.6713, "tokens_per_sec": 149561, "dt_s": 4.382, "eta_s": 28449, "world_size": 1, "timestamp": "2026-05-04T23:46:22.214847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27180, "epoch": 0, "train_loss": 3.7281276136636734, "train_ppl": 41.60114178576144, "lr": 0.00056, "grad_norm": 0.618, "tokens_per_sec": 151517, "dt_s": 4.325, "eta_s": 28422, "world_size": 1, "timestamp": "2026-05-04T23:46:26.540195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27190, "epoch": 0, "train_loss": 3.7130865156650543, "train_ppl": 40.98009723968461, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 150647, "dt_s": 4.35, "eta_s": 28425, "world_size": 1, "timestamp": "2026-05-04T23:46:30.890461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27200, "epoch": 0, "train_loss": 3.883436366915703, "train_ppl": 48.59090468123441, "lr": 0.00056, "grad_norm": 0.6843, "tokens_per_sec": 151565, "dt_s": 4.324, "eta_s": 28413, "world_size": 1, "timestamp": "2026-05-04T23:46:35.214429"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27210, "epoch": 0, "train_loss": 3.861807629466057, "train_ppl": 47.55122873044539, "lr": 0.00056, "grad_norm": 0.7435, "tokens_per_sec": 151590, "dt_s": 4.323, "eta_s": 28417, "world_size": 1, "timestamp": "2026-05-04T23:46:39.537680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27220, "epoch": 0, "train_loss": 3.8830703496932983, "train_ppl": 48.5731228276997, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 150013, "dt_s": 4.369, "eta_s": 28396, "world_size": 1, "timestamp": "2026-05-04T23:46:43.906411"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27230, "epoch": 0, "train_loss": 3.7695246040821075, "train_ppl": 43.3594470313091, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 149366, "dt_s": 4.388, "eta_s": 28473, "world_size": 1, "timestamp": "2026-05-04T23:46:48.293991"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27240, "epoch": 0, "train_loss": 4.024950101971626, "train_ppl": 55.977515544297276, "lr": 0.00056, "grad_norm": 0.7538, "tokens_per_sec": 150207, "dt_s": 4.363, "eta_s": 28485, "world_size": 1, "timestamp": "2026-05-04T23:46:52.657029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27250, "epoch": 0, "train_loss": 3.84275421500206, "train_ppl": 46.65379222573977, "lr": 0.00056, "grad_norm": 0.7293, "tokens_per_sec": 148119, "dt_s": 4.425, "eta_s": 28613, "world_size": 1, "timestamp": "2026-05-04T23:46:57.081564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27260, "epoch": 0, "train_loss": 3.8555061519145966, "train_ppl": 47.25252784692816, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 151375, "dt_s": 4.329, "eta_s": 28616, "world_size": 1, "timestamp": "2026-05-04T23:47:01.410961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27270, "epoch": 0, "train_loss": 3.76548470556736, "train_ppl": 43.184632619646955, "lr": 0.00056, "grad_norm": 0.6813, "tokens_per_sec": 151306, "dt_s": 4.331, "eta_s": 28563, "world_size": 1, "timestamp": "2026-05-04T23:47:05.742336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27280, "epoch": 0, "train_loss": 3.888159230351448, "train_ppl": 48.82093566314162, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 149159, "dt_s": 4.394, "eta_s": 28567, "world_size": 1, "timestamp": "2026-05-04T23:47:10.136009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27290, "epoch": 0, "train_loss": 4.089883998036385, "train_ppl": 59.73296216132941, "lr": 0.00056, "grad_norm": 0.7432, "tokens_per_sec": 151241, "dt_s": 4.333, "eta_s": 28523, "world_size": 1, "timestamp": "2026-05-04T23:47:14.469220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27300, "epoch": 0, "train_loss": 3.8621175438165665, "train_ppl": 47.5659678224234, "lr": 0.00056, "grad_norm": 0.7578, "tokens_per_sec": 151507, "dt_s": 4.326, "eta_s": 28390, "world_size": 1, "timestamp": "2026-05-04T23:47:18.794814"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27310, "epoch": 0, "train_loss": 3.9429610073566437, "train_ppl": 51.5710777878425, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 150126, "dt_s": 4.365, "eta_s": 28432, "world_size": 1, "timestamp": "2026-05-04T23:47:23.160226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27320, "epoch": 0, "train_loss": 3.7698008567094803, "train_ppl": 43.371426847124546, "lr": 0.00056, "grad_norm": 0.6673, "tokens_per_sec": 152613, "dt_s": 4.294, "eta_s": 28379, "world_size": 1, "timestamp": "2026-05-04T23:47:27.454477"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27330, "epoch": 0, "train_loss": 3.80305752158165, "train_ppl": 44.8380684858575, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 149629, "dt_s": 4.38, "eta_s": 28357, "world_size": 1, "timestamp": "2026-05-04T23:47:31.834383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27340, "epoch": 0, "train_loss": 3.7805119156837463, "train_ppl": 43.83847759668273, "lr": 0.00056, "grad_norm": 0.6955, "tokens_per_sec": 150275, "dt_s": 4.361, "eta_s": 28389, "world_size": 1, "timestamp": "2026-05-04T23:47:36.195446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27350, "epoch": 0, "train_loss": 3.8667313158512115, "train_ppl": 47.78593339987977, "lr": 0.00056, "grad_norm": 0.7507, "tokens_per_sec": 150549, "dt_s": 4.353, "eta_s": 28421, "world_size": 1, "timestamp": "2026-05-04T23:47:40.548591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27360, "epoch": 0, "train_loss": 3.663209244608879, "train_ppl": 38.986258762048855, "lr": 0.00056, "grad_norm": 0.7396, "tokens_per_sec": 147031, "dt_s": 4.457, "eta_s": 28536, "world_size": 1, "timestamp": "2026-05-04T23:47:45.005893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27370, "epoch": 0, "train_loss": 3.7022745460271835, "train_ppl": 40.53940832315275, "lr": 0.00056, "grad_norm": 0.7199, "tokens_per_sec": 151009, "dt_s": 4.34, "eta_s": 28592, "world_size": 1, "timestamp": "2026-05-04T23:47:49.345750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27380, "epoch": 0, "train_loss": 3.922583296895027, "train_ppl": 50.5308124072146, "lr": 0.00056, "grad_norm": 0.7199, "tokens_per_sec": 152222, "dt_s": 4.305, "eta_s": 28490, "world_size": 1, "timestamp": "2026-05-04T23:47:53.651043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27390, "epoch": 0, "train_loss": 3.6494646966457367, "train_ppl": 38.454075942710084, "lr": 0.00056, "grad_norm": 1.1767, "tokens_per_sec": 148185, "dt_s": 4.423, "eta_s": 28566, "world_size": 1, "timestamp": "2026-05-04T23:47:58.073638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27400, "epoch": 0, "train_loss": 3.750845894217491, "train_ppl": 42.55706555444276, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 136510, "dt_s": 4.801, "eta_s": 29146, "world_size": 1, "timestamp": "2026-05-04T23:48:02.874442"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27410, "epoch": 0, "train_loss": 3.8726489692926407, "train_ppl": 48.06955234476652, "lr": 0.00056, "grad_norm": 0.6343, "tokens_per_sec": 152553, "dt_s": 4.296, "eta_s": 28931, "world_size": 1, "timestamp": "2026-05-04T23:48:07.170414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27420, "epoch": 0, "train_loss": 3.9146310538053513, "train_ppl": 50.1305726146962, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 149528, "dt_s": 4.383, "eta_s": 28983, "world_size": 1, "timestamp": "2026-05-04T23:48:11.553287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27430, "epoch": 0, "train_loss": 3.75407512485981, "train_ppl": 42.69471426467891, "lr": 0.00056, "grad_norm": 0.8256, "tokens_per_sec": 151447, "dt_s": 4.327, "eta_s": 29007, "world_size": 1, "timestamp": "2026-05-04T23:48:15.880623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27440, "epoch": 0, "train_loss": 3.812781736254692, "train_ppl": 45.2762103294456, "lr": 0.00056, "grad_norm": 0.7123, "tokens_per_sec": 150977, "dt_s": 4.341, "eta_s": 28896, "world_size": 1, "timestamp": "2026-05-04T23:48:20.221404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27450, "epoch": 0, "train_loss": 3.7709833830595016, "train_ppl": 43.42274503878608, "lr": 0.00056, "grad_norm": 0.7144, "tokens_per_sec": 150999, "dt_s": 4.34, "eta_s": 28290, "world_size": 1, "timestamp": "2026-05-04T23:48:24.561528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27460, "epoch": 0, "train_loss": 3.7902027517557144, "train_ppl": 44.26537424855453, "lr": 0.00056, "grad_norm": 0.6276, "tokens_per_sec": 152606, "dt_s": 4.294, "eta_s": 28284, "world_size": 1, "timestamp": "2026-05-04T23:48:28.855996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27470, "epoch": 0, "train_loss": 3.8400867730379105, "train_ppl": 46.52951177170982, "lr": 0.00056, "grad_norm": 0.6833, "tokens_per_sec": 149778, "dt_s": 4.376, "eta_s": 28270, "world_size": 1, "timestamp": "2026-05-04T23:48:33.231504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27480, "epoch": 0, "train_loss": 3.7878499627113342, "train_ppl": 44.161349583010576, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 148047, "dt_s": 4.427, "eta_s": 28395, "world_size": 1, "timestamp": "2026-05-04T23:48:37.658218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27490, "epoch": 0, "train_loss": 3.844904199242592, "train_ppl": 46.754205048099685, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 151473, "dt_s": 4.327, "eta_s": 28372, "world_size": 1, "timestamp": "2026-05-04T23:48:41.984785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27500, "epoch": 0, "train_loss": 3.8614393770694733, "train_ppl": 47.533721100316136, "lr": 0.00056, "grad_norm": 0.6866, "tokens_per_sec": 146432, "dt_s": 4.476, "eta_s": 28545, "world_size": 1, "timestamp": "2026-05-04T23:48:46.460336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27510, "epoch": 0, "train_loss": 3.777071952819824, "train_ppl": 43.68793394248206, "lr": 0.00056, "grad_norm": 0.725, "tokens_per_sec": 127254, "dt_s": 5.15, "eta_s": 28645, "world_size": 1, "timestamp": "2026-05-04T23:48:51.610309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27520, "epoch": 0, "train_loss": 3.737163409590721, "train_ppl": 41.97874461519298, "lr": 0.00056, "grad_norm": 0.6649, "tokens_per_sec": 149812, "dt_s": 4.375, "eta_s": 28640, "world_size": 1, "timestamp": "2026-05-04T23:48:55.984852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27530, "epoch": 0, "train_loss": 3.839229568839073, "train_ppl": 46.48964356888595, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 145226, "dt_s": 4.513, "eta_s": 28747, "world_size": 1, "timestamp": "2026-05-04T23:49:00.497528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27540, "epoch": 0, "train_loss": 3.8106409162282944, "train_ppl": 45.17938579058863, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 152325, "dt_s": 4.302, "eta_s": 28711, "world_size": 1, "timestamp": "2026-05-04T23:49:04.799953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27550, "epoch": 0, "train_loss": 3.821387752890587, "train_ppl": 45.66753962670638, "lr": 0.00056, "grad_norm": 0.7118, "tokens_per_sec": 151083, "dt_s": 4.338, "eta_s": 28527, "world_size": 1, "timestamp": "2026-05-04T23:49:09.137665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27560, "epoch": 0, "train_loss": 3.7209471315145493, "train_ppl": 41.30349543082409, "lr": 0.00056, "grad_norm": 0.6353, "tokens_per_sec": 148830, "dt_s": 4.403, "eta_s": 28560, "world_size": 1, "timestamp": "2026-05-04T23:49:13.541108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27570, "epoch": 0, "train_loss": 3.819361060857773, "train_ppl": 45.57507931392459, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 153256, "dt_s": 4.276, "eta_s": 28427, "world_size": 1, "timestamp": "2026-05-04T23:49:17.817344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27580, "epoch": 0, "train_loss": 3.7813344448804855, "train_ppl": 43.874550858068154, "lr": 0.00056, "grad_norm": 0.7166, "tokens_per_sec": 150559, "dt_s": 4.353, "eta_s": 28215, "world_size": 1, "timestamp": "2026-05-04T23:49:22.170203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27590, "epoch": 0, "train_loss": 3.8038632422685623, "train_ppl": 44.87421000322664, "lr": 0.00056, "grad_norm": 0.7166, "tokens_per_sec": 148627, "dt_s": 4.409, "eta_s": 28350, "world_size": 1, "timestamp": "2026-05-04T23:49:26.579604"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27600, "epoch": 0, "train_loss": 3.857920318841934, "train_ppl": 47.36674114640293, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 148852, "dt_s": 4.403, "eta_s": 28430, "world_size": 1, "timestamp": "2026-05-04T23:49:30.982394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27610, "epoch": 0, "train_loss": 3.792079269886017, "train_ppl": 44.34851701092938, "lr": 0.00056, "grad_norm": 0.7181, "tokens_per_sec": 145337, "dt_s": 4.509, "eta_s": 28564, "world_size": 1, "timestamp": "2026-05-04T23:49:35.491635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27620, "epoch": 0, "train_loss": 3.8826687186956406, "train_ppl": 48.55361827299813, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 148545, "dt_s": 4.412, "eta_s": 28736, "world_size": 1, "timestamp": "2026-05-04T23:49:39.903488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27630, "epoch": 0, "train_loss": 3.861101418733597, "train_ppl": 47.51765939728114, "lr": 0.00056, "grad_norm": 0.9261, "tokens_per_sec": 147833, "dt_s": 4.433, "eta_s": 28836, "world_size": 1, "timestamp": "2026-05-04T23:49:44.336581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27640, "epoch": 0, "train_loss": 3.8000338971614838, "train_ppl": 44.70269976225162, "lr": 0.00056, "grad_norm": 0.6989, "tokens_per_sec": 148342, "dt_s": 4.418, "eta_s": 28842, "world_size": 1, "timestamp": "2026-05-04T23:49:48.754484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27650, "epoch": 0, "train_loss": 3.8087622672319412, "train_ppl": 45.094589259216086, "lr": 0.00056, "grad_norm": 0.6794, "tokens_per_sec": 150599, "dt_s": 4.352, "eta_s": 28771, "world_size": 1, "timestamp": "2026-05-04T23:49:53.106163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27660, "epoch": 0, "train_loss": 3.747675433754921, "train_ppl": 42.42235372280898, "lr": 0.00056, "grad_norm": 0.6676, "tokens_per_sec": 149773, "dt_s": 4.376, "eta_s": 28593, "world_size": 1, "timestamp": "2026-05-04T23:49:57.481877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27670, "epoch": 0, "train_loss": 3.8655065447092056, "train_ppl": 47.72744239401749, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 149512, "dt_s": 4.383, "eta_s": 28552, "world_size": 1, "timestamp": "2026-05-04T23:50:01.865205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27680, "epoch": 0, "train_loss": 3.7590431571006775, "train_ppl": 42.90735073673886, "lr": 0.00056, "grad_norm": 0.7154, "tokens_per_sec": 152216, "dt_s": 4.305, "eta_s": 28382, "world_size": 1, "timestamp": "2026-05-04T23:50:06.170646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27690, "epoch": 0, "train_loss": 3.9200644940137863, "train_ppl": 50.40369540986786, "lr": 0.00056, "grad_norm": 0.7049, "tokens_per_sec": 148748, "dt_s": 4.406, "eta_s": 28362, "world_size": 1, "timestamp": "2026-05-04T23:50:10.576504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27700, "epoch": 0, "train_loss": 3.799461245536804, "train_ppl": 44.677108016881846, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 133786, "dt_s": 4.899, "eta_s": 29068, "world_size": 1, "timestamp": "2026-05-04T23:50:15.475074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27710, "epoch": 0, "train_loss": 3.7585262954235077, "train_ppl": 42.885179301750284, "lr": 0.00056, "grad_norm": 0.6624, "tokens_per_sec": 150564, "dt_s": 4.353, "eta_s": 29033, "world_size": 1, "timestamp": "2026-05-04T23:50:19.827780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27720, "epoch": 0, "train_loss": 3.8971146792173386, "train_ppl": 49.26011263434764, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 148023, "dt_s": 4.427, "eta_s": 29086, "world_size": 1, "timestamp": "2026-05-04T23:50:24.255194"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27730, "epoch": 0, "train_loss": 3.977542459964752, "train_ppl": 53.38567544914956, "lr": 0.00056, "grad_norm": 0.7799, "tokens_per_sec": 149342, "dt_s": 4.388, "eta_s": 29189, "world_size": 1, "timestamp": "2026-05-04T23:50:28.643568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27740, "epoch": 0, "train_loss": 3.7130157500505447, "train_ppl": 40.97719736052785, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 150000, "dt_s": 4.369, "eta_s": 29137, "world_size": 1, "timestamp": "2026-05-04T23:50:33.012568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27750, "epoch": 0, "train_loss": 3.793926015496254, "train_ppl": 44.43049311124346, "lr": 0.00056, "grad_norm": 0.6623, "tokens_per_sec": 148162, "dt_s": 4.423, "eta_s": 28515, "world_size": 1, "timestamp": "2026-05-04T23:50:37.435826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27760, "epoch": 0, "train_loss": 3.888167902827263, "train_ppl": 48.82135906336138, "lr": 0.00056, "grad_norm": 0.7986, "tokens_per_sec": 150202, "dt_s": 4.363, "eta_s": 28524, "world_size": 1, "timestamp": "2026-05-04T23:50:41.799022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27770, "epoch": 0, "train_loss": 3.8340003341436386, "train_ppl": 46.2471728324787, "lr": 0.00056, "grad_norm": 0.6947, "tokens_per_sec": 148826, "dt_s": 4.404, "eta_s": 28489, "world_size": 1, "timestamp": "2026-05-04T23:50:46.202574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27780, "epoch": 0, "train_loss": 3.753363072872162, "train_ppl": 42.664324229451196, "lr": 0.00056, "grad_norm": 0.6453, "tokens_per_sec": 147746, "dt_s": 4.436, "eta_s": 28546, "world_size": 1, "timestamp": "2026-05-04T23:50:50.638303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27790, "epoch": 0, "train_loss": 3.7198000848293304, "train_ppl": 41.256145554753, "lr": 0.00056, "grad_norm": 0.6494, "tokens_per_sec": 148578, "dt_s": 4.411, "eta_s": 28596, "world_size": 1, "timestamp": "2026-05-04T23:50:55.049167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27800, "epoch": 0, "train_loss": 3.774757757782936, "train_ppl": 43.5869484377264, "lr": 0.00056, "grad_norm": 0.7071, "tokens_per_sec": 146523, "dt_s": 4.473, "eta_s": 28656, "world_size": 1, "timestamp": "2026-05-04T23:50:59.521911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27810, "epoch": 0, "train_loss": 3.711117148399353, "train_ppl": 40.89947179425022, "lr": 0.00056, "grad_norm": 0.7112, "tokens_per_sec": 150439, "dt_s": 4.356, "eta_s": 28643, "world_size": 1, "timestamp": "2026-05-04T23:51:03.878265"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27820, "epoch": 0, "train_loss": 3.8026283383369446, "train_ppl": 44.81882886709475, "lr": 0.00056, "grad_norm": 0.7328, "tokens_per_sec": 151004, "dt_s": 4.34, "eta_s": 28556, "world_size": 1, "timestamp": "2026-05-04T23:51:08.218254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27830, "epoch": 0, "train_loss": 3.8465163707733154, "train_ppl": 46.82964163844259, "lr": 0.00056, "grad_norm": 0.9462, "tokens_per_sec": 148275, "dt_s": 4.42, "eta_s": 28531, "world_size": 1, "timestamp": "2026-05-04T23:51:12.638121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27840, "epoch": 0, "train_loss": 3.859036698937416, "train_ppl": 47.41964996108537, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 152029, "dt_s": 4.311, "eta_s": 28397, "world_size": 1, "timestamp": "2026-05-04T23:51:16.948867"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27850, "epoch": 0, "train_loss": 3.652818113565445, "train_ppl": 38.5832449493278, "lr": 0.00056, "grad_norm": 0.779, "tokens_per_sec": 149734, "dt_s": 4.377, "eta_s": 28268, "world_size": 1, "timestamp": "2026-05-04T23:51:21.325685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27860, "epoch": 0, "train_loss": 3.823444828391075, "train_ppl": 45.76157789238557, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 151176, "dt_s": 4.335, "eta_s": 28236, "world_size": 1, "timestamp": "2026-05-04T23:51:25.660783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27870, "epoch": 0, "train_loss": 3.7910461723804474, "train_ppl": 44.302724326856364, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 150958, "dt_s": 4.341, "eta_s": 28234, "world_size": 1, "timestamp": "2026-05-04T23:51:30.002101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27880, "epoch": 0, "train_loss": 3.800393968820572, "train_ppl": 44.71879883575685, "lr": 0.00056, "grad_norm": 0.6921, "tokens_per_sec": 150086, "dt_s": 4.367, "eta_s": 28160, "world_size": 1, "timestamp": "2026-05-04T23:51:34.368645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27890, "epoch": 0, "train_loss": 3.894733354449272, "train_ppl": 49.14294786710208, "lr": 0.00056, "grad_norm": 0.6885, "tokens_per_sec": 153170, "dt_s": 4.279, "eta_s": 28114, "world_size": 1, "timestamp": "2026-05-04T23:51:38.647301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27900, "epoch": 0, "train_loss": 3.906917229294777, "train_ppl": 49.745361809308825, "lr": 0.00056, "grad_norm": 0.7388, "tokens_per_sec": 151426, "dt_s": 4.328, "eta_s": 28046, "world_size": 1, "timestamp": "2026-05-04T23:51:42.975225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27910, "epoch": 0, "train_loss": 3.8930739760398865, "train_ppl": 49.06146874148509, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 149413, "dt_s": 4.386, "eta_s": 28108, "world_size": 1, "timestamp": "2026-05-04T23:51:47.361433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27920, "epoch": 0, "train_loss": 3.7095059752464294, "train_ppl": 40.83362871986683, "lr": 0.00056, "grad_norm": 0.636, "tokens_per_sec": 151109, "dt_s": 4.337, "eta_s": 28098, "world_size": 1, "timestamp": "2026-05-04T23:51:51.698436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27930, "epoch": 0, "train_loss": 3.8530904352664948, "train_ppl": 47.1385168932672, "lr": 0.00056, "grad_norm": 0.7027, "tokens_per_sec": 150614, "dt_s": 4.351, "eta_s": 28074, "world_size": 1, "timestamp": "2026-05-04T23:51:56.049712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27940, "epoch": 0, "train_loss": 3.949894815683365, "train_ppl": 51.92990433605136, "lr": 0.00056, "grad_norm": 0.6823, "tokens_per_sec": 149337, "dt_s": 4.388, "eta_s": 28212, "world_size": 1, "timestamp": "2026-05-04T23:52:00.438172"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27950, "epoch": 0, "train_loss": 3.7483865916728973, "train_ppl": 42.452533445560604, "lr": 0.00056, "grad_norm": 0.6329, "tokens_per_sec": 151756, "dt_s": 4.319, "eta_s": 28195, "world_size": 1, "timestamp": "2026-05-04T23:52:04.756705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27960, "epoch": 0, "train_loss": 3.7036151438951492, "train_ppl": 40.5937918125695, "lr": 0.00056, "grad_norm": 0.7358, "tokens_per_sec": 149415, "dt_s": 4.386, "eta_s": 28191, "world_size": 1, "timestamp": "2026-05-04T23:52:09.142875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27970, "epoch": 0, "train_loss": 3.739505097270012, "train_ppl": 42.0771609093901, "lr": 0.00056, "grad_norm": 0.7413, "tokens_per_sec": 148220, "dt_s": 4.422, "eta_s": 28296, "world_size": 1, "timestamp": "2026-05-04T23:52:13.564390"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27980, "epoch": 0, "train_loss": 3.768044799566269, "train_ppl": 43.29533097710141, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 152161, "dt_s": 4.307, "eta_s": 28235, "world_size": 1, "timestamp": "2026-05-04T23:52:17.871455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 27990, "epoch": 0, "train_loss": 3.8014439940452576, "train_ppl": 44.76577936360515, "lr": 0.00056, "grad_norm": 0.6676, "tokens_per_sec": 133469, "dt_s": 4.91, "eta_s": 28905, "world_size": 1, "timestamp": "2026-05-04T23:52:22.781637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28000, "epoch": 0, "train_loss": 3.7859901934862137, "train_ppl": 44.07929598814176, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 151919, "dt_s": 4.314, "eta_s": 28895, "world_size": 1, "timestamp": "2026-05-04T23:52:27.095490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28010, "epoch": 0, "train_loss": 3.820066586136818, "train_ppl": 45.6072450300037, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 127319, "dt_s": 5.147, "eta_s": 28882, "world_size": 1, "timestamp": "2026-05-04T23:52:32.242906"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28020, "epoch": 0, "train_loss": 3.7073093354701996, "train_ppl": 40.74403039046777, "lr": 0.00056, "grad_norm": 0.6942, "tokens_per_sec": 149059, "dt_s": 4.397, "eta_s": 28845, "world_size": 1, "timestamp": "2026-05-04T23:52:36.639547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28030, "epoch": 0, "train_loss": 3.7646822184324265, "train_ppl": 43.14999140896485, "lr": 0.00056, "grad_norm": 0.6987, "tokens_per_sec": 150954, "dt_s": 4.341, "eta_s": 28885, "world_size": 1, "timestamp": "2026-05-04T23:52:40.981008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28040, "epoch": 0, "train_loss": 3.8383291214704514, "train_ppl": 46.44780093303904, "lr": 0.00056, "grad_norm": 0.7142, "tokens_per_sec": 150816, "dt_s": 4.345, "eta_s": 28150, "world_size": 1, "timestamp": "2026-05-04T23:52:45.326435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28050, "epoch": 0, "train_loss": 3.7295910865068436, "train_ppl": 41.66206849843032, "lr": 0.00056, "grad_norm": 0.6935, "tokens_per_sec": 147739, "dt_s": 4.436, "eta_s": 28304, "world_size": 1, "timestamp": "2026-05-04T23:52:49.762383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28060, "epoch": 0, "train_loss": 3.7913801074028015, "train_ppl": 44.31752102852575, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 150589, "dt_s": 4.352, "eta_s": 28264, "world_size": 1, "timestamp": "2026-05-04T23:52:54.114348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28070, "epoch": 0, "train_loss": 3.732450857758522, "train_ppl": 41.781383008995476, "lr": 0.00056, "grad_norm": 0.7155, "tokens_per_sec": 150847, "dt_s": 4.345, "eta_s": 28192, "world_size": 1, "timestamp": "2026-05-04T23:52:58.458891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28080, "epoch": 0, "train_loss": 3.8033855706453323, "train_ppl": 44.852779985156474, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 152267, "dt_s": 4.304, "eta_s": 28140, "world_size": 1, "timestamp": "2026-05-04T23:53:02.762889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28090, "epoch": 0, "train_loss": 3.8880578130483627, "train_ppl": 48.81598462657719, "lr": 0.00056, "grad_norm": 0.6573, "tokens_per_sec": 152893, "dt_s": 4.286, "eta_s": 28059, "world_size": 1, "timestamp": "2026-05-04T23:53:07.049297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28100, "epoch": 0, "train_loss": 3.9137766510248184, "train_ppl": 50.087759206616454, "lr": 0.00056, "grad_norm": 0.7166, "tokens_per_sec": 149568, "dt_s": 4.382, "eta_s": 27985, "world_size": 1, "timestamp": "2026-05-04T23:53:11.430987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28110, "epoch": 0, "train_loss": 3.8956572264432907, "train_ppl": 49.18837063952283, "lr": 0.00056, "grad_norm": 0.7079, "tokens_per_sec": 152619, "dt_s": 4.294, "eta_s": 27906, "world_size": 1, "timestamp": "2026-05-04T23:53:15.725045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28120, "epoch": 0, "train_loss": 3.7797635942697525, "train_ppl": 43.80568459652266, "lr": 0.00056, "grad_norm": 0.6298, "tokens_per_sec": 152493, "dt_s": 4.298, "eta_s": 27841, "world_size": 1, "timestamp": "2026-05-04T23:53:20.022716"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28130, "epoch": 0, "train_loss": 3.820854514837265, "train_ppl": 45.643194448240486, "lr": 0.00056, "grad_norm": 0.6376, "tokens_per_sec": 149231, "dt_s": 4.392, "eta_s": 27949, "world_size": 1, "timestamp": "2026-05-04T23:53:24.414289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28140, "epoch": 0, "train_loss": 3.837637275457382, "train_ppl": 46.41567732072557, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 151515, "dt_s": 4.325, "eta_s": 27995, "world_size": 1, "timestamp": "2026-05-04T23:53:28.739688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28150, "epoch": 0, "train_loss": 3.8513134568929672, "train_ppl": 47.05482714764366, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 151411, "dt_s": 4.328, "eta_s": 27922, "world_size": 1, "timestamp": "2026-05-04T23:53:33.068024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28160, "epoch": 0, "train_loss": 3.8054968863725662, "train_ppl": 44.947578404431184, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 147586, "dt_s": 4.441, "eta_s": 28107, "world_size": 1, "timestamp": "2026-05-04T23:53:37.508555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28170, "epoch": 0, "train_loss": 3.6537461653351784, "train_ppl": 38.61906881871657, "lr": 0.00056, "grad_norm": 0.8764, "tokens_per_sec": 152363, "dt_s": 4.301, "eta_s": 28107, "world_size": 1, "timestamp": "2026-05-04T23:53:41.809857"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28180, "epoch": 0, "train_loss": 3.8827863931655884, "train_ppl": 48.55933213047343, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 150957, "dt_s": 4.341, "eta_s": 28038, "world_size": 1, "timestamp": "2026-05-04T23:53:46.151234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28190, "epoch": 0, "train_loss": 3.8420013189315796, "train_ppl": 46.61867998849135, "lr": 0.00056, "grad_norm": 0.6795, "tokens_per_sec": 147676, "dt_s": 4.438, "eta_s": 28179, "world_size": 1, "timestamp": "2026-05-04T23:53:50.589085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28200, "epoch": 0, "train_loss": 3.7935866862535477, "train_ppl": 44.415419103333434, "lr": 0.00056, "grad_norm": 0.7087, "tokens_per_sec": 152113, "dt_s": 4.308, "eta_s": 28149, "world_size": 1, "timestamp": "2026-05-04T23:53:54.897427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28210, "epoch": 0, "train_loss": 3.890970304608345, "train_ppl": 48.9583680143534, "lr": 0.00056, "grad_norm": 0.6854, "tokens_per_sec": 150904, "dt_s": 4.343, "eta_s": 28018, "world_size": 1, "timestamp": "2026-05-04T23:53:59.240349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28220, "epoch": 0, "train_loss": 3.8805888146162033, "train_ppl": 48.45273635300708, "lr": 0.00056, "grad_norm": 0.6953, "tokens_per_sec": 150162, "dt_s": 4.364, "eta_s": 28095, "world_size": 1, "timestamp": "2026-05-04T23:54:03.604691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28230, "epoch": 0, "train_loss": 3.694062054157257, "train_ppl": 40.20784211780885, "lr": 0.00056, "grad_norm": 0.6857, "tokens_per_sec": 150947, "dt_s": 4.342, "eta_s": 28091, "world_size": 1, "timestamp": "2026-05-04T23:54:07.946348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28240, "epoch": 0, "train_loss": 4.000732704997063, "train_ppl": 54.63816902977293, "lr": 0.00056, "grad_norm": 0.9281, "tokens_per_sec": 148112, "dt_s": 4.425, "eta_s": 28070, "world_size": 1, "timestamp": "2026-05-04T23:54:12.371093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28250, "epoch": 0, "train_loss": 3.7630551904439926, "train_ppl": 43.079842248040904, "lr": 0.00056, "grad_norm": 0.6447, "tokens_per_sec": 150719, "dt_s": 4.348, "eta_s": 28117, "world_size": 1, "timestamp": "2026-05-04T23:54:16.719332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28260, "epoch": 0, "train_loss": 3.770535498857498, "train_ppl": 43.40330103193237, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 151527, "dt_s": 4.325, "eta_s": 28090, "world_size": 1, "timestamp": "2026-05-04T23:54:21.044363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28270, "epoch": 0, "train_loss": 3.823016569018364, "train_ppl": 45.74198426362015, "lr": 0.00056, "grad_norm": 0.6822, "tokens_per_sec": 149904, "dt_s": 4.372, "eta_s": 28095, "world_size": 1, "timestamp": "2026-05-04T23:54:25.416237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28280, "epoch": 0, "train_loss": 3.6605827510356903, "train_ppl": 38.88399595905314, "lr": 0.00056, "grad_norm": 0.7208, "tokens_per_sec": 151339, "dt_s": 4.33, "eta_s": 28076, "world_size": 1, "timestamp": "2026-05-04T23:54:29.746636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28290, "epoch": 0, "train_loss": 3.8638947159051895, "train_ppl": 47.650575892087275, "lr": 0.00056, "grad_norm": 0.7287, "tokens_per_sec": 134021, "dt_s": 4.89, "eta_s": 28671, "world_size": 1, "timestamp": "2026-05-04T23:54:34.636592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28300, "epoch": 0, "train_loss": 3.6568167209625244, "train_ppl": 38.737833060594184, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 145654, "dt_s": 4.499, "eta_s": 28861, "world_size": 1, "timestamp": "2026-05-04T23:54:39.136037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28310, "epoch": 0, "train_loss": 3.8188358396291733, "train_ppl": 45.551148599783474, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 149836, "dt_s": 4.374, "eta_s": 28919, "world_size": 1, "timestamp": "2026-05-04T23:54:43.509880"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28320, "epoch": 0, "train_loss": 3.846482753753662, "train_ppl": 46.82806739192016, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 147349, "dt_s": 4.448, "eta_s": 29013, "world_size": 1, "timestamp": "2026-05-04T23:54:47.957547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28330, "epoch": 0, "train_loss": 3.8457738757133484, "train_ppl": 46.79488376623706, "lr": 0.00056, "grad_norm": 0.7003, "tokens_per_sec": 148972, "dt_s": 4.399, "eta_s": 29097, "world_size": 1, "timestamp": "2026-05-04T23:54:52.356786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28340, "epoch": 0, "train_loss": 3.787277266383171, "train_ppl": 44.13606578091902, "lr": 0.00056, "grad_norm": 0.6361, "tokens_per_sec": 150543, "dt_s": 4.353, "eta_s": 28402, "world_size": 1, "timestamp": "2026-05-04T23:54:56.710102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28350, "epoch": 0, "train_loss": 3.7222010493278503, "train_ppl": 41.35531910401965, "lr": 0.00056, "grad_norm": 0.6729, "tokens_per_sec": 148308, "dt_s": 4.419, "eta_s": 28293, "world_size": 1, "timestamp": "2026-05-04T23:55:01.129003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28360, "epoch": 0, "train_loss": 3.8848481625318527, "train_ppl": 48.65955355513536, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 149750, "dt_s": 4.376, "eta_s": 28292, "world_size": 1, "timestamp": "2026-05-04T23:55:05.505404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28370, "epoch": 0, "train_loss": 3.9005332440137863, "train_ppl": 49.428799690803615, "lr": 0.00056, "grad_norm": 0.6907, "tokens_per_sec": 150756, "dt_s": 4.347, "eta_s": 28159, "world_size": 1, "timestamp": "2026-05-04T23:55:09.852531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28380, "epoch": 0, "train_loss": 3.886134773492813, "train_ppl": 48.72219976210123, "lr": 0.00056, "grad_norm": 0.7171, "tokens_per_sec": 147192, "dt_s": 4.452, "eta_s": 28223, "world_size": 1, "timestamp": "2026-05-04T23:55:14.304962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28390, "epoch": 0, "train_loss": 3.765514239668846, "train_ppl": 43.18590805780376, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 149656, "dt_s": 4.379, "eta_s": 28251, "world_size": 1, "timestamp": "2026-05-04T23:55:18.684078"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28400, "epoch": 0, "train_loss": 3.8409291207790375, "train_ppl": 46.56872231298341, "lr": 0.00056, "grad_norm": 0.7295, "tokens_per_sec": 151386, "dt_s": 4.329, "eta_s": 28132, "world_size": 1, "timestamp": "2026-05-04T23:55:23.013123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28410, "epoch": 0, "train_loss": 3.8269163221120834, "train_ppl": 45.92071498459365, "lr": 0.00056, "grad_norm": 0.6292, "tokens_per_sec": 149548, "dt_s": 4.382, "eta_s": 28135, "world_size": 1, "timestamp": "2026-05-04T23:55:27.395400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28420, "epoch": 0, "train_loss": 3.7837698459625244, "train_ppl": 43.98153320628756, "lr": 0.00056, "grad_norm": 0.7106, "tokens_per_sec": 149329, "dt_s": 4.389, "eta_s": 28184, "world_size": 1, "timestamp": "2026-05-04T23:55:31.784122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28430, "epoch": 0, "train_loss": 3.81024831533432, "train_ppl": 45.16165180475517, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 147709, "dt_s": 4.437, "eta_s": 28159, "world_size": 1, "timestamp": "2026-05-04T23:55:36.220940"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28440, "epoch": 0, "train_loss": 3.7306834757328033, "train_ppl": 41.70760456021261, "lr": 0.00056, "grad_norm": 0.6866, "tokens_per_sec": 151339, "dt_s": 4.33, "eta_s": 28092, "world_size": 1, "timestamp": "2026-05-04T23:55:40.551369"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28450, "epoch": 0, "train_loss": 3.8477723449468613, "train_ppl": 46.88849541056911, "lr": 0.00056, "grad_norm": 0.7194, "tokens_per_sec": 150171, "dt_s": 4.364, "eta_s": 28133, "world_size": 1, "timestamp": "2026-05-04T23:55:44.915460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28460, "epoch": 0, "train_loss": 3.8456585705280304, "train_ppl": 46.78948838455617, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 148823, "dt_s": 4.404, "eta_s": 28156, "world_size": 1, "timestamp": "2026-05-04T23:55:49.319061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28470, "epoch": 0, "train_loss": 3.7921704798936844, "train_ppl": 44.35256222398514, "lr": 0.00056, "grad_norm": 0.6823, "tokens_per_sec": 151259, "dt_s": 4.333, "eta_s": 28080, "world_size": 1, "timestamp": "2026-05-04T23:55:53.651762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28480, "epoch": 0, "train_loss": 3.9407282918691635, "train_ppl": 51.45606268953454, "lr": 0.00056, "grad_norm": 0.8047, "tokens_per_sec": 150559, "dt_s": 4.353, "eta_s": 27968, "world_size": 1, "timestamp": "2026-05-04T23:55:58.004602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28490, "epoch": 0, "train_loss": 3.887188211083412, "train_ppl": 48.773552602584594, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 150039, "dt_s": 4.368, "eta_s": 28011, "world_size": 1, "timestamp": "2026-05-04T23:56:02.372529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28500, "epoch": 0, "train_loss": 3.7943213880062103, "train_ppl": 44.44806317995565, "lr": 0.00056, "grad_norm": 0.712, "tokens_per_sec": 152642, "dt_s": 4.293, "eta_s": 27916, "world_size": 1, "timestamp": "2026-05-04T23:56:06.665987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28510, "epoch": 0, "train_loss": 3.660668596625328, "train_ppl": 38.887334121894945, "lr": 0.00056, "grad_norm": 0.7179, "tokens_per_sec": 127934, "dt_s": 5.123, "eta_s": 27848, "world_size": 1, "timestamp": "2026-05-04T23:56:11.788615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28520, "epoch": 0, "train_loss": 3.8629986494779587, "train_ppl": 47.60789693523805, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 148872, "dt_s": 4.402, "eta_s": 27933, "world_size": 1, "timestamp": "2026-05-04T23:56:16.190778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28530, "epoch": 0, "train_loss": 3.831507086753845, "train_ppl": 46.1320108128878, "lr": 0.00056, "grad_norm": 0.8252, "tokens_per_sec": 150186, "dt_s": 4.364, "eta_s": 27942, "world_size": 1, "timestamp": "2026-05-04T23:56:20.554447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28540, "epoch": 0, "train_loss": 3.924685314297676, "train_ppl": 50.637140767145254, "lr": 0.00056, "grad_norm": 0.8309, "tokens_per_sec": 149395, "dt_s": 4.387, "eta_s": 27962, "world_size": 1, "timestamp": "2026-05-04T23:56:24.941212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28550, "epoch": 0, "train_loss": 3.760737255215645, "train_ppl": 42.98010160489513, "lr": 0.00056, "grad_norm": 0.7384, "tokens_per_sec": 152785, "dt_s": 4.289, "eta_s": 27953, "world_size": 1, "timestamp": "2026-05-04T23:56:29.230629"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28560, "epoch": 0, "train_loss": 3.7533318400382996, "train_ppl": 42.66299172250979, "lr": 0.00056, "grad_norm": 0.6663, "tokens_per_sec": 150773, "dt_s": 4.347, "eta_s": 27939, "world_size": 1, "timestamp": "2026-05-04T23:56:33.577307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28570, "epoch": 0, "train_loss": 3.7056921422481537, "train_ppl": 40.67819267118702, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 150119, "dt_s": 4.366, "eta_s": 27888, "world_size": 1, "timestamp": "2026-05-04T23:56:37.942907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28580, "epoch": 0, "train_loss": 3.794870525598526, "train_ppl": 44.47247798528357, "lr": 0.00056, "grad_norm": 0.7186, "tokens_per_sec": 136496, "dt_s": 4.801, "eta_s": 28445, "world_size": 1, "timestamp": "2026-05-04T23:56:42.744237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28590, "epoch": 0, "train_loss": 3.713033825159073, "train_ppl": 40.97793803451118, "lr": 0.00056, "grad_norm": 0.7495, "tokens_per_sec": 149883, "dt_s": 4.372, "eta_s": 28422, "world_size": 1, "timestamp": "2026-05-04T23:56:47.116711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28600, "epoch": 0, "train_loss": 3.9184176474809647, "train_ppl": 50.320756571354934, "lr": 0.00056, "grad_norm": 0.6889, "tokens_per_sec": 149752, "dt_s": 4.376, "eta_s": 28529, "world_size": 1, "timestamp": "2026-05-04T23:56:51.493007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28610, "epoch": 0, "train_loss": 3.86348195374012, "train_ppl": 47.630911595833176, "lr": 0.00056, "grad_norm": 0.6708, "tokens_per_sec": 150954, "dt_s": 4.341, "eta_s": 28518, "world_size": 1, "timestamp": "2026-05-04T23:56:55.834467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28620, "epoch": 0, "train_loss": 3.7322570383548737, "train_ppl": 41.77328575098527, "lr": 0.00056, "grad_norm": 0.7027, "tokens_per_sec": 150852, "dt_s": 4.344, "eta_s": 28486, "world_size": 1, "timestamp": "2026-05-04T23:57:00.178852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28630, "epoch": 0, "train_loss": 3.8501047492027283, "train_ppl": 46.99798597530879, "lr": 0.00056, "grad_norm": 0.7311, "tokens_per_sec": 150573, "dt_s": 4.352, "eta_s": 27907, "world_size": 1, "timestamp": "2026-05-04T23:57:04.531302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28640, "epoch": 0, "train_loss": 3.6884837299585342, "train_ppl": 39.98417416532297, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 150952, "dt_s": 4.342, "eta_s": 27863, "world_size": 1, "timestamp": "2026-05-04T23:57:08.872822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28650, "epoch": 0, "train_loss": 3.840881109237671, "train_ppl": 46.56648653051781, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 148330, "dt_s": 4.418, "eta_s": 27912, "world_size": 1, "timestamp": "2026-05-04T23:57:13.291079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28660, "epoch": 0, "train_loss": 3.732619032263756, "train_ppl": 41.78841016328853, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 149756, "dt_s": 4.376, "eta_s": 27952, "world_size": 1, "timestamp": "2026-05-04T23:57:17.667267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28670, "epoch": 0, "train_loss": 3.7520706951618195, "train_ppl": 42.609221422279695, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 150125, "dt_s": 4.365, "eta_s": 27975, "world_size": 1, "timestamp": "2026-05-04T23:57:22.032689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28680, "epoch": 0, "train_loss": 3.897154226899147, "train_ppl": 49.26206079613034, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 146867, "dt_s": 4.462, "eta_s": 28111, "world_size": 1, "timestamp": "2026-05-04T23:57:26.494933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28690, "epoch": 0, "train_loss": 3.773274317383766, "train_ppl": 43.52233773245106, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 152016, "dt_s": 4.311, "eta_s": 28068, "world_size": 1, "timestamp": "2026-05-04T23:57:30.806046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28700, "epoch": 0, "train_loss": 3.7907444834709167, "train_ppl": 44.28936070219499, "lr": 0.00056, "grad_norm": 0.6664, "tokens_per_sec": 151098, "dt_s": 4.337, "eta_s": 27960, "world_size": 1, "timestamp": "2026-05-04T23:57:35.143374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28710, "epoch": 0, "train_loss": 3.8560605198144913, "train_ppl": 47.27873039380977, "lr": 0.00056, "grad_norm": 0.6929, "tokens_per_sec": 148829, "dt_s": 4.403, "eta_s": 27990, "world_size": 1, "timestamp": "2026-05-04T23:57:39.546823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28720, "epoch": 0, "train_loss": 3.70341756939888, "train_ppl": 40.58577230685146, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 150373, "dt_s": 4.358, "eta_s": 27977, "world_size": 1, "timestamp": "2026-05-04T23:57:43.905073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28730, "epoch": 0, "train_loss": 3.7811423242092133, "train_ppl": 43.86612245956625, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 146837, "dt_s": 4.463, "eta_s": 27973, "world_size": 1, "timestamp": "2026-05-04T23:57:48.368243"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28740, "epoch": 0, "train_loss": 3.7571472078561783, "train_ppl": 42.82607764669842, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 148982, "dt_s": 4.399, "eta_s": 28081, "world_size": 1, "timestamp": "2026-05-04T23:57:52.767133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28750, "epoch": 0, "train_loss": 3.847236081957817, "train_ppl": 46.86335758671315, "lr": 0.00056, "grad_norm": 0.7786, "tokens_per_sec": 150383, "dt_s": 4.358, "eta_s": 28103, "world_size": 1, "timestamp": "2026-05-04T23:57:57.125088"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28760, "epoch": 0, "train_loss": 3.7581702172756195, "train_ppl": 42.869911544951265, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 147624, "dt_s": 4.439, "eta_s": 28145, "world_size": 1, "timestamp": "2026-05-04T23:58:01.564461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28770, "epoch": 0, "train_loss": 3.7731398344039917, "train_ppl": 43.51648511233376, "lr": 0.00056, "grad_norm": 0.6833, "tokens_per_sec": 152964, "dt_s": 4.284, "eta_s": 28046, "world_size": 1, "timestamp": "2026-05-04T23:58:05.848894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28780, "epoch": 0, "train_loss": 3.917993113398552, "train_ppl": 50.29939822913049, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 151814, "dt_s": 4.317, "eta_s": 27855, "world_size": 1, "timestamp": "2026-05-04T23:58:10.165736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28790, "epoch": 0, "train_loss": 3.721174046397209, "train_ppl": 41.312868872089666, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 146890, "dt_s": 4.462, "eta_s": 27930, "world_size": 1, "timestamp": "2026-05-04T23:58:14.627302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28800, "epoch": 0, "train_loss": 3.6829527616500854, "train_ppl": 39.76363342931028, "lr": 0.00056, "grad_norm": 0.6272, "tokens_per_sec": 152687, "dt_s": 4.292, "eta_s": 27842, "world_size": 1, "timestamp": "2026-05-04T23:58:18.919462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28810, "epoch": 0, "train_loss": 3.90946264564991, "train_ppl": 49.87214575736344, "lr": 0.00056, "grad_norm": 0.7329, "tokens_per_sec": 151953, "dt_s": 4.313, "eta_s": 27676, "world_size": 1, "timestamp": "2026-05-04T23:58:23.232457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28820, "epoch": 0, "train_loss": 3.7875655442476273, "train_ppl": 44.14879106582869, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 148202, "dt_s": 4.422, "eta_s": 27847, "world_size": 1, "timestamp": "2026-05-04T23:58:27.654448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28830, "epoch": 0, "train_loss": 3.891791269183159, "train_ppl": 48.99857760319951, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 150839, "dt_s": 4.345, "eta_s": 27879, "world_size": 1, "timestamp": "2026-05-04T23:58:31.999249"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28840, "epoch": 0, "train_loss": 3.821982190012932, "train_ppl": 45.69469417758359, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 149861, "dt_s": 4.373, "eta_s": 27761, "world_size": 1, "timestamp": "2026-05-04T23:58:36.372376"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28850, "epoch": 0, "train_loss": 3.863246813416481, "train_ppl": 47.61971296454181, "lr": 0.00056, "grad_norm": 0.6278, "tokens_per_sec": 150642, "dt_s": 4.35, "eta_s": 27831, "world_size": 1, "timestamp": "2026-05-04T23:58:40.722829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28860, "epoch": 0, "train_loss": 3.749804139137268, "train_ppl": 42.512754599790476, "lr": 0.00056, "grad_norm": 0.7091, "tokens_per_sec": 150543, "dt_s": 4.353, "eta_s": 27879, "world_size": 1, "timestamp": "2026-05-04T23:58:45.076119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28870, "epoch": 0, "train_loss": 3.821169450879097, "train_ppl": 45.65757139902774, "lr": 0.00056, "grad_norm": 0.6886, "tokens_per_sec": 144605, "dt_s": 4.532, "eta_s": 28015, "world_size": 1, "timestamp": "2026-05-04T23:58:49.608211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28880, "epoch": 0, "train_loss": 3.87494894862175, "train_ppl": 48.18023856073009, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 135248, "dt_s": 4.846, "eta_s": 28649, "world_size": 1, "timestamp": "2026-05-04T23:58:54.453821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28890, "epoch": 0, "train_loss": 3.7735741287469864, "train_ppl": 43.535388180095644, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 149946, "dt_s": 4.371, "eta_s": 28642, "world_size": 1, "timestamp": "2026-05-04T23:58:58.824471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28900, "epoch": 0, "train_loss": 3.8809116780757904, "train_ppl": 48.46838249674061, "lr": 0.00056, "grad_norm": 0.6764, "tokens_per_sec": 145853, "dt_s": 4.493, "eta_s": 28820, "world_size": 1, "timestamp": "2026-05-04T23:59:03.317762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28910, "epoch": 0, "train_loss": 3.860875815153122, "train_ppl": 47.506940452346974, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 152112, "dt_s": 4.308, "eta_s": 28758, "world_size": 1, "timestamp": "2026-05-04T23:59:07.626159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28920, "epoch": 0, "train_loss": 3.699905961751938, "train_ppl": 40.44350094526257, "lr": 0.00056, "grad_norm": 0.6885, "tokens_per_sec": 151244, "dt_s": 4.333, "eta_s": 28500, "world_size": 1, "timestamp": "2026-05-04T23:59:11.959292"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28930, "epoch": 0, "train_loss": 3.802253469824791, "train_ppl": 44.80203084812178, "lr": 0.00056, "grad_norm": 0.6659, "tokens_per_sec": 147541, "dt_s": 4.442, "eta_s": 27980, "world_size": 1, "timestamp": "2026-05-04T23:59:16.401210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28940, "epoch": 0, "train_loss": 3.751729741692543, "train_ppl": 42.594696136776555, "lr": 0.00056, "grad_norm": 0.7289, "tokens_per_sec": 151968, "dt_s": 4.312, "eta_s": 27902, "world_size": 1, "timestamp": "2026-05-04T23:59:20.713673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28950, "epoch": 0, "train_loss": 3.7163277566432953, "train_ppl": 41.11313910404729, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 150395, "dt_s": 4.358, "eta_s": 27724, "world_size": 1, "timestamp": "2026-05-04T23:59:25.071286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28960, "epoch": 0, "train_loss": 3.7250164449214935, "train_ppl": 41.471914741605154, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 151057, "dt_s": 4.339, "eta_s": 27758, "world_size": 1, "timestamp": "2026-05-04T23:59:29.409754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28970, "epoch": 0, "train_loss": 3.745208501815796, "train_ppl": 42.317829643381266, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 152494, "dt_s": 4.298, "eta_s": 27709, "world_size": 1, "timestamp": "2026-05-04T23:59:33.707391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28980, "epoch": 0, "train_loss": 3.7948516458272934, "train_ppl": 44.471638362999016, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 147438, "dt_s": 4.445, "eta_s": 27708, "world_size": 1, "timestamp": "2026-05-04T23:59:38.152374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 28990, "epoch": 0, "train_loss": 3.813356801867485, "train_ppl": 45.302254608950676, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 151051, "dt_s": 4.339, "eta_s": 27737, "world_size": 1, "timestamp": "2026-05-04T23:59:42.491039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29000, "epoch": 0, "train_loss": 3.782519370317459, "train_ppl": 43.926569742578984, "lr": 0.00056, "grad_norm": 0.6374, "tokens_per_sec": 151228, "dt_s": 4.334, "eta_s": 27703, "world_size": 1, "timestamp": "2026-05-04T23:59:46.824634"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29010, "epoch": 0, "train_loss": 3.7448307424783707, "train_ppl": 42.301846707135674, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 126102, "dt_s": 5.197, "eta_s": 27812, "world_size": 1, "timestamp": "2026-05-04T23:59:52.021673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29020, "epoch": 0, "train_loss": 3.8208197355270386, "train_ppl": 45.64160703702574, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 151579, "dt_s": 4.324, "eta_s": 27840, "world_size": 1, "timestamp": "2026-05-04T23:59:56.345256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29030, "epoch": 0, "train_loss": 3.6170248687267303, "train_ppl": 37.22664873759273, "lr": 0.00056, "grad_norm": 0.7729, "tokens_per_sec": 148493, "dt_s": 4.413, "eta_s": 27796, "world_size": 1, "timestamp": "2026-05-05T00:00:00.758656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29040, "epoch": 0, "train_loss": 3.7763087898492813, "train_ppl": 43.654605648115286, "lr": 0.00056, "grad_norm": 0.6952, "tokens_per_sec": 151093, "dt_s": 4.337, "eta_s": 27790, "world_size": 1, "timestamp": "2026-05-05T00:00:05.096118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29050, "epoch": 0, "train_loss": 3.858004868030548, "train_ppl": 47.370746135241035, "lr": 0.00056, "grad_norm": 0.6848, "tokens_per_sec": 150902, "dt_s": 4.343, "eta_s": 27797, "world_size": 1, "timestamp": "2026-05-05T00:00:09.439056"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29060, "epoch": 0, "train_loss": 3.7171405851840973, "train_ppl": 41.146570622118396, "lr": 0.00056, "grad_norm": 0.6773, "tokens_per_sec": 149206, "dt_s": 4.392, "eta_s": 27748, "world_size": 1, "timestamp": "2026-05-05T00:00:13.831377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29070, "epoch": 0, "train_loss": 3.8754189014434814, "train_ppl": 48.202886321066025, "lr": 0.00056, "grad_norm": 0.6785, "tokens_per_sec": 150203, "dt_s": 4.363, "eta_s": 27794, "world_size": 1, "timestamp": "2026-05-05T00:00:18.194520"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29080, "epoch": 0, "train_loss": 3.823486015200615, "train_ppl": 45.7634627045929, "lr": 0.00056, "grad_norm": 0.7909, "tokens_per_sec": 151221, "dt_s": 4.334, "eta_s": 27689, "world_size": 1, "timestamp": "2026-05-05T00:00:22.528339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29090, "epoch": 0, "train_loss": 3.8326420336961746, "train_ppl": 46.18439792017367, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 150251, "dt_s": 4.362, "eta_s": 27715, "world_size": 1, "timestamp": "2026-05-05T00:00:26.890185"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29100, "epoch": 0, "train_loss": 3.72979199886322, "train_ppl": 41.67043976370102, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 150990, "dt_s": 4.34, "eta_s": 27707, "world_size": 1, "timestamp": "2026-05-05T00:00:31.230494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29110, "epoch": 0, "train_loss": 3.8130677342414856, "train_ppl": 45.28916108630637, "lr": 0.00056, "grad_norm": 0.7044, "tokens_per_sec": 153492, "dt_s": 4.27, "eta_s": 27547, "world_size": 1, "timestamp": "2026-05-05T00:00:35.500210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29120, "epoch": 0, "train_loss": 3.8337028473615646, "train_ppl": 46.2334169560499, "lr": 0.00056, "grad_norm": 0.7176, "tokens_per_sec": 151876, "dt_s": 4.315, "eta_s": 27482, "world_size": 1, "timestamp": "2026-05-05T00:00:39.815326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29130, "epoch": 0, "train_loss": 3.6785635203123093, "train_ppl": 39.58948371786752, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 150434, "dt_s": 4.356, "eta_s": 27506, "world_size": 1, "timestamp": "2026-05-05T00:00:44.171768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29140, "epoch": 0, "train_loss": 3.73226535320282, "train_ppl": 41.77363309094854, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 151008, "dt_s": 4.34, "eta_s": 27474, "world_size": 1, "timestamp": "2026-05-05T00:00:48.511670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29150, "epoch": 0, "train_loss": 3.820552334189415, "train_ppl": 45.629404041874075, "lr": 0.00056, "grad_norm": 0.6961, "tokens_per_sec": 150672, "dt_s": 4.35, "eta_s": 27481, "world_size": 1, "timestamp": "2026-05-05T00:00:52.861270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29160, "epoch": 0, "train_loss": 3.780162990093231, "train_ppl": 43.82318389833655, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 152004, "dt_s": 4.311, "eta_s": 27530, "world_size": 1, "timestamp": "2026-05-05T00:00:57.172730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29170, "epoch": 0, "train_loss": 3.799701601266861, "train_ppl": 44.68784770641732, "lr": 0.00056, "grad_norm": 0.6378, "tokens_per_sec": 133746, "dt_s": 4.9, "eta_s": 28269, "world_size": 1, "timestamp": "2026-05-05T00:01:02.072760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29180, "epoch": 0, "train_loss": 3.8627189993858337, "train_ppl": 47.59458524386885, "lr": 0.00056, "grad_norm": 0.6936, "tokens_per_sec": 151915, "dt_s": 4.314, "eta_s": 28211, "world_size": 1, "timestamp": "2026-05-05T00:01:06.386762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29190, "epoch": 0, "train_loss": 3.916049972176552, "train_ppl": 50.20175429369133, "lr": 0.00056, "grad_norm": 0.6802, "tokens_per_sec": 150547, "dt_s": 4.353, "eta_s": 28223, "world_size": 1, "timestamp": "2026-05-05T00:01:10.739970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29200, "epoch": 0, "train_loss": 3.973177671432495, "train_ppl": 53.1531660614521, "lr": 0.00056, "grad_norm": 0.7807, "tokens_per_sec": 149266, "dt_s": 4.391, "eta_s": 28271, "world_size": 1, "timestamp": "2026-05-05T00:01:15.130507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29210, "epoch": 0, "train_loss": 3.890812709927559, "train_ppl": 48.95065304390949, "lr": 0.00056, "grad_norm": 0.6821, "tokens_per_sec": 150327, "dt_s": 4.36, "eta_s": 28327, "world_size": 1, "timestamp": "2026-05-05T00:01:19.490059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29220, "epoch": 0, "train_loss": 3.8680388182401657, "train_ppl": 47.84845448628646, "lr": 0.00056, "grad_norm": 0.6843, "tokens_per_sec": 151673, "dt_s": 4.321, "eta_s": 27588, "world_size": 1, "timestamp": "2026-05-05T00:01:23.810913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29230, "epoch": 0, "train_loss": 3.8960303366184235, "train_ppl": 49.20672674531862, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 149148, "dt_s": 4.394, "eta_s": 27685, "world_size": 1, "timestamp": "2026-05-05T00:01:28.204922"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29240, "epoch": 0, "train_loss": 3.7974192649126053, "train_ppl": 44.58597130932923, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 151511, "dt_s": 4.325, "eta_s": 27645, "world_size": 1, "timestamp": "2026-05-05T00:01:32.530421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29250, "epoch": 0, "train_loss": 3.7555469423532486, "train_ppl": 42.757599358349445, "lr": 0.00056, "grad_norm": 0.7415, "tokens_per_sec": 150082, "dt_s": 4.367, "eta_s": 27611, "world_size": 1, "timestamp": "2026-05-05T00:01:36.897102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29260, "epoch": 0, "train_loss": 3.818548336625099, "train_ppl": 45.53805439012451, "lr": 0.00056, "grad_norm": 0.7955, "tokens_per_sec": 148525, "dt_s": 4.412, "eta_s": 27673, "world_size": 1, "timestamp": "2026-05-05T00:01:41.309546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29270, "epoch": 0, "train_loss": 3.7144419699907303, "train_ppl": 41.03568155224299, "lr": 0.00056, "grad_norm": 0.7165, "tokens_per_sec": 151241, "dt_s": 4.333, "eta_s": 27685, "world_size": 1, "timestamp": "2026-05-05T00:01:45.642774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29280, "epoch": 0, "train_loss": 3.7452321648597717, "train_ppl": 42.31883102389288, "lr": 0.00056, "grad_norm": 0.7124, "tokens_per_sec": 146271, "dt_s": 4.48, "eta_s": 27790, "world_size": 1, "timestamp": "2026-05-05T00:01:50.123236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29290, "epoch": 0, "train_loss": 3.777731731534004, "train_ppl": 43.71676782230792, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 150367, "dt_s": 4.358, "eta_s": 27827, "world_size": 1, "timestamp": "2026-05-05T00:01:54.481605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29300, "epoch": 0, "train_loss": 3.7880431413650513, "train_ppl": 44.169881437128865, "lr": 0.00056, "grad_norm": 0.7042, "tokens_per_sec": 150347, "dt_s": 4.359, "eta_s": 27813, "world_size": 1, "timestamp": "2026-05-05T00:01:58.840594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29310, "epoch": 0, "train_loss": 3.6862626373767853, "train_ppl": 39.89546416572709, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 150806, "dt_s": 4.346, "eta_s": 27724, "world_size": 1, "timestamp": "2026-05-05T00:02:03.186313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29320, "epoch": 0, "train_loss": 3.8140674084424973, "train_ppl": 45.3344581296003, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 153687, "dt_s": 4.264, "eta_s": 27632, "world_size": 1, "timestamp": "2026-05-05T00:02:07.450552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29330, "epoch": 0, "train_loss": 3.7617805004119873, "train_ppl": 43.02496378649162, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 151821, "dt_s": 4.317, "eta_s": 27420, "world_size": 1, "timestamp": "2026-05-05T00:02:11.767241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29340, "epoch": 0, "train_loss": 3.7715917974710464, "train_ppl": 43.44917210114411, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 149175, "dt_s": 4.393, "eta_s": 27460, "world_size": 1, "timestamp": "2026-05-05T00:02:16.160472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29350, "epoch": 0, "train_loss": 3.7683277130126953, "train_ppl": 43.30758154124526, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 150615, "dt_s": 4.351, "eta_s": 27446, "world_size": 1, "timestamp": "2026-05-05T00:02:20.511667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29360, "epoch": 0, "train_loss": 3.656973272562027, "train_ppl": 38.74389800504707, "lr": 0.00056, "grad_norm": 0.6913, "tokens_per_sec": 149033, "dt_s": 4.397, "eta_s": 27507, "world_size": 1, "timestamp": "2026-05-05T00:02:24.909089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29370, "epoch": 0, "train_loss": 3.7305617928504944, "train_ppl": 41.702529767439486, "lr": 0.00056, "grad_norm": 0.6988, "tokens_per_sec": 147586, "dt_s": 4.441, "eta_s": 27726, "world_size": 1, "timestamp": "2026-05-05T00:02:29.349651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29380, "epoch": 0, "train_loss": 3.8445129841566086, "train_ppl": 46.735917675133045, "lr": 0.00056, "grad_norm": 0.6332, "tokens_per_sec": 152037, "dt_s": 4.311, "eta_s": 27714, "world_size": 1, "timestamp": "2026-05-05T00:02:33.660147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29390, "epoch": 0, "train_loss": 3.7181944251060486, "train_ppl": 41.18995537716963, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 149616, "dt_s": 4.38, "eta_s": 27693, "world_size": 1, "timestamp": "2026-05-05T00:02:38.040434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29400, "epoch": 0, "train_loss": 3.7794219106435776, "train_ppl": 43.790719468177905, "lr": 0.00056, "grad_norm": 0.6435, "tokens_per_sec": 149441, "dt_s": 4.385, "eta_s": 27732, "world_size": 1, "timestamp": "2026-05-05T00:02:42.425817"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29410, "epoch": 0, "train_loss": 3.857433095574379, "train_ppl": 47.343668589207915, "lr": 0.00056, "grad_norm": 0.6298, "tokens_per_sec": 151585, "dt_s": 4.323, "eta_s": 27634, "world_size": 1, "timestamp": "2026-05-05T00:02:46.749217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29420, "epoch": 0, "train_loss": 3.77472123503685, "train_ppl": 43.58535655174614, "lr": 0.00056, "grad_norm": 0.7412, "tokens_per_sec": 147556, "dt_s": 4.441, "eta_s": 27630, "world_size": 1, "timestamp": "2026-05-05T00:02:51.190645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29430, "epoch": 0, "train_loss": 3.759984076023102, "train_ppl": 42.947742074465175, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 150800, "dt_s": 4.346, "eta_s": 27671, "world_size": 1, "timestamp": "2026-05-05T00:02:55.536531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29440, "epoch": 0, "train_loss": 3.8216810524463654, "train_ppl": 45.680935860251026, "lr": 0.00056, "grad_norm": 0.6438, "tokens_per_sec": 151911, "dt_s": 4.314, "eta_s": 27583, "world_size": 1, "timestamp": "2026-05-05T00:02:59.850651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29450, "epoch": 0, "train_loss": 3.7334446012973785, "train_ppl": 41.822923625346164, "lr": 0.00056, "grad_norm": 0.7513, "tokens_per_sec": 148984, "dt_s": 4.399, "eta_s": 27595, "world_size": 1, "timestamp": "2026-05-05T00:03:04.249503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29460, "epoch": 0, "train_loss": 3.9297811537981033, "train_ppl": 50.895838089460305, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 151817, "dt_s": 4.317, "eta_s": 27583, "world_size": 1, "timestamp": "2026-05-05T00:03:08.566291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29470, "epoch": 0, "train_loss": 3.934073567390442, "train_ppl": 51.114773621441856, "lr": 0.00056, "grad_norm": 0.7193, "tokens_per_sec": 135383, "dt_s": 4.841, "eta_s": 28084, "world_size": 1, "timestamp": "2026-05-05T00:03:13.407049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29480, "epoch": 0, "train_loss": 3.7674062848091125, "train_ppl": 43.26769509325533, "lr": 0.00056, "grad_norm": 0.7404, "tokens_per_sec": 150398, "dt_s": 4.357, "eta_s": 28094, "world_size": 1, "timestamp": "2026-05-05T00:03:17.764547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29490, "epoch": 0, "train_loss": 3.785906046628952, "train_ppl": 44.075587009965666, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 152076, "dt_s": 4.309, "eta_s": 28083, "world_size": 1, "timestamp": "2026-05-05T00:03:22.073965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29500, "epoch": 0, "train_loss": 3.7121727615594864, "train_ppl": 40.94266861047132, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 151314, "dt_s": 4.331, "eta_s": 27993, "world_size": 1, "timestamp": "2026-05-05T00:03:26.405090"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29510, "epoch": 0, "train_loss": 3.733697310090065, "train_ppl": 41.83349398143685, "lr": 0.00056, "grad_norm": 0.6461, "tokens_per_sec": 127821, "dt_s": 5.127, "eta_s": 28025, "world_size": 1, "timestamp": "2026-05-05T00:03:31.532275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29520, "epoch": 0, "train_loss": 3.7745034098625183, "train_ppl": 43.575863597796655, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 152216, "dt_s": 4.305, "eta_s": 27344, "world_size": 1, "timestamp": "2026-05-05T00:03:35.837738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29530, "epoch": 0, "train_loss": 3.797852084040642, "train_ppl": 44.60527314735502, "lr": 0.00056, "grad_norm": 0.6343, "tokens_per_sec": 146740, "dt_s": 4.466, "eta_s": 27477, "world_size": 1, "timestamp": "2026-05-05T00:03:40.303865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29540, "epoch": 0, "train_loss": 3.8849667459726334, "train_ppl": 48.66532411456241, "lr": 0.00056, "grad_norm": 0.6914, "tokens_per_sec": 151244, "dt_s": 4.333, "eta_s": 27502, "world_size": 1, "timestamp": "2026-05-05T00:03:44.637014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29550, "epoch": 0, "train_loss": 3.8067896217107773, "train_ppl": 45.00572130099293, "lr": 0.00056, "grad_norm": 0.6664, "tokens_per_sec": 152146, "dt_s": 4.307, "eta_s": 27468, "world_size": 1, "timestamp": "2026-05-05T00:03:48.944435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29560, "epoch": 0, "train_loss": 3.836162716150284, "train_ppl": 46.34728508833067, "lr": 0.00056, "grad_norm": 0.704, "tokens_per_sec": 150043, "dt_s": 4.368, "eta_s": 27493, "world_size": 1, "timestamp": "2026-05-05T00:03:53.312278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29570, "epoch": 0, "train_loss": 3.6888841092586517, "train_ppl": 40.000186206222025, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 152150, "dt_s": 4.307, "eta_s": 27491, "world_size": 1, "timestamp": "2026-05-05T00:03:57.619582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29580, "epoch": 0, "train_loss": 3.857811987400055, "train_ppl": 47.36161011696835, "lr": 0.00056, "grad_norm": 0.7675, "tokens_per_sec": 150742, "dt_s": 4.348, "eta_s": 27337, "world_size": 1, "timestamp": "2026-05-05T00:04:01.967147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29590, "epoch": 0, "train_loss": 3.7525389194488525, "train_ppl": 42.6291767660249, "lr": 0.00056, "grad_norm": 0.734, "tokens_per_sec": 147587, "dt_s": 4.44, "eta_s": 27468, "world_size": 1, "timestamp": "2026-05-05T00:04:06.407626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29600, "epoch": 0, "train_loss": 3.9216800034046173, "train_ppl": 50.48518886212985, "lr": 0.00056, "grad_norm": 0.6858, "tokens_per_sec": 150722, "dt_s": 4.348, "eta_s": 27515, "world_size": 1, "timestamp": "2026-05-05T00:04:10.755775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29610, "epoch": 0, "train_loss": 3.7093304246664047, "train_ppl": 40.826460981829236, "lr": 0.00056, "grad_norm": 0.7489, "tokens_per_sec": 147920, "dt_s": 4.43, "eta_s": 27589, "world_size": 1, "timestamp": "2026-05-05T00:04:15.186289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29620, "epoch": 0, "train_loss": 3.695314109325409, "train_ppl": 40.25821608324388, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 149385, "dt_s": 4.387, "eta_s": 27685, "world_size": 1, "timestamp": "2026-05-05T00:04:19.573330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29630, "epoch": 0, "train_loss": 3.7983332127332687, "train_ppl": 44.62673918765854, "lr": 0.00056, "grad_norm": 0.7309, "tokens_per_sec": 150971, "dt_s": 4.341, "eta_s": 27673, "world_size": 1, "timestamp": "2026-05-05T00:04:23.914283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29640, "epoch": 0, "train_loss": 3.701049581170082, "train_ppl": 40.48977937569332, "lr": 0.00056, "grad_norm": 0.7248, "tokens_per_sec": 147667, "dt_s": 4.438, "eta_s": 27665, "world_size": 1, "timestamp": "2026-05-05T00:04:28.352382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29650, "epoch": 0, "train_loss": 3.873630702495575, "train_ppl": 48.116766992655194, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 149989, "dt_s": 4.369, "eta_s": 27688, "world_size": 1, "timestamp": "2026-05-05T00:04:32.721756"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29660, "epoch": 0, "train_loss": 3.9757790118455887, "train_ppl": 53.29161553944913, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 150555, "dt_s": 4.353, "eta_s": 27586, "world_size": 1, "timestamp": "2026-05-05T00:04:37.074710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29670, "epoch": 0, "train_loss": 3.7531336545944214, "train_ppl": 42.654537376349936, "lr": 0.00056, "grad_norm": 0.7069, "tokens_per_sec": 147626, "dt_s": 4.439, "eta_s": 27647, "world_size": 1, "timestamp": "2026-05-05T00:04:41.514036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29680, "epoch": 0, "train_loss": 3.7999308109283447, "train_ppl": 44.69809176683651, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 150812, "dt_s": 4.346, "eta_s": 27648, "world_size": 1, "timestamp": "2026-05-05T00:04:45.859579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29690, "epoch": 0, "train_loss": 3.738218814134598, "train_ppl": 42.023072560849066, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 149603, "dt_s": 4.381, "eta_s": 27572, "world_size": 1, "timestamp": "2026-05-05T00:04:50.240248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29700, "epoch": 0, "train_loss": 3.7768396735191345, "train_ppl": 43.67778731820834, "lr": 0.00056, "grad_norm": 0.6966, "tokens_per_sec": 148769, "dt_s": 4.405, "eta_s": 27612, "world_size": 1, "timestamp": "2026-05-05T00:04:54.645455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29710, "epoch": 0, "train_loss": 3.9056651443243027, "train_ppl": 49.68311536648642, "lr": 0.00056, "grad_norm": 0.6938, "tokens_per_sec": 150084, "dt_s": 4.367, "eta_s": 27625, "world_size": 1, "timestamp": "2026-05-05T00:04:59.012072"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29720, "epoch": 0, "train_loss": 3.813845068216324, "train_ppl": 45.32437957640174, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 148443, "dt_s": 4.415, "eta_s": 27590, "world_size": 1, "timestamp": "2026-05-05T00:05:03.426975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29730, "epoch": 0, "train_loss": 3.8625942319631577, "train_ppl": 47.58864736056954, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 148554, "dt_s": 4.412, "eta_s": 27669, "world_size": 1, "timestamp": "2026-05-05T00:05:07.838613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29740, "epoch": 0, "train_loss": 3.7644993662834167, "train_ppl": 43.142102061620015, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 150679, "dt_s": 4.349, "eta_s": 27625, "world_size": 1, "timestamp": "2026-05-05T00:05:12.187956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29750, "epoch": 0, "train_loss": 3.7368115186691284, "train_ppl": 41.96397527481384, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 150440, "dt_s": 4.356, "eta_s": 27559, "world_size": 1, "timestamp": "2026-05-05T00:05:16.544278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29760, "epoch": 0, "train_loss": 3.042106717824936, "train_ppl": 20.949331107698168, "lr": 0.00056, "grad_norm": 6.7757, "tokens_per_sec": 134731, "dt_s": 4.864, "eta_s": 28181, "world_size": 1, "timestamp": "2026-05-05T00:05:21.408460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29770, "epoch": 0, "train_loss": 3.8312281668186188, "train_ppl": 46.11914546970375, "lr": 0.00056, "grad_norm": 0.707, "tokens_per_sec": 153432, "dt_s": 4.271, "eta_s": 27996, "world_size": 1, "timestamp": "2026-05-05T00:05:25.679786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29780, "epoch": 0, "train_loss": 3.8659927994012833, "train_ppl": 47.75065573016187, "lr": 0.00056, "grad_norm": 0.7116, "tokens_per_sec": 151526, "dt_s": 4.325, "eta_s": 27883, "world_size": 1, "timestamp": "2026-05-05T00:05:30.004860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29790, "epoch": 0, "train_loss": 3.77781043946743, "train_ppl": 43.72020881417418, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 151063, "dt_s": 4.338, "eta_s": 27864, "world_size": 1, "timestamp": "2026-05-05T00:05:34.343173"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29800, "epoch": 0, "train_loss": 3.8012263476848602, "train_ppl": 44.75603731485549, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 151236, "dt_s": 4.333, "eta_s": 27831, "world_size": 1, "timestamp": "2026-05-05T00:05:38.676538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29810, "epoch": 0, "train_loss": 3.776067405939102, "train_ppl": 43.64406940039811, "lr": 0.00056, "grad_norm": 0.6807, "tokens_per_sec": 151718, "dt_s": 4.32, "eta_s": 27142, "world_size": 1, "timestamp": "2026-05-05T00:05:42.996113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29820, "epoch": 0, "train_loss": 3.786080852150917, "train_ppl": 44.083292339406306, "lr": 0.00056, "grad_norm": 0.665, "tokens_per_sec": 149569, "dt_s": 4.382, "eta_s": 27276, "world_size": 1, "timestamp": "2026-05-05T00:05:47.377800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29830, "epoch": 0, "train_loss": 3.762036010622978, "train_ppl": 43.03595850863866, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 147172, "dt_s": 4.453, "eta_s": 27433, "world_size": 1, "timestamp": "2026-05-05T00:05:51.830802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29840, "epoch": 0, "train_loss": 3.7566166669130325, "train_ppl": 42.80336268521386, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 149681, "dt_s": 4.378, "eta_s": 27479, "world_size": 1, "timestamp": "2026-05-05T00:05:56.209210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29850, "epoch": 0, "train_loss": 3.7892513126134872, "train_ppl": 44.22327846781143, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 150574, "dt_s": 4.352, "eta_s": 27498, "world_size": 1, "timestamp": "2026-05-05T00:06:00.561598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29860, "epoch": 0, "train_loss": 3.630071833729744, "train_ppl": 37.71552576676151, "lr": 0.00056, "grad_norm": 0.9133, "tokens_per_sec": 151137, "dt_s": 4.336, "eta_s": 27515, "world_size": 1, "timestamp": "2026-05-05T00:06:04.897800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29870, "epoch": 0, "train_loss": 3.810919865965843, "train_ppl": 45.19199032633177, "lr": 0.00056, "grad_norm": 0.7349, "tokens_per_sec": 150784, "dt_s": 4.346, "eta_s": 27466, "world_size": 1, "timestamp": "2026-05-05T00:06:09.244166"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29880, "epoch": 0, "train_loss": 3.8287240266799927, "train_ppl": 46.003801145823836, "lr": 0.00056, "grad_norm": 0.6573, "tokens_per_sec": 150116, "dt_s": 4.366, "eta_s": 27352, "world_size": 1, "timestamp": "2026-05-05T00:06:13.609832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29890, "epoch": 0, "train_loss": 3.7858112156391144, "train_ppl": 44.07140747659961, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 148442, "dt_s": 4.415, "eta_s": 27393, "world_size": 1, "timestamp": "2026-05-05T00:06:18.024774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29900, "epoch": 0, "train_loss": 3.8432786017656326, "train_ppl": 46.67826327243915, "lr": 0.00056, "grad_norm": 0.7638, "tokens_per_sec": 151639, "dt_s": 4.322, "eta_s": 27351, "world_size": 1, "timestamp": "2026-05-05T00:06:22.346595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29910, "epoch": 0, "train_loss": 3.8281500190496445, "train_ppl": 45.97740219026691, "lr": 0.00056, "grad_norm": 0.7116, "tokens_per_sec": 151442, "dt_s": 4.327, "eta_s": 27335, "world_size": 1, "timestamp": "2026-05-05T00:06:26.674069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29920, "epoch": 0, "train_loss": 3.7930456399917603, "train_ppl": 44.39139480657588, "lr": 0.00056, "grad_norm": 0.695, "tokens_per_sec": 147365, "dt_s": 4.447, "eta_s": 27458, "world_size": 1, "timestamp": "2026-05-05T00:06:31.121284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29930, "epoch": 0, "train_loss": 3.7175568640232086, "train_ppl": 41.163702634370075, "lr": 0.00056, "grad_norm": 0.7033, "tokens_per_sec": 149620, "dt_s": 4.38, "eta_s": 27471, "world_size": 1, "timestamp": "2026-05-05T00:06:35.501462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29940, "epoch": 0, "train_loss": 3.9183551967144012, "train_ppl": 50.317614099658904, "lr": 0.00056, "grad_norm": 0.6848, "tokens_per_sec": 149410, "dt_s": 4.386, "eta_s": 27431, "world_size": 1, "timestamp": "2026-05-05T00:06:39.887771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29950, "epoch": 0, "train_loss": 3.839371904730797, "train_ppl": 46.49626118471019, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 150707, "dt_s": 4.349, "eta_s": 27460, "world_size": 1, "timestamp": "2026-05-05T00:06:44.236354"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29960, "epoch": 0, "train_loss": 3.742072567343712, "train_ppl": 42.185331563849445, "lr": 0.00056, "grad_norm": 0.6602, "tokens_per_sec": 148915, "dt_s": 4.401, "eta_s": 27548, "world_size": 1, "timestamp": "2026-05-05T00:06:48.637240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29970, "epoch": 0, "train_loss": 3.7712824940681458, "train_ppl": 43.43573520250637, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 149005, "dt_s": 4.398, "eta_s": 27482, "world_size": 1, "timestamp": "2026-05-05T00:06:53.035463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29980, "epoch": 0, "train_loss": 3.784733012318611, "train_ppl": 44.02391514651399, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 151894, "dt_s": 4.315, "eta_s": 27396, "world_size": 1, "timestamp": "2026-05-05T00:06:57.350060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 29990, "epoch": 0, "train_loss": 3.7870758175849915, "train_ppl": 44.12717551900743, "lr": 0.00056, "grad_norm": 0.7029, "tokens_per_sec": 150088, "dt_s": 4.366, "eta_s": 27366, "world_size": 1, "timestamp": "2026-05-05T00:07:01.716553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30000, "epoch": 0, "train_loss": 3.682277128100395, "train_ppl": 39.73677685812974, "lr": 0.00056, "grad_norm": 0.7649, "tokens_per_sec": 149983, "dt_s": 4.37, "eta_s": 27388, "world_size": 1, "timestamp": "2026-05-05T00:07:06.086119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30010, "epoch": 0, "train_loss": 3.7482815235853195, "train_ppl": 42.44807327337363, "lr": 0.00056, "grad_norm": 0.7027, "tokens_per_sec": 128471, "dt_s": 5.101, "eta_s": 27303, "world_size": 1, "timestamp": "2026-05-05T00:07:11.187333"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30020, "epoch": 0, "train_loss": 3.8777986317873, "train_ppl": 48.31773278991824, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 150096, "dt_s": 4.366, "eta_s": 27259, "world_size": 1, "timestamp": "2026-05-05T00:07:15.553586"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30030, "epoch": 0, "train_loss": 3.750658467411995, "train_ppl": 42.54908996703738, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 146558, "dt_s": 4.472, "eta_s": 27451, "world_size": 1, "timestamp": "2026-05-05T00:07:20.025287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30040, "epoch": 0, "train_loss": 3.8661889731884003, "train_ppl": 47.76002407601567, "lr": 0.00056, "grad_norm": 0.7044, "tokens_per_sec": 149582, "dt_s": 4.381, "eta_s": 27465, "world_size": 1, "timestamp": "2026-05-05T00:07:24.406553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30050, "epoch": 0, "train_loss": 3.7636371105909348, "train_ppl": 43.10491857167346, "lr": 0.00056, "grad_norm": 0.7013, "tokens_per_sec": 145801, "dt_s": 4.495, "eta_s": 27618, "world_size": 1, "timestamp": "2026-05-05T00:07:28.901449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30060, "epoch": 0, "train_loss": 3.8292580395936966, "train_ppl": 46.02837433033014, "lr": 0.00056, "grad_norm": 0.6707, "tokens_per_sec": 135062, "dt_s": 4.852, "eta_s": 28260, "world_size": 1, "timestamp": "2026-05-05T00:07:33.753724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30070, "epoch": 0, "train_loss": 3.846825137734413, "train_ppl": 46.844103317109976, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 149881, "dt_s": 4.373, "eta_s": 28263, "world_size": 1, "timestamp": "2026-05-05T00:07:38.126266"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30080, "epoch": 0, "train_loss": 3.7911866158246994, "train_ppl": 44.30894679099249, "lr": 0.00056, "grad_norm": 0.7842, "tokens_per_sec": 147881, "dt_s": 4.432, "eta_s": 28208, "world_size": 1, "timestamp": "2026-05-05T00:07:42.557907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30090, "epoch": 0, "train_loss": 3.7673006653785706, "train_ppl": 43.26312542526583, "lr": 0.00056, "grad_norm": 0.7149, "tokens_per_sec": 151191, "dt_s": 4.335, "eta_s": 28146, "world_size": 1, "timestamp": "2026-05-05T00:07:46.892548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30100, "epoch": 0, "train_loss": 3.857630804181099, "train_ppl": 47.353029765323754, "lr": 0.00056, "grad_norm": 0.6692, "tokens_per_sec": 147947, "dt_s": 4.43, "eta_s": 28059, "world_size": 1, "timestamp": "2026-05-05T00:07:51.322260"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30110, "epoch": 0, "train_loss": 3.801088869571686, "train_ppl": 44.749884762222635, "lr": 0.00056, "grad_norm": 0.6888, "tokens_per_sec": 147602, "dt_s": 4.44, "eta_s": 27539, "world_size": 1, "timestamp": "2026-05-05T00:07:55.762312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30120, "epoch": 0, "train_loss": 3.706592082977295, "train_ppl": 40.714817111000386, "lr": 0.00056, "grad_norm": 0.7021, "tokens_per_sec": 151881, "dt_s": 4.315, "eta_s": 27463, "world_size": 1, "timestamp": "2026-05-05T00:08:00.077299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30130, "epoch": 0, "train_loss": 3.790701687335968, "train_ppl": 44.287465329295195, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 148893, "dt_s": 4.402, "eta_s": 27421, "world_size": 1, "timestamp": "2026-05-05T00:08:04.478807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30140, "epoch": 0, "train_loss": 3.816761925816536, "train_ppl": 45.4567773362795, "lr": 0.00056, "grad_norm": 0.6571, "tokens_per_sec": 151357, "dt_s": 4.33, "eta_s": 27410, "world_size": 1, "timestamp": "2026-05-05T00:08:08.808699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30150, "epoch": 0, "train_loss": 3.6581498235464096, "train_ppl": 38.78950900295447, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 151764, "dt_s": 4.318, "eta_s": 27266, "world_size": 1, "timestamp": "2026-05-05T00:08:13.126962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30160, "epoch": 0, "train_loss": 3.7299559712409973, "train_ppl": 41.67727312501803, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 146854, "dt_s": 4.463, "eta_s": 27290, "world_size": 1, "timestamp": "2026-05-05T00:08:17.589640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30170, "epoch": 0, "train_loss": 3.8607890009880066, "train_ppl": 47.502816355992, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 152840, "dt_s": 4.288, "eta_s": 27252, "world_size": 1, "timestamp": "2026-05-05T00:08:21.877513"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30180, "epoch": 0, "train_loss": 3.763956367969513, "train_ppl": 43.11868233195458, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 150536, "dt_s": 4.353, "eta_s": 27188, "world_size": 1, "timestamp": "2026-05-05T00:08:26.231018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30190, "epoch": 0, "train_loss": 3.670631378889084, "train_ppl": 39.276696510344, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 149796, "dt_s": 4.375, "eta_s": 27240, "world_size": 1, "timestamp": "2026-05-05T00:08:30.606038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30200, "epoch": 0, "train_loss": 3.836795762181282, "train_ppl": 46.3766343419375, "lr": 0.00056, "grad_norm": 0.7749, "tokens_per_sec": 151780, "dt_s": 4.318, "eta_s": 27235, "world_size": 1, "timestamp": "2026-05-05T00:08:34.923837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30210, "epoch": 0, "train_loss": 3.7860655933618546, "train_ppl": 44.08261968687926, "lr": 0.00056, "grad_norm": 0.6984, "tokens_per_sec": 151084, "dt_s": 4.338, "eta_s": 27074, "world_size": 1, "timestamp": "2026-05-05T00:08:39.261610"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30220, "epoch": 0, "train_loss": 3.799665540456772, "train_ppl": 44.68623625548318, "lr": 0.00056, "grad_norm": 0.7094, "tokens_per_sec": 149052, "dt_s": 4.397, "eta_s": 27206, "world_size": 1, "timestamp": "2026-05-05T00:08:43.658433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30230, "epoch": 0, "train_loss": 3.6902341097593307, "train_ppl": 40.05422294423253, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 151750, "dt_s": 4.319, "eta_s": 27158, "world_size": 1, "timestamp": "2026-05-05T00:08:47.977101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30240, "epoch": 0, "train_loss": 3.768584430217743, "train_ppl": 43.31870076972346, "lr": 0.00056, "grad_norm": 0.6552, "tokens_per_sec": 149997, "dt_s": 4.369, "eta_s": 27147, "world_size": 1, "timestamp": "2026-05-05T00:08:52.346270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30250, "epoch": 0, "train_loss": 3.827886402606964, "train_ppl": 45.965283388485716, "lr": 0.00056, "grad_norm": 0.7394, "tokens_per_sec": 150312, "dt_s": 4.36, "eta_s": 27195, "world_size": 1, "timestamp": "2026-05-05T00:08:56.706277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30260, "epoch": 0, "train_loss": 3.8102461844682693, "train_ppl": 45.16155557142707, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 151831, "dt_s": 4.316, "eta_s": 27164, "world_size": 1, "timestamp": "2026-05-05T00:09:01.022688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30270, "epoch": 0, "train_loss": 3.852615475654602, "train_ppl": 47.11613331764636, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 147809, "dt_s": 4.434, "eta_s": 27206, "world_size": 1, "timestamp": "2026-05-05T00:09:05.456489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30280, "epoch": 0, "train_loss": 3.752997577190399, "train_ppl": 42.64873345253439, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 151558, "dt_s": 4.324, "eta_s": 27208, "world_size": 1, "timestamp": "2026-05-05T00:09:09.780629"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30290, "epoch": 0, "train_loss": 3.837158814072609, "train_ppl": 46.39347452349363, "lr": 0.00056, "grad_norm": 0.6835, "tokens_per_sec": 151744, "dt_s": 4.319, "eta_s": 27141, "world_size": 1, "timestamp": "2026-05-05T00:09:14.099499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30300, "epoch": 0, "train_loss": 3.8254616260528564, "train_ppl": 45.853962865262545, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 148082, "dt_s": 4.426, "eta_s": 27219, "world_size": 1, "timestamp": "2026-05-05T00:09:18.525142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30310, "epoch": 0, "train_loss": 3.8162565529346466, "train_ppl": 45.433810517608144, "lr": 0.00056, "grad_norm": 0.673, "tokens_per_sec": 152448, "dt_s": 4.299, "eta_s": 27192, "world_size": 1, "timestamp": "2026-05-05T00:09:22.824061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30320, "epoch": 0, "train_loss": 3.8543609380722046, "train_ppl": 47.198444572318124, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 152389, "dt_s": 4.301, "eta_s": 27022, "world_size": 1, "timestamp": "2026-05-05T00:09:27.124652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30330, "epoch": 0, "train_loss": 3.70001021027565, "train_ppl": 40.447717340302525, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 148211, "dt_s": 4.422, "eta_s": 27139, "world_size": 1, "timestamp": "2026-05-05T00:09:31.546458"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30340, "epoch": 0, "train_loss": 3.853669285774231, "train_ppl": 47.16581094653098, "lr": 0.00056, "grad_norm": 0.6935, "tokens_per_sec": 152932, "dt_s": 4.285, "eta_s": 27093, "world_size": 1, "timestamp": "2026-05-05T00:09:35.831772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30350, "epoch": 0, "train_loss": 3.829870879650116, "train_ppl": 46.05659100712341, "lr": 0.00056, "grad_norm": 0.626, "tokens_per_sec": 134862, "dt_s": 4.859, "eta_s": 27630, "world_size": 1, "timestamp": "2026-05-05T00:09:40.691278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30360, "epoch": 0, "train_loss": 3.722348690032959, "train_ppl": 41.36142528324137, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 150542, "dt_s": 4.353, "eta_s": 27693, "world_size": 1, "timestamp": "2026-05-05T00:09:45.044596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30370, "epoch": 0, "train_loss": 3.797916531562805, "train_ppl": 44.6081479393204, "lr": 0.00056, "grad_norm": 0.6676, "tokens_per_sec": 151859, "dt_s": 4.316, "eta_s": 27707, "world_size": 1, "timestamp": "2026-05-05T00:09:49.360195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30380, "epoch": 0, "train_loss": 3.8221947699785233, "train_ppl": 45.70440898664988, "lr": 0.00056, "grad_norm": 0.7044, "tokens_per_sec": 149275, "dt_s": 4.39, "eta_s": 27664, "world_size": 1, "timestamp": "2026-05-05T00:09:53.750483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30390, "epoch": 0, "train_loss": 3.9247763752937317, "train_ppl": 50.641752045571536, "lr": 0.00056, "grad_norm": 0.8043, "tokens_per_sec": 152035, "dt_s": 4.311, "eta_s": 27691, "world_size": 1, "timestamp": "2026-05-05T00:09:58.061069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30400, "epoch": 0, "train_loss": 3.850348472595215, "train_ppl": 47.00944187986998, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 152338, "dt_s": 4.302, "eta_s": 26992, "world_size": 1, "timestamp": "2026-05-05T00:10:02.363091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30410, "epoch": 0, "train_loss": 3.7755440175533295, "train_ppl": 43.621232578151165, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 148623, "dt_s": 4.41, "eta_s": 27057, "world_size": 1, "timestamp": "2026-05-05T00:10:06.772629"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30420, "epoch": 0, "train_loss": 3.8588763773441315, "train_ppl": 47.41204817663199, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 150126, "dt_s": 4.365, "eta_s": 27115, "world_size": 1, "timestamp": "2026-05-05T00:10:11.138035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30430, "epoch": 0, "train_loss": 3.741752505302429, "train_ppl": 42.171831801013006, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 153307, "dt_s": 4.275, "eta_s": 26967, "world_size": 1, "timestamp": "2026-05-05T00:10:15.412845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30440, "epoch": 0, "train_loss": 3.7795169204473495, "train_ppl": 43.794880213494274, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 150213, "dt_s": 4.363, "eta_s": 27028, "world_size": 1, "timestamp": "2026-05-05T00:10:19.775727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30450, "epoch": 0, "train_loss": 3.821145221590996, "train_ppl": 45.65646516197803, "lr": 0.00056, "grad_norm": 0.6995, "tokens_per_sec": 152480, "dt_s": 4.298, "eta_s": 27019, "world_size": 1, "timestamp": "2026-05-05T00:10:24.073743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30460, "epoch": 0, "train_loss": 3.868354916572571, "train_ppl": 47.86358169367429, "lr": 0.00056, "grad_norm": 0.6438, "tokens_per_sec": 151814, "dt_s": 4.317, "eta_s": 26899, "world_size": 1, "timestamp": "2026-05-05T00:10:28.390587"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30470, "epoch": 0, "train_loss": 3.921255350112915, "train_ppl": 50.46375471186067, "lr": 0.00056, "grad_norm": 0.767, "tokens_per_sec": 151022, "dt_s": 4.34, "eta_s": 26862, "world_size": 1, "timestamp": "2026-05-05T00:10:32.730100"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30480, "epoch": 0, "train_loss": 3.812171518802643, "train_ppl": 45.24859042367124, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 152976, "dt_s": 4.284, "eta_s": 26870, "world_size": 1, "timestamp": "2026-05-05T00:10:37.014164"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30490, "epoch": 0, "train_loss": 3.7726502269506454, "train_ppl": 43.495184331815295, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 149742, "dt_s": 4.377, "eta_s": 26882, "world_size": 1, "timestamp": "2026-05-05T00:10:41.390788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30500, "epoch": 0, "train_loss": 3.8669584542512894, "train_ppl": 47.79678865311419, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 148351, "dt_s": 4.418, "eta_s": 27027, "world_size": 1, "timestamp": "2026-05-05T00:10:45.808411"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30510, "epoch": 0, "train_loss": 3.7638400197029114, "train_ppl": 43.11366583984286, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 108853, "dt_s": 6.021, "eta_s": 27202, "world_size": 1, "timestamp": "2026-05-05T00:10:51.829016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30520, "epoch": 0, "train_loss": 3.738948702812195, "train_ppl": 42.053755922065854, "lr": 0.00056, "grad_norm": 0.6775, "tokens_per_sec": 144936, "dt_s": 4.522, "eta_s": 27424, "world_size": 1, "timestamp": "2026-05-05T00:10:56.350728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30530, "epoch": 0, "train_loss": 3.8586980253458023, "train_ppl": 47.40359289712522, "lr": 0.00056, "grad_norm": 0.7283, "tokens_per_sec": 150250, "dt_s": 4.362, "eta_s": 27517, "world_size": 1, "timestamp": "2026-05-05T00:11:00.712520"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30540, "epoch": 0, "train_loss": 3.77341228723526, "train_ppl": 43.528342917182364, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 149921, "dt_s": 4.371, "eta_s": 27506, "world_size": 1, "timestamp": "2026-05-05T00:11:05.083879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30550, "epoch": 0, "train_loss": 3.9276088774204254, "train_ppl": 50.785398259002356, "lr": 0.00056, "grad_norm": 0.6768, "tokens_per_sec": 150139, "dt_s": 4.365, "eta_s": 27436, "world_size": 1, "timestamp": "2026-05-05T00:11:09.448913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30560, "epoch": 0, "train_loss": 3.823349639773369, "train_ppl": 45.75722211835527, "lr": 0.00056, "grad_norm": 0.7117, "tokens_per_sec": 149531, "dt_s": 4.383, "eta_s": 27334, "world_size": 1, "timestamp": "2026-05-05T00:11:13.831708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30570, "epoch": 0, "train_loss": 3.7860194593667984, "train_ppl": 44.08058602643136, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 147284, "dt_s": 4.45, "eta_s": 27240, "world_size": 1, "timestamp": "2026-05-05T00:11:18.281302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30580, "epoch": 0, "train_loss": 3.8218702375888824, "train_ppl": 45.68957883214722, "lr": 0.00056, "grad_norm": 0.7527, "tokens_per_sec": 149289, "dt_s": 4.39, "eta_s": 27270, "world_size": 1, "timestamp": "2026-05-05T00:11:22.671185"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30590, "epoch": 0, "train_loss": 3.7402203381061554, "train_ppl": 42.10726697840302, "lr": 0.00056, "grad_norm": 0.782, "tokens_per_sec": 150805, "dt_s": 4.346, "eta_s": 27234, "world_size": 1, "timestamp": "2026-05-05T00:11:27.016905"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30600, "epoch": 0, "train_loss": 3.878881797194481, "train_ppl": 48.37009724119107, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 148992, "dt_s": 4.399, "eta_s": 27271, "world_size": 1, "timestamp": "2026-05-05T00:11:31.415519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30610, "epoch": 0, "train_loss": 3.7570700496435165, "train_ppl": 42.822773390568784, "lr": 0.00056, "grad_norm": 0.6283, "tokens_per_sec": 148766, "dt_s": 4.405, "eta_s": 27295, "world_size": 1, "timestamp": "2026-05-05T00:11:35.820843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30620, "epoch": 0, "train_loss": 3.8175406455993652, "train_ppl": 45.492189214230386, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 150455, "dt_s": 4.356, "eta_s": 27174, "world_size": 1, "timestamp": "2026-05-05T00:11:40.176707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30630, "epoch": 0, "train_loss": 3.799601763486862, "train_ppl": 44.68338639361688, "lr": 0.00056, "grad_norm": 0.701, "tokens_per_sec": 148504, "dt_s": 4.413, "eta_s": 27199, "world_size": 1, "timestamp": "2026-05-05T00:11:44.589769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30640, "epoch": 0, "train_loss": 3.6770908385515213, "train_ppl": 39.53122391687981, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 151478, "dt_s": 4.326, "eta_s": 27170, "world_size": 1, "timestamp": "2026-05-05T00:11:48.916202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30650, "epoch": 0, "train_loss": 3.8173487931489944, "train_ppl": 45.48346226342712, "lr": 0.00056, "grad_norm": 0.746, "tokens_per_sec": 135756, "dt_s": 4.827, "eta_s": 27698, "world_size": 1, "timestamp": "2026-05-05T00:11:53.743661"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30660, "epoch": 0, "train_loss": 3.7334004640579224, "train_ppl": 41.82107771768829, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 148252, "dt_s": 4.421, "eta_s": 27712, "world_size": 1, "timestamp": "2026-05-05T00:11:58.164270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30670, "epoch": 0, "train_loss": 3.807699739933014, "train_ppl": 45.046700473170894, "lr": 0.00056, "grad_norm": 0.7632, "tokens_per_sec": 151197, "dt_s": 4.334, "eta_s": 27681, "world_size": 1, "timestamp": "2026-05-05T00:12:02.498730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30680, "epoch": 0, "train_loss": 3.735383152961731, "train_ppl": 41.904078159261786, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 150307, "dt_s": 4.36, "eta_s": 27611, "world_size": 1, "timestamp": "2026-05-05T00:12:06.858882"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30690, "epoch": 0, "train_loss": 3.801370248198509, "train_ppl": 44.762478195025864, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 150682, "dt_s": 4.349, "eta_s": 27635, "world_size": 1, "timestamp": "2026-05-05T00:12:11.208172"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30700, "epoch": 0, "train_loss": 3.8294282853603363, "train_ppl": 46.03621113327752, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 152611, "dt_s": 4.294, "eta_s": 26970, "world_size": 1, "timestamp": "2026-05-05T00:12:15.502475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30710, "epoch": 0, "train_loss": 3.805936872959137, "train_ppl": 44.96735908732643, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 147463, "dt_s": 4.444, "eta_s": 26995, "world_size": 1, "timestamp": "2026-05-05T00:12:19.946721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30720, "epoch": 0, "train_loss": 3.7821643352508545, "train_ppl": 43.91097703810694, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 150461, "dt_s": 4.356, "eta_s": 27016, "world_size": 1, "timestamp": "2026-05-05T00:12:24.302386"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30730, "epoch": 0, "train_loss": 3.8362938463687897, "train_ppl": 46.35336301644277, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 151011, "dt_s": 4.34, "eta_s": 26987, "world_size": 1, "timestamp": "2026-05-05T00:12:28.642225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30740, "epoch": 0, "train_loss": 3.8019239753484726, "train_ppl": 44.787271258164694, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 151298, "dt_s": 4.332, "eta_s": 26961, "world_size": 1, "timestamp": "2026-05-05T00:12:32.973794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30750, "epoch": 0, "train_loss": 3.8155747056007385, "train_ppl": 45.402842154085285, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 152079, "dt_s": 4.309, "eta_s": 26975, "world_size": 1, "timestamp": "2026-05-05T00:12:37.283139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30760, "epoch": 0, "train_loss": 3.7448261380195618, "train_ppl": 42.3016519304734, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 150397, "dt_s": 4.358, "eta_s": 26863, "world_size": 1, "timestamp": "2026-05-05T00:12:41.640695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30770, "epoch": 0, "train_loss": 3.6852023005485535, "train_ppl": 39.853183955386186, "lr": 0.00056, "grad_norm": 0.6842, "tokens_per_sec": 149153, "dt_s": 4.394, "eta_s": 26906, "world_size": 1, "timestamp": "2026-05-05T00:12:46.034569"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30780, "epoch": 0, "train_loss": 3.7413587868213654, "train_ppl": 42.155231239640976, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 146515, "dt_s": 4.473, "eta_s": 27067, "world_size": 1, "timestamp": "2026-05-05T00:12:50.507563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30790, "epoch": 0, "train_loss": 3.8283361941576004, "train_ppl": 45.98596283494796, "lr": 0.00056, "grad_norm": 0.6906, "tokens_per_sec": 146666, "dt_s": 4.468, "eta_s": 27232, "world_size": 1, "timestamp": "2026-05-05T00:12:54.975946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30800, "epoch": 0, "train_loss": 3.7021793723106384, "train_ppl": 40.53555022059399, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 149186, "dt_s": 4.393, "eta_s": 27331, "world_size": 1, "timestamp": "2026-05-05T00:12:59.368851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30810, "epoch": 0, "train_loss": 3.7903376519680023, "train_ppl": 44.271346059727904, "lr": 0.00056, "grad_norm": 0.7683, "tokens_per_sec": 150313, "dt_s": 4.36, "eta_s": 27329, "world_size": 1, "timestamp": "2026-05-05T00:13:03.728843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30820, "epoch": 0, "train_loss": 3.8224552124738693, "train_ppl": 45.71631390718099, "lr": 0.00056, "grad_norm": 0.6277, "tokens_per_sec": 148683, "dt_s": 4.408, "eta_s": 27342, "world_size": 1, "timestamp": "2026-05-05T00:13:08.136608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30830, "epoch": 0, "train_loss": 3.831232324242592, "train_ppl": 46.119337206943314, "lr": 0.00056, "grad_norm": 0.7471, "tokens_per_sec": 151093, "dt_s": 4.337, "eta_s": 27170, "world_size": 1, "timestamp": "2026-05-05T00:13:12.474060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30840, "epoch": 0, "train_loss": 3.846106678247452, "train_ppl": 46.81045981386783, "lr": 0.00056, "grad_norm": 0.6595, "tokens_per_sec": 150810, "dt_s": 4.346, "eta_s": 27014, "world_size": 1, "timestamp": "2026-05-05T00:13:16.819647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30850, "epoch": 0, "train_loss": 3.7308071702718735, "train_ppl": 41.71276388221781, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 149785, "dt_s": 4.375, "eta_s": 26988, "world_size": 1, "timestamp": "2026-05-05T00:13:21.195008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30860, "epoch": 0, "train_loss": 3.8758521676063538, "train_ppl": 48.223775525627694, "lr": 0.00056, "grad_norm": 1.3829, "tokens_per_sec": 150285, "dt_s": 4.361, "eta_s": 26985, "world_size": 1, "timestamp": "2026-05-05T00:13:25.555802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30870, "epoch": 0, "train_loss": 3.6133382320404053, "train_ppl": 37.08966027714386, "lr": 0.00056, "grad_norm": 0.6849, "tokens_per_sec": 150810, "dt_s": 4.346, "eta_s": 26903, "world_size": 1, "timestamp": "2026-05-05T00:13:29.901401"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30880, "epoch": 0, "train_loss": 3.8700482845306396, "train_ppl": 47.944701012316145, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 151042, "dt_s": 4.339, "eta_s": 26901, "world_size": 1, "timestamp": "2026-05-05T00:13:34.240338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30890, "epoch": 0, "train_loss": 3.7837967723608017, "train_ppl": 43.982717486511646, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 151306, "dt_s": 4.331, "eta_s": 26879, "world_size": 1, "timestamp": "2026-05-05T00:13:38.571675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30900, "epoch": 0, "train_loss": 4.005522504448891, "train_ppl": 54.900502663045145, "lr": 0.00056, "grad_norm": 0.7175, "tokens_per_sec": 151003, "dt_s": 4.34, "eta_s": 26831, "world_size": 1, "timestamp": "2026-05-05T00:13:42.911725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30910, "epoch": 0, "train_loss": 3.7333571761846542, "train_ppl": 41.819267411358545, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 152540, "dt_s": 4.296, "eta_s": 26747, "world_size": 1, "timestamp": "2026-05-05T00:13:47.208042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30920, "epoch": 0, "train_loss": 3.8212743550539017, "train_ppl": 45.662361320115906, "lr": 0.00056, "grad_norm": 0.732, "tokens_per_sec": 149307, "dt_s": 4.389, "eta_s": 26796, "world_size": 1, "timestamp": "2026-05-05T00:13:51.597395"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30930, "epoch": 0, "train_loss": 3.8472313582897186, "train_ppl": 46.863136220288766, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 149219, "dt_s": 4.392, "eta_s": 26858, "world_size": 1, "timestamp": "2026-05-05T00:13:55.989324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30940, "epoch": 0, "train_loss": 3.7805546671152115, "train_ppl": 43.84035179441527, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 135163, "dt_s": 4.849, "eta_s": 27492, "world_size": 1, "timestamp": "2026-05-05T00:14:00.837973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30950, "epoch": 0, "train_loss": 3.689931094646454, "train_ppl": 40.04208774801266, "lr": 0.00056, "grad_norm": 0.9037, "tokens_per_sec": 150375, "dt_s": 4.358, "eta_s": 27510, "world_size": 1, "timestamp": "2026-05-05T00:14:05.196142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30960, "epoch": 0, "train_loss": 3.7518051862716675, "train_ppl": 42.59790979692462, "lr": 0.00056, "grad_norm": 0.636, "tokens_per_sec": 150417, "dt_s": 4.357, "eta_s": 27580, "world_size": 1, "timestamp": "2026-05-05T00:14:09.553100"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30970, "epoch": 0, "train_loss": 3.77759850025177, "train_ppl": 43.71094376925747, "lr": 0.00056, "grad_norm": 0.6242, "tokens_per_sec": 151849, "dt_s": 4.316, "eta_s": 27485, "world_size": 1, "timestamp": "2026-05-05T00:14:13.868979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30980, "epoch": 0, "train_loss": 3.8439377546310425, "train_ppl": 46.709041526100194, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 151032, "dt_s": 4.339, "eta_s": 27416, "world_size": 1, "timestamp": "2026-05-05T00:14:18.208202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 30990, "epoch": 0, "train_loss": 3.7013313621282578, "train_ppl": 40.50119023212765, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 150385, "dt_s": 4.358, "eta_s": 26806, "world_size": 1, "timestamp": "2026-05-05T00:14:22.566058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31000, "epoch": 0, "train_loss": 3.869736924767494, "train_ppl": 47.929775285320865, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 151569, "dt_s": 4.324, "eta_s": 26759, "world_size": 1, "timestamp": "2026-05-05T00:14:26.889889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31010, "epoch": 0, "train_loss": 3.8321648091077805, "train_ppl": 46.16236284814327, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 125675, "dt_s": 5.215, "eta_s": 26867, "world_size": 1, "timestamp": "2026-05-05T00:14:32.104600"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31020, "epoch": 0, "train_loss": 3.8136911392211914, "train_ppl": 45.31740337713199, "lr": 0.00056, "grad_norm": 0.6492, "tokens_per_sec": 151331, "dt_s": 4.331, "eta_s": 26881, "world_size": 1, "timestamp": "2026-05-05T00:14:36.435247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31030, "epoch": 0, "train_loss": 3.673896089196205, "train_ppl": 39.40513308630103, "lr": 0.00056, "grad_norm": 0.6539, "tokens_per_sec": 149969, "dt_s": 4.37, "eta_s": 26915, "world_size": 1, "timestamp": "2026-05-05T00:14:40.805241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31040, "epoch": 0, "train_loss": 3.8040103167295456, "train_ppl": 44.88081033883342, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 147456, "dt_s": 4.444, "eta_s": 27017, "world_size": 1, "timestamp": "2026-05-05T00:14:45.249649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31050, "epoch": 0, "train_loss": 3.8456673473119736, "train_ppl": 46.78989904758868, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 150188, "dt_s": 4.364, "eta_s": 27062, "world_size": 1, "timestamp": "2026-05-05T00:14:49.613277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31060, "epoch": 0, "train_loss": 3.771105647087097, "train_ppl": 43.428054403049295, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 150235, "dt_s": 4.362, "eta_s": 26951, "world_size": 1, "timestamp": "2026-05-05T00:14:53.975484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31070, "epoch": 0, "train_loss": 3.838458314538002, "train_ppl": 46.45380205456586, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 150866, "dt_s": 4.344, "eta_s": 26963, "world_size": 1, "timestamp": "2026-05-05T00:14:58.319466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31080, "epoch": 0, "train_loss": 3.8533490002155304, "train_ppl": 47.15070683736354, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 150388, "dt_s": 4.358, "eta_s": 26944, "world_size": 1, "timestamp": "2026-05-05T00:15:02.677266"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31090, "epoch": 0, "train_loss": 3.8891681432724, "train_ppl": 48.87021659184771, "lr": 0.00056, "grad_norm": 0.6908, "tokens_per_sec": 150136, "dt_s": 4.365, "eta_s": 26842, "world_size": 1, "timestamp": "2026-05-05T00:15:07.042367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31100, "epoch": 0, "train_loss": 3.8304414451122284, "train_ppl": 46.08287680542594, "lr": 0.00056, "grad_norm": 0.6448, "tokens_per_sec": 150675, "dt_s": 4.349, "eta_s": 26820, "world_size": 1, "timestamp": "2026-05-05T00:15:11.391832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31110, "epoch": 0, "train_loss": 3.805462285876274, "train_ppl": 44.94602322281644, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 150595, "dt_s": 4.352, "eta_s": 26803, "world_size": 1, "timestamp": "2026-05-05T00:15:15.743671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31120, "epoch": 0, "train_loss": 3.802912786602974, "train_ppl": 44.8315793185874, "lr": 0.00056, "grad_norm": 0.6318, "tokens_per_sec": 148203, "dt_s": 4.422, "eta_s": 26895, "world_size": 1, "timestamp": "2026-05-05T00:15:20.165712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31130, "epoch": 0, "train_loss": 3.573111206293106, "train_ppl": 35.62726467144461, "lr": 0.00056, "grad_norm": 0.7285, "tokens_per_sec": 150574, "dt_s": 4.352, "eta_s": 26884, "world_size": 1, "timestamp": "2026-05-05T00:15:24.518112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31140, "epoch": 0, "train_loss": 3.8139811605215073, "train_ppl": 45.33054829544737, "lr": 0.00056, "grad_norm": 0.6975, "tokens_per_sec": 151760, "dt_s": 4.318, "eta_s": 26822, "world_size": 1, "timestamp": "2026-05-05T00:15:28.836506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31150, "epoch": 0, "train_loss": 3.879887640476227, "train_ppl": 48.41877445525891, "lr": 0.00056, "grad_norm": 0.6557, "tokens_per_sec": 149061, "dt_s": 4.397, "eta_s": 26875, "world_size": 1, "timestamp": "2026-05-05T00:15:33.233115"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31160, "epoch": 0, "train_loss": 3.7870015799999237, "train_ppl": 44.123899745655294, "lr": 0.00056, "grad_norm": 0.7477, "tokens_per_sec": 152579, "dt_s": 4.295, "eta_s": 26801, "world_size": 1, "timestamp": "2026-05-05T00:15:37.528348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31170, "epoch": 0, "train_loss": 3.6397594809532166, "train_ppl": 38.08267601484644, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 150645, "dt_s": 4.35, "eta_s": 26709, "world_size": 1, "timestamp": "2026-05-05T00:15:41.878686"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31180, "epoch": 0, "train_loss": 3.764204427599907, "train_ppl": 43.12937966309008, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 150055, "dt_s": 4.367, "eta_s": 26723, "world_size": 1, "timestamp": "2026-05-05T00:15:46.246141"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31190, "epoch": 0, "train_loss": 3.8463696390390396, "train_ppl": 46.822770748010626, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 151628, "dt_s": 4.322, "eta_s": 26723, "world_size": 1, "timestamp": "2026-05-05T00:15:50.568324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31200, "epoch": 0, "train_loss": 3.871893137693405, "train_ppl": 48.03323358530528, "lr": 0.00056, "grad_norm": 0.6888, "tokens_per_sec": 149845, "dt_s": 4.374, "eta_s": 26691, "world_size": 1, "timestamp": "2026-05-05T00:15:54.941898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31210, "epoch": 0, "train_loss": 3.7685550898313522, "train_ppl": 43.317429800950386, "lr": 0.00056, "grad_norm": 0.879, "tokens_per_sec": 151072, "dt_s": 4.338, "eta_s": 26739, "world_size": 1, "timestamp": "2026-05-05T00:15:59.279949"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31220, "epoch": 0, "train_loss": 3.7949718087911606, "train_ppl": 44.47698252795167, "lr": 0.00056, "grad_norm": 0.6339, "tokens_per_sec": 152528, "dt_s": 4.297, "eta_s": 26669, "world_size": 1, "timestamp": "2026-05-05T00:16:03.576588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31230, "epoch": 0, "train_loss": 3.750473514199257, "train_ppl": 42.54122110385709, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 148683, "dt_s": 4.408, "eta_s": 26714, "world_size": 1, "timestamp": "2026-05-05T00:16:07.984400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31240, "epoch": 0, "train_loss": 3.8411948680877686, "train_ppl": 46.58109947013439, "lr": 0.00056, "grad_norm": 0.6408, "tokens_per_sec": 136511, "dt_s": 4.801, "eta_s": 27298, "world_size": 1, "timestamp": "2026-05-05T00:16:12.785162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31250, "epoch": 0, "train_loss": 3.858448639512062, "train_ppl": 47.39177258655855, "lr": 0.00056, "grad_norm": 0.7055, "tokens_per_sec": 153050, "dt_s": 4.282, "eta_s": 27181, "world_size": 1, "timestamp": "2026-05-05T00:16:17.067160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31260, "epoch": 0, "train_loss": 3.719798132777214, "train_ppl": 41.256065020685355, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 147776, "dt_s": 4.435, "eta_s": 27295, "world_size": 1, "timestamp": "2026-05-05T00:16:21.501986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31270, "epoch": 0, "train_loss": 3.8231958746910095, "train_ppl": 45.75018679623501, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 152690, "dt_s": 4.292, "eta_s": 27285, "world_size": 1, "timestamp": "2026-05-05T00:16:25.794062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31280, "epoch": 0, "train_loss": 3.935846984386444, "train_ppl": 51.205501855446606, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 151378, "dt_s": 4.329, "eta_s": 27184, "world_size": 1, "timestamp": "2026-05-05T00:16:30.123368"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31290, "epoch": 0, "train_loss": 3.903768479824066, "train_ppl": 49.588972472274314, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 151151, "dt_s": 4.336, "eta_s": 26609, "world_size": 1, "timestamp": "2026-05-05T00:16:34.459157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31300, "epoch": 0, "train_loss": 3.741906225681305, "train_ppl": 42.178314969260114, "lr": 0.00056, "grad_norm": 0.6258, "tokens_per_sec": 152174, "dt_s": 4.307, "eta_s": 26635, "world_size": 1, "timestamp": "2026-05-05T00:16:38.765804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31310, "epoch": 0, "train_loss": 3.7011496871709824, "train_ppl": 40.49383284846904, "lr": 0.00056, "grad_norm": 0.631, "tokens_per_sec": 149892, "dt_s": 4.372, "eta_s": 26554, "world_size": 1, "timestamp": "2026-05-05T00:16:43.138014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31320, "epoch": 0, "train_loss": 3.826476573944092, "train_ppl": 45.900525873693915, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 151892, "dt_s": 4.315, "eta_s": 26577, "world_size": 1, "timestamp": "2026-05-05T00:16:47.452675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31330, "epoch": 0, "train_loss": 3.7900767624378204, "train_ppl": 44.25979763555085, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 150718, "dt_s": 4.348, "eta_s": 26596, "world_size": 1, "timestamp": "2026-05-05T00:16:51.800896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31340, "epoch": 0, "train_loss": 3.9461680352687836, "train_ppl": 51.73673316249157, "lr": 0.00056, "grad_norm": 0.7327, "tokens_per_sec": 148568, "dt_s": 4.411, "eta_s": 26684, "world_size": 1, "timestamp": "2026-05-05T00:16:56.212096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31350, "epoch": 0, "train_loss": 3.809385195374489, "train_ppl": 45.12268869899955, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 152541, "dt_s": 4.296, "eta_s": 26667, "world_size": 1, "timestamp": "2026-05-05T00:17:00.508378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31360, "epoch": 0, "train_loss": 3.80763079226017, "train_ppl": 45.0435947150726, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 152423, "dt_s": 4.3, "eta_s": 26574, "world_size": 1, "timestamp": "2026-05-05T00:17:04.807989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31370, "epoch": 0, "train_loss": 3.902411788702011, "train_ppl": 49.52174116993707, "lr": 0.00056, "grad_norm": 0.778, "tokens_per_sec": 148064, "dt_s": 4.426, "eta_s": 26706, "world_size": 1, "timestamp": "2026-05-05T00:17:09.234163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31380, "epoch": 0, "train_loss": 3.8098410218954086, "train_ppl": 45.143261505663, "lr": 0.00056, "grad_norm": 0.7079, "tokens_per_sec": 149700, "dt_s": 4.378, "eta_s": 26738, "world_size": 1, "timestamp": "2026-05-05T00:17:13.611982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31390, "epoch": 0, "train_loss": 3.7502188682556152, "train_ppl": 42.53038953363167, "lr": 0.00056, "grad_norm": 0.7238, "tokens_per_sec": 151848, "dt_s": 4.316, "eta_s": 26617, "world_size": 1, "timestamp": "2026-05-05T00:17:17.927865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31400, "epoch": 0, "train_loss": 3.776727944612503, "train_ppl": 43.672907519399566, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 149353, "dt_s": 4.388, "eta_s": 26725, "world_size": 1, "timestamp": "2026-05-05T00:17:22.315863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31410, "epoch": 0, "train_loss": 3.8159332126379013, "train_ppl": 45.41912231060569, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 153376, "dt_s": 4.273, "eta_s": 26688, "world_size": 1, "timestamp": "2026-05-05T00:17:26.588763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31420, "epoch": 0, "train_loss": 3.7219392210245132, "train_ppl": 41.34449252839849, "lr": 0.00056, "grad_norm": 0.7309, "tokens_per_sec": 152212, "dt_s": 4.306, "eta_s": 26536, "world_size": 1, "timestamp": "2026-05-05T00:17:30.894337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31430, "epoch": 0, "train_loss": 3.7234607487916946, "train_ppl": 41.40744720329819, "lr": 0.00056, "grad_norm": 0.6757, "tokens_per_sec": 152950, "dt_s": 4.285, "eta_s": 26417, "world_size": 1, "timestamp": "2026-05-05T00:17:35.179129"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31440, "epoch": 0, "train_loss": 3.8565500676631927, "train_ppl": 47.30188126082961, "lr": 0.00056, "grad_norm": 0.6268, "tokens_per_sec": 153387, "dt_s": 4.273, "eta_s": 26360, "world_size": 1, "timestamp": "2026-05-05T00:17:39.451700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31450, "epoch": 0, "train_loss": 3.7459730058908463, "train_ppl": 42.35019416641946, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 151306, "dt_s": 4.331, "eta_s": 26286, "world_size": 1, "timestamp": "2026-05-05T00:17:43.783083"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31460, "epoch": 0, "train_loss": 3.832338958978653, "train_ppl": 46.17040271772325, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 150644, "dt_s": 4.35, "eta_s": 26377, "world_size": 1, "timestamp": "2026-05-05T00:17:48.133459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31470, "epoch": 0, "train_loss": 3.874456077814102, "train_ppl": 48.156497778687815, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 151436, "dt_s": 4.328, "eta_s": 26400, "world_size": 1, "timestamp": "2026-05-05T00:17:52.461106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31480, "epoch": 0, "train_loss": 3.8489789813756943, "train_ppl": 46.94510692511381, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 150110, "dt_s": 4.366, "eta_s": 26495, "world_size": 1, "timestamp": "2026-05-05T00:17:56.826976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31490, "epoch": 0, "train_loss": 3.840681880712509, "train_ppl": 46.557210082181605, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 151324, "dt_s": 4.331, "eta_s": 26562, "world_size": 1, "timestamp": "2026-05-05T00:18:01.157837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31500, "epoch": 0, "train_loss": 3.775180533528328, "train_ppl": 43.605379838241525, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 152416, "dt_s": 4.3, "eta_s": 26519, "world_size": 1, "timestamp": "2026-05-05T00:18:05.457650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31510, "epoch": 0, "train_loss": 3.731155440211296, "train_ppl": 41.72729371397316, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 111349, "dt_s": 5.886, "eta_s": 26566, "world_size": 1, "timestamp": "2026-05-05T00:18:11.343336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31520, "epoch": 0, "train_loss": 3.7553744465112686, "train_ppl": 42.750224486332755, "lr": 0.00056, "grad_norm": 0.7447, "tokens_per_sec": 146334, "dt_s": 4.479, "eta_s": 26746, "world_size": 1, "timestamp": "2026-05-05T00:18:15.821785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31530, "epoch": 0, "train_loss": 3.7611041367053986, "train_ppl": 42.99587310155989, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 134003, "dt_s": 4.891, "eta_s": 27384, "world_size": 1, "timestamp": "2026-05-05T00:18:20.712440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31540, "epoch": 0, "train_loss": 3.695227161049843, "train_ppl": 40.25471585294977, "lr": 0.00056, "grad_norm": 0.6335, "tokens_per_sec": 151148, "dt_s": 4.336, "eta_s": 27385, "world_size": 1, "timestamp": "2026-05-05T00:18:25.048321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31550, "epoch": 0, "train_loss": 3.8388001322746277, "train_ppl": 46.46968350216742, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 151974, "dt_s": 4.312, "eta_s": 27396, "world_size": 1, "timestamp": "2026-05-05T00:18:29.360636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31560, "epoch": 0, "train_loss": 3.8145616948604584, "train_ppl": 45.35687187546813, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 149002, "dt_s": 4.398, "eta_s": 27399, "world_size": 1, "timestamp": "2026-05-05T00:18:33.758978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31570, "epoch": 0, "train_loss": 3.732570767402649, "train_ppl": 41.786393300148326, "lr": 0.00056, "grad_norm": 0.7432, "tokens_per_sec": 152170, "dt_s": 4.307, "eta_s": 27184, "world_size": 1, "timestamp": "2026-05-05T00:18:38.065742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31580, "epoch": 0, "train_loss": 3.80857490003109, "train_ppl": 45.086140803759484, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 151908, "dt_s": 4.314, "eta_s": 26475, "world_size": 1, "timestamp": "2026-05-05T00:18:42.379910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31590, "epoch": 0, "train_loss": 3.8716095685958862, "train_ppl": 48.01961477563463, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 147418, "dt_s": 4.446, "eta_s": 26605, "world_size": 1, "timestamp": "2026-05-05T00:18:46.825514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31600, "epoch": 0, "train_loss": 3.839407190680504, "train_ppl": 46.49790187839047, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 151071, "dt_s": 4.338, "eta_s": 26632, "world_size": 1, "timestamp": "2026-05-05T00:18:51.163608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31610, "epoch": 0, "train_loss": 3.8202411830425262, "train_ppl": 45.61520860905173, "lr": 0.00056, "grad_norm": 0.6351, "tokens_per_sec": 151081, "dt_s": 4.338, "eta_s": 26554, "world_size": 1, "timestamp": "2026-05-05T00:18:55.501408"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31620, "epoch": 0, "train_loss": 3.7766094356775284, "train_ppl": 43.66773219630927, "lr": 0.00056, "grad_norm": 0.6851, "tokens_per_sec": 149910, "dt_s": 4.372, "eta_s": 26629, "world_size": 1, "timestamp": "2026-05-05T00:18:59.873110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31630, "epoch": 0, "train_loss": 3.7768707424402237, "train_ppl": 43.6791443610167, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 151043, "dt_s": 4.339, "eta_s": 26655, "world_size": 1, "timestamp": "2026-05-05T00:19:04.211983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31640, "epoch": 0, "train_loss": 3.73405385017395, "train_ppl": 41.84841195816051, "lr": 0.00056, "grad_norm": 0.7509, "tokens_per_sec": 149433, "dt_s": 4.386, "eta_s": 26577, "world_size": 1, "timestamp": "2026-05-05T00:19:08.597627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31650, "epoch": 0, "train_loss": 3.81898432970047, "train_ppl": 45.55791299529706, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 149179, "dt_s": 4.393, "eta_s": 26640, "world_size": 1, "timestamp": "2026-05-05T00:19:12.990757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31660, "epoch": 0, "train_loss": 3.7379351556301117, "train_ppl": 42.011154049406024, "lr": 0.00056, "grad_norm": 0.6983, "tokens_per_sec": 151066, "dt_s": 4.338, "eta_s": 26636, "world_size": 1, "timestamp": "2026-05-05T00:19:17.328983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31670, "epoch": 0, "train_loss": 3.8371944576501846, "train_ppl": 46.3951281823728, "lr": 0.00056, "grad_norm": 0.6602, "tokens_per_sec": 147528, "dt_s": 4.442, "eta_s": 26718, "world_size": 1, "timestamp": "2026-05-05T00:19:21.771272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31680, "epoch": 0, "train_loss": 3.652493417263031, "train_ppl": 38.57071914600868, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 149611, "dt_s": 4.38, "eta_s": 26764, "world_size": 1, "timestamp": "2026-05-05T00:19:26.151680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31690, "epoch": 0, "train_loss": 3.8003529608249664, "train_ppl": 44.716965045051005, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 152478, "dt_s": 4.298, "eta_s": 26653, "world_size": 1, "timestamp": "2026-05-05T00:19:30.449744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31700, "epoch": 0, "train_loss": 3.7963961213827133, "train_ppl": 44.54037679010626, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 148487, "dt_s": 4.414, "eta_s": 26673, "world_size": 1, "timestamp": "2026-05-05T00:19:34.863350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31710, "epoch": 0, "train_loss": 3.7719198018312454, "train_ppl": 43.46342595657589, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 151538, "dt_s": 4.325, "eta_s": 26653, "world_size": 1, "timestamp": "2026-05-05T00:19:39.188058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31720, "epoch": 0, "train_loss": 3.8364226818084717, "train_ppl": 46.35933535706404, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 151080, "dt_s": 4.338, "eta_s": 26521, "world_size": 1, "timestamp": "2026-05-05T00:19:43.525876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31730, "epoch": 0, "train_loss": 3.8519285172224045, "train_ppl": 47.08377760735768, "lr": 0.00056, "grad_norm": 0.691, "tokens_per_sec": 150316, "dt_s": 4.36, "eta_s": 26492, "world_size": 1, "timestamp": "2026-05-05T00:19:47.885775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31740, "epoch": 0, "train_loss": 3.7250518053770065, "train_ppl": 41.47338123332917, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 150229, "dt_s": 4.362, "eta_s": 26566, "world_size": 1, "timestamp": "2026-05-05T00:19:52.248196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31750, "epoch": 0, "train_loss": 3.71113084256649, "train_ppl": 40.90003188228775, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 149737, "dt_s": 4.377, "eta_s": 26516, "world_size": 1, "timestamp": "2026-05-05T00:19:56.624938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31760, "epoch": 0, "train_loss": 3.903722643852234, "train_ppl": 49.586699565619725, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 149281, "dt_s": 4.39, "eta_s": 26592, "world_size": 1, "timestamp": "2026-05-05T00:20:01.015050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31770, "epoch": 0, "train_loss": 3.788538485765457, "train_ppl": 44.19176616035541, "lr": 0.00056, "grad_norm": 0.6425, "tokens_per_sec": 148802, "dt_s": 4.404, "eta_s": 26668, "world_size": 1, "timestamp": "2026-05-05T00:20:05.419313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31780, "epoch": 0, "train_loss": 3.7901315838098526, "train_ppl": 44.26222408489313, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 147331, "dt_s": 4.448, "eta_s": 26772, "world_size": 1, "timestamp": "2026-05-05T00:20:09.867515"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31790, "epoch": 0, "train_loss": 3.779177874326706, "train_ppl": 43.78003424612987, "lr": 0.00056, "grad_norm": 0.7708, "tokens_per_sec": 149941, "dt_s": 4.371, "eta_s": 26777, "world_size": 1, "timestamp": "2026-05-05T00:20:14.238316"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31800, "epoch": 0, "train_loss": 3.906736448407173, "train_ppl": 49.736369611480015, "lr": 0.00056, "grad_norm": 0.7715, "tokens_per_sec": 151588, "dt_s": 4.323, "eta_s": 26708, "world_size": 1, "timestamp": "2026-05-05T00:20:18.561591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31810, "epoch": 0, "train_loss": 3.7214225232601166, "train_ppl": 41.323135439594125, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 148631, "dt_s": 4.409, "eta_s": 26727, "world_size": 1, "timestamp": "2026-05-05T00:20:22.970915"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31820, "epoch": 0, "train_loss": 3.8556423038244247, "train_ppl": 47.258961806826754, "lr": 0.00056, "grad_norm": 0.6132, "tokens_per_sec": 149205, "dt_s": 4.392, "eta_s": 26708, "world_size": 1, "timestamp": "2026-05-05T00:20:27.363261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31830, "epoch": 0, "train_loss": 3.7954556196928024, "train_ppl": 44.498506183240515, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 133357, "dt_s": 4.914, "eta_s": 27271, "world_size": 1, "timestamp": "2026-05-05T00:20:32.277571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31840, "epoch": 0, "train_loss": 3.852954924106598, "train_ppl": 47.13212953095639, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 150098, "dt_s": 4.366, "eta_s": 27261, "world_size": 1, "timestamp": "2026-05-05T00:20:36.643813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31850, "epoch": 0, "train_loss": 3.739288717508316, "train_ppl": 42.06805724829922, "lr": 0.00056, "grad_norm": 0.7151, "tokens_per_sec": 151574, "dt_s": 4.324, "eta_s": 27257, "world_size": 1, "timestamp": "2026-05-05T00:20:40.967491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31860, "epoch": 0, "train_loss": 3.7424527257680893, "train_ppl": 42.201371721735995, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 149715, "dt_s": 4.377, "eta_s": 27213, "world_size": 1, "timestamp": "2026-05-05T00:20:45.344859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31870, "epoch": 0, "train_loss": 3.79745414853096, "train_ppl": 44.58752665646427, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 151656, "dt_s": 4.321, "eta_s": 27123, "world_size": 1, "timestamp": "2026-05-05T00:20:49.666270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31880, "epoch": 0, "train_loss": 3.7277172803878784, "train_ppl": 41.58407495475948, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 151620, "dt_s": 4.322, "eta_s": 26398, "world_size": 1, "timestamp": "2026-05-05T00:20:53.988641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31890, "epoch": 0, "train_loss": 3.840964138507843, "train_ppl": 46.570353072424766, "lr": 0.00056, "grad_norm": 1.2136, "tokens_per_sec": 146465, "dt_s": 4.475, "eta_s": 26526, "world_size": 1, "timestamp": "2026-05-05T00:20:58.463138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31900, "epoch": 0, "train_loss": 3.776258736848831, "train_ppl": 43.65242065880222, "lr": 0.00056, "grad_norm": 0.6397, "tokens_per_sec": 149929, "dt_s": 4.371, "eta_s": 26579, "world_size": 1, "timestamp": "2026-05-05T00:21:02.834273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31910, "epoch": 0, "train_loss": 3.7042814940214157, "train_ppl": 40.620850505149825, "lr": 0.00056, "grad_norm": 0.6341, "tokens_per_sec": 151863, "dt_s": 4.315, "eta_s": 26499, "world_size": 1, "timestamp": "2026-05-05T00:21:07.149731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31920, "epoch": 0, "train_loss": 3.7443671375513077, "train_ppl": 42.28223990783394, "lr": 0.00056, "grad_norm": 0.6342, "tokens_per_sec": 148616, "dt_s": 4.41, "eta_s": 26602, "world_size": 1, "timestamp": "2026-05-05T00:21:11.559479"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31930, "epoch": 0, "train_loss": 3.80650694668293, "train_ppl": 44.99300110539641, "lr": 0.00056, "grad_norm": 0.6833, "tokens_per_sec": 151424, "dt_s": 4.328, "eta_s": 26605, "world_size": 1, "timestamp": "2026-05-05T00:21:15.887447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31940, "epoch": 0, "train_loss": 3.8234982192516327, "train_ppl": 45.764021207634485, "lr": 0.00056, "grad_norm": 0.6226, "tokens_per_sec": 151579, "dt_s": 4.324, "eta_s": 26417, "world_size": 1, "timestamp": "2026-05-05T00:21:20.211021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31950, "epoch": 0, "train_loss": 3.7823530584573746, "train_ppl": 43.919264840520725, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 150083, "dt_s": 4.367, "eta_s": 26407, "world_size": 1, "timestamp": "2026-05-05T00:21:24.577654"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31960, "epoch": 0, "train_loss": 3.8390707671642303, "train_ppl": 46.4822615217806, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 149731, "dt_s": 4.377, "eta_s": 26477, "world_size": 1, "timestamp": "2026-05-05T00:21:28.954572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31970, "epoch": 0, "train_loss": 3.776580885052681, "train_ppl": 43.666485473066835, "lr": 0.00056, "grad_norm": 0.771, "tokens_per_sec": 149957, "dt_s": 4.37, "eta_s": 26425, "world_size": 1, "timestamp": "2026-05-05T00:21:33.324885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31980, "epoch": 0, "train_loss": 3.849148079752922, "train_ppl": 46.95304593773179, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 150741, "dt_s": 4.348, "eta_s": 26445, "world_size": 1, "timestamp": "2026-05-05T00:21:37.672483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 31990, "epoch": 0, "train_loss": 3.9227481335401535, "train_ppl": 50.539142423334404, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 151561, "dt_s": 4.324, "eta_s": 26441, "world_size": 1, "timestamp": "2026-05-05T00:21:41.996557"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32000, "epoch": 0, "train_loss": 3.682307630777359, "train_ppl": 39.73798895468383, "lr": 0.00056, "grad_norm": 0.6695, "tokens_per_sec": 150954, "dt_s": 4.341, "eta_s": 26406, "world_size": 1, "timestamp": "2026-05-05T00:21:46.338025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32010, "epoch": 0, "train_loss": 3.8721599876880646, "train_ppl": 48.04605296378047, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 127432, "dt_s": 5.143, "eta_s": 26401, "world_size": 1, "timestamp": "2026-05-05T00:21:51.480832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32020, "epoch": 0, "train_loss": 3.7255446016788483, "train_ppl": 41.49382419892104, "lr": 0.00056, "grad_norm": 0.691, "tokens_per_sec": 150096, "dt_s": 4.366, "eta_s": 26391, "world_size": 1, "timestamp": "2026-05-05T00:21:55.847089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32030, "epoch": 0, "train_loss": 3.620593786239624, "train_ppl": 37.35974493962944, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 147417, "dt_s": 4.446, "eta_s": 26506, "world_size": 1, "timestamp": "2026-05-05T00:22:00.292733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32040, "epoch": 0, "train_loss": 3.7450692504644394, "train_ppl": 42.31193723868913, "lr": 0.00056, "grad_norm": 0.7484, "tokens_per_sec": 151439, "dt_s": 4.328, "eta_s": 26506, "world_size": 1, "timestamp": "2026-05-05T00:22:04.620290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32050, "epoch": 0, "train_loss": 3.827635884284973, "train_ppl": 45.953769685078434, "lr": 0.00056, "grad_norm": 0.7026, "tokens_per_sec": 147594, "dt_s": 4.44, "eta_s": 26621, "world_size": 1, "timestamp": "2026-05-05T00:22:09.060584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32060, "epoch": 0, "train_loss": 3.8342609852552414, "train_ppl": 46.25922878061586, "lr": 0.00056, "grad_norm": 0.7168, "tokens_per_sec": 149711, "dt_s": 4.378, "eta_s": 26619, "world_size": 1, "timestamp": "2026-05-05T00:22:13.438093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32070, "epoch": 0, "train_loss": 3.8350820541381836, "train_ppl": 46.29722639110964, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 151575, "dt_s": 4.324, "eta_s": 26563, "world_size": 1, "timestamp": "2026-05-05T00:22:17.761767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32080, "epoch": 0, "train_loss": 3.752071589231491, "train_ppl": 42.60925951790933, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 149392, "dt_s": 4.387, "eta_s": 26487, "world_size": 1, "timestamp": "2026-05-05T00:22:22.148615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32090, "epoch": 0, "train_loss": 3.7596181333065033, "train_ppl": 42.93202853636144, "lr": 0.00056, "grad_norm": 0.6595, "tokens_per_sec": 151961, "dt_s": 4.313, "eta_s": 26465, "world_size": 1, "timestamp": "2026-05-05T00:22:26.461297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32100, "epoch": 0, "train_loss": 3.8726969808340073, "train_ppl": 48.07186029347104, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 151006, "dt_s": 4.34, "eta_s": 26339, "world_size": 1, "timestamp": "2026-05-05T00:22:30.801262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32110, "epoch": 0, "train_loss": 3.842769429087639, "train_ppl": 46.65450202592674, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 150019, "dt_s": 4.368, "eta_s": 26323, "world_size": 1, "timestamp": "2026-05-05T00:22:35.169745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32120, "epoch": 0, "train_loss": 3.799728348851204, "train_ppl": 44.68904301437868, "lr": 0.00056, "grad_norm": 0.7439, "tokens_per_sec": 135947, "dt_s": 4.821, "eta_s": 26921, "world_size": 1, "timestamp": "2026-05-05T00:22:39.990473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32130, "epoch": 0, "train_loss": 3.7323894649744034, "train_ppl": 41.77881801230517, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 149045, "dt_s": 4.397, "eta_s": 26929, "world_size": 1, "timestamp": "2026-05-05T00:22:44.387512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32140, "epoch": 0, "train_loss": 3.9274726808071136, "train_ppl": 50.77848193075473, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 149981, "dt_s": 4.37, "eta_s": 26993, "world_size": 1, "timestamp": "2026-05-05T00:22:48.757147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32150, "epoch": 0, "train_loss": 3.7259762585163116, "train_ppl": 41.51173915812815, "lr": 0.00056, "grad_norm": 0.6625, "tokens_per_sec": 150947, "dt_s": 4.342, "eta_s": 26991, "world_size": 1, "timestamp": "2026-05-05T00:22:53.098809"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32160, "epoch": 0, "train_loss": 3.7720634788274765, "train_ppl": 43.4696710996942, "lr": 0.00056, "grad_norm": 0.6548, "tokens_per_sec": 149025, "dt_s": 4.398, "eta_s": 27022, "world_size": 1, "timestamp": "2026-05-05T00:22:57.496444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32170, "epoch": 0, "train_loss": 3.9212290197610855, "train_ppl": 50.462426000937256, "lr": 0.00056, "grad_norm": 0.6989, "tokens_per_sec": 151221, "dt_s": 4.334, "eta_s": 26428, "world_size": 1, "timestamp": "2026-05-05T00:23:01.830254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32180, "epoch": 0, "train_loss": 3.901659294962883, "train_ppl": 49.4844903870042, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 152413, "dt_s": 4.3, "eta_s": 26306, "world_size": 1, "timestamp": "2026-05-05T00:23:06.130133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32190, "epoch": 0, "train_loss": 3.8431855589151382, "train_ppl": 46.673920395808096, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 148874, "dt_s": 4.402, "eta_s": 26341, "world_size": 1, "timestamp": "2026-05-05T00:23:10.532248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32200, "epoch": 0, "train_loss": 3.880761295557022, "train_ppl": 48.46109424732651, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 151537, "dt_s": 4.325, "eta_s": 26316, "world_size": 1, "timestamp": "2026-05-05T00:23:14.856992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32210, "epoch": 0, "train_loss": 3.7055878937244415, "train_ppl": 40.6739522506864, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 151047, "dt_s": 4.339, "eta_s": 26241, "world_size": 1, "timestamp": "2026-05-05T00:23:19.195797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32220, "epoch": 0, "train_loss": 3.8456944674253464, "train_ppl": 46.79116801216271, "lr": 0.00056, "grad_norm": 0.6869, "tokens_per_sec": 147653, "dt_s": 4.439, "eta_s": 26363, "world_size": 1, "timestamp": "2026-05-05T00:23:23.634311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32230, "epoch": 0, "train_loss": 3.71016888320446, "train_ppl": 40.860706631390414, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 151807, "dt_s": 4.317, "eta_s": 26380, "world_size": 1, "timestamp": "2026-05-05T00:23:27.951347"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32240, "epoch": 0, "train_loss": 3.7300379276275635, "train_ppl": 41.6806889836991, "lr": 0.00056, "grad_norm": 0.6599, "tokens_per_sec": 151881, "dt_s": 4.315, "eta_s": 26270, "world_size": 1, "timestamp": "2026-05-05T00:23:32.266313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32250, "epoch": 0, "train_loss": 3.7553516775369644, "train_ppl": 42.74925111865126, "lr": 0.00056, "grad_norm": 0.6199, "tokens_per_sec": 150839, "dt_s": 4.345, "eta_s": 26290, "world_size": 1, "timestamp": "2026-05-05T00:23:36.611062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32260, "epoch": 0, "train_loss": 3.8198399543762207, "train_ppl": 45.59691015091672, "lr": 0.00056, "grad_norm": 0.714, "tokens_per_sec": 153481, "dt_s": 4.27, "eta_s": 26202, "world_size": 1, "timestamp": "2026-05-05T00:23:40.881010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32270, "epoch": 0, "train_loss": 3.6913113594055176, "train_ppl": 40.09739459087309, "lr": 0.00056, "grad_norm": 0.654, "tokens_per_sec": 150560, "dt_s": 4.353, "eta_s": 26094, "world_size": 1, "timestamp": "2026-05-05T00:23:45.233821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32280, "epoch": 0, "train_loss": 3.746363550424576, "train_ppl": 42.36673703340641, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 148569, "dt_s": 4.411, "eta_s": 26204, "world_size": 1, "timestamp": "2026-05-05T00:23:49.644983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32290, "epoch": 0, "train_loss": 3.8022588342428207, "train_ppl": 44.80227118558847, "lr": 0.00056, "grad_norm": 0.6726, "tokens_per_sec": 151110, "dt_s": 4.337, "eta_s": 26226, "world_size": 1, "timestamp": "2026-05-05T00:23:53.981947"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32300, "epoch": 0, "train_loss": 3.804763913154602, "train_ppl": 44.914645104336316, "lr": 0.00056, "grad_norm": 0.6343, "tokens_per_sec": 149790, "dt_s": 4.375, "eta_s": 26258, "world_size": 1, "timestamp": "2026-05-05T00:23:58.357131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32310, "epoch": 0, "train_loss": 3.865928590297699, "train_ppl": 47.747589801793175, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 149494, "dt_s": 4.384, "eta_s": 26391, "world_size": 1, "timestamp": "2026-05-05T00:24:02.740976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32320, "epoch": 0, "train_loss": 3.6952183693647385, "train_ppl": 40.25436194771974, "lr": 0.00056, "grad_norm": 0.6742, "tokens_per_sec": 151641, "dt_s": 4.322, "eta_s": 26350, "world_size": 1, "timestamp": "2026-05-05T00:24:07.062761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32330, "epoch": 0, "train_loss": 3.79389251768589, "train_ppl": 44.42900481193837, "lr": 0.00056, "grad_norm": 0.6244, "tokens_per_sec": 148904, "dt_s": 4.401, "eta_s": 26333, "world_size": 1, "timestamp": "2026-05-05T00:24:11.463980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32340, "epoch": 0, "train_loss": 3.737297296524048, "train_ppl": 41.98436539684063, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 151955, "dt_s": 4.313, "eta_s": 26300, "world_size": 1, "timestamp": "2026-05-05T00:24:15.776846"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32350, "epoch": 0, "train_loss": 3.6177144199609756, "train_ppl": 37.25232727149146, "lr": 0.00056, "grad_norm": 0.8267, "tokens_per_sec": 151202, "dt_s": 4.334, "eta_s": 26246, "world_size": 1, "timestamp": "2026-05-05T00:24:20.111205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32360, "epoch": 0, "train_loss": 3.993149310350418, "train_ppl": 54.22539332991889, "lr": 0.00056, "grad_norm": 1.009, "tokens_per_sec": 149392, "dt_s": 4.387, "eta_s": 26245, "world_size": 1, "timestamp": "2026-05-05T00:24:24.498022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32370, "epoch": 0, "train_loss": 3.947345107793808, "train_ppl": 51.797666904313026, "lr": 0.00056, "grad_norm": 0.74, "tokens_per_sec": 152463, "dt_s": 4.298, "eta_s": 26213, "world_size": 1, "timestamp": "2026-05-05T00:24:28.796496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32380, "epoch": 0, "train_loss": 3.8249991834163666, "train_ppl": 45.832762940034094, "lr": 0.00056, "grad_norm": 0.9833, "tokens_per_sec": 151088, "dt_s": 4.338, "eta_s": 26132, "world_size": 1, "timestamp": "2026-05-05T00:24:33.134104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32390, "epoch": 0, "train_loss": 3.741935595870018, "train_ppl": 42.17955377252221, "lr": 0.00056, "grad_norm": 0.6463, "tokens_per_sec": 150172, "dt_s": 4.364, "eta_s": 26189, "world_size": 1, "timestamp": "2026-05-05T00:24:37.498185"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32400, "epoch": 0, "train_loss": 3.757968232035637, "train_ppl": 42.86125333002507, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 152716, "dt_s": 4.291, "eta_s": 26133, "world_size": 1, "timestamp": "2026-05-05T00:24:41.789548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32410, "epoch": 0, "train_loss": 3.7061357498168945, "train_ppl": 40.696241828414045, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 150394, "dt_s": 4.358, "eta_s": 26093, "world_size": 1, "timestamp": "2026-05-05T00:24:46.147152"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32420, "epoch": 0, "train_loss": 3.7596899420022964, "train_ppl": 42.935111540030306, "lr": 0.00056, "grad_norm": 0.6498, "tokens_per_sec": 135096, "dt_s": 4.851, "eta_s": 26755, "world_size": 1, "timestamp": "2026-05-05T00:24:50.998230"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32430, "epoch": 0, "train_loss": 3.871784344315529, "train_ppl": 48.028008171823565, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 150859, "dt_s": 4.344, "eta_s": 26759, "world_size": 1, "timestamp": "2026-05-05T00:24:55.342401"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32440, "epoch": 0, "train_loss": 3.8322843611240387, "train_ppl": 46.1678819816022, "lr": 0.00056, "grad_norm": 0.6643, "tokens_per_sec": 149151, "dt_s": 4.394, "eta_s": 26790, "world_size": 1, "timestamp": "2026-05-05T00:24:59.736349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32450, "epoch": 0, "train_loss": 3.7752680480480194, "train_ppl": 43.609196109101156, "lr": 0.00056, "grad_norm": 0.7289, "tokens_per_sec": 152369, "dt_s": 4.301, "eta_s": 26798, "world_size": 1, "timestamp": "2026-05-05T00:25:04.037460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32460, "epoch": 0, "train_loss": 3.796922579407692, "train_ppl": 44.56383160234832, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 153529, "dt_s": 4.269, "eta_s": 26686, "world_size": 1, "timestamp": "2026-05-05T00:25:08.306105"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32470, "epoch": 0, "train_loss": 3.7681280076503754, "train_ppl": 43.298933648526386, "lr": 0.00056, "grad_norm": 0.6385, "tokens_per_sec": 149100, "dt_s": 4.395, "eta_s": 26133, "world_size": 1, "timestamp": "2026-05-05T00:25:12.701546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32480, "epoch": 0, "train_loss": 3.805997848510742, "train_ppl": 44.970101080447435, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 151844, "dt_s": 4.316, "eta_s": 26095, "world_size": 1, "timestamp": "2026-05-05T00:25:17.017558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32490, "epoch": 0, "train_loss": 3.7287789285182953, "train_ppl": 41.628246053123554, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 150839, "dt_s": 4.345, "eta_s": 26031, "world_size": 1, "timestamp": "2026-05-05T00:25:21.362341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32500, "epoch": 0, "train_loss": 3.780556797981262, "train_ppl": 43.840445212432094, "lr": 0.00056, "grad_norm": 0.8795, "tokens_per_sec": 148334, "dt_s": 4.418, "eta_s": 26168, "world_size": 1, "timestamp": "2026-05-05T00:25:25.780478"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32510, "epoch": 0, "train_loss": 3.7600876688957214, "train_ppl": 42.952191384893645, "lr": 0.00056, "grad_norm": 0.6074, "tokens_per_sec": 126473, "dt_s": 5.182, "eta_s": 26320, "world_size": 1, "timestamp": "2026-05-05T00:25:30.962315"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32520, "epoch": 0, "train_loss": 3.7592678368091583, "train_ppl": 42.91699223087757, "lr": 0.00056, "grad_norm": 0.6204, "tokens_per_sec": 148253, "dt_s": 4.421, "eta_s": 26346, "world_size": 1, "timestamp": "2026-05-05T00:25:35.382836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32530, "epoch": 0, "train_loss": 3.726658284664154, "train_ppl": 41.54006090666509, "lr": 0.00056, "grad_norm": 0.6003, "tokens_per_sec": 152457, "dt_s": 4.299, "eta_s": 26321, "world_size": 1, "timestamp": "2026-05-05T00:25:39.681503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32540, "epoch": 0, "train_loss": 3.70914663374424, "train_ppl": 40.818958138415034, "lr": 0.00056, "grad_norm": 0.6336, "tokens_per_sec": 151553, "dt_s": 4.324, "eta_s": 26292, "world_size": 1, "timestamp": "2026-05-05T00:25:44.005797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32550, "epoch": 0, "train_loss": 3.827755928039551, "train_ppl": 45.959286479250146, "lr": 0.00056, "grad_norm": 0.6716, "tokens_per_sec": 148855, "dt_s": 4.403, "eta_s": 26269, "world_size": 1, "timestamp": "2026-05-05T00:25:48.408475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32560, "epoch": 0, "train_loss": 3.8043168783187866, "train_ppl": 44.8945711805423, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 152281, "dt_s": 4.304, "eta_s": 26150, "world_size": 1, "timestamp": "2026-05-05T00:25:52.712101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32570, "epoch": 0, "train_loss": 3.719729632139206, "train_ppl": 41.25323905070121, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 152213, "dt_s": 4.306, "eta_s": 26007, "world_size": 1, "timestamp": "2026-05-05T00:25:57.017679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32580, "epoch": 0, "train_loss": 3.695381596684456, "train_ppl": 40.26093309560824, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 147462, "dt_s": 4.444, "eta_s": 26178, "world_size": 1, "timestamp": "2026-05-05T00:26:01.461917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32590, "epoch": 0, "train_loss": 3.8098667711019516, "train_ppl": 45.144423923793134, "lr": 0.00056, "grad_norm": 0.84, "tokens_per_sec": 151033, "dt_s": 4.339, "eta_s": 26191, "world_size": 1, "timestamp": "2026-05-05T00:26:05.801092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32600, "epoch": 0, "train_loss": 3.7114382833242416, "train_ppl": 40.91260815221162, "lr": 0.00056, "grad_norm": 0.6485, "tokens_per_sec": 152261, "dt_s": 4.304, "eta_s": 26069, "world_size": 1, "timestamp": "2026-05-05T00:26:10.105318"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32610, "epoch": 0, "train_loss": 3.7695316523313522, "train_ppl": 43.3597526405759, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 149425, "dt_s": 4.386, "eta_s": 26163, "world_size": 1, "timestamp": "2026-05-05T00:26:14.491137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32620, "epoch": 0, "train_loss": 3.921083778142929, "train_ppl": 50.45509728875867, "lr": 0.00056, "grad_norm": 0.7719, "tokens_per_sec": 151199, "dt_s": 4.334, "eta_s": 26193, "world_size": 1, "timestamp": "2026-05-05T00:26:18.825575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32630, "epoch": 0, "train_loss": 3.7211489975452423, "train_ppl": 41.31183404511363, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 148001, "dt_s": 4.428, "eta_s": 26170, "world_size": 1, "timestamp": "2026-05-05T00:26:23.253667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32640, "epoch": 0, "train_loss": 3.7694473415613174, "train_ppl": 43.356097100545114, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 149567, "dt_s": 4.382, "eta_s": 26216, "world_size": 1, "timestamp": "2026-05-05T00:26:27.635383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32650, "epoch": 0, "train_loss": 3.683771252632141, "train_ppl": 39.79619292769613, "lr": 0.00056, "grad_norm": 0.6403, "tokens_per_sec": 152608, "dt_s": 4.294, "eta_s": 26200, "world_size": 1, "timestamp": "2026-05-05T00:26:31.929790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32660, "epoch": 0, "train_loss": 3.7600422352552414, "train_ppl": 42.95023995480305, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 148160, "dt_s": 4.423, "eta_s": 26241, "world_size": 1, "timestamp": "2026-05-05T00:26:36.353099"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32670, "epoch": 0, "train_loss": 3.8161719292402267, "train_ppl": 45.42996590338559, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 151868, "dt_s": 4.315, "eta_s": 26213, "world_size": 1, "timestamp": "2026-05-05T00:26:40.668417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32680, "epoch": 0, "train_loss": 3.828912392258644, "train_ppl": 46.01246749464211, "lr": 0.00056, "grad_norm": 0.6435, "tokens_per_sec": 152953, "dt_s": 4.285, "eta_s": 26037, "world_size": 1, "timestamp": "2026-05-05T00:26:44.953133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32690, "epoch": 0, "train_loss": 3.735734134912491, "train_ppl": 41.91878831570761, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 148515, "dt_s": 4.413, "eta_s": 26070, "world_size": 1, "timestamp": "2026-05-05T00:26:49.365880"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32700, "epoch": 0, "train_loss": 3.73688443005085, "train_ppl": 41.96703503777807, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 153451, "dt_s": 4.271, "eta_s": 26037, "world_size": 1, "timestamp": "2026-05-05T00:26:53.636696"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32710, "epoch": 0, "train_loss": 3.8413314819335938, "train_ppl": 46.58746352797508, "lr": 0.00056, "grad_norm": 0.6421, "tokens_per_sec": 152623, "dt_s": 4.294, "eta_s": 25878, "world_size": 1, "timestamp": "2026-05-05T00:26:57.930670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32720, "epoch": 0, "train_loss": 3.792461410164833, "train_ppl": 44.36546760413095, "lr": 0.00056, "grad_norm": 0.7032, "tokens_per_sec": 133820, "dt_s": 4.897, "eta_s": 26572, "world_size": 1, "timestamp": "2026-05-05T00:27:02.827996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32730, "epoch": 0, "train_loss": 3.692030355334282, "train_ppl": 40.126234821099615, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 152123, "dt_s": 4.308, "eta_s": 26595, "world_size": 1, "timestamp": "2026-05-05T00:27:07.136084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32740, "epoch": 0, "train_loss": 3.9027542024850845, "train_ppl": 49.53870100014956, "lr": 0.00056, "grad_norm": 0.7216, "tokens_per_sec": 150818, "dt_s": 4.345, "eta_s": 26510, "world_size": 1, "timestamp": "2026-05-05T00:27:11.481436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32750, "epoch": 0, "train_loss": 3.7226036489009857, "train_ppl": 41.37197208985536, "lr": 0.00056, "grad_norm": 0.7052, "tokens_per_sec": 151298, "dt_s": 4.332, "eta_s": 26578, "world_size": 1, "timestamp": "2026-05-05T00:27:15.813015"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32760, "epoch": 0, "train_loss": 3.7557459324598312, "train_ppl": 42.766108544196804, "lr": 0.00056, "grad_norm": 0.6998, "tokens_per_sec": 153419, "dt_s": 4.272, "eta_s": 26547, "world_size": 1, "timestamp": "2026-05-05T00:27:20.084738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32770, "epoch": 0, "train_loss": 3.8056482821702957, "train_ppl": 44.954383794060426, "lr": 0.00056, "grad_norm": 0.6214, "tokens_per_sec": 150170, "dt_s": 4.364, "eta_s": 25904, "world_size": 1, "timestamp": "2026-05-05T00:27:24.448863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32780, "epoch": 0, "train_loss": 3.861936569213867, "train_ppl": 47.55736036918344, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 150641, "dt_s": 4.35, "eta_s": 25950, "world_size": 1, "timestamp": "2026-05-05T00:27:28.799332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32790, "epoch": 0, "train_loss": 3.780450329184532, "train_ppl": 43.835777821452375, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 152313, "dt_s": 4.303, "eta_s": 25895, "world_size": 1, "timestamp": "2026-05-05T00:27:33.102066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32800, "epoch": 0, "train_loss": 3.789426565170288, "train_ppl": 44.23102938959762, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 148457, "dt_s": 4.414, "eta_s": 25990, "world_size": 1, "timestamp": "2026-05-05T00:27:37.516533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32810, "epoch": 0, "train_loss": 3.9940420538187027, "train_ppl": 54.2738243106336, "lr": 0.00056, "grad_norm": 0.7, "tokens_per_sec": 152430, "dt_s": 4.299, "eta_s": 26019, "world_size": 1, "timestamp": "2026-05-05T00:27:41.815929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32820, "epoch": 0, "train_loss": 3.853118523955345, "train_ppl": 47.1398409709968, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 152911, "dt_s": 4.286, "eta_s": 25921, "world_size": 1, "timestamp": "2026-05-05T00:27:46.101835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32830, "epoch": 0, "train_loss": 3.824600964784622, "train_ppl": 45.81451511344046, "lr": 0.00056, "grad_norm": 0.6778, "tokens_per_sec": 149355, "dt_s": 4.388, "eta_s": 25961, "world_size": 1, "timestamp": "2026-05-05T00:27:50.489782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32840, "epoch": 0, "train_loss": 3.789538398385048, "train_ppl": 44.23597616440811, "lr": 0.00056, "grad_norm": 0.6319, "tokens_per_sec": 151325, "dt_s": 4.331, "eta_s": 25990, "world_size": 1, "timestamp": "2026-05-05T00:27:54.820586"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32850, "epoch": 0, "train_loss": 3.8656825721263885, "train_ppl": 47.73584447190719, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 151304, "dt_s": 4.331, "eta_s": 25887, "world_size": 1, "timestamp": "2026-05-05T00:27:59.152000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32860, "epoch": 0, "train_loss": 3.8307476192712784, "train_ppl": 46.096988351663605, "lr": 0.00056, "grad_norm": 0.7551, "tokens_per_sec": 149020, "dt_s": 4.398, "eta_s": 26000, "world_size": 1, "timestamp": "2026-05-05T00:28:03.549778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32870, "epoch": 0, "train_loss": 3.75249183177948, "train_ppl": 42.627169504702714, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 152294, "dt_s": 4.303, "eta_s": 26016, "world_size": 1, "timestamp": "2026-05-05T00:28:07.853007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32880, "epoch": 0, "train_loss": 3.728128597140312, "train_ppl": 41.60118269953266, "lr": 0.00056, "grad_norm": 0.6368, "tokens_per_sec": 148363, "dt_s": 4.417, "eta_s": 26047, "world_size": 1, "timestamp": "2026-05-05T00:28:12.270311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32890, "epoch": 0, "train_loss": 3.780699133872986, "train_ppl": 43.84668572540892, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 150980, "dt_s": 4.341, "eta_s": 26055, "world_size": 1, "timestamp": "2026-05-05T00:28:16.611007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32900, "epoch": 0, "train_loss": 3.74391633272171, "train_ppl": 42.26318316563573, "lr": 0.00056, "grad_norm": 0.6814, "tokens_per_sec": 152469, "dt_s": 4.298, "eta_s": 26011, "world_size": 1, "timestamp": "2026-05-05T00:28:20.909350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32910, "epoch": 0, "train_loss": 3.828687533736229, "train_ppl": 46.00212236232776, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 148820, "dt_s": 4.404, "eta_s": 26014, "world_size": 1, "timestamp": "2026-05-05T00:28:25.313047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32920, "epoch": 0, "train_loss": 3.84246326982975, "train_ppl": 46.64022050453086, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 150536, "dt_s": 4.354, "eta_s": 26069, "world_size": 1, "timestamp": "2026-05-05T00:28:29.666560"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32930, "epoch": 0, "train_loss": 3.740434542298317, "train_ppl": 42.11628749759228, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 151907, "dt_s": 4.314, "eta_s": 25942, "world_size": 1, "timestamp": "2026-05-05T00:28:33.980765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32940, "epoch": 0, "train_loss": 3.7367426455020905, "train_ppl": 41.96108518246121, "lr": 0.00056, "grad_norm": 0.697, "tokens_per_sec": 150467, "dt_s": 4.355, "eta_s": 25955, "world_size": 1, "timestamp": "2026-05-05T00:28:38.336287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32950, "epoch": 0, "train_loss": 3.9156946539878845, "train_ppl": 50.18391986592369, "lr": 0.00056, "grad_norm": 0.6549, "tokens_per_sec": 151713, "dt_s": 4.32, "eta_s": 25976, "world_size": 1, "timestamp": "2026-05-05T00:28:42.655998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32960, "epoch": 0, "train_loss": 3.7442281246185303, "train_ppl": 42.276362538184614, "lr": 0.00056, "grad_norm": 0.6838, "tokens_per_sec": 151322, "dt_s": 4.331, "eta_s": 25885, "world_size": 1, "timestamp": "2026-05-05T00:28:46.986903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32970, "epoch": 0, "train_loss": 3.8125965744256973, "train_ppl": 45.267827679628375, "lr": 0.00056, "grad_norm": 0.6625, "tokens_per_sec": 149449, "dt_s": 4.385, "eta_s": 25919, "world_size": 1, "timestamp": "2026-05-05T00:28:51.372073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32980, "epoch": 0, "train_loss": 3.78179894387722, "train_ppl": 43.894935276828434, "lr": 0.00056, "grad_norm": 0.7183, "tokens_per_sec": 151013, "dt_s": 4.34, "eta_s": 25945, "world_size": 1, "timestamp": "2026-05-05T00:28:55.711835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 32990, "epoch": 0, "train_loss": 3.692169114947319, "train_ppl": 40.131803108233775, "lr": 0.00056, "grad_norm": 0.7326, "tokens_per_sec": 150038, "dt_s": 4.368, "eta_s": 25955, "world_size": 1, "timestamp": "2026-05-05T00:29:00.079804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33000, "epoch": 0, "train_loss": 3.893167808651924, "train_ppl": 49.06607252323656, "lr": 0.00056, "grad_norm": 0.7734, "tokens_per_sec": 150510, "dt_s": 4.354, "eta_s": 25992, "world_size": 1, "timestamp": "2026-05-05T00:29:04.434060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33010, "epoch": 0, "train_loss": 3.826571464538574, "train_ppl": 45.904881608536996, "lr": 0.00056, "grad_norm": 0.6771, "tokens_per_sec": 116943, "dt_s": 5.604, "eta_s": 26586, "world_size": 1, "timestamp": "2026-05-05T00:29:10.038203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33020, "epoch": 0, "train_loss": 3.677035555243492, "train_ppl": 39.52903856045866, "lr": 0.00056, "grad_norm": 0.646, "tokens_per_sec": 148499, "dt_s": 4.413, "eta_s": 26615, "world_size": 1, "timestamp": "2026-05-05T00:29:14.451399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33030, "epoch": 0, "train_loss": 3.6661959886550903, "train_ppl": 39.10287480273226, "lr": 0.00056, "grad_norm": 0.8632, "tokens_per_sec": 147201, "dt_s": 4.452, "eta_s": 26745, "world_size": 1, "timestamp": "2026-05-05T00:29:18.903518"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33040, "epoch": 0, "train_loss": 3.7720360159873962, "train_ppl": 43.46847731546088, "lr": 0.00056, "grad_norm": 0.7425, "tokens_per_sec": 151298, "dt_s": 4.332, "eta_s": 26697, "world_size": 1, "timestamp": "2026-05-05T00:29:23.235096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33050, "epoch": 0, "train_loss": 3.814444974064827, "train_ppl": 45.35157809424866, "lr": 0.00056, "grad_norm": 0.7304, "tokens_per_sec": 148255, "dt_s": 4.42, "eta_s": 26771, "world_size": 1, "timestamp": "2026-05-05T00:29:27.655587"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33060, "epoch": 0, "train_loss": 3.831861227750778, "train_ppl": 46.1483509423717, "lr": 0.00056, "grad_norm": 0.6449, "tokens_per_sec": 151747, "dt_s": 4.319, "eta_s": 26154, "world_size": 1, "timestamp": "2026-05-05T00:29:31.974388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33070, "epoch": 0, "train_loss": 3.8211917132139206, "train_ppl": 45.65858785448375, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 150925, "dt_s": 4.342, "eta_s": 26066, "world_size": 1, "timestamp": "2026-05-05T00:29:36.316677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33080, "epoch": 0, "train_loss": 3.742641642689705, "train_ppl": 42.20934502809215, "lr": 0.00056, "grad_norm": 0.6522, "tokens_per_sec": 149402, "dt_s": 4.387, "eta_s": 25983, "world_size": 1, "timestamp": "2026-05-05T00:29:40.703275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33090, "epoch": 0, "train_loss": 3.657657578587532, "train_ppl": 38.77041976136735, "lr": 0.00056, "grad_norm": 0.7091, "tokens_per_sec": 152039, "dt_s": 4.31, "eta_s": 25953, "world_size": 1, "timestamp": "2026-05-05T00:29:45.013718"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33100, "epoch": 0, "train_loss": 3.771818161010742, "train_ppl": 43.45900852279939, "lr": 0.00056, "grad_norm": 0.7182, "tokens_per_sec": 149794, "dt_s": 4.375, "eta_s": 25895, "world_size": 1, "timestamp": "2026-05-05T00:29:49.388794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33110, "epoch": 0, "train_loss": 3.71122345328331, "train_ppl": 40.903819838958306, "lr": 0.00056, "grad_norm": 0.6648, "tokens_per_sec": 149634, "dt_s": 4.38, "eta_s": 25963, "world_size": 1, "timestamp": "2026-05-05T00:29:53.768529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33120, "epoch": 0, "train_loss": 3.6986291855573654, "train_ppl": 40.39189659664268, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 151913, "dt_s": 4.314, "eta_s": 25925, "world_size": 1, "timestamp": "2026-05-05T00:29:58.082587"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33130, "epoch": 0, "train_loss": 3.7457410246133804, "train_ppl": 42.340370853732175, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 148956, "dt_s": 4.4, "eta_s": 25937, "world_size": 1, "timestamp": "2026-05-05T00:30:02.482282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33140, "epoch": 0, "train_loss": 3.8779663294553757, "train_ppl": 48.325836240479696, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 150697, "dt_s": 4.349, "eta_s": 25978, "world_size": 1, "timestamp": "2026-05-05T00:30:06.831125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33150, "epoch": 0, "train_loss": 3.7648893892765045, "train_ppl": 43.158931755164566, "lr": 0.00056, "grad_norm": 0.7352, "tokens_per_sec": 151809, "dt_s": 4.317, "eta_s": 25904, "world_size": 1, "timestamp": "2026-05-05T00:30:11.148138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33160, "epoch": 0, "train_loss": 3.7509991377592087, "train_ppl": 42.563587649615144, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 150252, "dt_s": 4.362, "eta_s": 25878, "world_size": 1, "timestamp": "2026-05-05T00:30:15.509891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33170, "epoch": 0, "train_loss": 3.8537431806325912, "train_ppl": 47.169296386226776, "lr": 0.00056, "grad_norm": 0.6626, "tokens_per_sec": 151540, "dt_s": 4.325, "eta_s": 25887, "world_size": 1, "timestamp": "2026-05-05T00:30:19.834544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33180, "epoch": 0, "train_loss": 3.731514409184456, "train_ppl": 41.742275206535034, "lr": 0.00056, "grad_norm": 0.6498, "tokens_per_sec": 149729, "dt_s": 4.377, "eta_s": 25855, "world_size": 1, "timestamp": "2026-05-05T00:30:24.211517"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33190, "epoch": 0, "train_loss": 3.6996962279081345, "train_ppl": 40.4350194638104, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 151017, "dt_s": 4.34, "eta_s": 25840, "world_size": 1, "timestamp": "2026-05-05T00:30:28.551157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33200, "epoch": 0, "train_loss": 3.7572536915540695, "train_ppl": 42.830638168618734, "lr": 0.00056, "grad_norm": 0.6315, "tokens_per_sec": 150572, "dt_s": 4.352, "eta_s": 25878, "world_size": 1, "timestamp": "2026-05-05T00:30:32.903657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33210, "epoch": 0, "train_loss": 3.757463753223419, "train_ppl": 42.839636189008246, "lr": 0.00056, "grad_norm": 0.6115, "tokens_per_sec": 149968, "dt_s": 4.37, "eta_s": 25884, "world_size": 1, "timestamp": "2026-05-05T00:30:37.273633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33220, "epoch": 0, "train_loss": 3.7921057641506195, "train_ppl": 44.34969200783907, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 151042, "dt_s": 4.339, "eta_s": 25896, "world_size": 1, "timestamp": "2026-05-05T00:30:41.612571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33230, "epoch": 0, "train_loss": 3.8205303102731705, "train_ppl": 45.628399114767426, "lr": 0.00056, "grad_norm": 0.668, "tokens_per_sec": 149992, "dt_s": 4.369, "eta_s": 25883, "world_size": 1, "timestamp": "2026-05-05T00:30:45.981880"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33240, "epoch": 0, "train_loss": 3.6648781299591064, "train_ppl": 39.051376680213096, "lr": 0.00056, "grad_norm": 0.6713, "tokens_per_sec": 149428, "dt_s": 4.386, "eta_s": 25933, "world_size": 1, "timestamp": "2026-05-05T00:30:50.367683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33250, "epoch": 0, "train_loss": 3.834124431014061, "train_ppl": 46.25291231801182, "lr": 0.00056, "grad_norm": 0.7398, "tokens_per_sec": 148313, "dt_s": 4.419, "eta_s": 26008, "world_size": 1, "timestamp": "2026-05-05T00:30:54.786434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33260, "epoch": 0, "train_loss": 3.7149248868227005, "train_ppl": 41.05550315928456, "lr": 0.00056, "grad_norm": 0.6274, "tokens_per_sec": 150435, "dt_s": 4.356, "eta_s": 25987, "world_size": 1, "timestamp": "2026-05-05T00:30:59.142872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33270, "epoch": 0, "train_loss": 3.744090661406517, "train_ppl": 42.2705514930095, "lr": 0.00056, "grad_norm": 0.6233, "tokens_per_sec": 149221, "dt_s": 4.392, "eta_s": 26046, "world_size": 1, "timestamp": "2026-05-05T00:31:03.534738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33280, "epoch": 0, "train_loss": 3.7649916857481003, "train_ppl": 43.163346987428454, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 149513, "dt_s": 4.383, "eta_s": 26058, "world_size": 1, "timestamp": "2026-05-05T00:31:07.918038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33290, "epoch": 0, "train_loss": 3.7525884956121445, "train_ppl": 42.631290209441026, "lr": 0.00056, "grad_norm": 0.6502, "tokens_per_sec": 150263, "dt_s": 4.361, "eta_s": 26024, "world_size": 1, "timestamp": "2026-05-05T00:31:12.279440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33300, "epoch": 0, "train_loss": 3.8138909190893173, "train_ppl": 45.32645778641671, "lr": 0.00056, "grad_norm": 0.7086, "tokens_per_sec": 151262, "dt_s": 4.333, "eta_s": 25918, "world_size": 1, "timestamp": "2026-05-05T00:31:16.612068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33310, "epoch": 0, "train_loss": 3.8725365847349167, "train_ppl": 48.06415037294106, "lr": 0.00056, "grad_norm": 0.6409, "tokens_per_sec": 135450, "dt_s": 4.838, "eta_s": 26486, "world_size": 1, "timestamp": "2026-05-05T00:31:21.450456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33320, "epoch": 0, "train_loss": 3.7328954190015793, "train_ppl": 41.79996152190002, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 149625, "dt_s": 4.38, "eta_s": 26467, "world_size": 1, "timestamp": "2026-05-05T00:31:25.830484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33330, "epoch": 0, "train_loss": 3.700355216860771, "train_ppl": 40.46167447665158, "lr": 0.00056, "grad_norm": 0.6367, "tokens_per_sec": 151076, "dt_s": 4.338, "eta_s": 26409, "world_size": 1, "timestamp": "2026-05-05T00:31:30.168446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33340, "epoch": 0, "train_loss": 3.820304751396179, "train_ppl": 45.61810838493055, "lr": 0.00056, "grad_norm": 0.8261, "tokens_per_sec": 150261, "dt_s": 4.361, "eta_s": 26405, "world_size": 1, "timestamp": "2026-05-05T00:31:34.529897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33350, "epoch": 0, "train_loss": 3.729567512869835, "train_ppl": 41.661086383526566, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 149799, "dt_s": 4.375, "eta_s": 26450, "world_size": 1, "timestamp": "2026-05-05T00:31:38.904835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33360, "epoch": 0, "train_loss": 3.8309751600027084, "train_ppl": 46.1074784875312, "lr": 0.00056, "grad_norm": 0.7218, "tokens_per_sec": 151219, "dt_s": 4.334, "eta_s": 25847, "world_size": 1, "timestamp": "2026-05-05T00:31:43.238685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33370, "epoch": 0, "train_loss": 3.8225360959768295, "train_ppl": 45.72001175233755, "lr": 0.00056, "grad_norm": 0.6392, "tokens_per_sec": 151118, "dt_s": 4.337, "eta_s": 25792, "world_size": 1, "timestamp": "2026-05-05T00:31:47.575414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33380, "epoch": 0, "train_loss": 3.7507716715335846, "train_ppl": 42.5539069720388, "lr": 0.00056, "grad_norm": 0.686, "tokens_per_sec": 148660, "dt_s": 4.408, "eta_s": 25871, "world_size": 1, "timestamp": "2026-05-05T00:31:51.983874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33390, "epoch": 0, "train_loss": 3.827282652258873, "train_ppl": 45.93754020845984, "lr": 0.00056, "grad_norm": 0.7103, "tokens_per_sec": 151733, "dt_s": 4.319, "eta_s": 25816, "world_size": 1, "timestamp": "2026-05-05T00:31:56.303052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33400, "epoch": 0, "train_loss": 3.8252715468406677, "train_ppl": 45.845247808427196, "lr": 0.00056, "grad_norm": 0.7023, "tokens_per_sec": 149536, "dt_s": 4.383, "eta_s": 25821, "world_size": 1, "timestamp": "2026-05-05T00:32:00.685676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33410, "epoch": 0, "train_loss": 3.745868429541588, "train_ppl": 42.34576556929045, "lr": 0.00056, "grad_norm": 0.6276, "tokens_per_sec": 150010, "dt_s": 4.369, "eta_s": 25858, "world_size": 1, "timestamp": "2026-05-05T00:32:05.054456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33420, "epoch": 0, "train_loss": 3.7900294810533524, "train_ppl": 44.25770502051362, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 151904, "dt_s": 4.314, "eta_s": 25827, "world_size": 1, "timestamp": "2026-05-05T00:32:09.368764"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33430, "epoch": 0, "train_loss": 3.6961365938186646, "train_ppl": 40.29134146238643, "lr": 0.00056, "grad_norm": 0.6338, "tokens_per_sec": 149920, "dt_s": 4.371, "eta_s": 25779, "world_size": 1, "timestamp": "2026-05-05T00:32:13.740189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33440, "epoch": 0, "train_loss": 3.745350733399391, "train_ppl": 42.323849003367165, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 151964, "dt_s": 4.313, "eta_s": 25767, "world_size": 1, "timestamp": "2026-05-05T00:32:18.052780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33450, "epoch": 0, "train_loss": 3.654844492673874, "train_ppl": 38.66150849985983, "lr": 0.00056, "grad_norm": 0.9687, "tokens_per_sec": 151097, "dt_s": 4.337, "eta_s": 25709, "world_size": 1, "timestamp": "2026-05-05T00:32:22.390125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33460, "epoch": 0, "train_loss": 3.761814296245575, "train_ppl": 43.0264178755788, "lr": 0.00056, "grad_norm": 0.6448, "tokens_per_sec": 149722, "dt_s": 4.377, "eta_s": 25714, "world_size": 1, "timestamp": "2026-05-05T00:32:26.767336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33470, "epoch": 0, "train_loss": 3.803506389260292, "train_ppl": 44.85819936328566, "lr": 0.00056, "grad_norm": 0.6898, "tokens_per_sec": 152989, "dt_s": 4.284, "eta_s": 25674, "world_size": 1, "timestamp": "2026-05-05T00:32:31.051008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33480, "epoch": 0, "train_loss": 3.829421356320381, "train_ppl": 46.035892147636325, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 151693, "dt_s": 4.32, "eta_s": 25609, "world_size": 1, "timestamp": "2026-05-05T00:32:35.371339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33490, "epoch": 0, "train_loss": 3.8046102821826935, "train_ppl": 44.907745353777266, "lr": 0.00056, "grad_norm": 0.6346, "tokens_per_sec": 149583, "dt_s": 4.381, "eta_s": 25686, "world_size": 1, "timestamp": "2026-05-05T00:32:39.752566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33500, "epoch": 0, "train_loss": 3.676666244864464, "train_ppl": 39.51444277159933, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 152054, "dt_s": 4.31, "eta_s": 25649, "world_size": 1, "timestamp": "2026-05-05T00:32:44.062622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33510, "epoch": 0, "train_loss": 3.709130138158798, "train_ppl": 40.818284811356904, "lr": 0.00056, "grad_norm": 0.6272, "tokens_per_sec": 127387, "dt_s": 5.145, "eta_s": 25651, "world_size": 1, "timestamp": "2026-05-05T00:32:49.207274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33520, "epoch": 0, "train_loss": 3.8828991800546646, "train_ppl": 48.564809295350436, "lr": 0.00056, "grad_norm": 0.6926, "tokens_per_sec": 149666, "dt_s": 4.379, "eta_s": 25759, "world_size": 1, "timestamp": "2026-05-05T00:32:53.586100"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33530, "epoch": 0, "train_loss": 3.7427124977111816, "train_ppl": 42.21233587809776, "lr": 0.00056, "grad_norm": 0.7409, "tokens_per_sec": 149348, "dt_s": 4.388, "eta_s": 25835, "world_size": 1, "timestamp": "2026-05-05T00:32:57.974282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33540, "epoch": 0, "train_loss": 3.79513181746006, "train_ppl": 44.48409980012007, "lr": 0.00056, "grad_norm": 0.6787, "tokens_per_sec": 146511, "dt_s": 4.473, "eta_s": 25940, "world_size": 1, "timestamp": "2026-05-05T00:33:02.447380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33550, "epoch": 0, "train_loss": 3.7215703576803207, "train_ppl": 41.329244872944, "lr": 0.00056, "grad_norm": 0.7391, "tokens_per_sec": 150888, "dt_s": 4.343, "eta_s": 25974, "world_size": 1, "timestamp": "2026-05-05T00:33:06.790715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33560, "epoch": 0, "train_loss": 3.7231752425432205, "train_ppl": 41.395626805867245, "lr": 0.00056, "grad_norm": 0.6936, "tokens_per_sec": 150508, "dt_s": 4.354, "eta_s": 25937, "world_size": 1, "timestamp": "2026-05-05T00:33:11.145030"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33570, "epoch": 0, "train_loss": 3.8554630428552628, "train_ppl": 47.25049087880774, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 149239, "dt_s": 4.391, "eta_s": 25947, "world_size": 1, "timestamp": "2026-05-05T00:33:15.536392"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33580, "epoch": 0, "train_loss": 3.865322783589363, "train_ppl": 47.71867275154028, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 152956, "dt_s": 4.285, "eta_s": 25821, "world_size": 1, "timestamp": "2026-05-05T00:33:19.821018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33590, "epoch": 0, "train_loss": 3.7356227338314056, "train_ppl": 41.91411877747209, "lr": 0.00056, "grad_norm": 0.7694, "tokens_per_sec": 151791, "dt_s": 4.318, "eta_s": 25632, "world_size": 1, "timestamp": "2026-05-05T00:33:24.138539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33600, "epoch": 0, "train_loss": 3.758087679743767, "train_ppl": 42.86637331428202, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 134302, "dt_s": 4.88, "eta_s": 26262, "world_size": 1, "timestamp": "2026-05-05T00:33:29.018317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33610, "epoch": 0, "train_loss": 3.7097598761320114, "train_ppl": 40.84399773065514, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 154055, "dt_s": 4.254, "eta_s": 26139, "world_size": 1, "timestamp": "2026-05-05T00:33:33.272374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33620, "epoch": 0, "train_loss": 3.826394036412239, "train_ppl": 45.89673751392064, "lr": 0.00056, "grad_norm": 0.7206, "tokens_per_sec": 148838, "dt_s": 4.403, "eta_s": 26149, "world_size": 1, "timestamp": "2026-05-05T00:33:37.675631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33630, "epoch": 0, "train_loss": 3.8642715364694595, "train_ppl": 47.66853499244938, "lr": 0.00056, "grad_norm": 0.681, "tokens_per_sec": 149110, "dt_s": 4.395, "eta_s": 26275, "world_size": 1, "timestamp": "2026-05-05T00:33:42.070665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33640, "epoch": 0, "train_loss": 3.8250811100006104, "train_ppl": 45.83651801556638, "lr": 0.00056, "grad_norm": 0.6601, "tokens_per_sec": 152767, "dt_s": 4.29, "eta_s": 26237, "world_size": 1, "timestamp": "2026-05-05T00:33:46.360601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33650, "epoch": 0, "train_loss": 3.784178376197815, "train_ppl": 43.99950466308855, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 147565, "dt_s": 4.441, "eta_s": 25715, "world_size": 1, "timestamp": "2026-05-05T00:33:50.801769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33660, "epoch": 0, "train_loss": 3.7951570451259613, "train_ppl": 44.48522204428348, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 148925, "dt_s": 4.401, "eta_s": 25884, "world_size": 1, "timestamp": "2026-05-05T00:33:55.202376"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33670, "epoch": 0, "train_loss": 3.78965824842453, "train_ppl": 44.241278165614126, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 149474, "dt_s": 4.384, "eta_s": 25857, "world_size": 1, "timestamp": "2026-05-05T00:33:59.586856"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33680, "epoch": 0, "train_loss": 3.660301387310028, "train_ppl": 38.873056952073505, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 146788, "dt_s": 4.465, "eta_s": 25935, "world_size": 1, "timestamp": "2026-05-05T00:34:04.051481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33690, "epoch": 0, "train_loss": 3.8079892694950104, "train_ppl": 45.05974471288366, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 150567, "dt_s": 4.353, "eta_s": 26005, "world_size": 1, "timestamp": "2026-05-05T00:34:08.404080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33700, "epoch": 0, "train_loss": 3.8145972788333893, "train_ppl": 45.358485881885386, "lr": 0.00056, "grad_norm": 0.651, "tokens_per_sec": 148435, "dt_s": 4.415, "eta_s": 25970, "world_size": 1, "timestamp": "2026-05-05T00:34:12.819214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33710, "epoch": 0, "train_loss": 3.8054977655410767, "train_ppl": 44.947617920944104, "lr": 0.00056, "grad_norm": 0.6842, "tokens_per_sec": 147621, "dt_s": 4.439, "eta_s": 26011, "world_size": 1, "timestamp": "2026-05-05T00:34:17.258699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33720, "epoch": 0, "train_loss": 3.752165362238884, "train_ppl": 42.613255303662605, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 149038, "dt_s": 4.397, "eta_s": 26022, "world_size": 1, "timestamp": "2026-05-05T00:34:21.655984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33730, "epoch": 0, "train_loss": 3.841456860303879, "train_ppl": 46.59330495441455, "lr": 0.00056, "grad_norm": 0.7016, "tokens_per_sec": 146121, "dt_s": 4.485, "eta_s": 26041, "world_size": 1, "timestamp": "2026-05-05T00:34:26.141020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33740, "epoch": 0, "train_loss": 3.8208827823400497, "train_ppl": 45.644484685602464, "lr": 0.00056, "grad_norm": 0.6204, "tokens_per_sec": 147171, "dt_s": 4.453, "eta_s": 26155, "world_size": 1, "timestamp": "2026-05-05T00:34:30.594092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33750, "epoch": 0, "train_loss": 4.009813666343689, "train_ppl": 55.136595802734206, "lr": 0.00056, "grad_norm": 0.6656, "tokens_per_sec": 149848, "dt_s": 4.374, "eta_s": 26102, "world_size": 1, "timestamp": "2026-05-05T00:34:34.967592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33760, "epoch": 0, "train_loss": 3.7549126595258713, "train_ppl": 42.730487546523854, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 144843, "dt_s": 4.525, "eta_s": 26198, "world_size": 1, "timestamp": "2026-05-05T00:34:39.492226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33770, "epoch": 0, "train_loss": 3.85970875620842, "train_ppl": 47.45152939283223, "lr": 0.00056, "grad_norm": 0.6686, "tokens_per_sec": 147758, "dt_s": 4.435, "eta_s": 26238, "world_size": 1, "timestamp": "2026-05-05T00:34:43.927585"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33780, "epoch": 0, "train_loss": 3.9036984145641327, "train_ppl": 49.58549812974499, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 147633, "dt_s": 4.439, "eta_s": 26180, "world_size": 1, "timestamp": "2026-05-05T00:34:48.366693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33790, "epoch": 0, "train_loss": 3.919840857386589, "train_ppl": 50.392424557762766, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 145775, "dt_s": 4.496, "eta_s": 26225, "world_size": 1, "timestamp": "2026-05-05T00:34:52.862409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33800, "epoch": 0, "train_loss": 3.8089551776647568, "train_ppl": 45.10328931508651, "lr": 0.00056, "grad_norm": 0.6584, "tokens_per_sec": 148187, "dt_s": 4.423, "eta_s": 26279, "world_size": 1, "timestamp": "2026-05-05T00:34:57.284912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33810, "epoch": 0, "train_loss": 3.748815968632698, "train_ppl": 42.47076549923908, "lr": 0.00056, "grad_norm": 0.6261, "tokens_per_sec": 146144, "dt_s": 4.484, "eta_s": 26227, "world_size": 1, "timestamp": "2026-05-05T00:35:01.769280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33820, "epoch": 0, "train_loss": 3.7224884778261185, "train_ppl": 41.36720750973657, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 149344, "dt_s": 4.388, "eta_s": 26167, "world_size": 1, "timestamp": "2026-05-05T00:35:06.157538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33830, "epoch": 0, "train_loss": 3.7744238525629044, "train_ppl": 43.572396957660565, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 148616, "dt_s": 4.41, "eta_s": 26128, "world_size": 1, "timestamp": "2026-05-05T00:35:10.567319"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33840, "epoch": 0, "train_loss": 3.796805754303932, "train_ppl": 44.55862573219153, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 146208, "dt_s": 4.482, "eta_s": 26108, "world_size": 1, "timestamp": "2026-05-05T00:35:15.049685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33850, "epoch": 0, "train_loss": 3.738810211420059, "train_ppl": 42.047932242137755, "lr": 0.00056, "grad_norm": 0.6529, "tokens_per_sec": 147536, "dt_s": 4.442, "eta_s": 26126, "world_size": 1, "timestamp": "2026-05-05T00:35:19.491738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33860, "epoch": 0, "train_loss": 3.9170954823493958, "train_ppl": 50.25426818562057, "lr": 0.00056, "grad_norm": 0.6837, "tokens_per_sec": 147677, "dt_s": 4.438, "eta_s": 26067, "world_size": 1, "timestamp": "2026-05-05T00:35:23.929520"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33870, "epoch": 0, "train_loss": 3.7995695024728775, "train_ppl": 44.68194488551593, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 144443, "dt_s": 4.537, "eta_s": 26238, "world_size": 1, "timestamp": "2026-05-05T00:35:28.466742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33880, "epoch": 0, "train_loss": 3.7075138986110687, "train_ppl": 40.7523659698432, "lr": 0.00056, "grad_norm": 0.6445, "tokens_per_sec": 147940, "dt_s": 4.43, "eta_s": 26257, "world_size": 1, "timestamp": "2026-05-05T00:35:32.896603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33890, "epoch": 0, "train_loss": 3.659257009625435, "train_ppl": 38.832479991385156, "lr": 0.00056, "grad_norm": 0.6781, "tokens_per_sec": 146623, "dt_s": 4.47, "eta_s": 26238, "world_size": 1, "timestamp": "2026-05-05T00:35:37.366320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33900, "epoch": 0, "train_loss": 3.815973475575447, "train_ppl": 45.42095105470572, "lr": 0.00056, "grad_norm": 0.688, "tokens_per_sec": 128716, "dt_s": 5.092, "eta_s": 26997, "world_size": 1, "timestamp": "2026-05-05T00:35:42.457833"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33910, "epoch": 0, "train_loss": 3.7923834174871445, "train_ppl": 44.3620075574465, "lr": 0.00056, "grad_norm": 0.6292, "tokens_per_sec": 146664, "dt_s": 4.468, "eta_s": 27028, "world_size": 1, "timestamp": "2026-05-05T00:35:46.926264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33920, "epoch": 0, "train_loss": 3.800695061683655, "train_ppl": 44.7322653741695, "lr": 0.00056, "grad_norm": 0.6427, "tokens_per_sec": 144259, "dt_s": 4.543, "eta_s": 27030, "world_size": 1, "timestamp": "2026-05-05T00:35:51.469197"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33930, "epoch": 0, "train_loss": 3.743460714817047, "train_ppl": 42.24393168866924, "lr": 0.00056, "grad_norm": 0.6836, "tokens_per_sec": 144718, "dt_s": 4.529, "eta_s": 27142, "world_size": 1, "timestamp": "2026-05-05T00:35:55.997723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33940, "epoch": 0, "train_loss": 3.6902939528226852, "train_ppl": 40.056619983356164, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 146504, "dt_s": 4.473, "eta_s": 27141, "world_size": 1, "timestamp": "2026-05-05T00:36:00.471021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33950, "epoch": 0, "train_loss": 3.847080275416374, "train_ppl": 46.856056537837524, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 144261, "dt_s": 4.543, "eta_s": 26492, "world_size": 1, "timestamp": "2026-05-05T00:36:05.013890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33960, "epoch": 0, "train_loss": 3.7552461475133896, "train_ppl": 42.74474002720488, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 148267, "dt_s": 4.42, "eta_s": 26431, "world_size": 1, "timestamp": "2026-05-05T00:36:09.434046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33970, "epoch": 0, "train_loss": 3.7061233520507812, "train_ppl": 40.69573728905376, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 147722, "dt_s": 4.436, "eta_s": 26301, "world_size": 1, "timestamp": "2026-05-05T00:36:13.870466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33980, "epoch": 0, "train_loss": 3.819038301706314, "train_ppl": 45.56037191359926, "lr": 0.00056, "grad_norm": 0.9098, "tokens_per_sec": 145980, "dt_s": 4.489, "eta_s": 26251, "world_size": 1, "timestamp": "2026-05-05T00:36:18.359866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 33990, "epoch": 0, "train_loss": 3.7048972845077515, "train_ppl": 40.64587214168995, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 148925, "dt_s": 4.401, "eta_s": 26161, "world_size": 1, "timestamp": "2026-05-05T00:36:22.760463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34000, "epoch": 0, "train_loss": 3.6823081374168396, "train_ppl": 39.73800908752301, "lr": 0.00056, "grad_norm": 0.6599, "tokens_per_sec": 145531, "dt_s": 4.503, "eta_s": 26110, "world_size": 1, "timestamp": "2026-05-05T00:36:27.263688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34010, "epoch": 0, "train_loss": 3.759045660495758, "train_ppl": 42.90745815092407, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 124558, "dt_s": 5.261, "eta_s": 26158, "world_size": 1, "timestamp": "2026-05-05T00:36:32.525169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34020, "epoch": 0, "train_loss": 3.7240557223558426, "train_ppl": 41.43209087018119, "lr": 0.00056, "grad_norm": 0.7439, "tokens_per_sec": 145214, "dt_s": 4.513, "eta_s": 26243, "world_size": 1, "timestamp": "2026-05-05T00:36:37.038220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34030, "epoch": 0, "train_loss": 3.851087301969528, "train_ppl": 47.04418667005619, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 146525, "dt_s": 4.473, "eta_s": 26219, "world_size": 1, "timestamp": "2026-05-05T00:36:41.510882"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34040, "epoch": 0, "train_loss": 3.7363975197076797, "train_ppl": 41.94660582834629, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 147108, "dt_s": 4.455, "eta_s": 26278, "world_size": 1, "timestamp": "2026-05-05T00:36:45.965842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34050, "epoch": 0, "train_loss": 3.83559912443161, "train_ppl": 46.32117150166369, "lr": 0.00056, "grad_norm": 0.6871, "tokens_per_sec": 148657, "dt_s": 4.409, "eta_s": 26163, "world_size": 1, "timestamp": "2026-05-05T00:36:50.374369"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34060, "epoch": 0, "train_loss": 3.748454123735428, "train_ppl": 42.45540044951009, "lr": 0.00056, "grad_norm": 0.6326, "tokens_per_sec": 146577, "dt_s": 4.471, "eta_s": 26166, "world_size": 1, "timestamp": "2026-05-05T00:36:54.845455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34070, "epoch": 0, "train_loss": 3.805758461356163, "train_ppl": 44.95933710433886, "lr": 0.00056, "grad_norm": 0.683, "tokens_per_sec": 146451, "dt_s": 4.475, "eta_s": 26117, "world_size": 1, "timestamp": "2026-05-05T00:36:59.320420"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34080, "epoch": 0, "train_loss": 3.78965924680233, "train_ppl": 44.24132233514614, "lr": 0.00056, "grad_norm": 0.7325, "tokens_per_sec": 145003, "dt_s": 4.52, "eta_s": 26168, "world_size": 1, "timestamp": "2026-05-05T00:37:03.840039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34090, "epoch": 0, "train_loss": 3.7672246992588043, "train_ppl": 43.25983901832768, "lr": 0.00056, "grad_norm": 0.629, "tokens_per_sec": 146836, "dt_s": 4.463, "eta_s": 26173, "world_size": 1, "timestamp": "2026-05-05T00:37:08.303258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34100, "epoch": 0, "train_loss": 3.7162608802318573, "train_ppl": 41.11038969677734, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 147025, "dt_s": 4.457, "eta_s": 26226, "world_size": 1, "timestamp": "2026-05-05T00:37:12.760724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34110, "epoch": 0, "train_loss": 3.8198578357696533, "train_ppl": 45.59772549449616, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 145937, "dt_s": 4.491, "eta_s": 26244, "world_size": 1, "timestamp": "2026-05-05T00:37:17.251409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34120, "epoch": 0, "train_loss": 3.8946177512407303, "train_ppl": 49.13726711301447, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 146334, "dt_s": 4.479, "eta_s": 26244, "world_size": 1, "timestamp": "2026-05-05T00:37:21.729967"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34130, "epoch": 0, "train_loss": 3.916589245200157, "train_ppl": 50.228834046553416, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 147671, "dt_s": 4.438, "eta_s": 26144, "world_size": 1, "timestamp": "2026-05-05T00:37:26.167898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34140, "epoch": 0, "train_loss": 3.8149775117635727, "train_ppl": 45.37573595119531, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 144441, "dt_s": 4.537, "eta_s": 26226, "world_size": 1, "timestamp": "2026-05-05T00:37:30.705125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34150, "epoch": 0, "train_loss": 3.7716140002012253, "train_ppl": 43.45013680209822, "lr": 0.00056, "grad_norm": 0.6775, "tokens_per_sec": 146529, "dt_s": 4.473, "eta_s": 26239, "world_size": 1, "timestamp": "2026-05-05T00:37:35.177676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34160, "epoch": 0, "train_loss": 3.73654642701149, "train_ppl": 41.95285244939639, "lr": 0.00056, "grad_norm": 0.7071, "tokens_per_sec": 148501, "dt_s": 4.413, "eta_s": 26144, "world_size": 1, "timestamp": "2026-05-05T00:37:39.590845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34170, "epoch": 0, "train_loss": 3.7326690107584, "train_ppl": 41.79049873731352, "lr": 0.00056, "grad_norm": 0.6947, "tokens_per_sec": 144837, "dt_s": 4.525, "eta_s": 26194, "world_size": 1, "timestamp": "2026-05-05T00:37:44.115647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34180, "epoch": 0, "train_loss": 3.796096056699753, "train_ppl": 44.5270138010466, "lr": 0.00056, "grad_norm": 0.6967, "tokens_per_sec": 145953, "dt_s": 4.49, "eta_s": 26250, "world_size": 1, "timestamp": "2026-05-05T00:37:48.605878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34190, "epoch": 0, "train_loss": 3.7949669510126114, "train_ppl": 44.47676646914479, "lr": 0.00056, "grad_norm": 0.6562, "tokens_per_sec": 130620, "dt_s": 5.017, "eta_s": 26807, "world_size": 1, "timestamp": "2026-05-05T00:37:53.623154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34200, "epoch": 0, "train_loss": 3.7593298852443695, "train_ppl": 42.91965524570656, "lr": 0.00056, "grad_norm": 0.7232, "tokens_per_sec": 146242, "dt_s": 4.481, "eta_s": 26813, "world_size": 1, "timestamp": "2026-05-05T00:37:58.104507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34210, "epoch": 0, "train_loss": 3.88989982008934, "train_ppl": 48.905986880916636, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 147437, "dt_s": 4.445, "eta_s": 26846, "world_size": 1, "timestamp": "2026-05-05T00:38:02.549515"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34220, "epoch": 0, "train_loss": 3.7450507134199142, "train_ppl": 42.3111529076942, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 145948, "dt_s": 4.49, "eta_s": 26801, "world_size": 1, "timestamp": "2026-05-05T00:38:07.039894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34230, "epoch": 0, "train_loss": 3.8848880976438522, "train_ppl": 48.66149681865839, "lr": 0.00056, "grad_norm": 0.7335, "tokens_per_sec": 146186, "dt_s": 4.483, "eta_s": 26788, "world_size": 1, "timestamp": "2026-05-05T00:38:11.522965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34240, "epoch": 0, "train_loss": 3.852466493844986, "train_ppl": 47.10911439370156, "lr": 0.00056, "grad_norm": 0.7082, "tokens_per_sec": 148255, "dt_s": 4.42, "eta_s": 26086, "world_size": 1, "timestamp": "2026-05-05T00:38:15.943447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34250, "epoch": 0, "train_loss": 3.7502122968435287, "train_ppl": 42.53011004983415, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 147115, "dt_s": 4.455, "eta_s": 26050, "world_size": 1, "timestamp": "2026-05-05T00:38:20.398205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34260, "epoch": 0, "train_loss": 3.800392761826515, "train_ppl": 44.718744860465, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 146685, "dt_s": 4.468, "eta_s": 26072, "world_size": 1, "timestamp": "2026-05-05T00:38:24.866023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34270, "epoch": 0, "train_loss": 3.823690637946129, "train_ppl": 45.77282790810988, "lr": 0.00056, "grad_norm": 0.6571, "tokens_per_sec": 146408, "dt_s": 4.476, "eta_s": 26052, "world_size": 1, "timestamp": "2026-05-05T00:38:29.342263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34280, "epoch": 0, "train_loss": 3.7567591071128845, "train_ppl": 42.80946003899292, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 149115, "dt_s": 4.395, "eta_s": 25944, "world_size": 1, "timestamp": "2026-05-05T00:38:33.737264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34290, "epoch": 0, "train_loss": 3.8383305370807648, "train_ppl": 46.44786668507162, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 148198, "dt_s": 4.422, "eta_s": 25942, "world_size": 1, "timestamp": "2026-05-05T00:38:38.159444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34300, "epoch": 0, "train_loss": 3.702153742313385, "train_ppl": 40.53451130786689, "lr": 0.00056, "grad_norm": 0.6466, "tokens_per_sec": 147315, "dt_s": 4.449, "eta_s": 25930, "world_size": 1, "timestamp": "2026-05-05T00:38:42.608131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34310, "epoch": 0, "train_loss": 3.7873667925596237, "train_ppl": 44.1400172910112, "lr": 0.00056, "grad_norm": 0.6332, "tokens_per_sec": 149249, "dt_s": 4.391, "eta_s": 25836, "world_size": 1, "timestamp": "2026-05-05T00:38:46.999197"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34320, "epoch": 0, "train_loss": 3.815622940659523, "train_ppl": 45.40503221566402, "lr": 0.00056, "grad_norm": 0.7432, "tokens_per_sec": 147814, "dt_s": 4.434, "eta_s": 25782, "world_size": 1, "timestamp": "2026-05-05T00:38:51.432888"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34330, "epoch": 0, "train_loss": 3.639524668455124, "train_ppl": 38.07373477635569, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 148020, "dt_s": 4.428, "eta_s": 25816, "world_size": 1, "timestamp": "2026-05-05T00:38:55.860429"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34340, "epoch": 0, "train_loss": 3.598481461405754, "train_ppl": 36.54270078786535, "lr": 0.00056, "grad_norm": 0.6623, "tokens_per_sec": 148847, "dt_s": 4.403, "eta_s": 25789, "world_size": 1, "timestamp": "2026-05-05T00:39:00.263388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34350, "epoch": 0, "train_loss": 3.7755722999572754, "train_ppl": 43.622466308917915, "lr": 0.00056, "grad_norm": 0.7103, "tokens_per_sec": 146863, "dt_s": 4.462, "eta_s": 25800, "world_size": 1, "timestamp": "2026-05-05T00:39:04.725734"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34360, "epoch": 0, "train_loss": 3.766328975558281, "train_ppl": 43.22110750419531, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 146819, "dt_s": 4.464, "eta_s": 25881, "world_size": 1, "timestamp": "2026-05-05T00:39:09.189466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34370, "epoch": 0, "train_loss": 3.820426732301712, "train_ppl": 45.62367326249754, "lr": 0.00056, "grad_norm": 0.6224, "tokens_per_sec": 146225, "dt_s": 4.482, "eta_s": 25932, "world_size": 1, "timestamp": "2026-05-05T00:39:13.671343"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34380, "epoch": 0, "train_loss": 3.8006265461444855, "train_ppl": 44.72920062388183, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 147422, "dt_s": 4.445, "eta_s": 25949, "world_size": 1, "timestamp": "2026-05-05T00:39:18.116805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34390, "epoch": 0, "train_loss": 3.677448719739914, "train_ppl": 39.545373930134545, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 146883, "dt_s": 4.462, "eta_s": 26013, "world_size": 1, "timestamp": "2026-05-05T00:39:22.578581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34400, "epoch": 0, "train_loss": 3.7991932332515717, "train_ppl": 44.665135607512724, "lr": 0.00056, "grad_norm": 0.6538, "tokens_per_sec": 145479, "dt_s": 4.505, "eta_s": 26058, "world_size": 1, "timestamp": "2026-05-05T00:39:27.083409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34410, "epoch": 0, "train_loss": 3.7470531165599823, "train_ppl": 42.39596177557054, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 147703, "dt_s": 4.437, "eta_s": 26022, "world_size": 1, "timestamp": "2026-05-05T00:39:31.520434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34420, "epoch": 0, "train_loss": 3.7696706652641296, "train_ppl": 43.36578062592922, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 148181, "dt_s": 4.423, "eta_s": 25949, "world_size": 1, "timestamp": "2026-05-05T00:39:35.943142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34430, "epoch": 0, "train_loss": 3.832547202706337, "train_ppl": 46.180018415663504, "lr": 0.00056, "grad_norm": 0.6829, "tokens_per_sec": 147949, "dt_s": 4.43, "eta_s": 25926, "world_size": 1, "timestamp": "2026-05-05T00:39:40.372770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34440, "epoch": 0, "train_loss": 3.791834071278572, "train_ppl": 44.337644149375784, "lr": 0.00056, "grad_norm": 0.6863, "tokens_per_sec": 146973, "dt_s": 4.459, "eta_s": 25918, "world_size": 1, "timestamp": "2026-05-05T00:39:44.831841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34450, "epoch": 0, "train_loss": 3.8499148339033127, "train_ppl": 46.989061186234174, "lr": 0.00056, "grad_norm": 0.6988, "tokens_per_sec": 149534, "dt_s": 4.383, "eta_s": 25772, "world_size": 1, "timestamp": "2026-05-05T00:39:49.214490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34460, "epoch": 0, "train_loss": 3.778333529829979, "train_ppl": 43.74308441653545, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 144829, "dt_s": 4.525, "eta_s": 25870, "world_size": 1, "timestamp": "2026-05-05T00:39:53.739591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34470, "epoch": 0, "train_loss": 3.8879490047693253, "train_ppl": 48.81067333226209, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 146522, "dt_s": 4.473, "eta_s": 25924, "world_size": 1, "timestamp": "2026-05-05T00:39:58.212406"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34480, "epoch": 0, "train_loss": 3.865412876009941, "train_ppl": 47.722972035938795, "lr": 0.00056, "grad_norm": 0.7304, "tokens_per_sec": 147765, "dt_s": 4.435, "eta_s": 25926, "world_size": 1, "timestamp": "2026-05-05T00:40:02.647545"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34490, "epoch": 0, "train_loss": 3.594516947865486, "train_ppl": 36.398113554194225, "lr": 0.00056, "grad_norm": 0.7338, "tokens_per_sec": 130952, "dt_s": 5.005, "eta_s": 26556, "world_size": 1, "timestamp": "2026-05-05T00:40:07.652096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34500, "epoch": 0, "train_loss": 3.7433688938617706, "train_ppl": 42.240052988582654, "lr": 0.00056, "grad_norm": 0.6708, "tokens_per_sec": 147922, "dt_s": 4.43, "eta_s": 26607, "world_size": 1, "timestamp": "2026-05-05T00:40:12.082544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34510, "epoch": 0, "train_loss": 3.7415657490491867, "train_ppl": 42.16395668310012, "lr": 0.00056, "grad_norm": 0.6414, "tokens_per_sec": 122678, "dt_s": 5.342, "eta_s": 26645, "world_size": 1, "timestamp": "2026-05-05T00:40:17.424664"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34520, "epoch": 0, "train_loss": 3.85896860063076, "train_ppl": 47.41642087316976, "lr": 0.00056, "grad_norm": 0.6377, "tokens_per_sec": 148480, "dt_s": 4.414, "eta_s": 26572, "world_size": 1, "timestamp": "2026-05-05T00:40:21.838448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34530, "epoch": 0, "train_loss": 3.7388560473918915, "train_ppl": 42.049859594146305, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 144723, "dt_s": 4.528, "eta_s": 26676, "world_size": 1, "timestamp": "2026-05-05T00:40:26.366815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34540, "epoch": 0, "train_loss": 3.6738091707229614, "train_ppl": 39.40170820114025, "lr": 0.00056, "grad_norm": 0.7135, "tokens_per_sec": 142822, "dt_s": 4.589, "eta_s": 26188, "world_size": 1, "timestamp": "2026-05-05T00:40:30.955454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34550, "epoch": 0, "train_loss": 3.8299032151699066, "train_ppl": 46.05808029501173, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 146811, "dt_s": 4.464, "eta_s": 26222, "world_size": 1, "timestamp": "2026-05-05T00:40:35.419413"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34560, "epoch": 0, "train_loss": 3.725265070796013, "train_ppl": 41.4822270145716, "lr": 0.00056, "grad_norm": 0.8143, "tokens_per_sec": 147994, "dt_s": 4.428, "eta_s": 26062, "world_size": 1, "timestamp": "2026-05-05T00:40:39.847707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34570, "epoch": 0, "train_loss": 3.702575773000717, "train_ppl": 40.55162172584159, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 146340, "dt_s": 4.478, "eta_s": 26133, "world_size": 1, "timestamp": "2026-05-05T00:40:44.326023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34580, "epoch": 0, "train_loss": 3.932653024792671, "train_ppl": 51.042214457030816, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 145751, "dt_s": 4.496, "eta_s": 26091, "world_size": 1, "timestamp": "2026-05-05T00:40:48.822470"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34590, "epoch": 0, "train_loss": 3.668447256088257, "train_ppl": 39.19100499641448, "lr": 0.00056, "grad_norm": 0.6788, "tokens_per_sec": 144455, "dt_s": 4.537, "eta_s": 26027, "world_size": 1, "timestamp": "2026-05-05T00:40:53.359261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34600, "epoch": 0, "train_loss": 3.7547691613435745, "train_ppl": 42.724356239158524, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 143972, "dt_s": 4.552, "eta_s": 26124, "world_size": 1, "timestamp": "2026-05-05T00:40:57.911271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34610, "epoch": 0, "train_loss": 3.862094521522522, "train_ppl": 47.56487275733119, "lr": 0.00056, "grad_norm": 0.72, "tokens_per_sec": 145141, "dt_s": 4.515, "eta_s": 26221, "world_size": 1, "timestamp": "2026-05-05T00:41:02.426572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34620, "epoch": 0, "train_loss": 3.8547724783420563, "train_ppl": 47.21787263037392, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 143598, "dt_s": 4.564, "eta_s": 26316, "world_size": 1, "timestamp": "2026-05-05T00:41:06.990427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34630, "epoch": 0, "train_loss": 3.744681715965271, "train_ppl": 42.29554308013868, "lr": 0.00056, "grad_norm": 0.6624, "tokens_per_sec": 147491, "dt_s": 4.443, "eta_s": 26250, "world_size": 1, "timestamp": "2026-05-05T00:41:11.433822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34640, "epoch": 0, "train_loss": 3.7929973006248474, "train_ppl": 44.389249006518284, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 146544, "dt_s": 4.472, "eta_s": 26170, "world_size": 1, "timestamp": "2026-05-05T00:41:15.905965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34650, "epoch": 0, "train_loss": 3.696748912334442, "train_ppl": 40.31602015162748, "lr": 0.00056, "grad_norm": 0.6321, "tokens_per_sec": 147530, "dt_s": 4.442, "eta_s": 26038, "world_size": 1, "timestamp": "2026-05-05T00:41:20.348129"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34660, "epoch": 0, "train_loss": 3.759770780801773, "train_ppl": 42.93858250319495, "lr": 0.00056, "grad_norm": 1.0682, "tokens_per_sec": 146507, "dt_s": 4.473, "eta_s": 25985, "world_size": 1, "timestamp": "2026-05-05T00:41:24.821384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34670, "epoch": 0, "train_loss": 3.780257821083069, "train_ppl": 43.82733989129881, "lr": 0.00056, "grad_norm": 0.6274, "tokens_per_sec": 146191, "dt_s": 4.483, "eta_s": 25886, "world_size": 1, "timestamp": "2026-05-05T00:41:29.304264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34680, "epoch": 0, "train_loss": 3.8424313217401505, "train_ppl": 46.63873046238936, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 146026, "dt_s": 4.488, "eta_s": 25933, "world_size": 1, "timestamp": "2026-05-05T00:41:33.792264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34690, "epoch": 0, "train_loss": 3.713546186685562, "train_ppl": 40.99893893296171, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 148524, "dt_s": 4.412, "eta_s": 25860, "world_size": 1, "timestamp": "2026-05-05T00:41:38.204713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34700, "epoch": 0, "train_loss": 3.7512244135141373, "train_ppl": 42.57317727406977, "lr": 0.00056, "grad_norm": 0.6442, "tokens_per_sec": 144247, "dt_s": 4.543, "eta_s": 25973, "world_size": 1, "timestamp": "2026-05-05T00:41:42.748014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34710, "epoch": 0, "train_loss": 3.8537182807922363, "train_ppl": 47.168121892899514, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 147898, "dt_s": 4.431, "eta_s": 25919, "world_size": 1, "timestamp": "2026-05-05T00:41:47.179189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34720, "epoch": 0, "train_loss": 3.7361457645893097, "train_ppl": 41.93604688482056, "lr": 0.00056, "grad_norm": 0.6707, "tokens_per_sec": 148549, "dt_s": 4.412, "eta_s": 25833, "world_size": 1, "timestamp": "2026-05-05T00:41:51.590912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34730, "epoch": 0, "train_loss": 3.7721347212791443, "train_ppl": 43.47276809595403, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 144617, "dt_s": 4.532, "eta_s": 25879, "world_size": 1, "timestamp": "2026-05-05T00:41:56.122595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34740, "epoch": 0, "train_loss": 3.810121610760689, "train_ppl": 45.15592997941714, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 149439, "dt_s": 4.385, "eta_s": 25843, "world_size": 1, "timestamp": "2026-05-05T00:42:00.508071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34750, "epoch": 0, "train_loss": 3.64319109916687, "train_ppl": 38.213585707059664, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 146670, "dt_s": 4.468, "eta_s": 25752, "world_size": 1, "timestamp": "2026-05-05T00:42:04.976327"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34760, "epoch": 0, "train_loss": 3.7747547030448914, "train_ppl": 43.58681529122012, "lr": 0.00056, "grad_norm": 0.7001, "tokens_per_sec": 147785, "dt_s": 4.435, "eta_s": 25751, "world_size": 1, "timestamp": "2026-05-05T00:42:09.410855"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34770, "epoch": 0, "train_loss": 3.8175517320632935, "train_ppl": 45.49269356454085, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 147392, "dt_s": 4.446, "eta_s": 25787, "world_size": 1, "timestamp": "2026-05-05T00:42:13.857249"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34780, "epoch": 0, "train_loss": 3.714902624487877, "train_ppl": 41.05458917810059, "lr": 0.00056, "grad_norm": 0.636, "tokens_per_sec": 131283, "dt_s": 4.992, "eta_s": 26315, "world_size": 1, "timestamp": "2026-05-05T00:42:18.849202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34790, "epoch": 0, "train_loss": 3.768026277422905, "train_ppl": 43.29452906220068, "lr": 0.00056, "grad_norm": 0.6562, "tokens_per_sec": 148903, "dt_s": 4.401, "eta_s": 26329, "world_size": 1, "timestamp": "2026-05-05T00:42:23.250434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34800, "epoch": 0, "train_loss": 3.8008417040109634, "train_ppl": 44.73882549865396, "lr": 0.00056, "grad_norm": 0.6139, "tokens_per_sec": 148471, "dt_s": 4.414, "eta_s": 26262, "world_size": 1, "timestamp": "2026-05-05T00:42:27.664499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34810, "epoch": 0, "train_loss": 3.8714405298233032, "train_ppl": 48.01149828491319, "lr": 0.00056, "grad_norm": 0.6828, "tokens_per_sec": 146367, "dt_s": 4.478, "eta_s": 26307, "world_size": 1, "timestamp": "2026-05-05T00:42:32.142000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34820, "epoch": 0, "train_loss": 3.857574686408043, "train_ppl": 47.35037249330666, "lr": 0.00056, "grad_norm": 0.6851, "tokens_per_sec": 150612, "dt_s": 4.351, "eta_s": 26192, "world_size": 1, "timestamp": "2026-05-05T00:42:36.493312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34830, "epoch": 0, "train_loss": 3.8611158579587936, "train_ppl": 47.51834552041954, "lr": 0.00056, "grad_norm": 0.7175, "tokens_per_sec": 149876, "dt_s": 4.373, "eta_s": 25471, "world_size": 1, "timestamp": "2026-05-05T00:42:40.865979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34840, "epoch": 0, "train_loss": 3.7551339715719223, "train_ppl": 42.739945364677475, "lr": 0.00056, "grad_norm": 0.6986, "tokens_per_sec": 147051, "dt_s": 4.457, "eta_s": 25531, "world_size": 1, "timestamp": "2026-05-05T00:42:45.322663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34850, "epoch": 0, "train_loss": 3.764810472726822, "train_ppl": 43.155525935571994, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 149685, "dt_s": 4.378, "eta_s": 25485, "world_size": 1, "timestamp": "2026-05-05T00:42:49.700947"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34860, "epoch": 0, "train_loss": 3.8443801552057266, "train_ppl": 46.729710204494786, "lr": 0.00056, "grad_norm": 0.6448, "tokens_per_sec": 148154, "dt_s": 4.424, "eta_s": 25418, "world_size": 1, "timestamp": "2026-05-05T00:42:54.124449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34870, "epoch": 0, "train_loss": 3.904554158449173, "train_ppl": 49.62794877740673, "lr": 0.00056, "grad_norm": 0.7664, "tokens_per_sec": 148325, "dt_s": 4.418, "eta_s": 25492, "world_size": 1, "timestamp": "2026-05-05T00:42:58.542854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34880, "epoch": 0, "train_loss": 3.909576565027237, "train_ppl": 49.87782748477736, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 149756, "dt_s": 4.376, "eta_s": 25491, "world_size": 1, "timestamp": "2026-05-05T00:43:02.919057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34890, "epoch": 0, "train_loss": 3.711592987179756, "train_ppl": 40.91893798004368, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 146402, "dt_s": 4.476, "eta_s": 25510, "world_size": 1, "timestamp": "2026-05-05T00:43:07.395497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34900, "epoch": 0, "train_loss": 3.857755273580551, "train_ppl": 47.35892413532763, "lr": 0.00056, "grad_norm": 0.7396, "tokens_per_sec": 148466, "dt_s": 4.414, "eta_s": 25547, "world_size": 1, "timestamp": "2026-05-05T00:43:11.809709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34910, "epoch": 0, "train_loss": 3.7864783257246017, "train_ppl": 44.10081776586845, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 149441, "dt_s": 4.385, "eta_s": 25498, "world_size": 1, "timestamp": "2026-05-05T00:43:16.195124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34920, "epoch": 0, "train_loss": 3.939802587032318, "train_ppl": 51.408451603723975, "lr": 0.00056, "grad_norm": 0.8062, "tokens_per_sec": 148046, "dt_s": 4.427, "eta_s": 25504, "world_size": 1, "timestamp": "2026-05-05T00:43:20.621851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34930, "epoch": 0, "train_loss": 3.779987797141075, "train_ppl": 43.81550705786033, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 150165, "dt_s": 4.364, "eta_s": 25485, "world_size": 1, "timestamp": "2026-05-05T00:43:24.986119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34940, "epoch": 0, "train_loss": 3.8385905027389526, "train_ppl": 46.459943104965035, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 145796, "dt_s": 4.495, "eta_s": 25503, "world_size": 1, "timestamp": "2026-05-05T00:43:29.481226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34950, "epoch": 0, "train_loss": 3.8429404348134995, "train_ppl": 46.66248089510706, "lr": 0.00056, "grad_norm": 0.6407, "tokens_per_sec": 145941, "dt_s": 4.491, "eta_s": 25586, "world_size": 1, "timestamp": "2026-05-05T00:43:33.971770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34960, "epoch": 0, "train_loss": 3.7595832347869873, "train_ppl": 42.930530298269005, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 143894, "dt_s": 4.554, "eta_s": 25777, "world_size": 1, "timestamp": "2026-05-05T00:43:38.526271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34970, "epoch": 0, "train_loss": 3.7855714708566666, "train_ppl": 44.06084285305925, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 148227, "dt_s": 4.421, "eta_s": 25766, "world_size": 1, "timestamp": "2026-05-05T00:43:42.947544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34980, "epoch": 0, "train_loss": 3.8130223900079727, "train_ppl": 45.28710753056928, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 146908, "dt_s": 4.461, "eta_s": 25873, "world_size": 1, "timestamp": "2026-05-05T00:43:47.408595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 34990, "epoch": 0, "train_loss": 3.791320323944092, "train_ppl": 44.314871653032434, "lr": 0.00056, "grad_norm": 0.6425, "tokens_per_sec": 145689, "dt_s": 4.498, "eta_s": 25873, "world_size": 1, "timestamp": "2026-05-05T00:43:51.906938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35000, "epoch": 0, "train_loss": 3.7819180488586426, "train_ppl": 43.900163693638184, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 145112, "dt_s": 4.516, "eta_s": 25898, "world_size": 1, "timestamp": "2026-05-05T00:43:56.423196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35010, "epoch": 0, "train_loss": 3.8239081501960754, "train_ppl": 45.78278514176548, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 105837, "dt_s": 6.192, "eta_s": 25919, "world_size": 1, "timestamp": "2026-05-05T00:44:02.615357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35020, "epoch": 0, "train_loss": 3.7664594799280167, "train_ppl": 43.22674841566322, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 144811, "dt_s": 4.526, "eta_s": 26034, "world_size": 1, "timestamp": "2026-05-05T00:44:07.140954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35030, "epoch": 0, "train_loss": 3.7869081497192383, "train_ppl": 44.119777429894675, "lr": 0.00056, "grad_norm": 0.7318, "tokens_per_sec": 142844, "dt_s": 4.588, "eta_s": 26176, "world_size": 1, "timestamp": "2026-05-05T00:44:11.728917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35040, "epoch": 0, "train_loss": 3.7364554554224014, "train_ppl": 41.94903610533434, "lr": 0.00056, "grad_norm": 0.6554, "tokens_per_sec": 144914, "dt_s": 4.522, "eta_s": 26199, "world_size": 1, "timestamp": "2026-05-05T00:44:16.251318"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35050, "epoch": 0, "train_loss": 3.5676802694797516, "train_ppl": 35.43429971272309, "lr": 0.00056, "grad_norm": 0.7375, "tokens_per_sec": 144331, "dt_s": 4.541, "eta_s": 26223, "world_size": 1, "timestamp": "2026-05-05T00:44:20.792000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35060, "epoch": 0, "train_loss": 3.7059151083230972, "train_ppl": 40.68726353935307, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 148064, "dt_s": 4.426, "eta_s": 26045, "world_size": 1, "timestamp": "2026-05-05T00:44:25.218200"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35070, "epoch": 0, "train_loss": 3.77840419113636, "train_ppl": 43.746175469233094, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 146285, "dt_s": 4.48, "eta_s": 25988, "world_size": 1, "timestamp": "2026-05-05T00:44:29.698230"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35080, "epoch": 0, "train_loss": 3.7921777814626694, "train_ppl": 44.352886068460165, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 131524, "dt_s": 4.983, "eta_s": 26439, "world_size": 1, "timestamp": "2026-05-05T00:44:34.681026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35090, "epoch": 0, "train_loss": 3.7642950862646103, "train_ppl": 43.133289892305214, "lr": 0.00056, "grad_norm": 0.6302, "tokens_per_sec": 147506, "dt_s": 4.443, "eta_s": 26343, "world_size": 1, "timestamp": "2026-05-05T00:44:39.123969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35100, "epoch": 0, "train_loss": 3.6608613282442093, "train_ppl": 38.894829663044916, "lr": 0.00056, "grad_norm": 0.6502, "tokens_per_sec": 144159, "dt_s": 4.546, "eta_s": 26344, "world_size": 1, "timestamp": "2026-05-05T00:44:43.670062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35110, "epoch": 0, "train_loss": 3.740444153547287, "train_ppl": 42.11669228966239, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 147356, "dt_s": 4.447, "eta_s": 26364, "world_size": 1, "timestamp": "2026-05-05T00:44:48.117539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35120, "epoch": 0, "train_loss": 3.747864216566086, "train_ppl": 42.43036308999264, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 147021, "dt_s": 4.458, "eta_s": 26334, "world_size": 1, "timestamp": "2026-05-05T00:44:52.575119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35130, "epoch": 0, "train_loss": 3.799522280693054, "train_ppl": 44.67983497436975, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 144660, "dt_s": 4.53, "eta_s": 25808, "world_size": 1, "timestamp": "2026-05-05T00:44:57.105490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35140, "epoch": 0, "train_loss": 3.792674109339714, "train_ppl": 44.37490510612242, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 144537, "dt_s": 4.534, "eta_s": 25909, "world_size": 1, "timestamp": "2026-05-05T00:45:01.639708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35150, "epoch": 0, "train_loss": 3.76836721599102, "train_ppl": 43.30929235349106, "lr": 0.00056, "grad_norm": 0.6647, "tokens_per_sec": 142126, "dt_s": 4.611, "eta_s": 25979, "world_size": 1, "timestamp": "2026-05-05T00:45:06.250800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35160, "epoch": 0, "train_loss": 3.80863918364048, "train_ppl": 45.089039196782416, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 142021, "dt_s": 4.615, "eta_s": 26167, "world_size": 1, "timestamp": "2026-05-05T00:45:10.865317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35170, "epoch": 0, "train_loss": 3.783662810921669, "train_ppl": 43.97682589301217, "lr": 0.00056, "grad_norm": 0.6278, "tokens_per_sec": 145716, "dt_s": 4.498, "eta_s": 26208, "world_size": 1, "timestamp": "2026-05-05T00:45:15.362844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35180, "epoch": 0, "train_loss": 3.839445561170578, "train_ppl": 46.49968605990271, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 143295, "dt_s": 4.573, "eta_s": 26253, "world_size": 1, "timestamp": "2026-05-05T00:45:19.936363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35190, "epoch": 0, "train_loss": 3.752726376056671, "train_ppl": 42.63716863593657, "lr": 0.00056, "grad_norm": 0.6382, "tokens_per_sec": 146073, "dt_s": 4.487, "eta_s": 26194, "world_size": 1, "timestamp": "2026-05-05T00:45:24.422875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35200, "epoch": 0, "train_loss": 3.7932689636945724, "train_ppl": 44.401309564296085, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 146382, "dt_s": 4.477, "eta_s": 26035, "world_size": 1, "timestamp": "2026-05-05T00:45:28.899943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35210, "epoch": 0, "train_loss": 3.8669915944337845, "train_ppl": 47.798372673660054, "lr": 0.00056, "grad_norm": 0.7023, "tokens_per_sec": 143338, "dt_s": 4.572, "eta_s": 25982, "world_size": 1, "timestamp": "2026-05-05T00:45:33.472061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35220, "epoch": 0, "train_loss": 3.774085059762001, "train_ppl": 43.557637443602935, "lr": 0.00056, "grad_norm": 0.6298, "tokens_per_sec": 146794, "dt_s": 4.464, "eta_s": 25940, "world_size": 1, "timestamp": "2026-05-05T00:45:37.936565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35230, "epoch": 0, "train_loss": 3.793375253677368, "train_ppl": 44.406029229547904, "lr": 0.00056, "grad_norm": 0.6371, "tokens_per_sec": 147066, "dt_s": 4.456, "eta_s": 25800, "world_size": 1, "timestamp": "2026-05-05T00:45:42.392782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35240, "epoch": 0, "train_loss": 3.728565350174904, "train_ppl": 41.61935611067663, "lr": 0.00056, "grad_norm": 0.6699, "tokens_per_sec": 143965, "dt_s": 4.552, "eta_s": 25871, "world_size": 1, "timestamp": "2026-05-05T00:45:46.945005"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35250, "epoch": 0, "train_loss": 3.648693233728409, "train_ppl": 38.42442148922652, "lr": 0.00056, "grad_norm": 0.6322, "tokens_per_sec": 144996, "dt_s": 4.52, "eta_s": 25916, "world_size": 1, "timestamp": "2026-05-05T00:45:51.464877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35260, "epoch": 0, "train_loss": 3.7820496559143066, "train_ppl": 43.905941645126326, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 144211, "dt_s": 4.544, "eta_s": 25880, "world_size": 1, "timestamp": "2026-05-05T00:45:56.009290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35270, "epoch": 0, "train_loss": 3.792997419834137, "train_ppl": 44.38925429812944, "lr": 0.00056, "grad_norm": 0.6243, "tokens_per_sec": 147219, "dt_s": 4.452, "eta_s": 25860, "world_size": 1, "timestamp": "2026-05-05T00:46:00.460859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35280, "epoch": 0, "train_loss": 3.784384563565254, "train_ppl": 44.008577740468496, "lr": 0.00056, "grad_norm": 0.6308, "tokens_per_sec": 148776, "dt_s": 4.405, "eta_s": 25797, "world_size": 1, "timestamp": "2026-05-05T00:46:04.865899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35290, "epoch": 0, "train_loss": 3.7641436755657196, "train_ppl": 43.12675954513186, "lr": 0.00056, "grad_norm": 0.7138, "tokens_per_sec": 145034, "dt_s": 4.519, "eta_s": 25754, "world_size": 1, "timestamp": "2026-05-05T00:46:09.384544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35300, "epoch": 0, "train_loss": 3.747983455657959, "train_ppl": 42.43542274960395, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 147959, "dt_s": 4.429, "eta_s": 25646, "world_size": 1, "timestamp": "2026-05-05T00:46:13.813872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35310, "epoch": 0, "train_loss": 3.756524533033371, "train_ppl": 42.799419227012955, "lr": 0.00056, "grad_norm": 0.6334, "tokens_per_sec": 147038, "dt_s": 4.457, "eta_s": 25541, "world_size": 1, "timestamp": "2026-05-05T00:46:18.270944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35320, "epoch": 0, "train_loss": 3.771599844098091, "train_ppl": 43.44952172183404, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 145697, "dt_s": 4.498, "eta_s": 25590, "world_size": 1, "timestamp": "2026-05-05T00:46:22.769068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35330, "epoch": 0, "train_loss": 3.7244293689727783, "train_ppl": 41.44757472333193, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 149561, "dt_s": 4.382, "eta_s": 25559, "world_size": 1, "timestamp": "2026-05-05T00:46:27.150948"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35340, "epoch": 0, "train_loss": 3.8324649930000305, "train_ppl": 46.17622212596039, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 147069, "dt_s": 4.456, "eta_s": 25483, "world_size": 1, "timestamp": "2026-05-05T00:46:31.607070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35350, "epoch": 0, "train_loss": 3.939811497926712, "train_ppl": 51.408909701048195, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 148456, "dt_s": 4.414, "eta_s": 25461, "world_size": 1, "timestamp": "2026-05-05T00:46:36.021583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35360, "epoch": 0, "train_loss": 3.7552486658096313, "train_ppl": 42.74484767125858, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 147917, "dt_s": 4.431, "eta_s": 25426, "world_size": 1, "timestamp": "2026-05-05T00:46:40.452189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35370, "epoch": 0, "train_loss": 3.9032318592071533, "train_ppl": 49.56236914585928, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 129271, "dt_s": 5.07, "eta_s": 26077, "world_size": 1, "timestamp": "2026-05-05T00:46:45.521851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35380, "epoch": 0, "train_loss": 3.8067748695611954, "train_ppl": 45.00505737475744, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 148618, "dt_s": 4.41, "eta_s": 26104, "world_size": 1, "timestamp": "2026-05-05T00:46:49.931539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35390, "epoch": 0, "train_loss": 3.816279500722885, "train_ppl": 45.434853135033606, "lr": 0.00056, "grad_norm": 0.6378, "tokens_per_sec": 148099, "dt_s": 4.425, "eta_s": 26064, "world_size": 1, "timestamp": "2026-05-05T00:46:54.356689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35400, "epoch": 0, "train_loss": 3.7681538313627243, "train_ppl": 43.30005180217132, "lr": 0.00056, "grad_norm": 0.638, "tokens_per_sec": 144574, "dt_s": 4.533, "eta_s": 26196, "world_size": 1, "timestamp": "2026-05-05T00:46:58.889743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35410, "epoch": 0, "train_loss": 3.7041958272457123, "train_ppl": 40.617370796910535, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 146462, "dt_s": 4.475, "eta_s": 26242, "world_size": 1, "timestamp": "2026-05-05T00:47:03.364352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35420, "epoch": 0, "train_loss": 3.661570444703102, "train_ppl": 38.92242040829444, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 145122, "dt_s": 4.516, "eta_s": 25603, "world_size": 1, "timestamp": "2026-05-05T00:47:07.880297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35430, "epoch": 0, "train_loss": 3.861993670463562, "train_ppl": 47.56007603142583, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 148381, "dt_s": 4.417, "eta_s": 25606, "world_size": 1, "timestamp": "2026-05-05T00:47:12.297042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35440, "epoch": 0, "train_loss": 3.805846467614174, "train_ppl": 44.96329398147248, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 149665, "dt_s": 4.379, "eta_s": 25549, "world_size": 1, "timestamp": "2026-05-05T00:47:16.675896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35450, "epoch": 0, "train_loss": 3.737001970410347, "train_ppl": 41.971968148077536, "lr": 0.00056, "grad_norm": 0.6319, "tokens_per_sec": 146686, "dt_s": 4.468, "eta_s": 25470, "world_size": 1, "timestamp": "2026-05-05T00:47:21.143640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35460, "epoch": 0, "train_loss": 3.8245896846055984, "train_ppl": 45.813998320422854, "lr": 0.00056, "grad_norm": 0.6699, "tokens_per_sec": 147431, "dt_s": 4.445, "eta_s": 25432, "world_size": 1, "timestamp": "2026-05-05T00:47:25.588838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35470, "epoch": 0, "train_loss": 3.738035708665848, "train_ppl": 42.01537861087315, "lr": 0.00056, "grad_norm": 0.7523, "tokens_per_sec": 147333, "dt_s": 4.448, "eta_s": 25350, "world_size": 1, "timestamp": "2026-05-05T00:47:30.037002"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35480, "epoch": 0, "train_loss": 3.7379482686519623, "train_ppl": 42.011704946198996, "lr": 0.00056, "grad_norm": 0.6612, "tokens_per_sec": 145695, "dt_s": 4.498, "eta_s": 25438, "world_size": 1, "timestamp": "2026-05-05T00:47:34.535194"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35490, "epoch": 0, "train_loss": 3.6481775045394897, "train_ppl": 38.40461000261727, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 150127, "dt_s": 4.365, "eta_s": 25418, "world_size": 1, "timestamp": "2026-05-05T00:47:38.900546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35500, "epoch": 0, "train_loss": 3.6863273680210114, "train_ppl": 39.89804670840817, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 147748, "dt_s": 4.436, "eta_s": 25377, "world_size": 1, "timestamp": "2026-05-05T00:47:43.336241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35510, "epoch": 0, "train_loss": 3.7873119711875916, "train_ppl": 44.13759754102939, "lr": 0.00056, "grad_norm": 0.7288, "tokens_per_sec": 122348, "dt_s": 5.357, "eta_s": 25480, "world_size": 1, "timestamp": "2026-05-05T00:47:48.692736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35520, "epoch": 0, "train_loss": 3.604150667786598, "train_ppl": 36.750457251039485, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 147111, "dt_s": 4.455, "eta_s": 25483, "world_size": 1, "timestamp": "2026-05-05T00:47:53.147593"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35530, "epoch": 0, "train_loss": 3.8502695113420486, "train_ppl": 47.00573010197375, "lr": 0.00056, "grad_norm": 0.6764, "tokens_per_sec": 140764, "dt_s": 4.656, "eta_s": 25659, "world_size": 1, "timestamp": "2026-05-05T00:47:57.803337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35540, "epoch": 0, "train_loss": 3.7443602234125137, "train_ppl": 42.281947563569354, "lr": 0.00056, "grad_norm": 0.6852, "tokens_per_sec": 147446, "dt_s": 4.445, "eta_s": 25745, "world_size": 1, "timestamp": "2026-05-05T00:48:02.248085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35550, "epoch": 0, "train_loss": 3.7709188759326935, "train_ppl": 43.419944052608265, "lr": 0.00056, "grad_norm": 0.6482, "tokens_per_sec": 146313, "dt_s": 4.479, "eta_s": 25790, "world_size": 1, "timestamp": "2026-05-05T00:48:06.727254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35560, "epoch": 0, "train_loss": 3.7591108679771423, "train_ppl": 42.910256129426266, "lr": 0.00056, "grad_norm": 0.811, "tokens_per_sec": 144247, "dt_s": 4.543, "eta_s": 25791, "world_size": 1, "timestamp": "2026-05-05T00:48:11.270578"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35570, "epoch": 0, "train_loss": 3.8070124983787537, "train_ppl": 45.01575314408672, "lr": 0.00056, "grad_norm": 0.637, "tokens_per_sec": 148488, "dt_s": 4.414, "eta_s": 25739, "world_size": 1, "timestamp": "2026-05-05T00:48:15.684133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35580, "epoch": 0, "train_loss": 3.796648398041725, "train_ppl": 44.55161470502621, "lr": 0.00056, "grad_norm": 0.6282, "tokens_per_sec": 147045, "dt_s": 4.457, "eta_s": 25507, "world_size": 1, "timestamp": "2026-05-05T00:48:20.140976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35590, "epoch": 0, "train_loss": 3.6228960901498795, "train_ppl": 37.445857517114646, "lr": 0.00056, "grad_norm": 0.7622, "tokens_per_sec": 147004, "dt_s": 4.458, "eta_s": 25518, "world_size": 1, "timestamp": "2026-05-05T00:48:24.599123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35600, "epoch": 0, "train_loss": 3.818433329463005, "train_ppl": 45.53281748886816, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 148116, "dt_s": 4.425, "eta_s": 25451, "world_size": 1, "timestamp": "2026-05-05T00:48:29.023740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35610, "epoch": 0, "train_loss": 3.836924761533737, "train_ppl": 46.38261728362617, "lr": 0.00056, "grad_norm": 0.7102, "tokens_per_sec": 145246, "dt_s": 4.512, "eta_s": 25411, "world_size": 1, "timestamp": "2026-05-05T00:48:33.535808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35620, "epoch": 0, "train_loss": 3.830072969198227, "train_ppl": 46.06589950333079, "lr": 0.00056, "grad_norm": 0.6232, "tokens_per_sec": 149300, "dt_s": 4.39, "eta_s": 25380, "world_size": 1, "timestamp": "2026-05-05T00:48:37.925385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35630, "epoch": 0, "train_loss": 3.8242153972387314, "train_ppl": 45.796853928290695, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 150120, "dt_s": 4.366, "eta_s": 25271, "world_size": 1, "timestamp": "2026-05-05T00:48:42.290955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35640, "epoch": 0, "train_loss": 3.7731189727783203, "train_ppl": 43.515577297180094, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 146195, "dt_s": 4.483, "eta_s": 25295, "world_size": 1, "timestamp": "2026-05-05T00:48:46.773728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35650, "epoch": 0, "train_loss": 3.866629511117935, "train_ppl": 47.781068813298795, "lr": 0.00056, "grad_norm": 0.7048, "tokens_per_sec": 148783, "dt_s": 4.405, "eta_s": 25268, "world_size": 1, "timestamp": "2026-05-05T00:48:51.178549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35660, "epoch": 0, "train_loss": 3.767559215426445, "train_ppl": 43.27431255456981, "lr": 0.00056, "grad_norm": 0.6159, "tokens_per_sec": 147288, "dt_s": 4.45, "eta_s": 25192, "world_size": 1, "timestamp": "2026-05-05T00:48:55.628052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35670, "epoch": 0, "train_loss": 3.643302872776985, "train_ppl": 38.217857216206134, "lr": 0.00056, "grad_norm": 0.7133, "tokens_per_sec": 130182, "dt_s": 5.034, "eta_s": 25923, "world_size": 1, "timestamp": "2026-05-05T00:49:00.662273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35680, "epoch": 0, "train_loss": 3.639414668083191, "train_ppl": 38.069546881708625, "lr": 0.00056, "grad_norm": 0.651, "tokens_per_sec": 147149, "dt_s": 4.454, "eta_s": 26019, "world_size": 1, "timestamp": "2026-05-05T00:49:05.115957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35690, "epoch": 0, "train_loss": 3.843547210097313, "train_ppl": 46.690803126941795, "lr": 0.00056, "grad_norm": 0.7749, "tokens_per_sec": 146168, "dt_s": 4.484, "eta_s": 26015, "world_size": 1, "timestamp": "2026-05-05T00:49:09.599576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35700, "epoch": 0, "train_loss": 3.82138928771019, "train_ppl": 45.66760971819521, "lr": 0.00056, "grad_norm": 0.7134, "tokens_per_sec": 150130, "dt_s": 4.365, "eta_s": 25965, "world_size": 1, "timestamp": "2026-05-05T00:49:13.964883"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35710, "epoch": 0, "train_loss": 3.769924372434616, "train_ppl": 43.37678423121552, "lr": 0.00056, "grad_norm": 0.6863, "tokens_per_sec": 149938, "dt_s": 4.371, "eta_s": 25871, "world_size": 1, "timestamp": "2026-05-05T00:49:18.335741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35720, "epoch": 0, "train_loss": 3.7369314879179, "train_ppl": 41.9690099634009, "lr": 0.00056, "grad_norm": 0.7162, "tokens_per_sec": 145747, "dt_s": 4.497, "eta_s": 25254, "world_size": 1, "timestamp": "2026-05-05T00:49:22.832326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35730, "epoch": 0, "train_loss": 3.850749433040619, "train_ppl": 47.02829458596647, "lr": 0.00056, "grad_norm": 0.7085, "tokens_per_sec": 148006, "dt_s": 4.428, "eta_s": 25220, "world_size": 1, "timestamp": "2026-05-05T00:49:27.260226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35740, "epoch": 0, "train_loss": 3.5979806929826736, "train_ppl": 36.52440593834049, "lr": 0.00056, "grad_norm": 0.8202, "tokens_per_sec": 148371, "dt_s": 4.417, "eta_s": 25140, "world_size": 1, "timestamp": "2026-05-05T00:49:31.677278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35750, "epoch": 0, "train_loss": 3.8929711282253265, "train_ppl": 49.05642313611512, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 147902, "dt_s": 4.431, "eta_s": 25210, "world_size": 1, "timestamp": "2026-05-05T00:49:36.108279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35760, "epoch": 0, "train_loss": 3.796611800789833, "train_ppl": 44.54998426819559, "lr": 0.00056, "grad_norm": 0.7397, "tokens_per_sec": 149734, "dt_s": 4.377, "eta_s": 25213, "world_size": 1, "timestamp": "2026-05-05T00:49:40.485111"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35770, "epoch": 0, "train_loss": 3.7994495183229446, "train_ppl": 44.676584081953656, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 149305, "dt_s": 4.389, "eta_s": 25086, "world_size": 1, "timestamp": "2026-05-05T00:49:44.874495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35780, "epoch": 0, "train_loss": 3.7830869406461716, "train_ppl": 43.95150823671218, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 149200, "dt_s": 4.392, "eta_s": 25042, "world_size": 1, "timestamp": "2026-05-05T00:49:49.266999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35790, "epoch": 0, "train_loss": 3.779140591621399, "train_ppl": 43.77840203844149, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 150599, "dt_s": 4.352, "eta_s": 24963, "world_size": 1, "timestamp": "2026-05-05T00:49:53.618662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35800, "epoch": 0, "train_loss": 3.8016045838594437, "train_ppl": 44.772968869060264, "lr": 0.00056, "grad_norm": 0.6377, "tokens_per_sec": 148819, "dt_s": 4.404, "eta_s": 24928, "world_size": 1, "timestamp": "2026-05-05T00:49:58.022417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35810, "epoch": 0, "train_loss": 3.8928725570440292, "train_ppl": 49.05158782485146, "lr": 0.00056, "grad_norm": 0.662, "tokens_per_sec": 148620, "dt_s": 4.41, "eta_s": 24960, "world_size": 1, "timestamp": "2026-05-05T00:50:02.432040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35820, "epoch": 0, "train_loss": 3.6924372762441635, "train_ppl": 40.142566347677615, "lr": 0.00056, "grad_norm": 0.7327, "tokens_per_sec": 150844, "dt_s": 4.345, "eta_s": 24905, "world_size": 1, "timestamp": "2026-05-05T00:50:06.776680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35830, "epoch": 0, "train_loss": 3.8102108240127563, "train_ppl": 45.1599586664842, "lr": 0.00056, "grad_norm": 0.6869, "tokens_per_sec": 145112, "dt_s": 4.516, "eta_s": 25042, "world_size": 1, "timestamp": "2026-05-05T00:50:11.292923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35840, "epoch": 0, "train_loss": 3.8272270411252975, "train_ppl": 45.93498564080704, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 148170, "dt_s": 4.423, "eta_s": 25118, "world_size": 1, "timestamp": "2026-05-05T00:50:15.715930"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35850, "epoch": 0, "train_loss": 3.698054015636444, "train_ppl": 40.36867107262303, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 147491, "dt_s": 4.443, "eta_s": 25159, "world_size": 1, "timestamp": "2026-05-05T00:50:20.159340"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35860, "epoch": 0, "train_loss": 3.8151530027389526, "train_ppl": 45.38369968211698, "lr": 0.00056, "grad_norm": 0.6328, "tokens_per_sec": 148815, "dt_s": 4.404, "eta_s": 25148, "world_size": 1, "timestamp": "2026-05-05T00:50:24.563192"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35870, "epoch": 0, "train_loss": 3.8427685648202896, "train_ppl": 46.65446170398137, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 150643, "dt_s": 4.35, "eta_s": 25150, "world_size": 1, "timestamp": "2026-05-05T00:50:28.913634"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35880, "epoch": 0, "train_loss": 3.8560258001089096, "train_ppl": 47.27708891870615, "lr": 0.00056, "grad_norm": 0.7031, "tokens_per_sec": 148907, "dt_s": 4.401, "eta_s": 25015, "world_size": 1, "timestamp": "2026-05-05T00:50:33.314726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35890, "epoch": 0, "train_loss": 3.804515451192856, "train_ppl": 44.90348690975342, "lr": 0.00056, "grad_norm": 0.6853, "tokens_per_sec": 147543, "dt_s": 4.442, "eta_s": 25032, "world_size": 1, "timestamp": "2026-05-05T00:50:37.756537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35900, "epoch": 0, "train_loss": 3.748513162136078, "train_ppl": 42.45790702244288, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 149758, "dt_s": 4.376, "eta_s": 24951, "world_size": 1, "timestamp": "2026-05-05T00:50:42.132670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35910, "epoch": 0, "train_loss": 3.665086016058922, "train_ppl": 39.059495762496496, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 148299, "dt_s": 4.419, "eta_s": 24964, "world_size": 1, "timestamp": "2026-05-05T00:50:46.551844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35920, "epoch": 0, "train_loss": 3.8201201111078262, "train_ppl": 45.609686221803486, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 151068, "dt_s": 4.338, "eta_s": 24946, "world_size": 1, "timestamp": "2026-05-05T00:50:50.890027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35930, "epoch": 0, "train_loss": 3.8407865315675735, "train_ppl": 46.562082588977674, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 152641, "dt_s": 4.293, "eta_s": 24819, "world_size": 1, "timestamp": "2026-05-05T00:50:55.183501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35940, "epoch": 0, "train_loss": 3.798527777194977, "train_ppl": 44.63542280988113, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 147506, "dt_s": 4.443, "eta_s": 24816, "world_size": 1, "timestamp": "2026-05-05T00:50:59.626446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35950, "epoch": 0, "train_loss": 3.900820776820183, "train_ppl": 49.44301413575721, "lr": 0.00056, "grad_norm": 0.6408, "tokens_per_sec": 150439, "dt_s": 4.356, "eta_s": 24789, "world_size": 1, "timestamp": "2026-05-05T00:51:03.982793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35960, "epoch": 0, "train_loss": 3.825785204768181, "train_ppl": 45.86880263244594, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 149744, "dt_s": 4.377, "eta_s": 24736, "world_size": 1, "timestamp": "2026-05-05T00:51:08.359322"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35970, "epoch": 0, "train_loss": 3.8417643904685974, "train_ppl": 46.60763600466414, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 133964, "dt_s": 4.892, "eta_s": 25360, "world_size": 1, "timestamp": "2026-05-05T00:51:13.251377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35980, "epoch": 0, "train_loss": 3.711789697408676, "train_ppl": 40.926987945430206, "lr": 0.00056, "grad_norm": 0.6381, "tokens_per_sec": 149470, "dt_s": 4.385, "eta_s": 25459, "world_size": 1, "timestamp": "2026-05-05T00:51:17.635909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 35990, "epoch": 0, "train_loss": 3.7141683101654053, "train_ppl": 41.02445327123209, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 147138, "dt_s": 4.454, "eta_s": 25467, "world_size": 1, "timestamp": "2026-05-05T00:51:22.089984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36000, "epoch": 0, "train_loss": 3.7984340488910675, "train_ppl": 44.631239403461734, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 150199, "dt_s": 4.363, "eta_s": 25470, "world_size": 1, "timestamp": "2026-05-05T00:51:26.453238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36010, "epoch": 0, "train_loss": 3.7341136783361435, "train_ppl": 41.85091574663646, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 126865, "dt_s": 5.166, "eta_s": 25484, "world_size": 1, "timestamp": "2026-05-05T00:51:31.619025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36020, "epoch": 0, "train_loss": 3.887414515018463, "train_ppl": 48.784591498490656, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 149333, "dt_s": 4.389, "eta_s": 24909, "world_size": 1, "timestamp": "2026-05-05T00:51:36.007615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36030, "epoch": 0, "train_loss": 3.7849681079387665, "train_ppl": 44.03426619284218, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 148664, "dt_s": 4.408, "eta_s": 24931, "world_size": 1, "timestamp": "2026-05-05T00:51:40.415955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36040, "epoch": 0, "train_loss": 3.858749896287918, "train_ppl": 47.406051829921495, "lr": 0.00056, "grad_norm": 0.6887, "tokens_per_sec": 149041, "dt_s": 4.397, "eta_s": 24863, "world_size": 1, "timestamp": "2026-05-05T00:51:44.813145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36050, "epoch": 0, "train_loss": 4.026480406522751, "train_ppl": 56.06324376951187, "lr": 0.00056, "grad_norm": 2.4066, "tokens_per_sec": 148941, "dt_s": 4.4, "eta_s": 24900, "world_size": 1, "timestamp": "2026-05-05T00:51:49.213297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36060, "epoch": 0, "train_loss": 3.76765713095665, "train_ppl": 43.27854998927979, "lr": 0.00056, "grad_norm": 0.705, "tokens_per_sec": 149891, "dt_s": 4.372, "eta_s": 24873, "world_size": 1, "timestamp": "2026-05-05T00:51:53.585516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36070, "epoch": 0, "train_loss": 3.888423979282379, "train_ppl": 48.83386266479492, "lr": 0.00056, "grad_norm": 0.7031, "tokens_per_sec": 148707, "dt_s": 4.407, "eta_s": 24889, "world_size": 1, "timestamp": "2026-05-05T00:51:57.992558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36080, "epoch": 0, "train_loss": 3.8066662400960922, "train_ppl": 45.00016876497622, "lr": 0.00056, "grad_norm": 0.6853, "tokens_per_sec": 148759, "dt_s": 4.406, "eta_s": 24882, "world_size": 1, "timestamp": "2026-05-05T00:52:02.398067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36090, "epoch": 0, "train_loss": 3.768415093421936, "train_ppl": 43.311365940782345, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 151603, "dt_s": 4.323, "eta_s": 24793, "world_size": 1, "timestamp": "2026-05-05T00:52:06.720953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36100, "epoch": 0, "train_loss": 3.923633337020874, "train_ppl": 50.58389965482613, "lr": 0.00056, "grad_norm": 0.7389, "tokens_per_sec": 146908, "dt_s": 4.461, "eta_s": 24858, "world_size": 1, "timestamp": "2026-05-05T00:52:11.182009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36110, "epoch": 0, "train_loss": 3.7936746329069138, "train_ppl": 44.41932546257452, "lr": 0.00056, "grad_norm": 0.7245, "tokens_per_sec": 147216, "dt_s": 4.452, "eta_s": 24943, "world_size": 1, "timestamp": "2026-05-05T00:52:15.633686"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36120, "epoch": 0, "train_loss": 3.694006010890007, "train_ppl": 40.20558880210967, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 148474, "dt_s": 4.414, "eta_s": 24947, "world_size": 1, "timestamp": "2026-05-05T00:52:20.047650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36130, "epoch": 0, "train_loss": 3.913308173418045, "train_ppl": 50.064299708608495, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 147162, "dt_s": 4.453, "eta_s": 24996, "world_size": 1, "timestamp": "2026-05-05T00:52:24.500979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36140, "epoch": 0, "train_loss": 3.656826838850975, "train_ppl": 38.73822500765074, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 151772, "dt_s": 4.318, "eta_s": 24986, "world_size": 1, "timestamp": "2026-05-05T00:52:28.819014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36150, "epoch": 0, "train_loss": 3.6662754863500595, "train_ppl": 39.10598351471183, "lr": 0.00056, "grad_norm": 0.6338, "tokens_per_sec": 149364, "dt_s": 4.388, "eta_s": 24899, "world_size": 1, "timestamp": "2026-05-05T00:52:33.206705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36160, "epoch": 0, "train_loss": 3.664502829313278, "train_ppl": 39.03672342318493, "lr": 0.00056, "grad_norm": 0.7085, "tokens_per_sec": 151541, "dt_s": 4.325, "eta_s": 24751, "world_size": 1, "timestamp": "2026-05-05T00:52:37.531391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36170, "epoch": 0, "train_loss": 3.763028010725975, "train_ppl": 43.078671365988555, "lr": 0.00056, "grad_norm": 0.6948, "tokens_per_sec": 151373, "dt_s": 4.329, "eta_s": 24651, "world_size": 1, "timestamp": "2026-05-05T00:52:41.860795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36180, "epoch": 0, "train_loss": 3.7264197021722794, "train_ppl": 41.53015135759082, "lr": 0.00056, "grad_norm": 0.6207, "tokens_per_sec": 149895, "dt_s": 4.372, "eta_s": 24555, "world_size": 1, "timestamp": "2026-05-05T00:52:46.232904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36190, "epoch": 0, "train_loss": 3.815940737724304, "train_ppl": 45.419464094711394, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 150603, "dt_s": 4.352, "eta_s": 24589, "world_size": 1, "timestamp": "2026-05-05T00:52:50.584464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36200, "epoch": 0, "train_loss": 3.684493139386177, "train_ppl": 39.82493164403299, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 149727, "dt_s": 4.377, "eta_s": 24572, "world_size": 1, "timestamp": "2026-05-05T00:52:54.961490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36210, "epoch": 0, "train_loss": 3.713421270251274, "train_ppl": 40.993817811563375, "lr": 0.00056, "grad_norm": 0.7197, "tokens_per_sec": 147385, "dt_s": 4.447, "eta_s": 24706, "world_size": 1, "timestamp": "2026-05-05T00:52:59.408073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36220, "epoch": 0, "train_loss": 3.730852320790291, "train_ppl": 41.714647277649526, "lr": 0.00056, "grad_norm": 0.6742, "tokens_per_sec": 148776, "dt_s": 4.405, "eta_s": 24787, "world_size": 1, "timestamp": "2026-05-05T00:53:03.813091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36230, "epoch": 0, "train_loss": 3.7314347326755524, "train_ppl": 41.738949460266575, "lr": 0.00056, "grad_norm": 0.6407, "tokens_per_sec": 150909, "dt_s": 4.343, "eta_s": 24749, "world_size": 1, "timestamp": "2026-05-05T00:53:08.155874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36240, "epoch": 0, "train_loss": 3.7200769931077957, "train_ppl": 41.26757130486406, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 149446, "dt_s": 4.385, "eta_s": 24783, "world_size": 1, "timestamp": "2026-05-05T00:53:12.541104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36250, "epoch": 0, "train_loss": 3.796171322464943, "train_ppl": 44.53036528693646, "lr": 0.00056, "grad_norm": 0.6618, "tokens_per_sec": 150221, "dt_s": 4.363, "eta_s": 24762, "world_size": 1, "timestamp": "2026-05-05T00:53:16.903767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36260, "epoch": 0, "train_loss": 3.790928214788437, "train_ppl": 44.297498792377056, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 133458, "dt_s": 4.911, "eta_s": 25281, "world_size": 1, "timestamp": "2026-05-05T00:53:21.814365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36270, "epoch": 0, "train_loss": 3.6548871099948883, "train_ppl": 38.66315618488818, "lr": 0.00056, "grad_norm": 0.6263, "tokens_per_sec": 151049, "dt_s": 4.339, "eta_s": 25202, "world_size": 1, "timestamp": "2026-05-05T00:53:26.153059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36280, "epoch": 0, "train_loss": 3.7795897126197815, "train_ppl": 43.7980682539972, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 149074, "dt_s": 4.396, "eta_s": 25258, "world_size": 1, "timestamp": "2026-05-05T00:53:30.549278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36290, "epoch": 0, "train_loss": 3.6844842731952667, "train_ppl": 39.82457855015135, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 147809, "dt_s": 4.434, "eta_s": 25308, "world_size": 1, "timestamp": "2026-05-05T00:53:34.983095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36300, "epoch": 0, "train_loss": 3.7464448362588882, "train_ppl": 42.37018098894377, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 150492, "dt_s": 4.355, "eta_s": 25295, "world_size": 1, "timestamp": "2026-05-05T00:53:39.337896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36310, "epoch": 0, "train_loss": 3.851348266005516, "train_ppl": 47.0564651129257, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 150613, "dt_s": 4.351, "eta_s": 24660, "world_size": 1, "timestamp": "2026-05-05T00:53:43.689154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36320, "epoch": 0, "train_loss": 3.7752650529146194, "train_ppl": 43.609065493936946, "lr": 0.00056, "grad_norm": 0.7117, "tokens_per_sec": 149673, "dt_s": 4.379, "eta_s": 24700, "world_size": 1, "timestamp": "2026-05-05T00:53:48.067774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36330, "epoch": 0, "train_loss": 3.7399564683437347, "train_ppl": 42.09615760964695, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 150138, "dt_s": 4.365, "eta_s": 24661, "world_size": 1, "timestamp": "2026-05-05T00:53:52.432827"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36340, "epoch": 0, "train_loss": 3.7985082864761353, "train_ppl": 44.63455284188294, "lr": 0.00056, "grad_norm": 0.759, "tokens_per_sec": 146743, "dt_s": 4.466, "eta_s": 24693, "world_size": 1, "timestamp": "2026-05-05T00:53:56.898872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36350, "epoch": 0, "train_loss": 3.81552292406559, "train_ppl": 45.40049118608733, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 145015, "dt_s": 4.519, "eta_s": 24874, "world_size": 1, "timestamp": "2026-05-05T00:54:01.418155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36360, "epoch": 0, "train_loss": 3.778682067990303, "train_ppl": 43.75833320794338, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 147458, "dt_s": 4.444, "eta_s": 24974, "world_size": 1, "timestamp": "2026-05-05T00:54:05.862490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36370, "epoch": 0, "train_loss": 3.782622918486595, "train_ppl": 43.93111849395563, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 148085, "dt_s": 4.426, "eta_s": 25022, "world_size": 1, "timestamp": "2026-05-05T00:54:10.288055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36380, "epoch": 0, "train_loss": 3.8266921937465668, "train_ppl": 45.910424003094306, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 148897, "dt_s": 4.401, "eta_s": 25059, "world_size": 1, "timestamp": "2026-05-05T00:54:14.689501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36390, "epoch": 0, "train_loss": 3.620067372918129, "train_ppl": 37.34008344769523, "lr": 0.00056, "grad_norm": 0.7268, "tokens_per_sec": 148387, "dt_s": 4.417, "eta_s": 24999, "world_size": 1, "timestamp": "2026-05-05T00:54:19.106068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36400, "epoch": 0, "train_loss": 3.7707021832466125, "train_ppl": 43.410536287635765, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 144635, "dt_s": 4.531, "eta_s": 25008, "world_size": 1, "timestamp": "2026-05-05T00:54:23.637220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36410, "epoch": 0, "train_loss": 3.8379661440849304, "train_ppl": 46.43094449113185, "lr": 0.00056, "grad_norm": 0.7076, "tokens_per_sec": 146506, "dt_s": 4.473, "eta_s": 25036, "world_size": 1, "timestamp": "2026-05-05T00:54:28.110462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36420, "epoch": 0, "train_loss": 3.727692022919655, "train_ppl": 41.58302465957166, "lr": 0.00056, "grad_norm": 0.6743, "tokens_per_sec": 147512, "dt_s": 4.443, "eta_s": 25051, "world_size": 1, "timestamp": "2026-05-05T00:54:32.553244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36430, "epoch": 0, "train_loss": 3.763623893260956, "train_ppl": 43.10434884350613, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 146933, "dt_s": 4.46, "eta_s": 25112, "world_size": 1, "timestamp": "2026-05-05T00:54:37.013493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36440, "epoch": 0, "train_loss": 3.73489473760128, "train_ppl": 41.88361656110945, "lr": 0.00056, "grad_norm": 0.6217, "tokens_per_sec": 147533, "dt_s": 4.442, "eta_s": 25137, "world_size": 1, "timestamp": "2026-05-05T00:54:41.455618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36450, "epoch": 0, "train_loss": 3.8465036749839783, "train_ppl": 46.82904710295167, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 149764, "dt_s": 4.376, "eta_s": 24958, "world_size": 1, "timestamp": "2026-05-05T00:54:45.831560"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36460, "epoch": 0, "train_loss": 3.787225902080536, "train_ppl": 44.133798820900026, "lr": 0.00056, "grad_norm": 0.662, "tokens_per_sec": 145518, "dt_s": 4.504, "eta_s": 24987, "world_size": 1, "timestamp": "2026-05-05T00:54:50.335197"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36470, "epoch": 0, "train_loss": 3.768380180001259, "train_ppl": 43.309853819239976, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 146757, "dt_s": 4.466, "eta_s": 25009, "world_size": 1, "timestamp": "2026-05-05T00:54:54.800827"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36480, "epoch": 0, "train_loss": 3.8419215083122253, "train_ppl": 46.61495947123836, "lr": 0.00056, "grad_norm": 0.7423, "tokens_per_sec": 145823, "dt_s": 4.494, "eta_s": 25042, "world_size": 1, "timestamp": "2026-05-05T00:54:59.294995"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36490, "epoch": 0, "train_loss": 3.7531086206436157, "train_ppl": 42.653469578125275, "lr": 0.00056, "grad_norm": 0.7018, "tokens_per_sec": 147761, "dt_s": 4.435, "eta_s": 25030, "world_size": 1, "timestamp": "2026-05-05T00:55:03.730269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36500, "epoch": 0, "train_loss": 3.8574350774288177, "train_ppl": 47.34376241756064, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 147655, "dt_s": 4.438, "eta_s": 25096, "world_size": 1, "timestamp": "2026-05-05T00:55:08.168727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36510, "epoch": 0, "train_loss": 3.7668532729148865, "train_ppl": 43.243774158123465, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 123992, "dt_s": 5.286, "eta_s": 25091, "world_size": 1, "timestamp": "2026-05-05T00:55:13.454256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36520, "epoch": 0, "train_loss": 3.8560742139816284, "train_ppl": 47.279377841078926, "lr": 0.00056, "grad_norm": 0.6692, "tokens_per_sec": 146762, "dt_s": 4.465, "eta_s": 25086, "world_size": 1, "timestamp": "2026-05-05T00:55:17.919737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36530, "epoch": 0, "train_loss": 3.812830314040184, "train_ppl": 45.27840980090116, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 148070, "dt_s": 4.426, "eta_s": 25005, "world_size": 1, "timestamp": "2026-05-05T00:55:22.345721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36540, "epoch": 0, "train_loss": 3.8272363990545273, "train_ppl": 45.93541549916313, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 149528, "dt_s": 4.383, "eta_s": 24942, "world_size": 1, "timestamp": "2026-05-05T00:55:26.728611"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36550, "epoch": 0, "train_loss": 3.815486714243889, "train_ppl": 45.398847272159436, "lr": 0.00056, "grad_norm": 0.6966, "tokens_per_sec": 149672, "dt_s": 4.379, "eta_s": 24870, "world_size": 1, "timestamp": "2026-05-05T00:55:31.107247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36560, "epoch": 0, "train_loss": 3.8611215502023697, "train_ppl": 47.51861600718641, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 130560, "dt_s": 5.02, "eta_s": 25446, "world_size": 1, "timestamp": "2026-05-05T00:55:36.126847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36570, "epoch": 0, "train_loss": 3.770380914211273, "train_ppl": 43.396592066562235, "lr": 0.00056, "grad_norm": 0.7017, "tokens_per_sec": 146136, "dt_s": 4.485, "eta_s": 25463, "world_size": 1, "timestamp": "2026-05-05T00:55:40.611432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36580, "epoch": 0, "train_loss": 3.6589894592761993, "train_ppl": 38.82209173755422, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 150545, "dt_s": 4.353, "eta_s": 25376, "world_size": 1, "timestamp": "2026-05-05T00:55:44.964707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36590, "epoch": 0, "train_loss": 3.7303186506032944, "train_ppl": 41.69239135322626, "lr": 0.00056, "grad_norm": 0.6274, "tokens_per_sec": 146586, "dt_s": 4.471, "eta_s": 25471, "world_size": 1, "timestamp": "2026-05-05T00:55:49.435489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36600, "epoch": 0, "train_loss": 3.8094596415758133, "train_ppl": 45.12604803681016, "lr": 0.00056, "grad_norm": 0.6626, "tokens_per_sec": 149387, "dt_s": 4.387, "eta_s": 25475, "world_size": 1, "timestamp": "2026-05-05T00:55:53.822495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36610, "epoch": 0, "train_loss": 3.906772091984749, "train_ppl": 49.73814242522312, "lr": 0.00056, "grad_norm": 0.6868, "tokens_per_sec": 147344, "dt_s": 4.448, "eta_s": 24830, "world_size": 1, "timestamp": "2026-05-05T00:55:58.270341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36620, "epoch": 0, "train_loss": 3.834035202860832, "train_ppl": 46.248785440183816, "lr": 0.00056, "grad_norm": 0.7414, "tokens_per_sec": 146238, "dt_s": 4.481, "eta_s": 24822, "world_size": 1, "timestamp": "2026-05-05T00:56:02.751832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36630, "epoch": 0, "train_loss": 3.7658818662166595, "train_ppl": 43.20178726272704, "lr": 0.00056, "grad_norm": 0.6983, "tokens_per_sec": 148222, "dt_s": 4.421, "eta_s": 24894, "world_size": 1, "timestamp": "2026-05-05T00:56:07.173291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36640, "epoch": 0, "train_loss": 3.5594267398118973, "train_ppl": 35.143045260785286, "lr": 0.00056, "grad_norm": 0.7981, "tokens_per_sec": 145580, "dt_s": 4.502, "eta_s": 24924, "world_size": 1, "timestamp": "2026-05-05T00:56:11.674998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36650, "epoch": 0, "train_loss": 3.8027328550815582, "train_ppl": 44.82351342998871, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 148190, "dt_s": 4.422, "eta_s": 24959, "world_size": 1, "timestamp": "2026-05-05T00:56:16.097441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36660, "epoch": 0, "train_loss": 3.902619808912277, "train_ppl": 49.532043764484776, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 147826, "dt_s": 4.433, "eta_s": 24939, "world_size": 1, "timestamp": "2026-05-05T00:56:20.530758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36670, "epoch": 0, "train_loss": 3.720135048031807, "train_ppl": 41.26996716012522, "lr": 0.00056, "grad_norm": 0.6397, "tokens_per_sec": 145065, "dt_s": 4.518, "eta_s": 24975, "world_size": 1, "timestamp": "2026-05-05T00:56:25.048434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36680, "epoch": 0, "train_loss": 3.822332814335823, "train_ppl": 45.71071865791144, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 146079, "dt_s": 4.486, "eta_s": 25043, "world_size": 1, "timestamp": "2026-05-05T00:56:29.534778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36690, "epoch": 0, "train_loss": 3.7779040187597275, "train_ppl": 43.72430031181084, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 147653, "dt_s": 4.439, "eta_s": 24968, "world_size": 1, "timestamp": "2026-05-05T00:56:33.973289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36700, "epoch": 0, "train_loss": 3.7252700477838516, "train_ppl": 41.482433471624745, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 145063, "dt_s": 4.518, "eta_s": 25070, "world_size": 1, "timestamp": "2026-05-05T00:56:38.491051"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36710, "epoch": 0, "train_loss": 3.8595393002033234, "train_ppl": 47.44348912748047, "lr": 0.00056, "grad_norm": 0.6374, "tokens_per_sec": 147828, "dt_s": 4.433, "eta_s": 25065, "world_size": 1, "timestamp": "2026-05-05T00:56:42.924327"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36720, "epoch": 0, "train_loss": 3.769473224878311, "train_ppl": 43.35721931467323, "lr": 0.00056, "grad_norm": 0.6448, "tokens_per_sec": 145463, "dt_s": 4.505, "eta_s": 25047, "world_size": 1, "timestamp": "2026-05-05T00:56:47.429705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36730, "epoch": 0, "train_loss": 3.786046102643013, "train_ppl": 44.08176049330632, "lr": 0.00056, "grad_norm": 0.726, "tokens_per_sec": 147252, "dt_s": 4.451, "eta_s": 25003, "world_size": 1, "timestamp": "2026-05-05T00:56:51.880290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36740, "epoch": 0, "train_loss": 3.816779524087906, "train_ppl": 45.45757730402167, "lr": 0.00056, "grad_norm": 0.728, "tokens_per_sec": 150088, "dt_s": 4.367, "eta_s": 24918, "world_size": 1, "timestamp": "2026-05-05T00:56:56.246798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36750, "epoch": 0, "train_loss": 3.6924794912338257, "train_ppl": 40.14426100147064, "lr": 0.00056, "grad_norm": 0.628, "tokens_per_sec": 146145, "dt_s": 4.484, "eta_s": 24876, "world_size": 1, "timestamp": "2026-05-05T00:57:00.731101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36760, "epoch": 0, "train_loss": 3.6663670241832733, "train_ppl": 39.109563355551394, "lr": 0.00056, "grad_norm": 0.617, "tokens_per_sec": 149499, "dt_s": 4.384, "eta_s": 24816, "world_size": 1, "timestamp": "2026-05-05T00:57:05.114824"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36770, "epoch": 0, "train_loss": 3.814073771238327, "train_ppl": 45.33474658441912, "lr": 0.00056, "grad_norm": 0.6292, "tokens_per_sec": 148218, "dt_s": 4.422, "eta_s": 24718, "world_size": 1, "timestamp": "2026-05-05T00:57:09.536419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36780, "epoch": 0, "train_loss": 3.843657523393631, "train_ppl": 46.69595402744365, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 144055, "dt_s": 4.549, "eta_s": 24824, "world_size": 1, "timestamp": "2026-05-05T00:57:14.085802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36790, "epoch": 0, "train_loss": 3.853844717144966, "train_ppl": 47.17408603523102, "lr": 0.00056, "grad_norm": 0.7183, "tokens_per_sec": 147179, "dt_s": 4.453, "eta_s": 24916, "world_size": 1, "timestamp": "2026-05-05T00:57:18.538622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36800, "epoch": 0, "train_loss": 3.7548162192106247, "train_ppl": 42.726366803540294, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 148172, "dt_s": 4.423, "eta_s": 24843, "world_size": 1, "timestamp": "2026-05-05T00:57:22.961552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36810, "epoch": 0, "train_loss": 3.8054230511188507, "train_ppl": 44.94425981109189, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 145482, "dt_s": 4.505, "eta_s": 24974, "world_size": 1, "timestamp": "2026-05-05T00:57:27.466342"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36820, "epoch": 0, "train_loss": 3.873756393790245, "train_ppl": 48.12281523149136, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 147581, "dt_s": 4.441, "eta_s": 24990, "world_size": 1, "timestamp": "2026-05-05T00:57:31.907009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36830, "epoch": 0, "train_loss": 3.6717488318681717, "train_ppl": 39.32061090343324, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 146234, "dt_s": 4.482, "eta_s": 24910, "world_size": 1, "timestamp": "2026-05-05T00:57:36.388605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36840, "epoch": 0, "train_loss": 3.783622607588768, "train_ppl": 43.975057913580436, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 148704, "dt_s": 4.407, "eta_s": 24855, "world_size": 1, "timestamp": "2026-05-05T00:57:40.795732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36850, "epoch": 0, "train_loss": 3.748751014471054, "train_ppl": 42.46800693586275, "lr": 0.00056, "grad_norm": 0.6595, "tokens_per_sec": 130880, "dt_s": 5.007, "eta_s": 25503, "world_size": 1, "timestamp": "2026-05-05T00:57:45.803079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36860, "epoch": 0, "train_loss": 3.812928020954132, "train_ppl": 45.28283403072668, "lr": 0.00056, "grad_norm": 0.6317, "tokens_per_sec": 145848, "dt_s": 4.493, "eta_s": 25486, "world_size": 1, "timestamp": "2026-05-05T00:57:50.296569"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36870, "epoch": 0, "train_loss": 3.825942650437355, "train_ppl": 45.87602504532452, "lr": 0.00056, "grad_norm": 0.6285, "tokens_per_sec": 150178, "dt_s": 4.364, "eta_s": 25395, "world_size": 1, "timestamp": "2026-05-05T00:57:54.660450"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36880, "epoch": 0, "train_loss": 3.7946891635656357, "train_ppl": 44.4644130976228, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 150110, "dt_s": 4.366, "eta_s": 25262, "world_size": 1, "timestamp": "2026-05-05T00:57:59.026314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36890, "epoch": 0, "train_loss": 3.660765677690506, "train_ppl": 38.89110952897073, "lr": 0.00056, "grad_norm": 0.7266, "tokens_per_sec": 148118, "dt_s": 4.425, "eta_s": 25277, "world_size": 1, "timestamp": "2026-05-05T00:58:03.450897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36900, "epoch": 0, "train_loss": 3.7671948969364166, "train_ppl": 43.258549793869854, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 149748, "dt_s": 4.376, "eta_s": 24568, "world_size": 1, "timestamp": "2026-05-05T00:58:07.827344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36910, "epoch": 0, "train_loss": 3.7782018035650253, "train_ppl": 43.73732268290187, "lr": 0.00056, "grad_norm": 0.6349, "tokens_per_sec": 149173, "dt_s": 4.393, "eta_s": 24452, "world_size": 1, "timestamp": "2026-05-05T00:58:12.220627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36920, "epoch": 0, "train_loss": 3.6960400342941284, "train_ppl": 40.287451137438886, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 147096, "dt_s": 4.455, "eta_s": 24549, "world_size": 1, "timestamp": "2026-05-05T00:58:16.675939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36930, "epoch": 0, "train_loss": 3.7935521453619003, "train_ppl": 44.41388498164983, "lr": 0.00056, "grad_norm": 0.668, "tokens_per_sec": 149588, "dt_s": 4.381, "eta_s": 24562, "world_size": 1, "timestamp": "2026-05-05T00:58:21.057018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36940, "epoch": 0, "train_loss": 3.691817954182625, "train_ppl": 40.117712867680346, "lr": 0.00056, "grad_norm": 0.7078, "tokens_per_sec": 146549, "dt_s": 4.472, "eta_s": 24611, "world_size": 1, "timestamp": "2026-05-05T00:58:25.528975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36950, "epoch": 0, "train_loss": 3.6840890049934387, "train_ppl": 39.80884027122479, "lr": 0.00056, "grad_norm": 0.725, "tokens_per_sec": 150721, "dt_s": 4.348, "eta_s": 24575, "world_size": 1, "timestamp": "2026-05-05T00:58:29.877157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36960, "epoch": 0, "train_loss": 3.7637766003608704, "train_ppl": 43.110931686222344, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 149987, "dt_s": 4.369, "eta_s": 24544, "world_size": 1, "timestamp": "2026-05-05T00:58:34.246609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36970, "epoch": 0, "train_loss": 3.8665503561496735, "train_ppl": 47.777286853995776, "lr": 0.00056, "grad_norm": 0.6902, "tokens_per_sec": 148424, "dt_s": 4.415, "eta_s": 24495, "world_size": 1, "timestamp": "2026-05-05T00:58:38.662051"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36980, "epoch": 0, "train_loss": 3.751578599214554, "train_ppl": 42.58825875534639, "lr": 0.00056, "grad_norm": 0.6487, "tokens_per_sec": 148385, "dt_s": 4.417, "eta_s": 24530, "world_size": 1, "timestamp": "2026-05-05T00:58:43.078697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 36990, "epoch": 0, "train_loss": 3.7668319642543793, "train_ppl": 43.242852701038416, "lr": 0.00056, "grad_norm": 0.6676, "tokens_per_sec": 147859, "dt_s": 4.432, "eta_s": 24481, "world_size": 1, "timestamp": "2026-05-05T00:58:47.510981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37000, "epoch": 0, "train_loss": 3.71120485663414, "train_ppl": 40.903059172044, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 146130, "dt_s": 4.485, "eta_s": 24629, "world_size": 1, "timestamp": "2026-05-05T00:58:51.995753"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37010, "epoch": 0, "train_loss": 3.7301995754241943, "train_ppl": 41.68742711982313, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 125598, "dt_s": 5.218, "eta_s": 24705, "world_size": 1, "timestamp": "2026-05-05T00:58:57.213663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37020, "epoch": 0, "train_loss": 3.805311918258667, "train_ppl": 44.939265304482284, "lr": 0.00056, "grad_norm": 0.6336, "tokens_per_sec": 145802, "dt_s": 4.495, "eta_s": 24789, "world_size": 1, "timestamp": "2026-05-05T00:59:01.708526"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37030, "epoch": 0, "train_loss": 3.9023501873016357, "train_ppl": 49.51869065529093, "lr": 0.00056, "grad_norm": 0.7853, "tokens_per_sec": 148074, "dt_s": 4.426, "eta_s": 24795, "world_size": 1, "timestamp": "2026-05-05T00:59:06.134410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37040, "epoch": 0, "train_loss": 3.675317957997322, "train_ppl": 39.461201867419355, "lr": 0.00056, "grad_norm": 0.7157, "tokens_per_sec": 146774, "dt_s": 4.465, "eta_s": 24827, "world_size": 1, "timestamp": "2026-05-05T00:59:10.599511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37050, "epoch": 0, "train_loss": 3.7409921437501907, "train_ppl": 42.13977814925057, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 147359, "dt_s": 4.447, "eta_s": 24781, "world_size": 1, "timestamp": "2026-05-05T00:59:15.046864"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37060, "epoch": 0, "train_loss": 3.7563168108463287, "train_ppl": 42.79052976134857, "lr": 0.00056, "grad_norm": 0.8381, "tokens_per_sec": 148702, "dt_s": 4.407, "eta_s": 24738, "world_size": 1, "timestamp": "2026-05-05T00:59:19.454071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37070, "epoch": 0, "train_loss": 3.6830621361732483, "train_ppl": 39.76798279560643, "lr": 0.00056, "grad_norm": 0.6843, "tokens_per_sec": 148303, "dt_s": 4.419, "eta_s": 24650, "world_size": 1, "timestamp": "2026-05-05T00:59:23.873146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37080, "epoch": 0, "train_loss": 3.824968457221985, "train_ppl": 45.83135469528606, "lr": 0.00056, "grad_norm": 0.62, "tokens_per_sec": 150082, "dt_s": 4.367, "eta_s": 24579, "world_size": 1, "timestamp": "2026-05-05T00:59:28.239839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37090, "epoch": 0, "train_loss": 3.742586702108383, "train_ppl": 42.207026085841704, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 148026, "dt_s": 4.427, "eta_s": 24533, "world_size": 1, "timestamp": "2026-05-05T00:59:32.667166"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37100, "epoch": 0, "train_loss": 3.7725947499275208, "train_ppl": 43.49277141539963, "lr": 0.00056, "grad_norm": 0.644, "tokens_per_sec": 147706, "dt_s": 4.437, "eta_s": 24517, "world_size": 1, "timestamp": "2026-05-05T00:59:37.104077"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37110, "epoch": 0, "train_loss": 3.782650128006935, "train_ppl": 43.93231385488037, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 149268, "dt_s": 4.39, "eta_s": 24494, "world_size": 1, "timestamp": "2026-05-05T00:59:41.494570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37120, "epoch": 0, "train_loss": 3.8523946553468704, "train_ppl": 47.1057302672327, "lr": 0.00056, "grad_norm": 0.6363, "tokens_per_sec": 150010, "dt_s": 4.369, "eta_s": 24434, "world_size": 1, "timestamp": "2026-05-05T00:59:45.863364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37130, "epoch": 0, "train_loss": 3.808008447289467, "train_ppl": 45.06060886769229, "lr": 0.00056, "grad_norm": 0.6499, "tokens_per_sec": 147456, "dt_s": 4.444, "eta_s": 24516, "world_size": 1, "timestamp": "2026-05-05T00:59:50.307789"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37140, "epoch": 0, "train_loss": 3.8098415583372116, "train_ppl": 45.143285722402084, "lr": 0.00056, "grad_norm": 0.6522, "tokens_per_sec": 150696, "dt_s": 4.349, "eta_s": 24424, "world_size": 1, "timestamp": "2026-05-05T00:59:54.656681"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37150, "epoch": 0, "train_loss": 3.661655515432358, "train_ppl": 38.925731707828305, "lr": 0.00056, "grad_norm": 0.6294, "tokens_per_sec": 134057, "dt_s": 4.889, "eta_s": 24921, "world_size": 1, "timestamp": "2026-05-05T00:59:59.545348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37160, "epoch": 0, "train_loss": 3.652647152543068, "train_ppl": 38.5766492821417, "lr": 0.00056, "grad_norm": 0.7154, "tokens_per_sec": 147776, "dt_s": 4.435, "eta_s": 24966, "world_size": 1, "timestamp": "2026-05-05T01:00:03.980145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37170, "epoch": 0, "train_loss": 3.7546098232269287, "train_ppl": 42.717549163028224, "lr": 0.00056, "grad_norm": 0.7182, "tokens_per_sec": 150229, "dt_s": 4.362, "eta_s": 24954, "world_size": 1, "timestamp": "2026-05-05T01:00:08.342584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37180, "epoch": 0, "train_loss": 3.694786548614502, "train_ppl": 40.23698303150156, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 146745, "dt_s": 4.466, "eta_s": 24974, "world_size": 1, "timestamp": "2026-05-05T01:00:12.808578"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37190, "epoch": 0, "train_loss": 3.6575902849435806, "train_ppl": 38.76781084632677, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 149297, "dt_s": 4.39, "eta_s": 25014, "world_size": 1, "timestamp": "2026-05-05T01:00:17.198205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37200, "epoch": 0, "train_loss": 3.7239438593387604, "train_ppl": 41.42745641070957, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 153625, "dt_s": 4.266, "eta_s": 24319, "world_size": 1, "timestamp": "2026-05-05T01:00:21.464148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37210, "epoch": 0, "train_loss": 3.719434306025505, "train_ppl": 41.24105769076017, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 147981, "dt_s": 4.429, "eta_s": 24308, "world_size": 1, "timestamp": "2026-05-05T01:00:25.892856"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37220, "epoch": 0, "train_loss": 3.841603696346283, "train_ppl": 46.600147033236176, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 150050, "dt_s": 4.368, "eta_s": 24309, "world_size": 1, "timestamp": "2026-05-05T01:00:30.260448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37230, "epoch": 0, "train_loss": 3.7464764416217804, "train_ppl": 42.37152013505172, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 149776, "dt_s": 4.376, "eta_s": 24205, "world_size": 1, "timestamp": "2026-05-05T01:00:34.636052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37240, "epoch": 0, "train_loss": 3.75891450047493, "train_ppl": 42.901830776870625, "lr": 0.00056, "grad_norm": 0.6234, "tokens_per_sec": 145020, "dt_s": 4.519, "eta_s": 24344, "world_size": 1, "timestamp": "2026-05-05T01:00:39.155134"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37250, "epoch": 0, "train_loss": 3.810202032327652, "train_ppl": 45.159561636093564, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 150222, "dt_s": 4.363, "eta_s": 24447, "world_size": 1, "timestamp": "2026-05-05T01:00:43.517730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37260, "epoch": 0, "train_loss": 3.8144953548908234, "train_ppl": 45.35386300177057, "lr": 0.00056, "grad_norm": 0.7264, "tokens_per_sec": 149297, "dt_s": 4.39, "eta_s": 24399, "world_size": 1, "timestamp": "2026-05-05T01:00:47.907403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37270, "epoch": 0, "train_loss": 3.832820951938629, "train_ppl": 46.19266189074387, "lr": 0.00056, "grad_norm": 0.6638, "tokens_per_sec": 148435, "dt_s": 4.415, "eta_s": 24447, "world_size": 1, "timestamp": "2026-05-05T01:00:52.322527"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37280, "epoch": 0, "train_loss": 3.8428878784179688, "train_ppl": 46.660028547748524, "lr": 0.00056, "grad_norm": 0.6639, "tokens_per_sec": 150373, "dt_s": 4.358, "eta_s": 24424, "world_size": 1, "timestamp": "2026-05-05T01:00:56.680767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37290, "epoch": 0, "train_loss": 3.7742151021957397, "train_ppl": 43.563302153102335, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 146766, "dt_s": 4.465, "eta_s": 24360, "world_size": 1, "timestamp": "2026-05-05T01:01:01.146114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37300, "epoch": 0, "train_loss": 3.785577893257141, "train_ppl": 44.06112583034599, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 146195, "dt_s": 4.483, "eta_s": 24488, "world_size": 1, "timestamp": "2026-05-05T01:01:05.628877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37310, "epoch": 0, "train_loss": 3.809633046388626, "train_ppl": 45.13387378921492, "lr": 0.00056, "grad_norm": 0.7059, "tokens_per_sec": 150827, "dt_s": 4.345, "eta_s": 24435, "world_size": 1, "timestamp": "2026-05-05T01:01:09.973999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37320, "epoch": 0, "train_loss": 3.9214540123939514, "train_ppl": 50.47378095236632, "lr": 0.00056, "grad_norm": 0.7479, "tokens_per_sec": 147811, "dt_s": 4.434, "eta_s": 24451, "world_size": 1, "timestamp": "2026-05-05T01:01:14.407771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37330, "epoch": 0, "train_loss": 3.6789578944444656, "train_ppl": 39.60509986525116, "lr": 0.00056, "grad_norm": 0.6675, "tokens_per_sec": 152325, "dt_s": 4.302, "eta_s": 24385, "world_size": 1, "timestamp": "2026-05-05T01:01:18.710144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37340, "epoch": 0, "train_loss": 3.7732777893543243, "train_ppl": 43.52248884098861, "lr": 0.00056, "grad_norm": 0.6276, "tokens_per_sec": 151112, "dt_s": 4.337, "eta_s": 24238, "world_size": 1, "timestamp": "2026-05-05T01:01:23.047057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37350, "epoch": 0, "train_loss": 3.7178003937005997, "train_ppl": 41.1737284383337, "lr": 0.00056, "grad_norm": 0.6436, "tokens_per_sec": 146817, "dt_s": 4.464, "eta_s": 24213, "world_size": 1, "timestamp": "2026-05-05T01:01:27.510845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37360, "epoch": 0, "train_loss": 3.8624745905399323, "train_ppl": 47.582954127649984, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 149389, "dt_s": 4.387, "eta_s": 24255, "world_size": 1, "timestamp": "2026-05-05T01:01:31.897770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37370, "epoch": 0, "train_loss": 3.7638888508081436, "train_ppl": 43.11577117919904, "lr": 0.00056, "grad_norm": 0.6617, "tokens_per_sec": 149345, "dt_s": 4.388, "eta_s": 24200, "world_size": 1, "timestamp": "2026-05-05T01:01:36.286010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37380, "epoch": 0, "train_loss": 3.822492092847824, "train_ppl": 45.7179999730249, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 147146, "dt_s": 4.454, "eta_s": 24363, "world_size": 1, "timestamp": "2026-05-05T01:01:40.739816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37390, "epoch": 0, "train_loss": 3.6619801968336105, "train_ppl": 38.9383722209027, "lr": 0.00056, "grad_norm": 0.7118, "tokens_per_sec": 150355, "dt_s": 4.359, "eta_s": 24383, "world_size": 1, "timestamp": "2026-05-05T01:01:45.098558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37400, "epoch": 0, "train_loss": 3.8042512834072113, "train_ppl": 44.891626421697154, "lr": 0.00056, "grad_norm": 0.6331, "tokens_per_sec": 149319, "dt_s": 4.389, "eta_s": 24296, "world_size": 1, "timestamp": "2026-05-05T01:01:49.487561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37410, "epoch": 0, "train_loss": 3.730089768767357, "train_ppl": 41.68284981413286, "lr": 0.00056, "grad_norm": 0.7114, "tokens_per_sec": 145764, "dt_s": 4.496, "eta_s": 24412, "world_size": 1, "timestamp": "2026-05-05T01:01:53.983588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37420, "epoch": 0, "train_loss": 3.7953176498413086, "train_ppl": 44.4923671544609, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 149639, "dt_s": 4.38, "eta_s": 24398, "world_size": 1, "timestamp": "2026-05-05T01:01:58.363169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37430, "epoch": 0, "train_loss": 3.775453433394432, "train_ppl": 43.61728136444939, "lr": 0.00056, "grad_norm": 0.702, "tokens_per_sec": 146690, "dt_s": 4.468, "eta_s": 24409, "world_size": 1, "timestamp": "2026-05-05T01:02:02.830875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37440, "epoch": 0, "train_loss": 3.7916803807020187, "train_ppl": 44.33083039490179, "lr": 0.00056, "grad_norm": 0.7164, "tokens_per_sec": 135308, "dt_s": 4.843, "eta_s": 24940, "world_size": 1, "timestamp": "2026-05-05T01:02:07.674353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37450, "epoch": 0, "train_loss": 3.7734444886446, "train_ppl": 43.52974461373871, "lr": 0.00056, "grad_norm": 0.6337, "tokens_per_sec": 151919, "dt_s": 4.314, "eta_s": 24852, "world_size": 1, "timestamp": "2026-05-05T01:02:11.988226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37460, "epoch": 0, "train_loss": 3.808374121785164, "train_ppl": 45.077089396186615, "lr": 0.00056, "grad_norm": 0.645, "tokens_per_sec": 148457, "dt_s": 4.414, "eta_s": 24758, "world_size": 1, "timestamp": "2026-05-05T01:02:16.402738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37470, "epoch": 0, "train_loss": 3.824374556541443, "train_ppl": 45.80414350371478, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 150833, "dt_s": 4.345, "eta_s": 24715, "world_size": 1, "timestamp": "2026-05-05T01:02:20.747640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37480, "epoch": 0, "train_loss": 3.7580432146787643, "train_ppl": 42.86446730058201, "lr": 0.00056, "grad_norm": 0.692, "tokens_per_sec": 148462, "dt_s": 4.414, "eta_s": 24652, "world_size": 1, "timestamp": "2026-05-05T01:02:25.161960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37490, "epoch": 0, "train_loss": 3.79426346719265, "train_ppl": 44.44548878653128, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 148641, "dt_s": 4.409, "eta_s": 24168, "world_size": 1, "timestamp": "2026-05-05T01:02:29.570982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37500, "epoch": 0, "train_loss": 3.8077407777309418, "train_ppl": 45.048549128494344, "lr": 0.00056, "grad_norm": 0.637, "tokens_per_sec": 150599, "dt_s": 4.352, "eta_s": 24205, "world_size": 1, "timestamp": "2026-05-05T01:02:33.922643"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37510, "epoch": 0, "train_loss": 3.703419715166092, "train_ppl": 40.58585939456438, "lr": 0.00056, "grad_norm": 0.6289, "tokens_per_sec": 125077, "dt_s": 5.24, "eta_s": 24259, "world_size": 1, "timestamp": "2026-05-05T01:02:39.162331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37520, "epoch": 0, "train_loss": 3.73273067176342, "train_ppl": 41.79307566091299, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 145923, "dt_s": 4.491, "eta_s": 24416, "world_size": 1, "timestamp": "2026-05-05T01:02:43.653463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37530, "epoch": 0, "train_loss": 3.7300266325473785, "train_ppl": 41.680218199633636, "lr": 0.00056, "grad_norm": 0.6842, "tokens_per_sec": 150336, "dt_s": 4.359, "eta_s": 24351, "world_size": 1, "timestamp": "2026-05-05T01:02:48.012765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37540, "epoch": 0, "train_loss": 3.718748241662979, "train_ppl": 41.212773374345836, "lr": 0.00056, "grad_norm": 0.6445, "tokens_per_sec": 148540, "dt_s": 4.412, "eta_s": 24350, "world_size": 1, "timestamp": "2026-05-05T01:02:52.424754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37550, "epoch": 0, "train_loss": 3.645924910902977, "train_ppl": 38.31819738530797, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 151920, "dt_s": 4.314, "eta_s": 24304, "world_size": 1, "timestamp": "2026-05-05T01:02:56.738616"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37560, "epoch": 0, "train_loss": 3.7095422446727753, "train_ppl": 40.83510975901417, "lr": 0.00056, "grad_norm": 0.6242, "tokens_per_sec": 149761, "dt_s": 4.376, "eta_s": 24198, "world_size": 1, "timestamp": "2026-05-05T01:03:01.114663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37570, "epoch": 0, "train_loss": 3.8385055512189865, "train_ppl": 46.45599642982113, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 145322, "dt_s": 4.51, "eta_s": 24214, "world_size": 1, "timestamp": "2026-05-05T01:03:05.624384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37580, "epoch": 0, "train_loss": 3.6654726713895798, "train_ppl": 39.07460124486533, "lr": 0.00056, "grad_norm": 0.651, "tokens_per_sec": 149159, "dt_s": 4.394, "eta_s": 24248, "world_size": 1, "timestamp": "2026-05-05T01:03:10.018092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37590, "epoch": 0, "train_loss": 3.7617456167936325, "train_ppl": 43.023462946252685, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 144594, "dt_s": 4.532, "eta_s": 24376, "world_size": 1, "timestamp": "2026-05-05T01:03:14.550480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37600, "epoch": 0, "train_loss": 3.8273880630731583, "train_ppl": 45.94238277720455, "lr": 0.00056, "grad_norm": 0.7435, "tokens_per_sec": 149282, "dt_s": 4.39, "eta_s": 24456, "world_size": 1, "timestamp": "2026-05-05T01:03:18.940581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37610, "epoch": 0, "train_loss": 3.788780450820923, "train_ppl": 44.20246031725949, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 148641, "dt_s": 4.409, "eta_s": 24488, "world_size": 1, "timestamp": "2026-05-05T01:03:23.349579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37620, "epoch": 0, "train_loss": 3.7661991715431213, "train_ppl": 43.21549759500387, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 147608, "dt_s": 4.44, "eta_s": 24406, "world_size": 1, "timestamp": "2026-05-05T01:03:27.789470"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37630, "epoch": 0, "train_loss": 3.8668689280748367, "train_ppl": 47.792509780917754, "lr": 0.00056, "grad_norm": 0.635, "tokens_per_sec": 150857, "dt_s": 4.344, "eta_s": 24347, "world_size": 1, "timestamp": "2026-05-05T01:03:32.133720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37640, "epoch": 0, "train_loss": 3.6339893490076065, "train_ppl": 37.863566702147416, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 146930, "dt_s": 4.46, "eta_s": 24264, "world_size": 1, "timestamp": "2026-05-05T01:03:36.594062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37650, "epoch": 0, "train_loss": 3.752157539129257, "train_ppl": 42.61292193679879, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 144622, "dt_s": 4.532, "eta_s": 24415, "world_size": 1, "timestamp": "2026-05-05T01:03:41.125581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37660, "epoch": 0, "train_loss": 3.7268788516521454, "train_ppl": 41.5492242833123, "lr": 0.00056, "grad_norm": 0.6984, "tokens_per_sec": 148217, "dt_s": 4.422, "eta_s": 24424, "world_size": 1, "timestamp": "2026-05-05T01:03:45.547220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37670, "epoch": 0, "train_loss": 3.7729570418596268, "train_ppl": 43.508531350264604, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 146466, "dt_s": 4.474, "eta_s": 24458, "world_size": 1, "timestamp": "2026-05-05T01:03:50.021693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37680, "epoch": 0, "train_loss": 3.767937198281288, "train_ppl": 43.29067259448308, "lr": 0.00056, "grad_norm": 0.6327, "tokens_per_sec": 149301, "dt_s": 4.39, "eta_s": 24503, "world_size": 1, "timestamp": "2026-05-05T01:03:54.411234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37690, "epoch": 0, "train_loss": 3.8113591074943542, "train_ppl": 45.21184488539338, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 149362, "dt_s": 4.388, "eta_s": 24419, "world_size": 1, "timestamp": "2026-05-05T01:03:58.798986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37700, "epoch": 0, "train_loss": 3.8835456371307373, "train_ppl": 48.59621450993543, "lr": 0.00056, "grad_norm": 0.6713, "tokens_per_sec": 147071, "dt_s": 4.456, "eta_s": 24332, "world_size": 1, "timestamp": "2026-05-05T01:04:03.255060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37710, "epoch": 0, "train_loss": 3.729495048522949, "train_ppl": 41.65806754949147, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 150465, "dt_s": 4.356, "eta_s": 24255, "world_size": 1, "timestamp": "2026-05-05T01:04:07.610641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37720, "epoch": 0, "train_loss": 3.8116258531808853, "train_ppl": 45.223906558624805, "lr": 0.00056, "grad_norm": 0.7046, "tokens_per_sec": 151157, "dt_s": 4.336, "eta_s": 24098, "world_size": 1, "timestamp": "2026-05-05T01:04:11.946283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37730, "epoch": 0, "train_loss": 3.817954793572426, "train_ppl": 45.511033614099205, "lr": 0.00056, "grad_norm": 0.6442, "tokens_per_sec": 147990, "dt_s": 4.428, "eta_s": 24136, "world_size": 1, "timestamp": "2026-05-05T01:04:16.374702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37740, "epoch": 0, "train_loss": 3.814451813697815, "train_ppl": 45.35188828345904, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 134265, "dt_s": 4.881, "eta_s": 24674, "world_size": 1, "timestamp": "2026-05-05T01:04:21.255755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37750, "epoch": 0, "train_loss": 3.8076834827661514, "train_ppl": 45.04596814739749, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 148494, "dt_s": 4.413, "eta_s": 24622, "world_size": 1, "timestamp": "2026-05-05T01:04:25.669131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37760, "epoch": 0, "train_loss": 3.7313160449266434, "train_ppl": 41.73399585228541, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 146830, "dt_s": 4.463, "eta_s": 24736, "world_size": 1, "timestamp": "2026-05-05T01:04:30.132540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37770, "epoch": 0, "train_loss": 3.725587159395218, "train_ppl": 41.49559011889888, "lr": 0.00056, "grad_norm": 0.6441, "tokens_per_sec": 147634, "dt_s": 4.439, "eta_s": 24845, "world_size": 1, "timestamp": "2026-05-05T01:04:34.571652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37780, "epoch": 0, "train_loss": 3.7016358822584152, "train_ppl": 40.51352553792781, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 143906, "dt_s": 4.554, "eta_s": 24979, "world_size": 1, "timestamp": "2026-05-05T01:04:39.125733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37790, "epoch": 0, "train_loss": 3.8518064469099045, "train_ppl": 47.078030426698675, "lr": 0.00056, "grad_norm": 0.6858, "tokens_per_sec": 148253, "dt_s": 4.421, "eta_s": 24468, "world_size": 1, "timestamp": "2026-05-05T01:04:43.546290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37800, "epoch": 0, "train_loss": 3.753839522600174, "train_ppl": 42.68465647838875, "lr": 0.00056, "grad_norm": 0.7164, "tokens_per_sec": 147097, "dt_s": 4.455, "eta_s": 24510, "world_size": 1, "timestamp": "2026-05-05T01:04:48.001592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37810, "epoch": 0, "train_loss": 3.8001300245523453, "train_ppl": 44.706997122687945, "lr": 0.00056, "grad_norm": 0.6568, "tokens_per_sec": 144300, "dt_s": 4.542, "eta_s": 24591, "world_size": 1, "timestamp": "2026-05-05T01:04:52.543241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37820, "epoch": 0, "train_loss": 3.7274367064237595, "train_ppl": 41.5724091826377, "lr": 0.00056, "grad_norm": 0.63, "tokens_per_sec": 148604, "dt_s": 4.41, "eta_s": 24555, "world_size": 1, "timestamp": "2026-05-05T01:04:56.953367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37830, "epoch": 0, "train_loss": 3.8086221516132355, "train_ppl": 45.088271245578284, "lr": 0.00056, "grad_norm": 0.6622, "tokens_per_sec": 149662, "dt_s": 4.379, "eta_s": 24359, "world_size": 1, "timestamp": "2026-05-05T01:05:01.332295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37840, "epoch": 0, "train_loss": 3.799105331301689, "train_ppl": 44.66120962755423, "lr": 0.00056, "grad_norm": 0.6879, "tokens_per_sec": 145051, "dt_s": 4.518, "eta_s": 24461, "world_size": 1, "timestamp": "2026-05-05T01:05:05.850419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37850, "epoch": 0, "train_loss": 3.763536751270294, "train_ppl": 43.100592808398304, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 149566, "dt_s": 4.382, "eta_s": 24376, "world_size": 1, "timestamp": "2026-05-05T01:05:10.232195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37860, "epoch": 0, "train_loss": 3.707274153828621, "train_ppl": 40.74259697380923, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 148411, "dt_s": 4.416, "eta_s": 24234, "world_size": 1, "timestamp": "2026-05-05T01:05:14.648006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37870, "epoch": 0, "train_loss": 3.6502832770347595, "train_ppl": 38.485566582206324, "lr": 0.00056, "grad_norm": 0.6836, "tokens_per_sec": 147516, "dt_s": 4.443, "eta_s": 24265, "world_size": 1, "timestamp": "2026-05-05T01:05:19.090646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37880, "epoch": 0, "train_loss": 3.771935060620308, "train_ppl": 43.464089160884335, "lr": 0.00056, "grad_norm": 0.6311, "tokens_per_sec": 149186, "dt_s": 4.393, "eta_s": 24276, "world_size": 1, "timestamp": "2026-05-05T01:05:23.483528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37890, "epoch": 0, "train_loss": 3.7522095888853073, "train_ppl": 42.61513998671416, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 144551, "dt_s": 4.534, "eta_s": 24289, "world_size": 1, "timestamp": "2026-05-05T01:05:28.017333"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37900, "epoch": 0, "train_loss": 3.726608097553253, "train_ppl": 41.5379761833351, "lr": 0.00056, "grad_norm": 0.6231, "tokens_per_sec": 149249, "dt_s": 4.391, "eta_s": 24294, "world_size": 1, "timestamp": "2026-05-05T01:05:32.408375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37910, "epoch": 0, "train_loss": 3.745714545249939, "train_ppl": 42.33924972250758, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 148753, "dt_s": 4.406, "eta_s": 24279, "world_size": 1, "timestamp": "2026-05-05T01:05:36.814059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37920, "epoch": 0, "train_loss": 3.778537929058075, "train_ppl": 43.752026383059125, "lr": 0.00056, "grad_norm": 0.7361, "tokens_per_sec": 146667, "dt_s": 4.468, "eta_s": 24303, "world_size": 1, "timestamp": "2026-05-05T01:05:41.282400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37930, "epoch": 0, "train_loss": 3.706969290971756, "train_ppl": 40.73017796244353, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 149677, "dt_s": 4.378, "eta_s": 24282, "world_size": 1, "timestamp": "2026-05-05T01:05:45.660913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37940, "epoch": 0, "train_loss": 3.698437049984932, "train_ppl": 40.38413662197588, "lr": 0.00056, "grad_norm": 0.6592, "tokens_per_sec": 147298, "dt_s": 4.449, "eta_s": 24185, "world_size": 1, "timestamp": "2026-05-05T01:05:50.110121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37950, "epoch": 0, "train_loss": 3.7984130531549454, "train_ppl": 44.63030234757354, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 147021, "dt_s": 4.458, "eta_s": 24254, "world_size": 1, "timestamp": "2026-05-05T01:05:54.567709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37960, "epoch": 0, "train_loss": 3.6805702447891235, "train_ppl": 39.669008669515236, "lr": 0.00056, "grad_norm": 0.6454, "tokens_per_sec": 149996, "dt_s": 4.369, "eta_s": 24209, "world_size": 1, "timestamp": "2026-05-05T01:05:58.936896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37970, "epoch": 0, "train_loss": 3.7097502648830414, "train_ppl": 40.84360517071052, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 149433, "dt_s": 4.386, "eta_s": 24114, "world_size": 1, "timestamp": "2026-05-05T01:06:03.322533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37980, "epoch": 0, "train_loss": 3.670637384057045, "train_ppl": 39.276932374211704, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 147728, "dt_s": 4.436, "eta_s": 24173, "world_size": 1, "timestamp": "2026-05-05T01:06:07.758777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 37990, "epoch": 0, "train_loss": 3.7391377687454224, "train_ppl": 42.0617076063476, "lr": 0.00056, "grad_norm": 0.6162, "tokens_per_sec": 148856, "dt_s": 4.403, "eta_s": 24118, "world_size": 1, "timestamp": "2026-05-05T01:06:12.161454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38000, "epoch": 0, "train_loss": 3.756359189748764, "train_ppl": 42.792343215460306, "lr": 0.00056, "grad_norm": 0.6424, "tokens_per_sec": 145476, "dt_s": 4.505, "eta_s": 24165, "world_size": 1, "timestamp": "2026-05-05T01:06:16.666358"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38010, "epoch": 0, "train_loss": 3.7622647285461426, "train_ppl": 43.04580272942209, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 126055, "dt_s": 5.199, "eta_s": 24217, "world_size": 1, "timestamp": "2026-05-05T01:06:21.865387"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38020, "epoch": 0, "train_loss": 3.7437689304351807, "train_ppl": 42.2569539349138, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 145654, "dt_s": 4.499, "eta_s": 24337, "world_size": 1, "timestamp": "2026-05-05T01:06:26.364779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38030, "epoch": 0, "train_loss": 3.750930666923523, "train_ppl": 42.5606733849712, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 146725, "dt_s": 4.467, "eta_s": 24365, "world_size": 1, "timestamp": "2026-05-05T01:06:30.831404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38040, "epoch": 0, "train_loss": 3.668514609336853, "train_ppl": 39.19364472681295, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 132255, "dt_s": 4.955, "eta_s": 24965, "world_size": 1, "timestamp": "2026-05-05T01:06:35.786658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38050, "epoch": 0, "train_loss": 3.760345235466957, "train_ppl": 42.963255858413795, "lr": 0.00056, "grad_norm": 0.6287, "tokens_per_sec": 145321, "dt_s": 4.51, "eta_s": 24966, "world_size": 1, "timestamp": "2026-05-05T01:06:40.296398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38060, "epoch": 0, "train_loss": 3.795681521296501, "train_ppl": 44.50855960264854, "lr": 0.00056, "grad_norm": 0.6828, "tokens_per_sec": 147826, "dt_s": 4.433, "eta_s": 24975, "world_size": 1, "timestamp": "2026-05-05T01:06:44.729739"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38070, "epoch": 0, "train_loss": 3.795060634613037, "train_ppl": 44.480933407946786, "lr": 0.00056, "grad_norm": 0.6111, "tokens_per_sec": 148703, "dt_s": 4.407, "eta_s": 24870, "world_size": 1, "timestamp": "2026-05-05T01:06:49.136903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38080, "epoch": 0, "train_loss": 3.780796155333519, "train_ppl": 43.85094000127223, "lr": 0.00056, "grad_norm": 0.6514, "tokens_per_sec": 147535, "dt_s": 4.442, "eta_s": 24839, "world_size": 1, "timestamp": "2026-05-05T01:06:53.578945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38090, "epoch": 0, "train_loss": 3.67256236076355, "train_ppl": 39.35261237188178, "lr": 0.00056, "grad_norm": 0.6771, "tokens_per_sec": 150999, "dt_s": 4.34, "eta_s": 24162, "world_size": 1, "timestamp": "2026-05-05T01:06:57.919118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38100, "epoch": 0, "train_loss": 3.7257519215345383, "train_ppl": 41.502427584361484, "lr": 0.00056, "grad_norm": 0.6868, "tokens_per_sec": 151538, "dt_s": 4.325, "eta_s": 23956, "world_size": 1, "timestamp": "2026-05-05T01:07:02.243881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38110, "epoch": 0, "train_loss": 3.8913988023996353, "train_ppl": 48.979351062186495, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 146681, "dt_s": 4.468, "eta_s": 23989, "world_size": 1, "timestamp": "2026-05-05T01:07:06.711772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38120, "epoch": 0, "train_loss": 3.7497213035821915, "train_ppl": 42.50923317801684, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 147392, "dt_s": 4.446, "eta_s": 24028, "world_size": 1, "timestamp": "2026-05-05T01:07:11.158158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38130, "epoch": 0, "train_loss": 3.6784134954214096, "train_ppl": 39.58354475539922, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 147727, "dt_s": 4.436, "eta_s": 24017, "world_size": 1, "timestamp": "2026-05-05T01:07:15.594438"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38140, "epoch": 0, "train_loss": 3.8209158629179, "train_ppl": 45.645994656506765, "lr": 0.00056, "grad_norm": 0.768, "tokens_per_sec": 145701, "dt_s": 4.498, "eta_s": 24185, "world_size": 1, "timestamp": "2026-05-05T01:07:20.092408"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38150, "epoch": 0, "train_loss": 3.701802983880043, "train_ppl": 40.52029597940318, "lr": 0.00056, "grad_norm": 0.7779, "tokens_per_sec": 147961, "dt_s": 4.429, "eta_s": 24294, "world_size": 1, "timestamp": "2026-05-05T01:07:24.521700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38160, "epoch": 0, "train_loss": 3.8068681210279465, "train_ppl": 45.009254358053234, "lr": 0.00056, "grad_norm": 0.7446, "tokens_per_sec": 147872, "dt_s": 4.432, "eta_s": 24251, "world_size": 1, "timestamp": "2026-05-05T01:07:28.953638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38170, "epoch": 0, "train_loss": 3.7571368664503098, "train_ppl": 42.82563476713773, "lr": 0.00056, "grad_norm": 0.6775, "tokens_per_sec": 149714, "dt_s": 4.377, "eta_s": 24171, "world_size": 1, "timestamp": "2026-05-05T01:07:33.331039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38180, "epoch": 0, "train_loss": 3.762129455804825, "train_ppl": 43.03998019950833, "lr": 0.00056, "grad_norm": 0.9583, "tokens_per_sec": 149359, "dt_s": 4.388, "eta_s": 24114, "world_size": 1, "timestamp": "2026-05-05T01:07:37.718857"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38190, "epoch": 0, "train_loss": 3.8250185400247574, "train_ppl": 45.83365011546412, "lr": 0.00056, "grad_norm": 0.6528, "tokens_per_sec": 146506, "dt_s": 4.473, "eta_s": 24082, "world_size": 1, "timestamp": "2026-05-05T01:07:42.192114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38200, "epoch": 0, "train_loss": 3.793234810233116, "train_ppl": 44.39979313177713, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 152210, "dt_s": 4.306, "eta_s": 23943, "world_size": 1, "timestamp": "2026-05-05T01:07:46.497732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38210, "epoch": 0, "train_loss": 3.7717949002981186, "train_ppl": 43.45799764704812, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 150098, "dt_s": 4.366, "eta_s": 23867, "world_size": 1, "timestamp": "2026-05-05T01:07:50.863956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38220, "epoch": 0, "train_loss": 3.710624247789383, "train_ppl": 40.87931738712321, "lr": 0.00056, "grad_norm": 0.6232, "tokens_per_sec": 148484, "dt_s": 4.414, "eta_s": 23902, "world_size": 1, "timestamp": "2026-05-05T01:07:55.277623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38230, "epoch": 0, "train_loss": 3.6566507816314697, "train_ppl": 38.7314054637999, "lr": 0.00056, "grad_norm": 0.6583, "tokens_per_sec": 149042, "dt_s": 4.397, "eta_s": 23908, "world_size": 1, "timestamp": "2026-05-05T01:07:59.674795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38240, "epoch": 0, "train_loss": 3.9132553189992905, "train_ppl": 50.06165365907536, "lr": 0.00056, "grad_norm": 0.7074, "tokens_per_sec": 147553, "dt_s": 4.442, "eta_s": 23869, "world_size": 1, "timestamp": "2026-05-05T01:08:04.116308"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38250, "epoch": 0, "train_loss": 3.7802681624889374, "train_ppl": 43.827793129952326, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 149127, "dt_s": 4.395, "eta_s": 23962, "world_size": 1, "timestamp": "2026-05-05T01:08:08.510936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38260, "epoch": 0, "train_loss": 3.6530944854021072, "train_ppl": 38.59390974525557, "lr": 0.00056, "grad_norm": 0.6377, "tokens_per_sec": 148684, "dt_s": 4.408, "eta_s": 24003, "world_size": 1, "timestamp": "2026-05-05T01:08:12.918663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38270, "epoch": 0, "train_loss": 3.7604682445526123, "train_ppl": 42.9685410542906, "lr": 0.00056, "grad_norm": 0.6915, "tokens_per_sec": 146499, "dt_s": 4.473, "eta_s": 24063, "world_size": 1, "timestamp": "2026-05-05T01:08:17.392129"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38280, "epoch": 0, "train_loss": 3.7635557651519775, "train_ppl": 43.10141232576152, "lr": 0.00056, "grad_norm": 0.7055, "tokens_per_sec": 148293, "dt_s": 4.419, "eta_s": 24083, "world_size": 1, "timestamp": "2026-05-05T01:08:21.811498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38290, "epoch": 0, "train_loss": 3.800283759832382, "train_ppl": 44.71387069375186, "lr": 0.00056, "grad_norm": 0.72, "tokens_per_sec": 151644, "dt_s": 4.322, "eta_s": 23948, "world_size": 1, "timestamp": "2026-05-05T01:08:26.133203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38300, "epoch": 0, "train_loss": 3.702715888619423, "train_ppl": 40.55730403949038, "lr": 0.00056, "grad_norm": 0.6315, "tokens_per_sec": 148259, "dt_s": 4.42, "eta_s": 23972, "world_size": 1, "timestamp": "2026-05-05T01:08:30.553565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38310, "epoch": 0, "train_loss": 3.773453176021576, "train_ppl": 43.53012277468245, "lr": 0.00056, "grad_norm": 0.6205, "tokens_per_sec": 152257, "dt_s": 4.304, "eta_s": 23855, "world_size": 1, "timestamp": "2026-05-05T01:08:34.857866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38320, "epoch": 0, "train_loss": 3.748185768723488, "train_ppl": 42.44400885857916, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 150862, "dt_s": 4.344, "eta_s": 23710, "world_size": 1, "timestamp": "2026-05-05T01:08:39.201989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38330, "epoch": 0, "train_loss": 3.8157410621643066, "train_ppl": 45.41039584316773, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 133963, "dt_s": 4.892, "eta_s": 24219, "world_size": 1, "timestamp": "2026-05-05T01:08:44.094086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38340, "epoch": 0, "train_loss": 3.8586476892232895, "train_ppl": 47.401206844118434, "lr": 0.00056, "grad_norm": 0.6775, "tokens_per_sec": 151465, "dt_s": 4.327, "eta_s": 24221, "world_size": 1, "timestamp": "2026-05-05T01:08:48.420919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38350, "epoch": 0, "train_loss": 3.6692813634872437, "train_ppl": 39.22370814072698, "lr": 0.00056, "grad_norm": 0.6313, "tokens_per_sec": 147884, "dt_s": 4.432, "eta_s": 24228, "world_size": 1, "timestamp": "2026-05-05T01:08:52.852486"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38360, "epoch": 0, "train_loss": 3.8194106072187424, "train_ppl": 45.57733744919622, "lr": 0.00056, "grad_norm": 0.6858, "tokens_per_sec": 152509, "dt_s": 4.297, "eta_s": 24216, "world_size": 1, "timestamp": "2026-05-05T01:08:57.149685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38370, "epoch": 0, "train_loss": 3.804092511534691, "train_ppl": 44.8844994599052, "lr": 0.00056, "grad_norm": 0.6584, "tokens_per_sec": 151590, "dt_s": 4.323, "eta_s": 24189, "world_size": 1, "timestamp": "2026-05-05T01:09:01.472938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38380, "epoch": 0, "train_loss": 3.8058592826128006, "train_ppl": 44.96387018971515, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 149781, "dt_s": 4.375, "eta_s": 23623, "world_size": 1, "timestamp": "2026-05-05T01:09:05.848418"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38390, "epoch": 0, "train_loss": 3.836148500442505, "train_ppl": 46.34662623355256, "lr": 0.00056, "grad_norm": 0.7841, "tokens_per_sec": 152628, "dt_s": 4.294, "eta_s": 23583, "world_size": 1, "timestamp": "2026-05-05T01:09:10.142238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38400, "epoch": 0, "train_loss": 3.7767500579357147, "train_ppl": 43.67387328319722, "lr": 0.00056, "grad_norm": 0.6975, "tokens_per_sec": 153352, "dt_s": 4.274, "eta_s": 23407, "world_size": 1, "timestamp": "2026-05-05T01:09:14.415786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38410, "epoch": 0, "train_loss": 3.674509897828102, "train_ppl": 39.429327721808875, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 149329, "dt_s": 4.389, "eta_s": 23502, "world_size": 1, "timestamp": "2026-05-05T01:09:18.804497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38420, "epoch": 0, "train_loss": 3.6947665065526962, "train_ppl": 40.23617660748199, "lr": 0.00056, "grad_norm": 0.6983, "tokens_per_sec": 149213, "dt_s": 4.392, "eta_s": 23573, "world_size": 1, "timestamp": "2026-05-05T01:09:23.196589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38430, "epoch": 0, "train_loss": 3.7612425088882446, "train_ppl": 43.001822946011316, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 150901, "dt_s": 4.343, "eta_s": 23533, "world_size": 1, "timestamp": "2026-05-05T01:09:27.539578"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38440, "epoch": 0, "train_loss": 3.7635359466075897, "train_ppl": 43.10055812697268, "lr": 0.00056, "grad_norm": 0.6511, "tokens_per_sec": 147478, "dt_s": 4.444, "eta_s": 23691, "world_size": 1, "timestamp": "2026-05-05T01:09:31.983385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38450, "epoch": 0, "train_loss": 3.8390802294015884, "train_ppl": 46.482701350052935, "lr": 0.00056, "grad_norm": 0.6336, "tokens_per_sec": 153811, "dt_s": 4.261, "eta_s": 23673, "world_size": 1, "timestamp": "2026-05-05T01:09:36.244189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38460, "epoch": 0, "train_loss": 3.7649411261081696, "train_ppl": 43.16116471931437, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 149585, "dt_s": 4.381, "eta_s": 23661, "world_size": 1, "timestamp": "2026-05-05T01:09:40.625356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38470, "epoch": 0, "train_loss": 3.748974949121475, "train_ppl": 42.477518059045295, "lr": 0.00056, "grad_norm": 0.6236, "tokens_per_sec": 152563, "dt_s": 4.296, "eta_s": 23552, "world_size": 1, "timestamp": "2026-05-05T01:09:44.921015"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38480, "epoch": 0, "train_loss": 3.7447817027568817, "train_ppl": 42.2997722872196, "lr": 0.00056, "grad_norm": 0.7767, "tokens_per_sec": 151332, "dt_s": 4.331, "eta_s": 23534, "world_size": 1, "timestamp": "2026-05-05T01:09:49.251631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38490, "epoch": 0, "train_loss": 3.6691294610500336, "train_ppl": 39.217750416371864, "lr": 0.00056, "grad_norm": 0.7151, "tokens_per_sec": 147757, "dt_s": 4.435, "eta_s": 23521, "world_size": 1, "timestamp": "2026-05-05T01:09:53.687018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38500, "epoch": 0, "train_loss": 3.651390701532364, "train_ppl": 38.52821004929557, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 150360, "dt_s": 4.359, "eta_s": 23622, "world_size": 1, "timestamp": "2026-05-05T01:09:58.045638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38510, "epoch": 0, "train_loss": 3.8050669878721237, "train_ppl": 44.9282596607236, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 128452, "dt_s": 5.102, "eta_s": 23562, "world_size": 1, "timestamp": "2026-05-05T01:10:03.147576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38520, "epoch": 0, "train_loss": 3.787470296025276, "train_ppl": 44.14458617221822, "lr": 0.00056, "grad_norm": 0.6808, "tokens_per_sec": 149722, "dt_s": 4.377, "eta_s": 23646, "world_size": 1, "timestamp": "2026-05-05T01:10:07.524769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38530, "epoch": 0, "train_loss": 3.796423837542534, "train_ppl": 44.541611295415635, "lr": 0.00056, "grad_norm": 0.6544, "tokens_per_sec": 151687, "dt_s": 4.32, "eta_s": 23631, "world_size": 1, "timestamp": "2026-05-05T01:10:11.845280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38540, "epoch": 0, "train_loss": 3.7545875757932663, "train_ppl": 42.7165988177584, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 147080, "dt_s": 4.456, "eta_s": 23648, "world_size": 1, "timestamp": "2026-05-05T01:10:16.301040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38550, "epoch": 0, "train_loss": 3.726871505379677, "train_ppl": 41.54891905251101, "lr": 0.00056, "grad_norm": 0.644, "tokens_per_sec": 147785, "dt_s": 4.435, "eta_s": 23726, "world_size": 1, "timestamp": "2026-05-05T01:10:20.735601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38560, "epoch": 0, "train_loss": 3.7873464226722717, "train_ppl": 44.139118172988766, "lr": 0.00056, "grad_norm": 0.6866, "tokens_per_sec": 152833, "dt_s": 4.288, "eta_s": 23677, "world_size": 1, "timestamp": "2026-05-05T01:10:25.023663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38570, "epoch": 0, "train_loss": 3.8403297811746597, "train_ppl": 46.54082019563318, "lr": 0.00056, "grad_norm": 0.707, "tokens_per_sec": 148466, "dt_s": 4.414, "eta_s": 23713, "world_size": 1, "timestamp": "2026-05-05T01:10:29.437860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38580, "epoch": 0, "train_loss": 3.8000157922506332, "train_ppl": 44.701890431184104, "lr": 0.00056, "grad_norm": 0.7999, "tokens_per_sec": 150198, "dt_s": 4.363, "eta_s": 23755, "world_size": 1, "timestamp": "2026-05-05T01:10:33.801162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38590, "epoch": 0, "train_loss": 3.8032635003328323, "train_ppl": 44.847305126452845, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 152147, "dt_s": 4.307, "eta_s": 23590, "world_size": 1, "timestamp": "2026-05-05T01:10:38.108591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38600, "epoch": 0, "train_loss": 3.7910753935575485, "train_ppl": 44.30401892352471, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 149047, "dt_s": 4.397, "eta_s": 23545, "world_size": 1, "timestamp": "2026-05-05T01:10:42.505608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38610, "epoch": 0, "train_loss": 3.701974168419838, "train_ppl": 40.527233021362996, "lr": 0.00056, "grad_norm": 0.6131, "tokens_per_sec": 151755, "dt_s": 4.319, "eta_s": 23573, "world_size": 1, "timestamp": "2026-05-05T01:10:46.824148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38620, "epoch": 0, "train_loss": 3.746766671538353, "train_ppl": 42.38381940252692, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 151642, "dt_s": 4.322, "eta_s": 23469, "world_size": 1, "timestamp": "2026-05-05T01:10:51.145932"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38630, "epoch": 0, "train_loss": 3.780363231897354, "train_ppl": 43.83196001038569, "lr": 0.00056, "grad_norm": 0.7543, "tokens_per_sec": 133675, "dt_s": 4.903, "eta_s": 24048, "world_size": 1, "timestamp": "2026-05-05T01:10:56.048565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38640, "epoch": 0, "train_loss": 3.689257174730301, "train_ppl": 40.01511167847117, "lr": 0.00056, "grad_norm": 0.7516, "tokens_per_sec": 148290, "dt_s": 4.419, "eta_s": 24164, "world_size": 1, "timestamp": "2026-05-05T01:11:00.468016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38650, "epoch": 0, "train_loss": 3.7282073497772217, "train_ppl": 41.60445903137699, "lr": 0.00056, "grad_norm": 0.61, "tokens_per_sec": 148890, "dt_s": 4.402, "eta_s": 24165, "world_size": 1, "timestamp": "2026-05-05T01:11:04.869654"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38660, "epoch": 0, "train_loss": 3.754798710346222, "train_ppl": 42.72561871992658, "lr": 0.00056, "grad_norm": 0.6777, "tokens_per_sec": 150155, "dt_s": 4.365, "eta_s": 24210, "world_size": 1, "timestamp": "2026-05-05T01:11:09.234198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38670, "epoch": 0, "train_loss": 3.727869361639023, "train_ppl": 41.590399593823605, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 150364, "dt_s": 4.358, "eta_s": 24245, "world_size": 1, "timestamp": "2026-05-05T01:11:13.592689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38680, "epoch": 0, "train_loss": 3.782640352845192, "train_ppl": 43.931884411505635, "lr": 0.00056, "grad_norm": 0.6764, "tokens_per_sec": 148359, "dt_s": 4.417, "eta_s": 23717, "world_size": 1, "timestamp": "2026-05-05T01:11:18.010063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38690, "epoch": 0, "train_loss": 3.8032609671354294, "train_ppl": 44.84719151951986, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 149377, "dt_s": 4.387, "eta_s": 23677, "world_size": 1, "timestamp": "2026-05-05T01:11:22.397341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38700, "epoch": 0, "train_loss": 3.761535570025444, "train_ppl": 43.01442695592805, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 149611, "dt_s": 4.38, "eta_s": 23650, "world_size": 1, "timestamp": "2026-05-05T01:11:26.777769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38710, "epoch": 0, "train_loss": 3.8000592291355133, "train_ppl": 44.70383218422423, "lr": 0.00056, "grad_norm": 0.7109, "tokens_per_sec": 147690, "dt_s": 4.437, "eta_s": 23724, "world_size": 1, "timestamp": "2026-05-05T01:11:31.215160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38720, "epoch": 0, "train_loss": 3.708217039704323, "train_ppl": 40.78103070950111, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 150162, "dt_s": 4.364, "eta_s": 23726, "world_size": 1, "timestamp": "2026-05-05T01:11:35.579516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38730, "epoch": 0, "train_loss": 3.785204201936722, "train_ppl": 44.044663646134545, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 146481, "dt_s": 4.474, "eta_s": 23783, "world_size": 1, "timestamp": "2026-05-05T01:11:40.053558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38740, "epoch": 0, "train_loss": 3.730317145586014, "train_ppl": 41.692328605504024, "lr": 0.00056, "grad_norm": 0.6303, "tokens_per_sec": 148471, "dt_s": 4.414, "eta_s": 23808, "world_size": 1, "timestamp": "2026-05-05T01:11:44.467608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38750, "epoch": 0, "train_loss": 3.8556722551584244, "train_ppl": 47.260377296974106, "lr": 0.00056, "grad_norm": 0.761, "tokens_per_sec": 146802, "dt_s": 4.464, "eta_s": 23894, "world_size": 1, "timestamp": "2026-05-05T01:11:48.931861"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38760, "epoch": 0, "train_loss": 3.686653435230255, "train_ppl": 39.91105827435987, "lr": 0.00056, "grad_norm": 0.6478, "tokens_per_sec": 146717, "dt_s": 4.467, "eta_s": 23921, "world_size": 1, "timestamp": "2026-05-05T01:11:53.398667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38770, "epoch": 0, "train_loss": 3.8313930928707123, "train_ppl": 46.12675234556076, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 147953, "dt_s": 4.43, "eta_s": 23987, "world_size": 1, "timestamp": "2026-05-05T01:11:57.828203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38780, "epoch": 0, "train_loss": 3.7863367199897766, "train_ppl": 44.09457327930031, "lr": 0.00056, "grad_norm": 0.6202, "tokens_per_sec": 150203, "dt_s": 4.363, "eta_s": 23863, "world_size": 1, "timestamp": "2026-05-05T01:12:02.191392"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38790, "epoch": 0, "train_loss": 3.8180982917547226, "train_ppl": 45.51756483329511, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 148951, "dt_s": 4.4, "eta_s": 23843, "world_size": 1, "timestamp": "2026-05-05T01:12:06.591214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38800, "epoch": 0, "train_loss": 3.7914379984140396, "train_ppl": 44.32008668889728, "lr": 0.00056, "grad_norm": 0.621, "tokens_per_sec": 146925, "dt_s": 4.461, "eta_s": 23835, "world_size": 1, "timestamp": "2026-05-05T01:12:11.051752"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38810, "epoch": 0, "train_loss": 3.861391991376877, "train_ppl": 47.53146873538547, "lr": 0.00056, "grad_norm": 0.6835, "tokens_per_sec": 150446, "dt_s": 4.356, "eta_s": 23711, "world_size": 1, "timestamp": "2026-05-05T01:12:15.407838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38820, "epoch": 0, "train_loss": 3.700563818216324, "train_ppl": 40.47011571719184, "lr": 0.00056, "grad_norm": 0.6439, "tokens_per_sec": 145849, "dt_s": 4.493, "eta_s": 23775, "world_size": 1, "timestamp": "2026-05-05T01:12:19.901261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38830, "epoch": 0, "train_loss": 3.768559083342552, "train_ppl": 43.31760278993687, "lr": 0.00056, "grad_norm": 0.6497, "tokens_per_sec": 150370, "dt_s": 4.358, "eta_s": 23766, "world_size": 1, "timestamp": "2026-05-05T01:12:24.259570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38840, "epoch": 0, "train_loss": 3.781701371073723, "train_ppl": 43.89065253387716, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 150994, "dt_s": 4.34, "eta_s": 23697, "world_size": 1, "timestamp": "2026-05-05T01:12:28.599903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38850, "epoch": 0, "train_loss": 3.7539299726486206, "train_ppl": 42.68851748224651, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 147734, "dt_s": 4.436, "eta_s": 23666, "world_size": 1, "timestamp": "2026-05-05T01:12:33.035983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38860, "epoch": 0, "train_loss": 3.820037618279457, "train_ppl": 45.60592390497023, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 151878, "dt_s": 4.315, "eta_s": 23618, "world_size": 1, "timestamp": "2026-05-05T01:12:37.351013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38870, "epoch": 0, "train_loss": 3.7757037729024887, "train_ppl": 43.628201860067655, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 145133, "dt_s": 4.516, "eta_s": 23637, "world_size": 1, "timestamp": "2026-05-05T01:12:41.866581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38880, "epoch": 0, "train_loss": 3.747514382004738, "train_ppl": 42.41552207863392, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 148059, "dt_s": 4.426, "eta_s": 23706, "world_size": 1, "timestamp": "2026-05-05T01:12:46.292938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38890, "epoch": 0, "train_loss": 3.84846094250679, "train_ppl": 46.920793833128805, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 150178, "dt_s": 4.364, "eta_s": 23727, "world_size": 1, "timestamp": "2026-05-05T01:12:50.656827"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38900, "epoch": 0, "train_loss": 3.7252114713191986, "train_ppl": 41.48000364849269, "lr": 0.00056, "grad_norm": 0.6147, "tokens_per_sec": 149004, "dt_s": 4.398, "eta_s": 23682, "world_size": 1, "timestamp": "2026-05-05T01:12:55.055101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38910, "epoch": 0, "train_loss": 3.675429090857506, "train_ppl": 39.46558754734124, "lr": 0.00056, "grad_norm": 0.6224, "tokens_per_sec": 150951, "dt_s": 4.342, "eta_s": 23706, "world_size": 1, "timestamp": "2026-05-05T01:12:59.396648"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38920, "epoch": 0, "train_loss": 3.790262147784233, "train_ppl": 44.26800351406898, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 133185, "dt_s": 4.921, "eta_s": 24137, "world_size": 1, "timestamp": "2026-05-05T01:13:04.317363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38930, "epoch": 0, "train_loss": 3.760087698698044, "train_ppl": 42.95219266496872, "lr": 0.00056, "grad_norm": 0.7155, "tokens_per_sec": 147420, "dt_s": 4.446, "eta_s": 24154, "world_size": 1, "timestamp": "2026-05-05T01:13:08.762889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38940, "epoch": 0, "train_loss": 3.766599252820015, "train_ppl": 43.23279076556948, "lr": 0.00056, "grad_norm": 0.6734, "tokens_per_sec": 149774, "dt_s": 4.376, "eta_s": 24162, "world_size": 1, "timestamp": "2026-05-05T01:13:13.138539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38950, "epoch": 0, "train_loss": 3.8278473019599915, "train_ppl": 45.96348615130374, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 145756, "dt_s": 4.496, "eta_s": 24263, "world_size": 1, "timestamp": "2026-05-05T01:13:17.634830"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38960, "epoch": 0, "train_loss": 3.6754970401525497, "train_ppl": 39.46826929730405, "lr": 0.00056, "grad_norm": 0.723, "tokens_per_sec": 147035, "dt_s": 4.457, "eta_s": 24382, "world_size": 1, "timestamp": "2026-05-05T01:13:22.092006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38970, "epoch": 0, "train_loss": 3.7793684601783752, "train_ppl": 43.788378896403664, "lr": 0.00056, "grad_norm": 0.665, "tokens_per_sec": 148672, "dt_s": 4.408, "eta_s": 23827, "world_size": 1, "timestamp": "2026-05-05T01:13:26.500118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38980, "epoch": 0, "train_loss": 3.7547855526208878, "train_ppl": 42.72505655166916, "lr": 0.00056, "grad_norm": 0.6087, "tokens_per_sec": 144580, "dt_s": 4.533, "eta_s": 23916, "world_size": 1, "timestamp": "2026-05-05T01:13:31.032974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 38990, "epoch": 0, "train_loss": 3.8483914732933044, "train_ppl": 46.91753439570168, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 147978, "dt_s": 4.429, "eta_s": 23969, "world_size": 1, "timestamp": "2026-05-05T01:13:35.461764"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39000, "epoch": 0, "train_loss": 3.7934670746326447, "train_ppl": 44.410106820773095, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 147310, "dt_s": 4.449, "eta_s": 23913, "world_size": 1, "timestamp": "2026-05-05T01:13:39.910579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39010, "epoch": 0, "train_loss": 3.8187104761600494, "train_ppl": 45.545438507698385, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 122283, "dt_s": 5.359, "eta_s": 24038, "world_size": 1, "timestamp": "2026-05-05T01:13:45.269952"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39020, "epoch": 0, "train_loss": 3.8361312448978424, "train_ppl": 46.34582650417354, "lr": 0.00056, "grad_norm": 0.6984, "tokens_per_sec": 146997, "dt_s": 4.458, "eta_s": 24088, "world_size": 1, "timestamp": "2026-05-05T01:13:49.728271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39030, "epoch": 0, "train_loss": 3.7763536125421524, "train_ppl": 43.65656240894997, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 143371, "dt_s": 4.571, "eta_s": 24124, "world_size": 1, "timestamp": "2026-05-05T01:13:54.299366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39040, "epoch": 0, "train_loss": 3.786979764699936, "train_ppl": 44.12293718004508, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 145689, "dt_s": 4.498, "eta_s": 24194, "world_size": 1, "timestamp": "2026-05-05T01:13:58.797697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39050, "epoch": 0, "train_loss": 3.7426562011241913, "train_ppl": 42.209959534549576, "lr": 0.00056, "grad_norm": 0.6857, "tokens_per_sec": 145245, "dt_s": 4.512, "eta_s": 24258, "world_size": 1, "timestamp": "2026-05-05T01:14:03.309792"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39060, "epoch": 0, "train_loss": 3.783392772078514, "train_ppl": 43.96495204509463, "lr": 0.00056, "grad_norm": 0.7159, "tokens_per_sec": 143511, "dt_s": 4.567, "eta_s": 24241, "world_size": 1, "timestamp": "2026-05-05T01:14:07.876416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39070, "epoch": 0, "train_loss": 3.869944855570793, "train_ppl": 47.93974239819686, "lr": 0.00056, "grad_norm": 0.6314, "tokens_per_sec": 147449, "dt_s": 4.445, "eta_s": 24222, "world_size": 1, "timestamp": "2026-05-05T01:14:12.321052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39080, "epoch": 0, "train_loss": 3.8016438335180283, "train_ppl": 44.774726227289825, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 147267, "dt_s": 4.45, "eta_s": 24088, "world_size": 1, "timestamp": "2026-05-05T01:14:16.771271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39090, "epoch": 0, "train_loss": 3.7522206753492355, "train_ppl": 42.61561244054534, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 144799, "dt_s": 4.526, "eta_s": 24113, "world_size": 1, "timestamp": "2026-05-05T01:14:21.297185"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39100, "epoch": 0, "train_loss": 3.8104759007692337, "train_ppl": 45.17193110858812, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 148101, "dt_s": 4.425, "eta_s": 24015, "world_size": 1, "timestamp": "2026-05-05T01:14:25.722310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39110, "epoch": 0, "train_loss": 3.8858969062566757, "train_ppl": 48.71061172536706, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 147355, "dt_s": 4.448, "eta_s": 23883, "world_size": 1, "timestamp": "2026-05-05T01:14:30.169813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39120, "epoch": 0, "train_loss": 3.91981603205204, "train_ppl": 50.39117356449262, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 150434, "dt_s": 4.356, "eta_s": 23784, "world_size": 1, "timestamp": "2026-05-05T01:14:34.526272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39130, "epoch": 0, "train_loss": 3.7862481474876404, "train_ppl": 44.09066788557214, "lr": 0.00056, "grad_norm": 0.6407, "tokens_per_sec": 150817, "dt_s": 4.345, "eta_s": 23668, "world_size": 1, "timestamp": "2026-05-05T01:14:38.871663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39140, "epoch": 0, "train_loss": 3.7357759326696396, "train_ppl": 41.92054046365927, "lr": 0.00056, "grad_norm": 0.6912, "tokens_per_sec": 149039, "dt_s": 4.397, "eta_s": 23526, "world_size": 1, "timestamp": "2026-05-05T01:14:43.268904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39150, "epoch": 0, "train_loss": 3.7545232623815536, "train_ppl": 42.7138516558923, "lr": 0.00056, "grad_norm": 0.6793, "tokens_per_sec": 148615, "dt_s": 4.41, "eta_s": 23505, "world_size": 1, "timestamp": "2026-05-05T01:14:47.678718"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39160, "epoch": 0, "train_loss": 3.724927932024002, "train_ppl": 41.46824410471861, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 150024, "dt_s": 4.368, "eta_s": 23416, "world_size": 1, "timestamp": "2026-05-05T01:14:52.047058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39170, "epoch": 0, "train_loss": 3.7763390839099884, "train_ppl": 43.65592814342071, "lr": 0.00056, "grad_norm": 0.668, "tokens_per_sec": 147194, "dt_s": 4.452, "eta_s": 23514, "world_size": 1, "timestamp": "2026-05-05T01:14:56.499409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39180, "epoch": 0, "train_loss": 3.795883685350418, "train_ppl": 44.517558543092306, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 147893, "dt_s": 4.431, "eta_s": 23601, "world_size": 1, "timestamp": "2026-05-05T01:15:00.930767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39190, "epoch": 0, "train_loss": 3.8008346259593964, "train_ppl": 44.73850883606071, "lr": 0.00056, "grad_norm": 0.6757, "tokens_per_sec": 145882, "dt_s": 4.492, "eta_s": 23699, "world_size": 1, "timestamp": "2026-05-05T01:15:05.423116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39200, "epoch": 0, "train_loss": 3.780158758163452, "train_ppl": 43.822998442092015, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 144745, "dt_s": 4.528, "eta_s": 23820, "world_size": 1, "timestamp": "2026-05-05T01:15:09.950818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39210, "epoch": 0, "train_loss": 3.817978113889694, "train_ppl": 45.51209495821769, "lr": 0.00056, "grad_norm": 0.6192, "tokens_per_sec": 146582, "dt_s": 4.471, "eta_s": 23926, "world_size": 1, "timestamp": "2026-05-05T01:15:14.421731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39220, "epoch": 0, "train_loss": 3.7966932207345963, "train_ppl": 44.553611673123456, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 131444, "dt_s": 4.986, "eta_s": 24492, "world_size": 1, "timestamp": "2026-05-05T01:15:19.407585"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39230, "epoch": 0, "train_loss": 3.8090577572584152, "train_ppl": 45.107916229486555, "lr": 0.00056, "grad_norm": 0.6137, "tokens_per_sec": 147847, "dt_s": 4.433, "eta_s": 24489, "world_size": 1, "timestamp": "2026-05-05T01:15:23.840301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39240, "epoch": 0, "train_loss": 3.846996173262596, "train_ppl": 46.852116008270976, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 147192, "dt_s": 4.452, "eta_s": 24441, "world_size": 1, "timestamp": "2026-05-05T01:15:28.292727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39250, "epoch": 0, "train_loss": 3.676043286919594, "train_ppl": 39.48983460126108, "lr": 0.00056, "grad_norm": 0.9505, "tokens_per_sec": 147403, "dt_s": 4.446, "eta_s": 24350, "world_size": 1, "timestamp": "2026-05-05T01:15:32.738763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39260, "epoch": 0, "train_loss": 3.773464798927307, "train_ppl": 43.53062872413622, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 149894, "dt_s": 4.372, "eta_s": 24239, "world_size": 1, "timestamp": "2026-05-05T01:15:37.110913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39270, "epoch": 0, "train_loss": 3.770625576376915, "train_ppl": 43.407210869715435, "lr": 0.00056, "grad_norm": 0.6003, "tokens_per_sec": 148179, "dt_s": 4.423, "eta_s": 23633, "world_size": 1, "timestamp": "2026-05-05T01:15:41.533656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39280, "epoch": 0, "train_loss": 3.781197175383568, "train_ppl": 43.86852863388768, "lr": 0.00056, "grad_norm": 0.7187, "tokens_per_sec": 145974, "dt_s": 4.49, "eta_s": 23690, "world_size": 1, "timestamp": "2026-05-05T01:15:46.023239"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39290, "epoch": 0, "train_loss": 3.7714185416698456, "train_ppl": 43.441644932101894, "lr": 0.00056, "grad_norm": 0.6295, "tokens_per_sec": 149611, "dt_s": 4.38, "eta_s": 23608, "world_size": 1, "timestamp": "2026-05-05T01:15:50.403644"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39300, "epoch": 0, "train_loss": 3.700154021382332, "train_ppl": 40.45353458957849, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 148001, "dt_s": 4.428, "eta_s": 23585, "world_size": 1, "timestamp": "2026-05-05T01:15:54.831731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39310, "epoch": 0, "train_loss": 3.72628690302372, "train_ppl": 41.52463655503962, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 145583, "dt_s": 4.502, "eta_s": 23719, "world_size": 1, "timestamp": "2026-05-05T01:15:59.333347"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39320, "epoch": 0, "train_loss": 3.659764662384987, "train_ppl": 38.85219841164479, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 148891, "dt_s": 4.402, "eta_s": 23692, "world_size": 1, "timestamp": "2026-05-05T01:16:03.734973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39330, "epoch": 0, "train_loss": 3.780109629034996, "train_ppl": 43.82084550925851, "lr": 0.00056, "grad_norm": 0.6293, "tokens_per_sec": 148209, "dt_s": 4.422, "eta_s": 23615, "world_size": 1, "timestamp": "2026-05-05T01:16:08.156847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39340, "epoch": 0, "train_loss": 3.9620301127433777, "train_ppl": 52.56392840588259, "lr": 0.00056, "grad_norm": 0.7194, "tokens_per_sec": 148292, "dt_s": 4.419, "eta_s": 23652, "world_size": 1, "timestamp": "2026-05-05T01:16:12.576248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39350, "epoch": 0, "train_loss": 3.656920909881592, "train_ppl": 38.74186932381107, "lr": 0.00056, "grad_norm": 0.7419, "tokens_per_sec": 146909, "dt_s": 4.461, "eta_s": 23683, "world_size": 1, "timestamp": "2026-05-05T01:16:17.037237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39360, "epoch": 0, "train_loss": 3.862767457962036, "train_ppl": 47.596891665587215, "lr": 0.00056, "grad_norm": 0.6794, "tokens_per_sec": 146339, "dt_s": 4.478, "eta_s": 23653, "world_size": 1, "timestamp": "2026-05-05T01:16:21.515617"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39370, "epoch": 0, "train_loss": 3.804978907108307, "train_ppl": 44.92430251957229, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 149115, "dt_s": 4.395, "eta_s": 23642, "world_size": 1, "timestamp": "2026-05-05T01:16:25.910592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39380, "epoch": 0, "train_loss": 3.7568598240613937, "train_ppl": 42.813771894310186, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 147044, "dt_s": 4.457, "eta_s": 23675, "world_size": 1, "timestamp": "2026-05-05T01:16:30.367510"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39390, "epoch": 0, "train_loss": 3.6570729464292526, "train_ppl": 38.74775995165701, "lr": 0.00056, "grad_norm": 0.6817, "tokens_per_sec": 145941, "dt_s": 4.491, "eta_s": 23746, "world_size": 1, "timestamp": "2026-05-05T01:16:34.858055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39400, "epoch": 0, "train_loss": 3.7382738292217255, "train_ppl": 42.025384527443315, "lr": 0.00056, "grad_norm": 0.7093, "tokens_per_sec": 148492, "dt_s": 4.413, "eta_s": 23691, "world_size": 1, "timestamp": "2026-05-05T01:16:39.271476"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39410, "epoch": 0, "train_loss": 3.6704825907945633, "train_ppl": 39.270853040241334, "lr": 0.00056, "grad_norm": 0.6317, "tokens_per_sec": 146535, "dt_s": 4.472, "eta_s": 23680, "world_size": 1, "timestamp": "2026-05-05T01:16:43.743924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39420, "epoch": 0, "train_loss": 3.7436819672584534, "train_ppl": 42.25327929574227, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 149480, "dt_s": 4.384, "eta_s": 23664, "world_size": 1, "timestamp": "2026-05-05T01:16:48.128146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39430, "epoch": 0, "train_loss": 3.8283554911613464, "train_ppl": 45.9868502348071, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 147610, "dt_s": 4.44, "eta_s": 23642, "world_size": 1, "timestamp": "2026-05-05T01:16:52.567951"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39440, "epoch": 0, "train_loss": 3.6355243176221848, "train_ppl": 37.92173071721986, "lr": 0.00056, "grad_norm": 0.6367, "tokens_per_sec": 146571, "dt_s": 4.471, "eta_s": 23617, "world_size": 1, "timestamp": "2026-05-05T01:16:57.039269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39450, "epoch": 0, "train_loss": 3.9220953434705734, "train_ppl": 50.506161738938175, "lr": 0.00056, "grad_norm": 0.6813, "tokens_per_sec": 147265, "dt_s": 4.45, "eta_s": 23651, "world_size": 1, "timestamp": "2026-05-05T01:17:01.489452"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39460, "epoch": 0, "train_loss": 3.7912264615297318, "train_ppl": 44.31071234739131, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 148128, "dt_s": 4.424, "eta_s": 23596, "world_size": 1, "timestamp": "2026-05-05T01:17:05.913717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39470, "epoch": 0, "train_loss": 3.71305550634861, "train_ppl": 40.97882649458395, "lr": 0.00056, "grad_norm": 0.8987, "tokens_per_sec": 144977, "dt_s": 4.52, "eta_s": 23736, "world_size": 1, "timestamp": "2026-05-05T01:17:10.434172"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39480, "epoch": 0, "train_loss": 3.8513935059309006, "train_ppl": 47.05859399205107, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 148723, "dt_s": 4.407, "eta_s": 23697, "world_size": 1, "timestamp": "2026-05-05T01:17:14.840790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39490, "epoch": 0, "train_loss": 3.8163859248161316, "train_ppl": 45.43968875538898, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 148116, "dt_s": 4.425, "eta_s": 23642, "world_size": 1, "timestamp": "2026-05-05T01:17:19.265438"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39500, "epoch": 0, "train_loss": 3.871987447142601, "train_ppl": 48.03776378672485, "lr": 0.00056, "grad_norm": 0.7178, "tokens_per_sec": 145682, "dt_s": 4.499, "eta_s": 23689, "world_size": 1, "timestamp": "2026-05-05T01:17:23.764003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39510, "epoch": 0, "train_loss": 3.7631030082702637, "train_ppl": 43.08190228170608, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 113942, "dt_s": 5.752, "eta_s": 24271, "world_size": 1, "timestamp": "2026-05-05T01:17:29.515705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39520, "epoch": 0, "train_loss": 3.7102872282266617, "train_ppl": 40.86554257877338, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 142581, "dt_s": 4.596, "eta_s": 24347, "world_size": 1, "timestamp": "2026-05-05T01:17:34.112103"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39530, "epoch": 0, "train_loss": 3.716257780790329, "train_ppl": 41.11026227772573, "lr": 0.00056, "grad_norm": 0.6639, "tokens_per_sec": 148626, "dt_s": 4.409, "eta_s": 24346, "world_size": 1, "timestamp": "2026-05-05T01:17:38.521559"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39540, "epoch": 0, "train_loss": 3.8405115455389023, "train_ppl": 46.54928042708837, "lr": 0.00056, "grad_norm": 0.697, "tokens_per_sec": 146841, "dt_s": 4.463, "eta_s": 24382, "world_size": 1, "timestamp": "2026-05-05T01:17:42.984638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39550, "epoch": 0, "train_loss": 3.789758250117302, "train_ppl": 44.24570258954233, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 147163, "dt_s": 4.453, "eta_s": 24329, "world_size": 1, "timestamp": "2026-05-05T01:17:47.437908"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39560, "epoch": 0, "train_loss": 3.7614677101373672, "train_ppl": 43.01150810076685, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 148590, "dt_s": 4.411, "eta_s": 23724, "world_size": 1, "timestamp": "2026-05-05T01:17:51.848448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39570, "epoch": 0, "train_loss": 3.8098904192447662, "train_ppl": 45.145491518200636, "lr": 0.00056, "grad_norm": 0.6952, "tokens_per_sec": 145230, "dt_s": 4.513, "eta_s": 23631, "world_size": 1, "timestamp": "2026-05-05T01:17:56.361007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39580, "epoch": 0, "train_loss": 3.774868279695511, "train_ppl": 43.59176601685016, "lr": 0.00056, "grad_norm": 0.6486, "tokens_per_sec": 147690, "dt_s": 4.437, "eta_s": 23656, "world_size": 1, "timestamp": "2026-05-05T01:18:00.798423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39590, "epoch": 0, "train_loss": 3.7991130650043488, "train_ppl": 44.661555025405505, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 148330, "dt_s": 4.418, "eta_s": 23604, "world_size": 1, "timestamp": "2026-05-05T01:18:05.216696"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39600, "epoch": 0, "train_loss": 3.8153388649225235, "train_ppl": 45.3921355795713, "lr": 0.00056, "grad_norm": 0.71, "tokens_per_sec": 145408, "dt_s": 4.507, "eta_s": 23657, "world_size": 1, "timestamp": "2026-05-05T01:18:09.723741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39610, "epoch": 0, "train_loss": 3.7737382501363754, "train_ppl": 43.54253385485435, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 148377, "dt_s": 4.417, "eta_s": 23659, "world_size": 1, "timestamp": "2026-05-05T01:18:14.140609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39620, "epoch": 0, "train_loss": 3.812919780611992, "train_ppl": 45.282460886218615, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 148540, "dt_s": 4.412, "eta_s": 23548, "world_size": 1, "timestamp": "2026-05-05T01:18:18.552616"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39630, "epoch": 0, "train_loss": 3.8867287784814835, "train_ppl": 48.75114958913853, "lr": 0.00056, "grad_norm": 0.7983, "tokens_per_sec": 148472, "dt_s": 4.414, "eta_s": 23519, "world_size": 1, "timestamp": "2026-05-05T01:18:22.966662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39640, "epoch": 0, "train_loss": 3.6922139823436737, "train_ppl": 40.1336037581452, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 148023, "dt_s": 4.427, "eta_s": 23524, "world_size": 1, "timestamp": "2026-05-05T01:18:27.394076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39650, "epoch": 0, "train_loss": 3.740438759326935, "train_ppl": 42.11646510355642, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 147298, "dt_s": 4.449, "eta_s": 23458, "world_size": 1, "timestamp": "2026-05-05T01:18:31.843313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39660, "epoch": 0, "train_loss": 3.707394778728485, "train_ppl": 40.74751184191118, "lr": 0.00056, "grad_norm": 0.7477, "tokens_per_sec": 147876, "dt_s": 4.432, "eta_s": 23470, "world_size": 1, "timestamp": "2026-05-05T01:18:36.275103"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39670, "epoch": 0, "train_loss": 3.7027976363897324, "train_ppl": 40.56061964418517, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 147387, "dt_s": 4.447, "eta_s": 23502, "world_size": 1, "timestamp": "2026-05-05T01:18:40.721623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39680, "epoch": 0, "train_loss": 3.7482188642024994, "train_ppl": 42.445413586628455, "lr": 0.00056, "grad_norm": 0.7351, "tokens_per_sec": 146565, "dt_s": 4.471, "eta_s": 23558, "world_size": 1, "timestamp": "2026-05-05T01:18:45.193085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39690, "epoch": 0, "train_loss": 3.7873871624469757, "train_ppl": 44.140916427348756, "lr": 0.00056, "grad_norm": 0.6432, "tokens_per_sec": 147595, "dt_s": 4.44, "eta_s": 23567, "world_size": 1, "timestamp": "2026-05-05T01:18:49.633353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39700, "epoch": 0, "train_loss": 3.725690469145775, "train_ppl": 41.49987723941014, "lr": 0.00056, "grad_norm": 0.6325, "tokens_per_sec": 147047, "dt_s": 4.457, "eta_s": 23571, "world_size": 1, "timestamp": "2026-05-05T01:18:54.090135"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39710, "epoch": 0, "train_loss": 3.7786941528320312, "train_ppl": 43.758862023669806, "lr": 0.00056, "grad_norm": 0.6781, "tokens_per_sec": 145849, "dt_s": 4.493, "eta_s": 23632, "world_size": 1, "timestamp": "2026-05-05T01:18:58.583545"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39720, "epoch": 0, "train_loss": 3.758885532617569, "train_ppl": 42.90058802075624, "lr": 0.00056, "grad_norm": 0.6952, "tokens_per_sec": 146641, "dt_s": 4.469, "eta_s": 23651, "world_size": 1, "timestamp": "2026-05-05T01:19:03.052695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39730, "epoch": 0, "train_loss": 3.864787846803665, "train_ppl": 47.693153104428355, "lr": 0.00056, "grad_norm": 0.8028, "tokens_per_sec": 147988, "dt_s": 4.428, "eta_s": 23601, "world_size": 1, "timestamp": "2026-05-05T01:19:07.481146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39740, "epoch": 0, "train_loss": 3.771500453352928, "train_ppl": 43.44520345609451, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 146110, "dt_s": 4.485, "eta_s": 23645, "world_size": 1, "timestamp": "2026-05-05T01:19:11.966572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39750, "epoch": 0, "train_loss": 3.7231656461954117, "train_ppl": 41.3952295609407, "lr": 0.00056, "grad_norm": 0.6369, "tokens_per_sec": 146008, "dt_s": 4.489, "eta_s": 23674, "world_size": 1, "timestamp": "2026-05-05T01:19:16.455054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39760, "epoch": 0, "train_loss": 3.8119047433137894, "train_ppl": 45.236520818849975, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 146097, "dt_s": 4.486, "eta_s": 23661, "world_size": 1, "timestamp": "2026-05-05T01:19:20.940859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39770, "epoch": 0, "train_loss": 3.7485144585371017, "train_ppl": 42.457962064952696, "lr": 0.00056, "grad_norm": 0.6231, "tokens_per_sec": 148212, "dt_s": 4.422, "eta_s": 23607, "world_size": 1, "timestamp": "2026-05-05T01:19:25.362642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39780, "epoch": 0, "train_loss": 2.935962997376919, "train_ppl": 18.83963692332732, "lr": 0.00056, "grad_norm": 1.0622, "tokens_per_sec": 148233, "dt_s": 4.421, "eta_s": 23594, "world_size": 1, "timestamp": "2026-05-05T01:19:29.783801"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39790, "epoch": 0, "train_loss": 3.8716961443424225, "train_ppl": 48.02377228959955, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 145302, "dt_s": 4.51, "eta_s": 23616, "world_size": 1, "timestamp": "2026-05-05T01:19:34.294096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39800, "epoch": 0, "train_loss": 3.7648510187864304, "train_ppl": 43.15727575757297, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 149373, "dt_s": 4.387, "eta_s": 23505, "world_size": 1, "timestamp": "2026-05-05T01:19:38.681511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39810, "epoch": 0, "train_loss": 3.831922635436058, "train_ppl": 46.15118489279483, "lr": 0.00056, "grad_norm": 0.7233, "tokens_per_sec": 131002, "dt_s": 5.003, "eta_s": 24047, "world_size": 1, "timestamp": "2026-05-05T01:19:43.684171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39820, "epoch": 0, "train_loss": 3.7953063398599625, "train_ppl": 44.49186394946396, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 146754, "dt_s": 4.466, "eta_s": 24089, "world_size": 1, "timestamp": "2026-05-05T01:19:48.149885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39830, "epoch": 0, "train_loss": 3.7690280228853226, "train_ppl": 43.337920890391125, "lr": 0.00056, "grad_norm": 0.6848, "tokens_per_sec": 145679, "dt_s": 4.499, "eta_s": 24166, "world_size": 1, "timestamp": "2026-05-05T01:19:52.648528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39840, "epoch": 0, "train_loss": 3.7670394629240036, "train_ppl": 43.251826466434636, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 144761, "dt_s": 4.527, "eta_s": 24180, "world_size": 1, "timestamp": "2026-05-05T01:19:57.175729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39850, "epoch": 0, "train_loss": 3.8566990196704865, "train_ppl": 47.30892749575457, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 147526, "dt_s": 4.442, "eta_s": 24233, "world_size": 1, "timestamp": "2026-05-05T01:20:01.618058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39860, "epoch": 0, "train_loss": 3.8238236755132675, "train_ppl": 45.77891781886031, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 146925, "dt_s": 4.461, "eta_s": 23656, "world_size": 1, "timestamp": "2026-05-05T01:20:06.078584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39870, "epoch": 0, "train_loss": 3.818075269460678, "train_ppl": 45.51651692659598, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 144659, "dt_s": 4.53, "eta_s": 23720, "world_size": 1, "timestamp": "2026-05-05T01:20:10.608955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39880, "epoch": 0, "train_loss": 3.702065348625183, "train_ppl": 40.530928471265334, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 147096, "dt_s": 4.455, "eta_s": 23669, "world_size": 1, "timestamp": "2026-05-05T01:20:15.064293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39890, "epoch": 0, "train_loss": 3.686188191175461, "train_ppl": 39.892494210522194, "lr": 0.00056, "grad_norm": 0.6795, "tokens_per_sec": 146488, "dt_s": 4.474, "eta_s": 23608, "world_size": 1, "timestamp": "2026-05-05T01:20:19.538102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39900, "epoch": 0, "train_loss": 3.7402833849191666, "train_ppl": 42.10992179107848, "lr": 0.00056, "grad_norm": 0.6664, "tokens_per_sec": 145601, "dt_s": 4.501, "eta_s": 23666, "world_size": 1, "timestamp": "2026-05-05T01:20:24.039164"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39910, "epoch": 0, "train_loss": 3.7859372943639755, "train_ppl": 44.07696429374794, "lr": 0.00056, "grad_norm": 0.7462, "tokens_per_sec": 147819, "dt_s": 4.434, "eta_s": 23633, "world_size": 1, "timestamp": "2026-05-05T01:20:28.472701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39920, "epoch": 0, "train_loss": 4.02381956577301, "train_ppl": 55.91426669595873, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 148501, "dt_s": 4.413, "eta_s": 23505, "world_size": 1, "timestamp": "2026-05-05T01:20:32.885910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39930, "epoch": 0, "train_loss": 3.7165122777223587, "train_ppl": 41.12072604479218, "lr": 0.00056, "grad_norm": 0.6538, "tokens_per_sec": 147584, "dt_s": 4.441, "eta_s": 23485, "world_size": 1, "timestamp": "2026-05-05T01:20:37.326489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39940, "epoch": 0, "train_loss": 3.6850044578313828, "train_ppl": 39.84530007309461, "lr": 0.00056, "grad_norm": 0.6819, "tokens_per_sec": 148138, "dt_s": 4.424, "eta_s": 23428, "world_size": 1, "timestamp": "2026-05-05T01:20:41.750454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39950, "epoch": 0, "train_loss": 3.7747055143117905, "train_ppl": 43.58467136372501, "lr": 0.00056, "grad_norm": 0.7437, "tokens_per_sec": 144559, "dt_s": 4.534, "eta_s": 23458, "world_size": 1, "timestamp": "2026-05-05T01:20:46.283966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39960, "epoch": 0, "train_loss": 3.8214433789253235, "train_ppl": 45.670080001506825, "lr": 0.00056, "grad_norm": 0.6907, "tokens_per_sec": 148969, "dt_s": 4.399, "eta_s": 23417, "world_size": 1, "timestamp": "2026-05-05T01:20:50.683280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39970, "epoch": 0, "train_loss": 3.759386569261551, "train_ppl": 42.92208817313533, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 146665, "dt_s": 4.468, "eta_s": 23471, "world_size": 1, "timestamp": "2026-05-05T01:20:55.151701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39980, "epoch": 0, "train_loss": 3.831557512283325, "train_ppl": 46.13433710261072, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 144463, "dt_s": 4.537, "eta_s": 23568, "world_size": 1, "timestamp": "2026-05-05T01:20:59.688227"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 39990, "epoch": 0, "train_loss": 3.962277054786682, "train_ppl": 52.576910252583154, "lr": 0.00056, "grad_norm": 0.8626, "tokens_per_sec": 148723, "dt_s": 4.407, "eta_s": 23545, "world_size": 1, "timestamp": "2026-05-05T01:21:04.094815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40000, "epoch": 0, "train_loss": 3.7332037687301636, "train_ppl": 41.812852516055266, "lr": 0.00056, "grad_norm": 0.7186, "tokens_per_sec": 144276, "dt_s": 4.542, "eta_s": 23550, "world_size": 1, "timestamp": "2026-05-05T01:21:08.637274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40010, "epoch": 0, "train_loss": 3.7310932278633118, "train_ppl": 41.72469784180468, "lr": 0.00056, "grad_norm": 0.6692, "tokens_per_sec": 123920, "dt_s": 5.289, "eta_s": 23652, "world_size": 1, "timestamp": "2026-05-05T01:21:13.925812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40020, "epoch": 0, "train_loss": 3.7522820979356766, "train_ppl": 42.61823008207454, "lr": 0.00056, "grad_norm": 0.635, "tokens_per_sec": 148267, "dt_s": 4.42, "eta_s": 23597, "world_size": 1, "timestamp": "2026-05-05T01:21:18.345951"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40030, "epoch": 0, "train_loss": 3.766325816512108, "train_ppl": 43.22097096693672, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 147134, "dt_s": 4.454, "eta_s": 23506, "world_size": 1, "timestamp": "2026-05-05T01:21:22.800106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40040, "epoch": 0, "train_loss": 3.6767968088388443, "train_ppl": 39.51960227110807, "lr": 0.00056, "grad_norm": 0.7433, "tokens_per_sec": 147833, "dt_s": 4.433, "eta_s": 23529, "world_size": 1, "timestamp": "2026-05-05T01:21:27.233223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40050, "epoch": 0, "train_loss": 3.713064506649971, "train_ppl": 40.979195318031586, "lr": 0.00056, "grad_norm": 0.6087, "tokens_per_sec": 145576, "dt_s": 4.502, "eta_s": 23482, "world_size": 1, "timestamp": "2026-05-05T01:21:31.735053"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40060, "epoch": 0, "train_loss": 3.71996209025383, "train_ppl": 41.26282981555544, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 144732, "dt_s": 4.528, "eta_s": 23506, "world_size": 1, "timestamp": "2026-05-05T01:21:36.263155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40070, "epoch": 0, "train_loss": 3.755517899990082, "train_ppl": 42.756357594652705, "lr": 0.00056, "grad_norm": 0.6442, "tokens_per_sec": 147240, "dt_s": 4.451, "eta_s": 23534, "world_size": 1, "timestamp": "2026-05-05T01:21:40.714116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40080, "epoch": 0, "train_loss": 3.6084811836481094, "train_ppl": 36.90995078446774, "lr": 0.00056, "grad_norm": 0.6908, "tokens_per_sec": 145863, "dt_s": 4.493, "eta_s": 23570, "world_size": 1, "timestamp": "2026-05-05T01:21:45.207076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40090, "epoch": 0, "train_loss": 3.7541851550340652, "train_ppl": 42.699412229983466, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 146502, "dt_s": 4.473, "eta_s": 23608, "world_size": 1, "timestamp": "2026-05-05T01:21:49.680485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40100, "epoch": 0, "train_loss": 3.7272388488054276, "train_ppl": 41.564184578445634, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 147167, "dt_s": 4.453, "eta_s": 23553, "world_size": 1, "timestamp": "2026-05-05T01:21:54.133624"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40110, "epoch": 0, "train_loss": 3.809191331267357, "train_ppl": 45.11394187711833, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 130218, "dt_s": 5.033, "eta_s": 24079, "world_size": 1, "timestamp": "2026-05-05T01:21:59.166431"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40120, "epoch": 0, "train_loss": 3.7758877873420715, "train_ppl": 43.63623081788237, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 147592, "dt_s": 4.44, "eta_s": 24063, "world_size": 1, "timestamp": "2026-05-05T01:22:03.606797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40130, "epoch": 0, "train_loss": 3.70901757478714, "train_ppl": 40.81369042617782, "lr": 0.00056, "grad_norm": 0.6984, "tokens_per_sec": 147314, "dt_s": 4.449, "eta_s": 24012, "world_size": 1, "timestamp": "2026-05-05T01:22:08.055503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40140, "epoch": 0, "train_loss": 3.708354026079178, "train_ppl": 40.78661753771179, "lr": 0.00056, "grad_norm": 0.6526, "tokens_per_sec": 146549, "dt_s": 4.472, "eta_s": 24006, "world_size": 1, "timestamp": "2026-05-05T01:22:12.527436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40150, "epoch": 0, "train_loss": 3.8740560710430145, "train_ppl": 48.137238705641074, "lr": 0.00056, "grad_norm": 1.0342, "tokens_per_sec": 148517, "dt_s": 4.413, "eta_s": 23959, "world_size": 1, "timestamp": "2026-05-05T01:22:16.940145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40160, "epoch": 0, "train_loss": 3.7302340269088745, "train_ppl": 41.688863338319685, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 148072, "dt_s": 4.426, "eta_s": 23317, "world_size": 1, "timestamp": "2026-05-05T01:22:21.366106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40170, "epoch": 0, "train_loss": 3.6810460090637207, "train_ppl": 39.68788625693364, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 148935, "dt_s": 4.4, "eta_s": 23270, "world_size": 1, "timestamp": "2026-05-05T01:22:25.766407"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40180, "epoch": 0, "train_loss": 3.8223010450601578, "train_ppl": 45.70926648455691, "lr": 0.00056, "grad_norm": 0.7106, "tokens_per_sec": 146649, "dt_s": 4.469, "eta_s": 23287, "world_size": 1, "timestamp": "2026-05-05T01:22:30.235338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40190, "epoch": 0, "train_loss": 3.681183099746704, "train_ppl": 39.693327469328004, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 144312, "dt_s": 4.541, "eta_s": 23355, "world_size": 1, "timestamp": "2026-05-05T01:22:34.776575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40200, "epoch": 0, "train_loss": 3.815067857503891, "train_ppl": 45.37983564084426, "lr": 0.00056, "grad_norm": 0.6584, "tokens_per_sec": 146284, "dt_s": 4.48, "eta_s": 23422, "world_size": 1, "timestamp": "2026-05-05T01:22:39.256614"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40210, "epoch": 0, "train_loss": 3.7650681734085083, "train_ppl": 43.16664857711873, "lr": 0.00056, "grad_norm": 0.6226, "tokens_per_sec": 149291, "dt_s": 4.39, "eta_s": 23379, "world_size": 1, "timestamp": "2026-05-05T01:22:43.646462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40220, "epoch": 0, "train_loss": 3.758922666311264, "train_ppl": 42.90218110762956, "lr": 0.00056, "grad_norm": 0.6511, "tokens_per_sec": 146592, "dt_s": 4.471, "eta_s": 23449, "world_size": 1, "timestamp": "2026-05-05T01:22:48.117067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40230, "epoch": 0, "train_loss": 3.782238006591797, "train_ppl": 43.91421213783303, "lr": 0.00056, "grad_norm": 0.6497, "tokens_per_sec": 149250, "dt_s": 4.391, "eta_s": 23362, "world_size": 1, "timestamp": "2026-05-05T01:22:52.508095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40240, "epoch": 0, "train_loss": 3.792991265654564, "train_ppl": 44.38898111952797, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 148589, "dt_s": 4.411, "eta_s": 23221, "world_size": 1, "timestamp": "2026-05-05T01:22:56.918645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40250, "epoch": 0, "train_loss": 3.7269986420869827, "train_ppl": 41.55420178107871, "lr": 0.00056, "grad_norm": 0.6056, "tokens_per_sec": 145705, "dt_s": 4.498, "eta_s": 23235, "world_size": 1, "timestamp": "2026-05-05T01:23:01.416500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40260, "epoch": 0, "train_loss": 3.7903562635183334, "train_ppl": 44.27217002578094, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 147091, "dt_s": 4.455, "eta_s": 23299, "world_size": 1, "timestamp": "2026-05-05T01:23:05.871970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40270, "epoch": 0, "train_loss": 3.812201604247093, "train_ppl": 45.24995176810311, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 142874, "dt_s": 4.587, "eta_s": 23417, "world_size": 1, "timestamp": "2026-05-05T01:23:10.458946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40280, "epoch": 0, "train_loss": 3.7051668763160706, "train_ppl": 40.656831413059855, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 146732, "dt_s": 4.466, "eta_s": 23491, "world_size": 1, "timestamp": "2026-05-05T01:23:14.925331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40290, "epoch": 0, "train_loss": 3.7740010917186737, "train_ppl": 43.55398014756498, "lr": 0.00056, "grad_norm": 0.7785, "tokens_per_sec": 147523, "dt_s": 4.442, "eta_s": 23520, "world_size": 1, "timestamp": "2026-05-05T01:23:19.367749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40300, "epoch": 0, "train_loss": 3.762050911784172, "train_ppl": 43.0365997991715, "lr": 0.00056, "grad_norm": 0.7194, "tokens_per_sec": 145983, "dt_s": 4.489, "eta_s": 23507, "world_size": 1, "timestamp": "2026-05-05T01:23:23.857063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40310, "epoch": 0, "train_loss": 3.8713007867336273, "train_ppl": 48.00478947856846, "lr": 0.00056, "grad_norm": 0.6371, "tokens_per_sec": 148225, "dt_s": 4.421, "eta_s": 23467, "world_size": 1, "timestamp": "2026-05-05T01:23:28.278426"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40320, "epoch": 0, "train_loss": 3.8119376599788666, "train_ppl": 45.238009878762334, "lr": 0.00056, "grad_norm": 0.7352, "tokens_per_sec": 149619, "dt_s": 4.38, "eta_s": 23246, "world_size": 1, "timestamp": "2026-05-05T01:23:32.658636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40330, "epoch": 0, "train_loss": 3.750073954463005, "train_ppl": 42.52422674013084, "lr": 0.00056, "grad_norm": 0.7191, "tokens_per_sec": 146960, "dt_s": 4.459, "eta_s": 23234, "world_size": 1, "timestamp": "2026-05-05T01:23:37.118070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40340, "epoch": 0, "train_loss": 3.6785068809986115, "train_ppl": 39.587241460180664, "lr": 0.00056, "grad_norm": 0.6968, "tokens_per_sec": 146918, "dt_s": 4.461, "eta_s": 23249, "world_size": 1, "timestamp": "2026-05-05T01:23:41.578815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40350, "epoch": 0, "train_loss": 3.805776983499527, "train_ppl": 44.96016985533839, "lr": 0.00056, "grad_norm": 0.6393, "tokens_per_sec": 147678, "dt_s": 4.438, "eta_s": 23190, "world_size": 1, "timestamp": "2026-05-05T01:23:46.016583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40360, "epoch": 0, "train_loss": 3.793830633163452, "train_ppl": 44.42625542926618, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 146752, "dt_s": 4.466, "eta_s": 23233, "world_size": 1, "timestamp": "2026-05-05T01:23:50.482372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40370, "epoch": 0, "train_loss": 3.7660961002111435, "train_ppl": 43.21104354565101, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 151712, "dt_s": 4.32, "eta_s": 23165, "world_size": 1, "timestamp": "2026-05-05T01:23:54.802128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40380, "epoch": 0, "train_loss": 3.713254436850548, "train_ppl": 40.98697924399572, "lr": 0.00056, "grad_norm": 0.694, "tokens_per_sec": 147938, "dt_s": 4.43, "eta_s": 23130, "world_size": 1, "timestamp": "2026-05-05T01:23:59.232092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40390, "epoch": 0, "train_loss": 3.779126390814781, "train_ppl": 43.77778035423433, "lr": 0.00056, "grad_norm": 0.6707, "tokens_per_sec": 147392, "dt_s": 4.446, "eta_s": 23110, "world_size": 1, "timestamp": "2026-05-05T01:24:03.678465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40400, "epoch": 0, "train_loss": 3.9187270402908325, "train_ppl": 50.33632786032343, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 133044, "dt_s": 4.926, "eta_s": 23616, "world_size": 1, "timestamp": "2026-05-05T01:24:08.604400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40410, "epoch": 0, "train_loss": 3.715207099914551, "train_ppl": 41.06709119483937, "lr": 0.00056, "grad_norm": 0.6378, "tokens_per_sec": 142826, "dt_s": 4.589, "eta_s": 23740, "world_size": 1, "timestamp": "2026-05-05T01:24:13.192892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40420, "epoch": 0, "train_loss": 3.9155662953853607, "train_ppl": 50.177478741496245, "lr": 0.00056, "grad_norm": 0.6919, "tokens_per_sec": 149510, "dt_s": 4.383, "eta_s": 23802, "world_size": 1, "timestamp": "2026-05-05T01:24:17.576283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40430, "epoch": 0, "train_loss": 3.7378473579883575, "train_ppl": 42.007465731068315, "lr": 0.00056, "grad_norm": 0.7084, "tokens_per_sec": 147713, "dt_s": 4.437, "eta_s": 23804, "world_size": 1, "timestamp": "2026-05-05T01:24:22.012974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40440, "epoch": 0, "train_loss": 3.7726337760686874, "train_ppl": 43.49446880355766, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 144505, "dt_s": 4.535, "eta_s": 23893, "world_size": 1, "timestamp": "2026-05-05T01:24:26.548269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40450, "epoch": 0, "train_loss": 3.7366025149822235, "train_ppl": 41.9552055657469, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 146326, "dt_s": 4.479, "eta_s": 23421, "world_size": 1, "timestamp": "2026-05-05T01:24:31.026955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40460, "epoch": 0, "train_loss": 3.6807249188423157, "train_ppl": 39.67514491041865, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 146913, "dt_s": 4.461, "eta_s": 23283, "world_size": 1, "timestamp": "2026-05-05T01:24:35.487834"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40470, "epoch": 0, "train_loss": 3.775884911417961, "train_ppl": 43.63610532357453, "lr": 0.00056, "grad_norm": 0.6362, "tokens_per_sec": 151100, "dt_s": 4.337, "eta_s": 23231, "world_size": 1, "timestamp": "2026-05-05T01:24:39.825104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40480, "epoch": 0, "train_loss": 3.7763377726078033, "train_ppl": 43.65587089734427, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 148374, "dt_s": 4.417, "eta_s": 23205, "world_size": 1, "timestamp": "2026-05-05T01:24:44.242059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40490, "epoch": 0, "train_loss": 3.642991930246353, "train_ppl": 38.2059755063282, "lr": 0.00056, "grad_norm": 0.7134, "tokens_per_sec": 148629, "dt_s": 4.409, "eta_s": 23070, "world_size": 1, "timestamp": "2026-05-05T01:24:48.651421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40500, "epoch": 0, "train_loss": 3.7603909373283386, "train_ppl": 42.96521940404604, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 150351, "dt_s": 4.359, "eta_s": 22940, "world_size": 1, "timestamp": "2026-05-05T01:24:53.010307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40510, "epoch": 0, "train_loss": 3.6371795684099197, "train_ppl": 37.9845526705621, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 125813, "dt_s": 5.209, "eta_s": 22916, "world_size": 1, "timestamp": "2026-05-05T01:24:58.219279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40520, "epoch": 0, "train_loss": 3.7301919013261795, "train_ppl": 41.68710720764894, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 148288, "dt_s": 4.42, "eta_s": 22997, "world_size": 1, "timestamp": "2026-05-05T01:25:02.638791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40530, "epoch": 0, "train_loss": 3.7332910895347595, "train_ppl": 41.81650380739395, "lr": 0.00056, "grad_norm": 0.6443, "tokens_per_sec": 146137, "dt_s": 4.485, "eta_s": 23063, "world_size": 1, "timestamp": "2026-05-05T01:25:07.123375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40540, "epoch": 0, "train_loss": 3.787226974964142, "train_ppl": 44.133846171354655, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 144204, "dt_s": 4.545, "eta_s": 23200, "world_size": 1, "timestamp": "2026-05-05T01:25:11.668055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40550, "epoch": 0, "train_loss": 3.7372746765613556, "train_ppl": 41.98341572280253, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 149257, "dt_s": 4.391, "eta_s": 23229, "world_size": 1, "timestamp": "2026-05-05T01:25:16.058840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40560, "epoch": 0, "train_loss": 3.680242657661438, "train_ppl": 39.656015741181285, "lr": 0.00056, "grad_norm": 0.639, "tokens_per_sec": 147514, "dt_s": 4.443, "eta_s": 23225, "world_size": 1, "timestamp": "2026-05-05T01:25:20.501558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40570, "epoch": 0, "train_loss": 3.751363307237625, "train_ppl": 42.5790908318506, "lr": 0.00056, "grad_norm": 0.675, "tokens_per_sec": 147021, "dt_s": 4.458, "eta_s": 23261, "world_size": 1, "timestamp": "2026-05-05T01:25:24.959120"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40580, "epoch": 0, "train_loss": 3.832337662577629, "train_ppl": 46.170342862404695, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 150643, "dt_s": 4.35, "eta_s": 23116, "world_size": 1, "timestamp": "2026-05-05T01:25:29.309552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40590, "epoch": 0, "train_loss": 3.865867093205452, "train_ppl": 47.74465355414484, "lr": 0.00056, "grad_norm": 0.7135, "tokens_per_sec": 149327, "dt_s": 4.389, "eta_s": 22949, "world_size": 1, "timestamp": "2026-05-05T01:25:33.698324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40600, "epoch": 0, "train_loss": 3.7257810533046722, "train_ppl": 41.50363664115277, "lr": 0.00056, "grad_norm": 0.6762, "tokens_per_sec": 147748, "dt_s": 4.436, "eta_s": 22992, "world_size": 1, "timestamp": "2026-05-05T01:25:38.133953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40610, "epoch": 0, "train_loss": 3.6972047835588455, "train_ppl": 40.334403254942615, "lr": 0.00056, "grad_norm": 0.6574, "tokens_per_sec": 144177, "dt_s": 4.546, "eta_s": 23094, "world_size": 1, "timestamp": "2026-05-05T01:25:42.679477"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40620, "epoch": 0, "train_loss": 3.736030489206314, "train_ppl": 41.93121296957566, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 144020, "dt_s": 4.55, "eta_s": 23187, "world_size": 1, "timestamp": "2026-05-05T01:25:47.229981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40630, "epoch": 0, "train_loss": 3.841983124613762, "train_ppl": 46.617831801127515, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 147733, "dt_s": 4.436, "eta_s": 23271, "world_size": 1, "timestamp": "2026-05-05T01:25:51.666062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40640, "epoch": 0, "train_loss": 3.761779010295868, "train_ppl": 43.02489967434731, "lr": 0.00056, "grad_norm": 0.6486, "tokens_per_sec": 148430, "dt_s": 4.415, "eta_s": 23295, "world_size": 1, "timestamp": "2026-05-05T01:25:56.081342"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40650, "epoch": 0, "train_loss": 3.82051083445549, "train_ppl": 45.62751047303876, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 145266, "dt_s": 4.511, "eta_s": 23369, "world_size": 1, "timestamp": "2026-05-05T01:26:00.592779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40660, "epoch": 0, "train_loss": 3.785385549068451, "train_ppl": 44.052651743842155, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 146039, "dt_s": 4.488, "eta_s": 23304, "world_size": 1, "timestamp": "2026-05-05T01:26:05.080365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40670, "epoch": 0, "train_loss": 3.770587831735611, "train_ppl": 43.40557251103097, "lr": 0.00056, "grad_norm": 0.6533, "tokens_per_sec": 147139, "dt_s": 4.454, "eta_s": 23199, "world_size": 1, "timestamp": "2026-05-05T01:26:09.534384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40680, "epoch": 0, "train_loss": 3.749125689268112, "train_ppl": 42.48392160897019, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 141710, "dt_s": 4.625, "eta_s": 23391, "world_size": 1, "timestamp": "2026-05-05T01:26:14.159024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40690, "epoch": 0, "train_loss": 3.8685333132743835, "train_ppl": 47.87212116046909, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 142346, "dt_s": 4.604, "eta_s": 23583, "world_size": 1, "timestamp": "2026-05-05T01:26:18.763029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40700, "epoch": 0, "train_loss": 3.7781382501125336, "train_ppl": 43.734543113369185, "lr": 0.00056, "grad_norm": 0.7114, "tokens_per_sec": 125334, "dt_s": 5.229, "eta_s": 24324, "world_size": 1, "timestamp": "2026-05-05T01:26:23.991927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40710, "epoch": 0, "train_loss": 3.756057098507881, "train_ppl": 42.7794179757972, "lr": 0.00056, "grad_norm": 0.6752, "tokens_per_sec": 142096, "dt_s": 4.612, "eta_s": 24449, "world_size": 1, "timestamp": "2026-05-05T01:26:28.604040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40720, "epoch": 0, "train_loss": 3.852654203772545, "train_ppl": 47.117958072148916, "lr": 0.00056, "grad_norm": 0.6616, "tokens_per_sec": 140285, "dt_s": 4.672, "eta_s": 24670, "world_size": 1, "timestamp": "2026-05-05T01:26:33.275710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40730, "epoch": 0, "train_loss": 3.7598330974578857, "train_ppl": 42.94125837544961, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 141048, "dt_s": 4.646, "eta_s": 24688, "world_size": 1, "timestamp": "2026-05-05T01:26:37.922046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40740, "epoch": 0, "train_loss": 3.6964201778173447, "train_ppl": 40.302769062376186, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 140560, "dt_s": 4.662, "eta_s": 24744, "world_size": 1, "timestamp": "2026-05-05T01:26:42.584540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40750, "epoch": 0, "train_loss": 3.7861294895410538, "train_ppl": 44.085436487836795, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 141881, "dt_s": 4.619, "eta_s": 24106, "world_size": 1, "timestamp": "2026-05-05T01:26:47.203625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40760, "epoch": 0, "train_loss": 3.7999424040317535, "train_ppl": 44.69860995944026, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 142221, "dt_s": 4.608, "eta_s": 24097, "world_size": 1, "timestamp": "2026-05-05T01:26:51.811670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40770, "epoch": 0, "train_loss": 3.7477816492319107, "train_ppl": 42.42685987265186, "lr": 0.00056, "grad_norm": 0.6356, "tokens_per_sec": 144399, "dt_s": 4.539, "eta_s": 23954, "world_size": 1, "timestamp": "2026-05-05T01:26:56.350172"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40780, "epoch": 0, "train_loss": 3.6747301667928696, "train_ppl": 39.43801373560215, "lr": 0.00056, "grad_norm": 1.0008, "tokens_per_sec": 143620, "dt_s": 4.563, "eta_s": 23863, "world_size": 1, "timestamp": "2026-05-05T01:27:00.913318"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40790, "epoch": 0, "train_loss": 3.929661899805069, "train_ppl": 50.88976891943288, "lr": 0.00056, "grad_norm": 0.7396, "tokens_per_sec": 147521, "dt_s": 4.442, "eta_s": 23630, "world_size": 1, "timestamp": "2026-05-05T01:27:05.355815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40800, "epoch": 0, "train_loss": 3.7045838832855225, "train_ppl": 40.63313567159908, "lr": 0.00056, "grad_norm": 0.6568, "tokens_per_sec": 147308, "dt_s": 4.449, "eta_s": 23449, "world_size": 1, "timestamp": "2026-05-05T01:27:09.804693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40810, "epoch": 0, "train_loss": 3.8443064987659454, "train_ppl": 46.72626838716668, "lr": 0.00056, "grad_norm": 0.8935, "tokens_per_sec": 147195, "dt_s": 4.452, "eta_s": 23283, "world_size": 1, "timestamp": "2026-05-05T01:27:14.257012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40820, "epoch": 0, "train_loss": 3.789970636367798, "train_ppl": 44.25510076640233, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 148871, "dt_s": 4.402, "eta_s": 23137, "world_size": 1, "timestamp": "2026-05-05T01:27:18.659214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40830, "epoch": 0, "train_loss": 3.7017349302768707, "train_ppl": 40.517538521088724, "lr": 0.00056, "grad_norm": 0.6251, "tokens_per_sec": 147383, "dt_s": 4.447, "eta_s": 23012, "world_size": 1, "timestamp": "2026-05-05T01:27:23.105847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40840, "epoch": 0, "train_loss": 3.668944627046585, "train_ppl": 39.21050231242483, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 145488, "dt_s": 4.505, "eta_s": 23072, "world_size": 1, "timestamp": "2026-05-05T01:27:27.610468"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40850, "epoch": 0, "train_loss": 3.732883408665657, "train_ppl": 41.79945949333538, "lr": 0.00056, "grad_norm": 0.7072, "tokens_per_sec": 147723, "dt_s": 4.436, "eta_s": 23055, "world_size": 1, "timestamp": "2026-05-05T01:27:32.046873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40860, "epoch": 0, "train_loss": 3.7871884554624557, "train_ppl": 44.13214619033406, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 145924, "dt_s": 4.491, "eta_s": 23090, "world_size": 1, "timestamp": "2026-05-05T01:27:36.537945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40870, "epoch": 0, "train_loss": 3.729582980275154, "train_ppl": 41.66173077741924, "lr": 0.00056, "grad_norm": 0.6558, "tokens_per_sec": 148110, "dt_s": 4.425, "eta_s": 23109, "world_size": 1, "timestamp": "2026-05-05T01:27:40.962783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40880, "epoch": 0, "train_loss": 3.7706712931394577, "train_ppl": 43.40919535222912, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 147903, "dt_s": 4.431, "eta_s": 23089, "world_size": 1, "timestamp": "2026-05-05T01:27:45.393790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40890, "epoch": 0, "train_loss": 3.6752731800079346, "train_ppl": 39.459434913701536, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 144277, "dt_s": 4.542, "eta_s": 23123, "world_size": 1, "timestamp": "2026-05-05T01:27:49.936169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40900, "epoch": 0, "train_loss": 3.807142049074173, "train_ppl": 45.02158534398598, "lr": 0.00056, "grad_norm": 0.6435, "tokens_per_sec": 149494, "dt_s": 4.384, "eta_s": 23065, "world_size": 1, "timestamp": "2026-05-05T01:27:54.320045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40910, "epoch": 0, "train_loss": 3.7204697877168655, "train_ppl": 41.28378416835625, "lr": 0.00056, "grad_norm": 0.7048, "tokens_per_sec": 150871, "dt_s": 4.344, "eta_s": 22908, "world_size": 1, "timestamp": "2026-05-05T01:27:58.663891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40920, "epoch": 0, "train_loss": 3.738298773765564, "train_ppl": 42.02643284456483, "lr": 0.00056, "grad_norm": 0.6648, "tokens_per_sec": 144706, "dt_s": 4.529, "eta_s": 23011, "world_size": 1, "timestamp": "2026-05-05T01:28:03.192800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40930, "epoch": 0, "train_loss": 3.7378875762224197, "train_ppl": 42.00915523113157, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 151454, "dt_s": 4.327, "eta_s": 22899, "world_size": 1, "timestamp": "2026-05-05T01:28:07.519925"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40940, "epoch": 0, "train_loss": 3.8341459780931473, "train_ppl": 46.25390894390866, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 147089, "dt_s": 4.456, "eta_s": 22805, "world_size": 1, "timestamp": "2026-05-05T01:28:11.975472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40950, "epoch": 0, "train_loss": 3.6930929869413376, "train_ppl": 40.16889688950936, "lr": 0.00056, "grad_norm": 0.6315, "tokens_per_sec": 147549, "dt_s": 4.442, "eta_s": 22860, "world_size": 1, "timestamp": "2026-05-05T01:28:16.417121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40960, "epoch": 0, "train_loss": 3.6898749619722366, "train_ppl": 40.039840141628794, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 150068, "dt_s": 4.367, "eta_s": 22880, "world_size": 1, "timestamp": "2026-05-05T01:28:20.784215"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40970, "epoch": 0, "train_loss": 3.7392063587903976, "train_ppl": 42.06459271970796, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 146281, "dt_s": 4.48, "eta_s": 22825, "world_size": 1, "timestamp": "2026-05-05T01:28:25.264366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40980, "epoch": 0, "train_loss": 3.595591828227043, "train_ppl": 36.43725820579111, "lr": 0.00056, "grad_norm": 0.7381, "tokens_per_sec": 146559, "dt_s": 4.472, "eta_s": 22970, "world_size": 1, "timestamp": "2026-05-05T01:28:29.736043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 40990, "epoch": 0, "train_loss": 3.7881693989038467, "train_ppl": 44.17545856971811, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 132603, "dt_s": 4.942, "eta_s": 23469, "world_size": 1, "timestamp": "2026-05-05T01:28:34.678351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41000, "epoch": 0, "train_loss": 3.859033942222595, "train_ppl": 47.419519238813706, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 144311, "dt_s": 4.541, "eta_s": 23567, "world_size": 1, "timestamp": "2026-05-05T01:28:39.219622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41010, "epoch": 0, "train_loss": 3.796784430742264, "train_ppl": 44.55767559371807, "lr": 0.00056, "grad_norm": 0.6412, "tokens_per_sec": 124227, "dt_s": 5.276, "eta_s": 23693, "world_size": 1, "timestamp": "2026-05-05T01:28:44.495158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41020, "epoch": 0, "train_loss": 3.700316622853279, "train_ppl": 40.46011292861708, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 140556, "dt_s": 4.663, "eta_s": 23877, "world_size": 1, "timestamp": "2026-05-05T01:28:49.157795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41030, "epoch": 0, "train_loss": 3.7342098355293274, "train_ppl": 41.854940206714126, "lr": 0.00056, "grad_norm": 0.6686, "tokens_per_sec": 145499, "dt_s": 4.504, "eta_s": 23906, "world_size": 1, "timestamp": "2026-05-05T01:28:53.662003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41040, "epoch": 0, "train_loss": 3.8610034584999084, "train_ppl": 47.513004784249475, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 146657, "dt_s": 4.469, "eta_s": 23412, "world_size": 1, "timestamp": "2026-05-05T01:28:58.130662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41050, "epoch": 0, "train_loss": 3.820368006825447, "train_ppl": 45.62099406922547, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 144760, "dt_s": 4.527, "eta_s": 23393, "world_size": 1, "timestamp": "2026-05-05T01:29:02.657865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41060, "epoch": 0, "train_loss": 3.819078966975212, "train_ppl": 45.56222467604551, "lr": 0.00056, "grad_norm": 0.7143, "tokens_per_sec": 145797, "dt_s": 4.495, "eta_s": 23390, "world_size": 1, "timestamp": "2026-05-05T01:29:07.152886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41070, "epoch": 0, "train_loss": 3.906853035092354, "train_ppl": 49.74216854797876, "lr": 0.00056, "grad_norm": 0.6871, "tokens_per_sec": 144903, "dt_s": 4.523, "eta_s": 23241, "world_size": 1, "timestamp": "2026-05-05T01:29:11.675640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41080, "epoch": 0, "train_loss": 3.734483629465103, "train_ppl": 41.8664014044566, "lr": 0.00056, "grad_norm": 0.7193, "tokens_per_sec": 146863, "dt_s": 4.462, "eta_s": 23194, "world_size": 1, "timestamp": "2026-05-05T01:29:16.137996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41090, "epoch": 0, "train_loss": 3.825191542506218, "train_ppl": 45.84158013660537, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 147111, "dt_s": 4.455, "eta_s": 23175, "world_size": 1, "timestamp": "2026-05-05T01:29:20.592886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41100, "epoch": 0, "train_loss": 3.815197631716728, "train_ppl": 45.385725155438536, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 143728, "dt_s": 4.56, "eta_s": 23204, "world_size": 1, "timestamp": "2026-05-05T01:29:25.152618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41110, "epoch": 0, "train_loss": 3.7195673286914825, "train_ppl": 41.24654405109898, "lr": 0.00056, "grad_norm": 0.723, "tokens_per_sec": 138432, "dt_s": 4.734, "eta_s": 23446, "world_size": 1, "timestamp": "2026-05-05T01:29:29.886764"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41120, "epoch": 0, "train_loss": 3.8190096765756607, "train_ppl": 45.55906776066649, "lr": 0.00056, "grad_norm": 0.673, "tokens_per_sec": 138059, "dt_s": 4.747, "eta_s": 23673, "world_size": 1, "timestamp": "2026-05-05T01:29:34.633747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41130, "epoch": 0, "train_loss": 3.7450354546308517, "train_ppl": 42.31050729566263, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 135043, "dt_s": 4.853, "eta_s": 24071, "world_size": 1, "timestamp": "2026-05-05T01:29:39.486709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41140, "epoch": 0, "train_loss": 3.7591782212257385, "train_ppl": 42.913146371907196, "lr": 0.00056, "grad_norm": 0.6982, "tokens_per_sec": 136831, "dt_s": 4.79, "eta_s": 24411, "world_size": 1, "timestamp": "2026-05-05T01:29:44.276276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41150, "epoch": 0, "train_loss": 3.834163948893547, "train_ppl": 46.254740171142885, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 137612, "dt_s": 4.762, "eta_s": 24615, "world_size": 1, "timestamp": "2026-05-05T01:29:49.038646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41160, "epoch": 0, "train_loss": 3.7749083787202835, "train_ppl": 43.59351403920231, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 133457, "dt_s": 4.911, "eta_s": 24792, "world_size": 1, "timestamp": "2026-05-05T01:29:53.949299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41170, "epoch": 0, "train_loss": 3.7509825527668, "train_ppl": 42.56288173869087, "lr": 0.00056, "grad_norm": 0.6485, "tokens_per_sec": 141718, "dt_s": 4.624, "eta_s": 24661, "world_size": 1, "timestamp": "2026-05-05T01:29:58.573702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41180, "epoch": 0, "train_loss": 3.764669716358185, "train_ppl": 43.14945194794093, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 138946, "dt_s": 4.717, "eta_s": 24516, "world_size": 1, "timestamp": "2026-05-05T01:30:03.290328"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41190, "epoch": 0, "train_loss": 3.666113182902336, "train_ppl": 39.099636993805824, "lr": 0.00056, "grad_norm": 0.6823, "tokens_per_sec": 139692, "dt_s": 4.691, "eta_s": 24410, "world_size": 1, "timestamp": "2026-05-05T01:30:07.981828"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41200, "epoch": 0, "train_loss": 3.8641645163297653, "train_ppl": 47.663433772146995, "lr": 0.00056, "grad_norm": 0.7361, "tokens_per_sec": 138186, "dt_s": 4.743, "eta_s": 24385, "world_size": 1, "timestamp": "2026-05-05T01:30:12.724378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41210, "epoch": 0, "train_loss": 3.849275901913643, "train_ppl": 46.9590479611036, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 137494, "dt_s": 4.766, "eta_s": 24232, "world_size": 1, "timestamp": "2026-05-05T01:30:17.490842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41220, "epoch": 0, "train_loss": 3.6911692768335342, "train_ppl": 40.091697854632834, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 139668, "dt_s": 4.692, "eta_s": 24297, "world_size": 1, "timestamp": "2026-05-05T01:30:22.183095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41230, "epoch": 0, "train_loss": 3.6813913881778717, "train_ppl": 39.70159599132326, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 138515, "dt_s": 4.731, "eta_s": 24308, "world_size": 1, "timestamp": "2026-05-05T01:30:26.914433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41240, "epoch": 0, "train_loss": 3.7087092250585556, "train_ppl": 40.80110747588674, "lr": 0.00056, "grad_norm": 0.6612, "tokens_per_sec": 138996, "dt_s": 4.715, "eta_s": 24327, "world_size": 1, "timestamp": "2026-05-05T01:30:31.629383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41250, "epoch": 0, "train_loss": 3.6533622294664383, "train_ppl": 38.60424441897123, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 138832, "dt_s": 4.721, "eta_s": 24300, "world_size": 1, "timestamp": "2026-05-05T01:30:36.349946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41260, "epoch": 0, "train_loss": 3.6930407136678696, "train_ppl": 40.16679718465705, "lr": 0.00056, "grad_norm": 0.615, "tokens_per_sec": 137491, "dt_s": 4.767, "eta_s": 24295, "world_size": 1, "timestamp": "2026-05-05T01:30:41.116507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41270, "epoch": 0, "train_loss": 3.7019147127866745, "train_ppl": 40.524823520693246, "lr": 0.00056, "grad_norm": 0.6817, "tokens_per_sec": 137642, "dt_s": 4.761, "eta_s": 24361, "world_size": 1, "timestamp": "2026-05-05T01:30:45.877840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41280, "epoch": 0, "train_loss": 3.7272392213344574, "train_ppl": 41.56420006231387, "lr": 0.00056, "grad_norm": 0.6487, "tokens_per_sec": 122821, "dt_s": 5.336, "eta_s": 24978, "world_size": 1, "timestamp": "2026-05-05T01:30:51.213740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41290, "epoch": 0, "train_loss": 3.883245140314102, "train_ppl": 48.58161369603359, "lr": 0.00056, "grad_norm": 1.0605, "tokens_per_sec": 140073, "dt_s": 4.679, "eta_s": 24936, "world_size": 1, "timestamp": "2026-05-05T01:30:55.892423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41300, "epoch": 0, "train_loss": 3.876063659787178, "train_ppl": 48.23397555565647, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 137511, "dt_s": 4.766, "eta_s": 24978, "world_size": 1, "timestamp": "2026-05-05T01:31:00.658323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41310, "epoch": 0, "train_loss": 3.7456753253936768, "train_ppl": 42.33758921578184, "lr": 0.00056, "grad_norm": 0.6439, "tokens_per_sec": 137537, "dt_s": 4.765, "eta_s": 24971, "world_size": 1, "timestamp": "2026-05-05T01:31:05.423281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41320, "epoch": 0, "train_loss": 3.840185835957527, "train_ppl": 46.5341213493096, "lr": 0.00056, "grad_norm": 0.7678, "tokens_per_sec": 144220, "dt_s": 4.544, "eta_s": 24743, "world_size": 1, "timestamp": "2026-05-05T01:31:09.967453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41330, "epoch": 0, "train_loss": 3.7108757942914963, "train_ppl": 40.88960172986171, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 141392, "dt_s": 4.635, "eta_s": 24019, "world_size": 1, "timestamp": "2026-05-05T01:31:14.602521"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41340, "epoch": 0, "train_loss": 3.8283579498529434, "train_ppl": 45.98696330242835, "lr": 0.00056, "grad_norm": 0.6209, "tokens_per_sec": 139575, "dt_s": 4.695, "eta_s": 24031, "world_size": 1, "timestamp": "2026-05-05T01:31:19.297920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41350, "epoch": 0, "train_loss": 3.7665741592645645, "train_ppl": 43.23170591474857, "lr": 0.00056, "grad_norm": 0.7106, "tokens_per_sec": 138782, "dt_s": 4.722, "eta_s": 23982, "world_size": 1, "timestamp": "2026-05-05T01:31:24.020139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41360, "epoch": 0, "train_loss": 3.8155355006456375, "train_ppl": 45.401062172589455, "lr": 0.00056, "grad_norm": 0.7495, "tokens_per_sec": 137165, "dt_s": 4.778, "eta_s": 23990, "world_size": 1, "timestamp": "2026-05-05T01:31:28.798046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41370, "epoch": 0, "train_loss": 3.6950265616178513, "train_ppl": 40.246641589688075, "lr": 0.00056, "grad_norm": 0.6263, "tokens_per_sec": 140021, "dt_s": 4.68, "eta_s": 24125, "world_size": 1, "timestamp": "2026-05-05T01:31:33.478483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41380, "epoch": 0, "train_loss": 3.7587892413139343, "train_ppl": 42.896457266090096, "lr": 0.00056, "grad_norm": 0.704, "tokens_per_sec": 138046, "dt_s": 4.747, "eta_s": 24236, "world_size": 1, "timestamp": "2026-05-05T01:31:38.225870"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41390, "epoch": 0, "train_loss": 3.722817987203598, "train_ppl": 41.38084063852971, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 139709, "dt_s": 4.691, "eta_s": 24227, "world_size": 1, "timestamp": "2026-05-05T01:31:42.916792"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41400, "epoch": 0, "train_loss": 3.766217917203903, "train_ppl": 43.21630770565531, "lr": 0.00056, "grad_norm": 0.7007, "tokens_per_sec": 137342, "dt_s": 4.772, "eta_s": 24273, "world_size": 1, "timestamp": "2026-05-05T01:31:47.688522"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41410, "epoch": 0, "train_loss": 3.6930117309093475, "train_ppl": 40.16563305694354, "lr": 0.00056, "grad_norm": 0.6407, "tokens_per_sec": 136955, "dt_s": 4.785, "eta_s": 24275, "world_size": 1, "timestamp": "2026-05-05T01:31:52.473746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41420, "epoch": 0, "train_loss": 3.8374778479337692, "train_ppl": 46.40827797407894, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 138906, "dt_s": 4.718, "eta_s": 24309, "world_size": 1, "timestamp": "2026-05-05T01:31:57.191740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41430, "epoch": 0, "train_loss": 3.7691687047481537, "train_ppl": 43.34401817871209, "lr": 0.00056, "grad_norm": 0.7494, "tokens_per_sec": 138931, "dt_s": 4.717, "eta_s": 24273, "world_size": 1, "timestamp": "2026-05-05T01:32:01.908921"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41440, "epoch": 0, "train_loss": 3.725674718618393, "train_ppl": 41.49922359960494, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 136646, "dt_s": 4.796, "eta_s": 24376, "world_size": 1, "timestamp": "2026-05-05T01:32:06.704956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41450, "epoch": 0, "train_loss": 3.692156344652176, "train_ppl": 40.1312906165358, "lr": 0.00056, "grad_norm": 0.6699, "tokens_per_sec": 138238, "dt_s": 4.741, "eta_s": 24340, "world_size": 1, "timestamp": "2026-05-05T01:32:11.445759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41460, "epoch": 0, "train_loss": 3.6771993190050125, "train_ppl": 39.53551251458768, "lr": 0.00056, "grad_norm": 0.7015, "tokens_per_sec": 136050, "dt_s": 4.817, "eta_s": 24368, "world_size": 1, "timestamp": "2026-05-05T01:32:16.262817"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41470, "epoch": 0, "train_loss": 3.8119995445013046, "train_ppl": 45.24080949802539, "lr": 0.00056, "grad_norm": 0.631, "tokens_per_sec": 143662, "dt_s": 4.562, "eta_s": 24203, "world_size": 1, "timestamp": "2026-05-05T01:32:20.824630"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41480, "epoch": 0, "train_loss": 3.7909122556447983, "train_ppl": 44.29679184787213, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 139259, "dt_s": 4.706, "eta_s": 24187, "world_size": 1, "timestamp": "2026-05-05T01:32:25.530676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41490, "epoch": 0, "train_loss": 3.724735379219055, "train_ppl": 41.46026004670107, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 135357, "dt_s": 4.842, "eta_s": 24229, "world_size": 1, "timestamp": "2026-05-05T01:32:30.372379"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41500, "epoch": 0, "train_loss": 3.7502860724925995, "train_ppl": 42.5332478520534, "lr": 0.00056, "grad_norm": 0.7288, "tokens_per_sec": 137892, "dt_s": 4.753, "eta_s": 24237, "world_size": 1, "timestamp": "2026-05-05T01:32:35.125074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41510, "epoch": 0, "train_loss": 3.7102964520454407, "train_ppl": 40.86591951687083, "lr": 0.00056, "grad_norm": 0.6876, "tokens_per_sec": 115943, "dt_s": 5.652, "eta_s": 24260, "world_size": 1, "timestamp": "2026-05-05T01:32:40.777510"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41520, "epoch": 0, "train_loss": 3.755478784441948, "train_ppl": 42.754685188997925, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 137131, "dt_s": 4.779, "eta_s": 24478, "world_size": 1, "timestamp": "2026-05-05T01:32:45.556607"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41530, "epoch": 0, "train_loss": 3.7907207757234573, "train_ppl": 44.28831071366281, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 136734, "dt_s": 4.793, "eta_s": 24562, "world_size": 1, "timestamp": "2026-05-05T01:32:50.349549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41540, "epoch": 0, "train_loss": 3.66682793200016, "train_ppl": 39.12759341379521, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 137645, "dt_s": 4.761, "eta_s": 24475, "world_size": 1, "timestamp": "2026-05-05T01:32:55.110785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41550, "epoch": 0, "train_loss": 3.7594904601573944, "train_ppl": 42.92654761897101, "lr": 0.00056, "grad_norm": 0.6842, "tokens_per_sec": 137240, "dt_s": 4.775, "eta_s": 24493, "world_size": 1, "timestamp": "2026-05-05T01:32:59.886080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41560, "epoch": 0, "train_loss": 3.7909003645181656, "train_ppl": 44.29626511224259, "lr": 0.00056, "grad_norm": 0.6507, "tokens_per_sec": 137062, "dt_s": 4.781, "eta_s": 24423, "world_size": 1, "timestamp": "2026-05-05T01:33:04.667546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41570, "epoch": 0, "train_loss": 3.751416802406311, "train_ppl": 42.581368668423224, "lr": 0.00056, "grad_norm": 0.7042, "tokens_per_sec": 137992, "dt_s": 4.749, "eta_s": 24388, "world_size": 1, "timestamp": "2026-05-05T01:33:09.416807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41580, "epoch": 0, "train_loss": 3.742666259407997, "train_ppl": 42.21038409643719, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 123460, "dt_s": 5.308, "eta_s": 24910, "world_size": 1, "timestamp": "2026-05-05T01:33:14.725081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41590, "epoch": 0, "train_loss": 3.6914257407188416, "train_ppl": 40.10198124583607, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 141311, "dt_s": 4.638, "eta_s": 24779, "world_size": 1, "timestamp": "2026-05-05T01:33:19.362784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41600, "epoch": 0, "train_loss": 3.8142379075288773, "train_ppl": 45.34218827226534, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 141195, "dt_s": 4.642, "eta_s": 24638, "world_size": 1, "timestamp": "2026-05-05T01:33:24.004307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41610, "epoch": 0, "train_loss": 3.8086729496717453, "train_ppl": 45.09056170039396, "lr": 0.00056, "grad_norm": 0.6887, "tokens_per_sec": 146009, "dt_s": 4.488, "eta_s": 24334, "world_size": 1, "timestamp": "2026-05-05T01:33:28.492785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41620, "epoch": 0, "train_loss": 3.716432496905327, "train_ppl": 41.11744553053418, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 141715, "dt_s": 4.625, "eta_s": 24201, "world_size": 1, "timestamp": "2026-05-05T01:33:33.117295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41630, "epoch": 0, "train_loss": 3.76730440557003, "train_ppl": 43.263287237940666, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 141281, "dt_s": 4.639, "eta_s": 23513, "world_size": 1, "timestamp": "2026-05-05T01:33:37.755991"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41640, "epoch": 0, "train_loss": 3.7144211381673813, "train_ppl": 41.03482671307785, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 136956, "dt_s": 4.785, "eta_s": 23659, "world_size": 1, "timestamp": "2026-05-05T01:33:42.541208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41650, "epoch": 0, "train_loss": 3.6894447803497314, "train_ppl": 40.022619442511534, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 139651, "dt_s": 4.693, "eta_s": 23707, "world_size": 1, "timestamp": "2026-05-05T01:33:47.234039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41660, "epoch": 0, "train_loss": 3.901151403784752, "train_ppl": 49.45936403214943, "lr": 0.00056, "grad_norm": 0.7072, "tokens_per_sec": 139623, "dt_s": 4.694, "eta_s": 23912, "world_size": 1, "timestamp": "2026-05-05T01:33:51.927816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41670, "epoch": 0, "train_loss": 3.686312511563301, "train_ppl": 39.89745396916753, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 141500, "dt_s": 4.632, "eta_s": 23914, "world_size": 1, "timestamp": "2026-05-05T01:33:56.559349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41680, "epoch": 0, "train_loss": 3.695994809269905, "train_ppl": 40.2856291776847, "lr": 0.00056, "grad_norm": 0.803, "tokens_per_sec": 141826, "dt_s": 4.621, "eta_s": 23891, "world_size": 1, "timestamp": "2026-05-05T01:34:01.180225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41690, "epoch": 0, "train_loss": 3.748323455452919, "train_ppl": 42.44985323768034, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 138406, "dt_s": 4.735, "eta_s": 23835, "world_size": 1, "timestamp": "2026-05-05T01:34:05.915274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41700, "epoch": 0, "train_loss": 3.722140744328499, "train_ppl": 41.352825246724684, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 140400, "dt_s": 4.668, "eta_s": 23805, "world_size": 1, "timestamp": "2026-05-05T01:34:10.583061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41710, "epoch": 0, "train_loss": 3.7069854140281677, "train_ppl": 40.73083466269447, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 137738, "dt_s": 4.758, "eta_s": 23866, "world_size": 1, "timestamp": "2026-05-05T01:34:15.341066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41720, "epoch": 0, "train_loss": 3.7562233805656433, "train_ppl": 42.78653201690039, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 138308, "dt_s": 4.738, "eta_s": 23970, "world_size": 1, "timestamp": "2026-05-05T01:34:20.079496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41730, "epoch": 0, "train_loss": 3.7689764946699142, "train_ppl": 43.335687822201635, "lr": 0.00056, "grad_norm": 0.7888, "tokens_per_sec": 142839, "dt_s": 4.588, "eta_s": 23932, "world_size": 1, "timestamp": "2026-05-05T01:34:24.667605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41740, "epoch": 0, "train_loss": 3.7208553701639175, "train_ppl": 41.29970554018295, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 148724, "dt_s": 4.407, "eta_s": 23593, "world_size": 1, "timestamp": "2026-05-05T01:34:29.074146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41750, "epoch": 0, "train_loss": 3.749577969312668, "train_ppl": 42.5031405847804, "lr": 0.00056, "grad_norm": 0.7313, "tokens_per_sec": 150828, "dt_s": 4.345, "eta_s": 23259, "world_size": 1, "timestamp": "2026-05-05T01:34:33.419324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41760, "epoch": 0, "train_loss": 3.781807914376259, "train_ppl": 43.89532903806925, "lr": 0.00056, "grad_norm": 0.7234, "tokens_per_sec": 153262, "dt_s": 4.276, "eta_s": 22764, "world_size": 1, "timestamp": "2026-05-05T01:34:37.695341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41770, "epoch": 0, "train_loss": 3.761157810688019, "train_ppl": 42.99818092323992, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 148843, "dt_s": 4.403, "eta_s": 22418, "world_size": 1, "timestamp": "2026-05-05T01:34:42.098345"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41780, "epoch": 0, "train_loss": 3.7450381070375443, "train_ppl": 42.31061952048418, "lr": 0.00056, "grad_norm": 0.7019, "tokens_per_sec": 150139, "dt_s": 4.365, "eta_s": 22186, "world_size": 1, "timestamp": "2026-05-05T01:34:46.463359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41790, "epoch": 0, "train_loss": 3.767055705189705, "train_ppl": 43.25252897979736, "lr": 0.00056, "grad_norm": 0.6886, "tokens_per_sec": 152273, "dt_s": 4.304, "eta_s": 22078, "world_size": 1, "timestamp": "2026-05-05T01:34:50.767211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41800, "epoch": 0, "train_loss": 3.8344330489635468, "train_ppl": 46.26718899987598, "lr": 0.00056, "grad_norm": 0.7246, "tokens_per_sec": 149610, "dt_s": 4.38, "eta_s": 22109, "world_size": 1, "timestamp": "2026-05-05T01:34:55.147642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41810, "epoch": 0, "train_loss": 3.72550630569458, "train_ppl": 41.492235182508956, "lr": 0.00056, "grad_norm": 0.6153, "tokens_per_sec": 152187, "dt_s": 4.306, "eta_s": 22136, "world_size": 1, "timestamp": "2026-05-05T01:34:59.453912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41820, "epoch": 0, "train_loss": 3.8169268518686295, "train_ppl": 45.46427496136632, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 152016, "dt_s": 4.311, "eta_s": 22038, "world_size": 1, "timestamp": "2026-05-05T01:35:03.765032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41830, "epoch": 0, "train_loss": 3.664119616150856, "train_ppl": 39.02176690290209, "lr": 0.00056, "grad_norm": 0.6427, "tokens_per_sec": 151889, "dt_s": 4.315, "eta_s": 21982, "world_size": 1, "timestamp": "2026-05-05T01:35:08.079788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41840, "epoch": 0, "train_loss": 3.7835065722465515, "train_ppl": 43.9699555487195, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 151624, "dt_s": 4.322, "eta_s": 21997, "world_size": 1, "timestamp": "2026-05-05T01:35:12.402037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41850, "epoch": 0, "train_loss": 3.755425989627838, "train_ppl": 42.75242802292491, "lr": 0.00056, "grad_norm": 0.6551, "tokens_per_sec": 150169, "dt_s": 4.364, "eta_s": 21976, "world_size": 1, "timestamp": "2026-05-05T01:35:16.766211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41860, "epoch": 0, "train_loss": 3.7068412601947784, "train_ppl": 40.724963579920406, "lr": 0.00056, "grad_norm": 0.6665, "tokens_per_sec": 153125, "dt_s": 4.28, "eta_s": 21945, "world_size": 1, "timestamp": "2026-05-05T01:35:21.046101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41870, "epoch": 0, "train_loss": 3.6965687572956085, "train_ppl": 40.30875767165726, "lr": 0.00056, "grad_norm": 0.6987, "tokens_per_sec": 138429, "dt_s": 4.734, "eta_s": 22370, "world_size": 1, "timestamp": "2026-05-05T01:35:25.780375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41880, "epoch": 0, "train_loss": 3.799969792366028, "train_ppl": 44.699834196676264, "lr": 0.00056, "grad_norm": 0.6403, "tokens_per_sec": 152765, "dt_s": 4.29, "eta_s": 22341, "world_size": 1, "timestamp": "2026-05-05T01:35:30.070358"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41890, "epoch": 0, "train_loss": 3.9484345465898514, "train_ppl": 51.85412804207572, "lr": 0.00056, "grad_norm": 0.8493, "tokens_per_sec": 155403, "dt_s": 4.217, "eta_s": 22230, "world_size": 1, "timestamp": "2026-05-05T01:35:34.287512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41900, "epoch": 0, "train_loss": 3.8177090138196945, "train_ppl": 45.49984929800651, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 153120, "dt_s": 4.28, "eta_s": 22140, "world_size": 1, "timestamp": "2026-05-05T01:35:38.567568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41910, "epoch": 0, "train_loss": 3.7582711279392242, "train_ppl": 42.874237794452625, "lr": 0.00056, "grad_norm": 0.7166, "tokens_per_sec": 150585, "dt_s": 4.352, "eta_s": 22209, "world_size": 1, "timestamp": "2026-05-05T01:35:42.919673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41920, "epoch": 0, "train_loss": 3.719882160425186, "train_ppl": 41.259531816444934, "lr": 0.00056, "grad_norm": 0.7481, "tokens_per_sec": 154429, "dt_s": 4.244, "eta_s": 21707, "world_size": 1, "timestamp": "2026-05-05T01:35:47.163450"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41930, "epoch": 0, "train_loss": 3.911430850625038, "train_ppl": 49.97040102429732, "lr": 0.00056, "grad_norm": 0.7157, "tokens_per_sec": 152409, "dt_s": 4.3, "eta_s": 21712, "world_size": 1, "timestamp": "2026-05-05T01:35:51.463452"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41940, "epoch": 0, "train_loss": 3.7349455058574677, "train_ppl": 41.88574297326174, "lr": 0.00056, "grad_norm": 0.7086, "tokens_per_sec": 153869, "dt_s": 4.259, "eta_s": 21751, "world_size": 1, "timestamp": "2026-05-05T01:35:55.722652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41950, "epoch": 0, "train_loss": 3.7399677336215973, "train_ppl": 42.09663183723052, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 155351, "dt_s": 4.219, "eta_s": 21684, "world_size": 1, "timestamp": "2026-05-05T01:35:59.941254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41960, "epoch": 0, "train_loss": 3.8854693323373795, "train_ppl": 48.68978879018957, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 154709, "dt_s": 4.236, "eta_s": 21562, "world_size": 1, "timestamp": "2026-05-05T01:36:04.177329"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41970, "epoch": 0, "train_loss": 3.7735189646482468, "train_ppl": 43.53298665588298, "lr": 0.00056, "grad_norm": 0.9382, "tokens_per_sec": 152617, "dt_s": 4.294, "eta_s": 21609, "world_size": 1, "timestamp": "2026-05-05T01:36:08.471464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41980, "epoch": 0, "train_loss": 3.8446880131959915, "train_ppl": 46.74409853383144, "lr": 0.00056, "grad_norm": 0.6528, "tokens_per_sec": 154626, "dt_s": 4.238, "eta_s": 21542, "world_size": 1, "timestamp": "2026-05-05T01:36:12.709810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 41990, "epoch": 0, "train_loss": 3.8024161607027054, "train_ppl": 44.80932032280225, "lr": 0.00056, "grad_norm": 0.7456, "tokens_per_sec": 152616, "dt_s": 4.294, "eta_s": 21573, "world_size": 1, "timestamp": "2026-05-05T01:36:17.003998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42000, "epoch": 0, "train_loss": 3.797855034470558, "train_ppl": 44.605404752281494, "lr": 0.00056, "grad_norm": 0.7407, "tokens_per_sec": 153312, "dt_s": 4.275, "eta_s": 21626, "world_size": 1, "timestamp": "2026-05-05T01:36:21.278689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42010, "epoch": 0, "train_loss": 3.8562292009592056, "train_ppl": 47.28670609682968, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 130106, "dt_s": 5.037, "eta_s": 21662, "world_size": 1, "timestamp": "2026-05-05T01:36:26.315786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42020, "epoch": 0, "train_loss": 3.697695568203926, "train_ppl": 40.35420361918855, "lr": 0.00056, "grad_norm": 0.7015, "tokens_per_sec": 150852, "dt_s": 4.344, "eta_s": 21709, "world_size": 1, "timestamp": "2026-05-05T01:36:30.660170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42030, "epoch": 0, "train_loss": 3.7000424414873123, "train_ppl": 40.449021040251175, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 155360, "dt_s": 4.218, "eta_s": 21684, "world_size": 1, "timestamp": "2026-05-05T01:36:34.878492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42040, "epoch": 0, "train_loss": 3.6683476865291595, "train_ppl": 39.18710295959167, "lr": 0.00056, "grad_norm": 0.6194, "tokens_per_sec": 153050, "dt_s": 4.282, "eta_s": 21668, "world_size": 1, "timestamp": "2026-05-05T01:36:39.160519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42050, "epoch": 0, "train_loss": 3.8339344263076782, "train_ppl": 46.24412488184104, "lr": 0.00056, "grad_norm": 0.7762, "tokens_per_sec": 150541, "dt_s": 4.353, "eta_s": 21743, "world_size": 1, "timestamp": "2026-05-05T01:36:43.513872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42060, "epoch": 0, "train_loss": 3.7501494586467743, "train_ppl": 42.527437618377114, "lr": 0.00056, "grad_norm": 0.7201, "tokens_per_sec": 153203, "dt_s": 4.278, "eta_s": 21741, "world_size": 1, "timestamp": "2026-05-05T01:36:47.791594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42070, "epoch": 0, "train_loss": 3.7315246909856796, "train_ppl": 41.742704394517745, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 153027, "dt_s": 4.283, "eta_s": 21674, "world_size": 1, "timestamp": "2026-05-05T01:36:52.074250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42080, "epoch": 0, "train_loss": 3.8030566573143005, "train_ppl": 44.83802973379565, "lr": 0.00056, "grad_norm": 0.7289, "tokens_per_sec": 153258, "dt_s": 4.276, "eta_s": 21728, "world_size": 1, "timestamp": "2026-05-05T01:36:56.350436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42090, "epoch": 0, "train_loss": 3.798351749777794, "train_ppl": 44.62756644317729, "lr": 0.00056, "grad_norm": 0.7332, "tokens_per_sec": 153990, "dt_s": 4.256, "eta_s": 21697, "world_size": 1, "timestamp": "2026-05-05T01:37:00.606301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42100, "epoch": 0, "train_loss": 3.9110110104084015, "train_ppl": 49.94942584372613, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 151472, "dt_s": 4.327, "eta_s": 21666, "world_size": 1, "timestamp": "2026-05-05T01:37:04.932891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42110, "epoch": 0, "train_loss": 3.7092500925064087, "train_ppl": 40.823181435761846, "lr": 0.00056, "grad_norm": 0.6241, "tokens_per_sec": 153550, "dt_s": 4.268, "eta_s": 21652, "world_size": 1, "timestamp": "2026-05-05T01:37:09.200950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42120, "epoch": 0, "train_loss": 3.7339095175266266, "train_ppl": 41.84237230194657, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 151747, "dt_s": 4.319, "eta_s": 21684, "world_size": 1, "timestamp": "2026-05-05T01:37:13.519728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42130, "epoch": 0, "train_loss": 3.789750710129738, "train_ppl": 44.24536897875276, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 150765, "dt_s": 4.347, "eta_s": 21751, "world_size": 1, "timestamp": "2026-05-05T01:37:17.866623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42140, "epoch": 0, "train_loss": 3.7576962262392044, "train_ppl": 42.84959640612425, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 153351, "dt_s": 4.274, "eta_s": 21765, "world_size": 1, "timestamp": "2026-05-05T01:37:22.140228"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42150, "epoch": 0, "train_loss": 3.683459922671318, "train_ppl": 39.78380510896067, "lr": 0.00056, "grad_norm": 0.6877, "tokens_per_sec": 152231, "dt_s": 4.305, "eta_s": 21739, "world_size": 1, "timestamp": "2026-05-05T01:37:26.445266"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42160, "epoch": 0, "train_loss": 3.6429506093263626, "train_ppl": 38.20439683288748, "lr": 0.00056, "grad_norm": 0.6041, "tokens_per_sec": 150615, "dt_s": 4.351, "eta_s": 21819, "world_size": 1, "timestamp": "2026-05-05T01:37:30.796473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42170, "epoch": 0, "train_loss": 3.8340113759040833, "train_ppl": 46.247683485501604, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 137070, "dt_s": 4.781, "eta_s": 22282, "world_size": 1, "timestamp": "2026-05-05T01:37:35.577694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42180, "epoch": 0, "train_loss": 3.864342287182808, "train_ppl": 47.6719076946135, "lr": 0.00056, "grad_norm": 0.702, "tokens_per_sec": 153925, "dt_s": 4.258, "eta_s": 22187, "world_size": 1, "timestamp": "2026-05-05T01:37:39.835366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42190, "epoch": 0, "train_loss": 3.8385760486125946, "train_ppl": 46.45927157193003, "lr": 0.00056, "grad_norm": 0.7179, "tokens_per_sec": 150996, "dt_s": 4.34, "eta_s": 22250, "world_size": 1, "timestamp": "2026-05-05T01:37:44.175595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42200, "epoch": 0, "train_loss": 3.790954440832138, "train_ppl": 44.298660555750395, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 154385, "dt_s": 4.245, "eta_s": 22185, "world_size": 1, "timestamp": "2026-05-05T01:37:48.420603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42210, "epoch": 0, "train_loss": 3.8961776345968246, "train_ppl": 49.213975330529806, "lr": 0.00056, "grad_norm": 0.7326, "tokens_per_sec": 152422, "dt_s": 4.3, "eta_s": 22128, "world_size": 1, "timestamp": "2026-05-05T01:37:52.720245"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42220, "epoch": 0, "train_loss": 3.8055823743343353, "train_ppl": 44.95142104554287, "lr": 0.00056, "grad_norm": 0.9762, "tokens_per_sec": 153363, "dt_s": 4.273, "eta_s": 21611, "world_size": 1, "timestamp": "2026-05-05T01:37:56.993487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42230, "epoch": 0, "train_loss": 3.67911559343338, "train_ppl": 39.6113460419507, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 153484, "dt_s": 4.27, "eta_s": 21619, "world_size": 1, "timestamp": "2026-05-05T01:38:01.263388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42240, "epoch": 0, "train_loss": 3.81455697119236, "train_ppl": 45.35665762516543, "lr": 0.00056, "grad_norm": 0.7328, "tokens_per_sec": 151973, "dt_s": 4.312, "eta_s": 21587, "world_size": 1, "timestamp": "2026-05-05T01:38:05.575715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42250, "epoch": 0, "train_loss": 3.726580485701561, "train_ppl": 41.53682925873156, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 153178, "dt_s": 4.278, "eta_s": 21616, "world_size": 1, "timestamp": "2026-05-05T01:38:09.854143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42260, "epoch": 0, "train_loss": 3.684683158993721, "train_ppl": 39.83249988094839, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 153479, "dt_s": 4.27, "eta_s": 21582, "world_size": 1, "timestamp": "2026-05-05T01:38:14.124202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42270, "epoch": 0, "train_loss": 3.7404442876577377, "train_ppl": 42.11669793795136, "lr": 0.00056, "grad_norm": 0.6497, "tokens_per_sec": 149997, "dt_s": 4.369, "eta_s": 21675, "world_size": 1, "timestamp": "2026-05-05T01:38:18.493338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42280, "epoch": 0, "train_loss": 3.794895976781845, "train_ppl": 44.473609876877354, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 154055, "dt_s": 4.254, "eta_s": 21654, "world_size": 1, "timestamp": "2026-05-05T01:38:22.747383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42290, "epoch": 0, "train_loss": 3.716124504804611, "train_ppl": 41.1047836320916, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 153170, "dt_s": 4.279, "eta_s": 21616, "world_size": 1, "timestamp": "2026-05-05T01:38:27.026007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42300, "epoch": 0, "train_loss": 3.737644001841545, "train_ppl": 41.99892412322355, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 150325, "dt_s": 4.36, "eta_s": 21694, "world_size": 1, "timestamp": "2026-05-05T01:38:31.385625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42310, "epoch": 0, "train_loss": 3.6787009984254837, "train_ppl": 39.594926779532955, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 154458, "dt_s": 4.243, "eta_s": 21662, "world_size": 1, "timestamp": "2026-05-05T01:38:35.628629"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42320, "epoch": 0, "train_loss": 3.6019200831651688, "train_ppl": 36.668573604417325, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 152942, "dt_s": 4.285, "eta_s": 21573, "world_size": 1, "timestamp": "2026-05-05T01:38:39.913630"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42330, "epoch": 0, "train_loss": 3.875534400343895, "train_ppl": 48.2084540229584, "lr": 0.00056, "grad_norm": 0.6777, "tokens_per_sec": 149734, "dt_s": 4.377, "eta_s": 21692, "world_size": 1, "timestamp": "2026-05-05T01:38:44.290444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42340, "epoch": 0, "train_loss": 3.6822711378335953, "train_ppl": 39.73653882494753, "lr": 0.00056, "grad_norm": 0.6453, "tokens_per_sec": 150262, "dt_s": 4.361, "eta_s": 21772, "world_size": 1, "timestamp": "2026-05-05T01:38:48.651898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42350, "epoch": 0, "train_loss": 3.776416704058647, "train_ppl": 43.659316854567464, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 151776, "dt_s": 4.318, "eta_s": 21725, "world_size": 1, "timestamp": "2026-05-05T01:38:52.969825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42360, "epoch": 0, "train_loss": 3.789071723818779, "train_ppl": 44.21533717563824, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 151802, "dt_s": 4.317, "eta_s": 21796, "world_size": 1, "timestamp": "2026-05-05T01:38:57.287020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42370, "epoch": 0, "train_loss": 3.9161724895238876, "train_ppl": 50.20790525625111, "lr": 0.00056, "grad_norm": 0.6356, "tokens_per_sec": 153237, "dt_s": 4.277, "eta_s": 21783, "world_size": 1, "timestamp": "2026-05-05T01:39:01.563822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42380, "epoch": 0, "train_loss": 3.6849477738142014, "train_ppl": 39.843041545432484, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 150511, "dt_s": 4.354, "eta_s": 21756, "world_size": 1, "timestamp": "2026-05-05T01:39:05.918032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42390, "epoch": 0, "train_loss": 3.6482185274362564, "train_ppl": 38.40618550328435, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 154276, "dt_s": 4.248, "eta_s": 21637, "world_size": 1, "timestamp": "2026-05-05T01:39:10.166015"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42400, "epoch": 0, "train_loss": 3.8545909225940704, "train_ppl": 47.20930073235244, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 152660, "dt_s": 4.293, "eta_s": 21608, "world_size": 1, "timestamp": "2026-05-05T01:39:14.458947"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42410, "epoch": 0, "train_loss": 3.6812397837638855, "train_ppl": 39.69557751035435, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 151771, "dt_s": 4.318, "eta_s": 21605, "world_size": 1, "timestamp": "2026-05-05T01:39:18.777014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42420, "epoch": 0, "train_loss": 3.6218714863061905, "train_ppl": 37.407509996435735, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 153721, "dt_s": 4.263, "eta_s": 21587, "world_size": 1, "timestamp": "2026-05-05T01:39:23.040343"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42430, "epoch": 0, "train_loss": 3.8049991577863693, "train_ppl": 44.925212276371354, "lr": 0.00056, "grad_norm": 0.7289, "tokens_per_sec": 154471, "dt_s": 4.243, "eta_s": 21470, "world_size": 1, "timestamp": "2026-05-05T01:39:27.282930"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42440, "epoch": 0, "train_loss": 3.8648001551628113, "train_ppl": 47.69374013249824, "lr": 0.00056, "grad_norm": 0.7114, "tokens_per_sec": 151340, "dt_s": 4.33, "eta_s": 21549, "world_size": 1, "timestamp": "2026-05-05T01:39:31.613314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42450, "epoch": 0, "train_loss": 3.673583835363388, "train_ppl": 39.39283060331086, "lr": 0.00056, "grad_norm": 0.7445, "tokens_per_sec": 153283, "dt_s": 4.275, "eta_s": 21527, "world_size": 1, "timestamp": "2026-05-05T01:39:35.888816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42460, "epoch": 0, "train_loss": 3.6973592191934586, "train_ppl": 40.340632805125914, "lr": 0.00056, "grad_norm": 0.6442, "tokens_per_sec": 135937, "dt_s": 4.821, "eta_s": 22028, "world_size": 1, "timestamp": "2026-05-05T01:39:40.709845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42470, "epoch": 0, "train_loss": 3.809441328048706, "train_ppl": 45.125221627273454, "lr": 0.00056, "grad_norm": 0.7506, "tokens_per_sec": 151761, "dt_s": 4.318, "eta_s": 22079, "world_size": 1, "timestamp": "2026-05-05T01:39:45.028237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42480, "epoch": 0, "train_loss": 3.747459754347801, "train_ppl": 42.41320508133167, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 152094, "dt_s": 4.309, "eta_s": 22141, "world_size": 1, "timestamp": "2026-05-05T01:39:49.337125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42490, "epoch": 0, "train_loss": 3.8871705532073975, "train_ppl": 48.77269137284371, "lr": 0.00056, "grad_norm": 0.6659, "tokens_per_sec": 148399, "dt_s": 4.416, "eta_s": 22223, "world_size": 1, "timestamp": "2026-05-05T01:39:53.753366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42500, "epoch": 0, "train_loss": 3.7025416493415833, "train_ppl": 40.550237979733865, "lr": 0.00056, "grad_norm": 0.6156, "tokens_per_sec": 152221, "dt_s": 4.305, "eta_s": 22248, "world_size": 1, "timestamp": "2026-05-05T01:39:58.058678"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42510, "epoch": 0, "train_loss": 3.7126626074314117, "train_ppl": 40.96272912055367, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 128788, "dt_s": 5.089, "eta_s": 21743, "world_size": 1, "timestamp": "2026-05-05T01:40:03.147379"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42520, "epoch": 0, "train_loss": 3.7487350404262543, "train_ppl": 42.46732855543566, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 148388, "dt_s": 4.417, "eta_s": 21837, "world_size": 1, "timestamp": "2026-05-05T01:40:07.563859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42530, "epoch": 0, "train_loss": 3.8768966048955917, "train_ppl": 48.274168546608685, "lr": 0.00056, "grad_norm": 0.8469, "tokens_per_sec": 150054, "dt_s": 4.367, "eta_s": 21891, "world_size": 1, "timestamp": "2026-05-05T01:40:11.931365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42540, "epoch": 0, "train_loss": 3.722558304667473, "train_ppl": 41.370096152023834, "lr": 0.00056, "grad_norm": 0.6453, "tokens_per_sec": 153455, "dt_s": 4.271, "eta_s": 21741, "world_size": 1, "timestamp": "2026-05-05T01:40:16.202062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42550, "epoch": 0, "train_loss": 3.7959418445825577, "train_ppl": 44.5201477254056, "lr": 0.00056, "grad_norm": 0.6395, "tokens_per_sec": 147648, "dt_s": 4.439, "eta_s": 21870, "world_size": 1, "timestamp": "2026-05-05T01:40:20.640714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42560, "epoch": 0, "train_loss": 3.6923203617334366, "train_ppl": 40.137873373517486, "lr": 0.00056, "grad_norm": 0.6238, "tokens_per_sec": 151718, "dt_s": 4.32, "eta_s": 21864, "world_size": 1, "timestamp": "2026-05-05T01:40:24.960315"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42570, "epoch": 0, "train_loss": 3.787553369998932, "train_ppl": 44.148253590738335, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 150355, "dt_s": 4.359, "eta_s": 21802, "world_size": 1, "timestamp": "2026-05-05T01:40:29.319075"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42580, "epoch": 0, "train_loss": 3.760437473654747, "train_ppl": 42.96721889404453, "lr": 0.00056, "grad_norm": 0.7638, "tokens_per_sec": 151184, "dt_s": 4.335, "eta_s": 21765, "world_size": 1, "timestamp": "2026-05-05T01:40:33.653915"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42590, "epoch": 0, "train_loss": 3.961343616247177, "train_ppl": 52.527855836468355, "lr": 0.00056, "grad_norm": 1.116, "tokens_per_sec": 153028, "dt_s": 4.283, "eta_s": 21772, "world_size": 1, "timestamp": "2026-05-05T01:40:37.936538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42600, "epoch": 0, "train_loss": 3.813721239566803, "train_ppl": 45.31876746716554, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 149376, "dt_s": 4.387, "eta_s": 21716, "world_size": 1, "timestamp": "2026-05-05T01:40:42.323863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42610, "epoch": 0, "train_loss": 3.587726227939129, "train_ppl": 36.15178149183195, "lr": 0.00056, "grad_norm": 0.917, "tokens_per_sec": 152367, "dt_s": 4.301, "eta_s": 21694, "world_size": 1, "timestamp": "2026-05-05T01:40:46.625055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42620, "epoch": 0, "train_loss": 3.712956964969635, "train_ppl": 40.974788583466335, "lr": 0.00056, "grad_norm": 0.6367, "tokens_per_sec": 153661, "dt_s": 4.265, "eta_s": 21595, "world_size": 1, "timestamp": "2026-05-05T01:40:50.890016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42630, "epoch": 0, "train_loss": 3.7157344073057175, "train_ppl": 41.08875188597949, "lr": 0.00056, "grad_norm": 0.6134, "tokens_per_sec": 149822, "dt_s": 4.374, "eta_s": 21631, "world_size": 1, "timestamp": "2026-05-05T01:40:55.264293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42640, "epoch": 0, "train_loss": 3.68285271525383, "train_ppl": 39.75965542007972, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 152378, "dt_s": 4.301, "eta_s": 21644, "world_size": 1, "timestamp": "2026-05-05T01:40:59.565172"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42650, "epoch": 0, "train_loss": 3.768304154276848, "train_ppl": 43.306561281389605, "lr": 0.00056, "grad_norm": 0.6977, "tokens_per_sec": 152748, "dt_s": 4.29, "eta_s": 21543, "world_size": 1, "timestamp": "2026-05-05T01:41:03.855635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42660, "epoch": 0, "train_loss": 3.892968401312828, "train_ppl": 49.05628936372413, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 150532, "dt_s": 4.354, "eta_s": 21591, "world_size": 1, "timestamp": "2026-05-05T01:41:08.209280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42670, "epoch": 0, "train_loss": 3.7533354461193085, "train_ppl": 42.66314556899141, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 152747, "dt_s": 4.29, "eta_s": 21613, "world_size": 1, "timestamp": "2026-05-05T01:41:12.499759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42680, "epoch": 0, "train_loss": 3.814903646707535, "train_ppl": 45.37238439369953, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 152436, "dt_s": 4.299, "eta_s": 21533, "world_size": 1, "timestamp": "2026-05-05T01:41:16.799001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42690, "epoch": 0, "train_loss": 3.8788911402225494, "train_ppl": 48.37054916647844, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 151364, "dt_s": 4.33, "eta_s": 21558, "world_size": 1, "timestamp": "2026-05-05T01:41:21.128693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42700, "epoch": 0, "train_loss": 3.759980410337448, "train_ppl": 42.947584641831746, "lr": 0.00056, "grad_norm": 0.6873, "tokens_per_sec": 154101, "dt_s": 4.253, "eta_s": 21516, "world_size": 1, "timestamp": "2026-05-05T01:41:25.381488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42710, "epoch": 0, "train_loss": 3.8038690984249115, "train_ppl": 44.87447279438594, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 152462, "dt_s": 4.299, "eta_s": 21456, "world_size": 1, "timestamp": "2026-05-05T01:41:29.680017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42720, "epoch": 0, "train_loss": 3.7226999700069427, "train_ppl": 41.37595727588836, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 151483, "dt_s": 4.326, "eta_s": 21488, "world_size": 1, "timestamp": "2026-05-05T01:41:34.006316"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42730, "epoch": 0, "train_loss": 3.8128129690885544, "train_ppl": 45.2776244558842, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 153331, "dt_s": 4.274, "eta_s": 21459, "world_size": 1, "timestamp": "2026-05-05T01:41:38.280476"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42740, "epoch": 0, "train_loss": 3.871513992547989, "train_ppl": 48.01502546995016, "lr": 0.00056, "grad_norm": 0.8127, "tokens_per_sec": 150726, "dt_s": 4.348, "eta_s": 21472, "world_size": 1, "timestamp": "2026-05-05T01:41:42.628491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42750, "epoch": 0, "train_loss": 3.671245262026787, "train_ppl": 39.300815214311775, "lr": 0.00056, "grad_norm": 0.6647, "tokens_per_sec": 153123, "dt_s": 4.28, "eta_s": 21495, "world_size": 1, "timestamp": "2026-05-05T01:41:46.908443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42760, "epoch": 0, "train_loss": 3.704469859600067, "train_ppl": 40.628502795852, "lr": 0.00056, "grad_norm": 0.7288, "tokens_per_sec": 135048, "dt_s": 4.853, "eta_s": 22045, "world_size": 1, "timestamp": "2026-05-05T01:41:51.761251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42770, "epoch": 0, "train_loss": 3.8375028520822525, "train_ppl": 46.409438388059776, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 150026, "dt_s": 4.368, "eta_s": 22082, "world_size": 1, "timestamp": "2026-05-05T01:41:56.129562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42780, "epoch": 0, "train_loss": 3.825046792626381, "train_ppl": 45.83494505361438, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 154567, "dt_s": 4.24, "eta_s": 22044, "world_size": 1, "timestamp": "2026-05-05T01:42:00.369536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42790, "epoch": 0, "train_loss": 3.8283861130476, "train_ppl": 45.988258460465296, "lr": 0.00056, "grad_norm": 0.6858, "tokens_per_sec": 153113, "dt_s": 4.28, "eta_s": 21971, "world_size": 1, "timestamp": "2026-05-05T01:42:04.649766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42800, "epoch": 0, "train_loss": 3.711313635110855, "train_ppl": 40.907508786520474, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 152001, "dt_s": 4.312, "eta_s": 21999, "world_size": 1, "timestamp": "2026-05-05T01:42:08.961324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42810, "epoch": 0, "train_loss": 3.7415850460529327, "train_ppl": 42.16477032898062, "lr": 0.00056, "grad_norm": 0.8222, "tokens_per_sec": 155076, "dt_s": 4.226, "eta_s": 21369, "world_size": 1, "timestamp": "2026-05-05T01:42:13.187374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42820, "epoch": 0, "train_loss": 3.7771330922842026, "train_ppl": 43.69060508101826, "lr": 0.00056, "grad_norm": 0.6773, "tokens_per_sec": 151273, "dt_s": 4.332, "eta_s": 21329, "world_size": 1, "timestamp": "2026-05-05T01:42:17.519674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42830, "epoch": 0, "train_loss": 3.798732504248619, "train_ppl": 44.64456182395086, "lr": 0.00056, "grad_norm": 0.7091, "tokens_per_sec": 152347, "dt_s": 4.302, "eta_s": 21386, "world_size": 1, "timestamp": "2026-05-05T01:42:21.821416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42840, "epoch": 0, "train_loss": 3.7156286537647247, "train_ppl": 41.08440683472891, "lr": 0.00056, "grad_norm": 0.6381, "tokens_per_sec": 152738, "dt_s": 4.291, "eta_s": 21392, "world_size": 1, "timestamp": "2026-05-05T01:42:26.112171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42850, "epoch": 0, "train_loss": 3.683315396308899, "train_ppl": 39.77805571580453, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 150497, "dt_s": 4.355, "eta_s": 21431, "world_size": 1, "timestamp": "2026-05-05T01:42:30.466797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42860, "epoch": 0, "train_loss": 3.723510041832924, "train_ppl": 41.4094883526072, "lr": 0.00056, "grad_norm": 0.6323, "tokens_per_sec": 153271, "dt_s": 4.276, "eta_s": 21476, "world_size": 1, "timestamp": "2026-05-05T01:42:34.742633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42870, "epoch": 0, "train_loss": 3.88763964176178, "train_ppl": 48.795575451043085, "lr": 0.00056, "grad_norm": 0.6886, "tokens_per_sec": 153819, "dt_s": 4.261, "eta_s": 21400, "world_size": 1, "timestamp": "2026-05-05T01:42:39.003240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42880, "epoch": 0, "train_loss": 3.863167256116867, "train_ppl": 47.61592461946721, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 148385, "dt_s": 4.417, "eta_s": 21511, "world_size": 1, "timestamp": "2026-05-05T01:42:43.419828"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42890, "epoch": 0, "train_loss": 3.872423827648163, "train_ppl": 48.058731104903885, "lr": 0.00056, "grad_norm": 0.7491, "tokens_per_sec": 151543, "dt_s": 4.325, "eta_s": 21540, "world_size": 1, "timestamp": "2026-05-05T01:42:47.744398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42900, "epoch": 0, "train_loss": 3.819275349378586, "train_ppl": 45.57117317386544, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 153325, "dt_s": 4.274, "eta_s": 21456, "world_size": 1, "timestamp": "2026-05-05T01:42:52.018731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42910, "epoch": 0, "train_loss": 3.711825519800186, "train_ppl": 40.92845407427568, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 150829, "dt_s": 4.345, "eta_s": 21520, "world_size": 1, "timestamp": "2026-05-05T01:42:56.363788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42920, "epoch": 0, "train_loss": 3.7553540468215942, "train_ppl": 42.749352403914855, "lr": 0.00056, "grad_norm": 0.7133, "tokens_per_sec": 152716, "dt_s": 4.291, "eta_s": 21547, "world_size": 1, "timestamp": "2026-05-05T01:43:00.655146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42930, "epoch": 0, "train_loss": 3.861127942800522, "train_ppl": 47.518919775574226, "lr": 0.00056, "grad_norm": 0.7952, "tokens_per_sec": 153930, "dt_s": 4.258, "eta_s": 21384, "world_size": 1, "timestamp": "2026-05-05T01:43:04.912675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42940, "epoch": 0, "train_loss": 3.68569752573967, "train_ppl": 39.87292514379053, "lr": 0.00056, "grad_norm": 0.6371, "tokens_per_sec": 149772, "dt_s": 4.376, "eta_s": 21431, "world_size": 1, "timestamp": "2026-05-05T01:43:09.288392"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42950, "epoch": 0, "train_loss": 3.7515182942152023, "train_ppl": 42.585690547868424, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 153871, "dt_s": 4.259, "eta_s": 21411, "world_size": 1, "timestamp": "2026-05-05T01:43:13.547546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42960, "epoch": 0, "train_loss": 3.877753034234047, "train_ppl": 48.3155296697531, "lr": 0.00056, "grad_norm": 0.701, "tokens_per_sec": 151969, "dt_s": 4.312, "eta_s": 21374, "world_size": 1, "timestamp": "2026-05-05T01:43:17.860028"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42970, "epoch": 0, "train_loss": 3.7266168892383575, "train_ppl": 41.538341373746896, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 152044, "dt_s": 4.31, "eta_s": 21389, "world_size": 1, "timestamp": "2026-05-05T01:43:22.170370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42980, "epoch": 0, "train_loss": 3.673068016767502, "train_ppl": 39.37251628844218, "lr": 0.00056, "grad_norm": 0.715, "tokens_per_sec": 155012, "dt_s": 4.228, "eta_s": 21355, "world_size": 1, "timestamp": "2026-05-05T01:43:26.398147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 42990, "epoch": 0, "train_loss": 3.67501462996006, "train_ppl": 39.4492339936965, "lr": 0.00056, "grad_norm": 0.6807, "tokens_per_sec": 151934, "dt_s": 4.313, "eta_s": 21289, "world_size": 1, "timestamp": "2026-05-05T01:43:30.711596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43000, "epoch": 0, "train_loss": 3.651233270764351, "train_ppl": 38.52214500102248, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 154292, "dt_s": 4.248, "eta_s": 21273, "world_size": 1, "timestamp": "2026-05-05T01:43:34.959142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43010, "epoch": 0, "train_loss": 3.6531519889831543, "train_ppl": 38.596129097082255, "lr": 0.00056, "grad_norm": 0.7049, "tokens_per_sec": 131423, "dt_s": 4.987, "eta_s": 21179, "world_size": 1, "timestamp": "2026-05-05T01:43:39.945818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43020, "epoch": 0, "train_loss": 3.7683421224355698, "train_ppl": 43.30820558299739, "lr": 0.00056, "grad_norm": 0.7077, "tokens_per_sec": 150214, "dt_s": 4.363, "eta_s": 21227, "world_size": 1, "timestamp": "2026-05-05T01:43:44.308622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43030, "epoch": 0, "train_loss": 3.745541885495186, "train_ppl": 42.331940069093704, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 152485, "dt_s": 4.298, "eta_s": 21293, "world_size": 1, "timestamp": "2026-05-05T01:43:48.606490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43040, "epoch": 0, "train_loss": 3.6452510058879852, "train_ppl": 38.29238325903609, "lr": 0.00056, "grad_norm": 0.6228, "tokens_per_sec": 149468, "dt_s": 4.385, "eta_s": 21359, "world_size": 1, "timestamp": "2026-05-05T01:43:52.991126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43050, "epoch": 0, "train_loss": 4.010643020272255, "train_ppl": 55.18234252255971, "lr": 0.00056, "grad_norm": 0.7502, "tokens_per_sec": 151140, "dt_s": 4.336, "eta_s": 21442, "world_size": 1, "timestamp": "2026-05-05T01:43:57.327233"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43060, "epoch": 0, "train_loss": 3.829397425055504, "train_ppl": 46.03479046368987, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 136244, "dt_s": 4.81, "eta_s": 22022, "world_size": 1, "timestamp": "2026-05-05T01:44:02.137453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43070, "epoch": 0, "train_loss": 3.802111804485321, "train_ppl": 44.79568440275902, "lr": 0.00056, "grad_norm": 0.7161, "tokens_per_sec": 152905, "dt_s": 4.286, "eta_s": 21941, "world_size": 1, "timestamp": "2026-05-05T01:44:06.423498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43080, "epoch": 0, "train_loss": 3.7446350157260895, "train_ppl": 42.29356791428124, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 151529, "dt_s": 4.325, "eta_s": 21963, "world_size": 1, "timestamp": "2026-05-05T01:44:10.748481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43090, "epoch": 0, "train_loss": 3.6860999166965485, "train_ppl": 39.88897287680748, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 155550, "dt_s": 4.213, "eta_s": 21789, "world_size": 1, "timestamp": "2026-05-05T01:44:14.961642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43100, "epoch": 0, "train_loss": 3.9205046892166138, "train_ppl": 50.4258877589165, "lr": 0.00056, "grad_norm": 1.1748, "tokens_per_sec": 151832, "dt_s": 4.316, "eta_s": 21765, "world_size": 1, "timestamp": "2026-05-05T01:44:19.277984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43110, "epoch": 0, "train_loss": 3.7420530319213867, "train_ppl": 42.18450746363102, "lr": 0.00056, "grad_norm": 0.7002, "tokens_per_sec": 153268, "dt_s": 4.276, "eta_s": 21231, "world_size": 1, "timestamp": "2026-05-05T01:44:23.553924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43120, "epoch": 0, "train_loss": 3.8054024279117584, "train_ppl": 44.9433329258719, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 154629, "dt_s": 4.238, "eta_s": 21179, "world_size": 1, "timestamp": "2026-05-05T01:44:27.792208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43130, "epoch": 0, "train_loss": 3.7246820628643036, "train_ppl": 41.45804959569545, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 151904, "dt_s": 4.314, "eta_s": 21164, "world_size": 1, "timestamp": "2026-05-05T01:44:32.106483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43140, "epoch": 0, "train_loss": 3.871236875653267, "train_ppl": 48.001721538649136, "lr": 0.00056, "grad_norm": 0.7185, "tokens_per_sec": 154485, "dt_s": 4.242, "eta_s": 21189, "world_size": 1, "timestamp": "2026-05-05T01:44:36.348729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43150, "epoch": 0, "train_loss": 3.7585688084363937, "train_ppl": 42.887002518685506, "lr": 0.00056, "grad_norm": 0.6552, "tokens_per_sec": 153910, "dt_s": 4.258, "eta_s": 21127, "world_size": 1, "timestamp": "2026-05-05T01:44:40.606797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43160, "epoch": 0, "train_loss": 3.761438697576523, "train_ppl": 43.010260244872896, "lr": 0.00056, "grad_norm": 0.7029, "tokens_per_sec": 151590, "dt_s": 4.323, "eta_s": 21169, "world_size": 1, "timestamp": "2026-05-05T01:44:44.930008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43170, "epoch": 0, "train_loss": 3.79472915828228, "train_ppl": 44.466191474788005, "lr": 0.00056, "grad_norm": 0.6544, "tokens_per_sec": 154507, "dt_s": 4.242, "eta_s": 21169, "world_size": 1, "timestamp": "2026-05-05T01:44:49.171653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43180, "epoch": 0, "train_loss": 3.7745867371559143, "train_ppl": 43.579494807855006, "lr": 0.00056, "grad_norm": 0.6949, "tokens_per_sec": 155033, "dt_s": 4.227, "eta_s": 21078, "world_size": 1, "timestamp": "2026-05-05T01:44:53.398906"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43190, "epoch": 0, "train_loss": 3.7086331397295, "train_ppl": 40.798003228293936, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 149981, "dt_s": 4.37, "eta_s": 21200, "world_size": 1, "timestamp": "2026-05-05T01:44:57.768517"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43200, "epoch": 0, "train_loss": 3.8265914618968964, "train_ppl": 45.90579959408187, "lr": 0.00056, "grad_norm": 0.6871, "tokens_per_sec": 153338, "dt_s": 4.274, "eta_s": 21211, "world_size": 1, "timestamp": "2026-05-05T01:45:02.042467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43210, "epoch": 0, "train_loss": 3.785872220993042, "train_ppl": 44.074096150421724, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 153692, "dt_s": 4.264, "eta_s": 21149, "world_size": 1, "timestamp": "2026-05-05T01:45:06.306596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43220, "epoch": 0, "train_loss": 3.751015916466713, "train_ppl": 42.56430181759404, "lr": 0.00056, "grad_norm": 0.7166, "tokens_per_sec": 150833, "dt_s": 4.345, "eta_s": 21246, "world_size": 1, "timestamp": "2026-05-05T01:45:10.651520"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43230, "epoch": 0, "train_loss": 3.7718383967876434, "train_ppl": 43.4598879584982, "lr": 0.00056, "grad_norm": 0.6902, "tokens_per_sec": 153751, "dt_s": 4.262, "eta_s": 21277, "world_size": 1, "timestamp": "2026-05-05T01:45:14.914012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43240, "epoch": 0, "train_loss": 3.7032139599323273, "train_ppl": 40.57750950062376, "lr": 0.00056, "grad_norm": 0.7298, "tokens_per_sec": 149587, "dt_s": 4.381, "eta_s": 21284, "world_size": 1, "timestamp": "2026-05-05T01:45:19.295129"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43250, "epoch": 0, "train_loss": 3.6939725279808044, "train_ppl": 40.20424262456747, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 154246, "dt_s": 4.249, "eta_s": 21255, "world_size": 1, "timestamp": "2026-05-05T01:45:23.543928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43260, "epoch": 0, "train_loss": 3.735479712486267, "train_ppl": 41.90812459248266, "lr": 0.00056, "grad_norm": 0.6644, "tokens_per_sec": 155127, "dt_s": 4.225, "eta_s": 21212, "world_size": 1, "timestamp": "2026-05-05T01:45:27.768589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43270, "epoch": 0, "train_loss": 3.7905942499637604, "train_ppl": 44.28270745598976, "lr": 0.00056, "grad_norm": 0.7367, "tokens_per_sec": 151798, "dt_s": 4.317, "eta_s": 21180, "world_size": 1, "timestamp": "2026-05-05T01:45:32.085931"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43280, "epoch": 0, "train_loss": 3.802213117480278, "train_ppl": 44.80022301761346, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 153846, "dt_s": 4.26, "eta_s": 21173, "world_size": 1, "timestamp": "2026-05-05T01:45:36.345778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43290, "epoch": 0, "train_loss": 3.6723298877477646, "train_ppl": 39.343465014702716, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 154567, "dt_s": 4.24, "eta_s": 21029, "world_size": 1, "timestamp": "2026-05-05T01:45:40.585767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43300, "epoch": 0, "train_loss": 3.7327604591846466, "train_ppl": 41.79432058740354, "lr": 0.00056, "grad_norm": 0.6201, "tokens_per_sec": 152569, "dt_s": 4.295, "eta_s": 21071, "world_size": 1, "timestamp": "2026-05-05T01:45:44.881250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43310, "epoch": 0, "train_loss": 3.8325401097536087, "train_ppl": 46.17969086413755, "lr": 0.00056, "grad_norm": 0.6584, "tokens_per_sec": 153953, "dt_s": 4.257, "eta_s": 21099, "world_size": 1, "timestamp": "2026-05-05T01:45:49.138097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43320, "epoch": 0, "train_loss": 3.773602843284607, "train_ppl": 43.53663829658554, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 154234, "dt_s": 4.249, "eta_s": 21027, "world_size": 1, "timestamp": "2026-05-05T01:45:53.387255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43330, "epoch": 0, "train_loss": 3.7514023780822754, "train_ppl": 42.58075446539341, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 150655, "dt_s": 4.35, "eta_s": 21112, "world_size": 1, "timestamp": "2026-05-05T01:45:57.737294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43340, "epoch": 0, "train_loss": 3.7064457833766937, "train_ppl": 40.708860985218536, "lr": 0.00056, "grad_norm": 0.6717, "tokens_per_sec": 150117, "dt_s": 4.366, "eta_s": 21232, "world_size": 1, "timestamp": "2026-05-05T01:46:02.102963"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43350, "epoch": 0, "train_loss": 3.729840263724327, "train_ppl": 41.672451030224835, "lr": 0.00056, "grad_norm": 0.6963, "tokens_per_sec": 136435, "dt_s": 4.803, "eta_s": 21729, "world_size": 1, "timestamp": "2026-05-05T01:46:06.906427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43360, "epoch": 0, "train_loss": 3.7568579018115997, "train_ppl": 42.81368959562508, "lr": 0.00056, "grad_norm": 0.6315, "tokens_per_sec": 151704, "dt_s": 4.32, "eta_s": 21786, "world_size": 1, "timestamp": "2026-05-05T01:46:11.226416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43370, "epoch": 0, "train_loss": 3.799238920211792, "train_ppl": 44.6671762684019, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 153351, "dt_s": 4.274, "eta_s": 21806, "world_size": 1, "timestamp": "2026-05-05T01:46:15.499994"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43380, "epoch": 0, "train_loss": 3.7040086835622787, "train_ppl": 40.60977022375003, "lr": 0.00056, "grad_norm": 0.7111, "tokens_per_sec": 148412, "dt_s": 4.416, "eta_s": 21867, "world_size": 1, "timestamp": "2026-05-05T01:46:19.915822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43390, "epoch": 0, "train_loss": 3.795928791165352, "train_ppl": 44.51956658913619, "lr": 0.00056, "grad_norm": 0.7415, "tokens_per_sec": 154329, "dt_s": 4.247, "eta_s": 21745, "world_size": 1, "timestamp": "2026-05-05T01:46:24.162346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43400, "epoch": 0, "train_loss": 3.748785451054573, "train_ppl": 42.469469414111714, "lr": 0.00056, "grad_norm": 0.6818, "tokens_per_sec": 155037, "dt_s": 4.227, "eta_s": 21172, "world_size": 1, "timestamp": "2026-05-05T01:46:28.389434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43410, "epoch": 0, "train_loss": 3.7861889451742172, "train_ppl": 44.08805769329841, "lr": 0.00056, "grad_norm": 0.7478, "tokens_per_sec": 153344, "dt_s": 4.274, "eta_s": 21122, "world_size": 1, "timestamp": "2026-05-05T01:46:32.663244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43420, "epoch": 0, "train_loss": 3.6158913522958755, "train_ppl": 37.18447562605599, "lr": 0.00056, "grad_norm": 0.6947, "tokens_per_sec": 155047, "dt_s": 4.227, "eta_s": 21072, "world_size": 1, "timestamp": "2026-05-05T01:46:36.890046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43430, "epoch": 0, "train_loss": 3.6139746755361557, "train_ppl": 37.11327326355559, "lr": 0.00056, "grad_norm": 0.7389, "tokens_per_sec": 155058, "dt_s": 4.227, "eta_s": 20881, "world_size": 1, "timestamp": "2026-05-05T01:46:41.116598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43440, "epoch": 0, "train_loss": 3.7062998563051224, "train_ppl": 40.70292089376857, "lr": 0.00056, "grad_norm": 0.6419, "tokens_per_sec": 152011, "dt_s": 4.311, "eta_s": 20941, "world_size": 1, "timestamp": "2026-05-05T01:46:45.427868"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43450, "epoch": 0, "train_loss": 3.7175716161727905, "train_ppl": 41.164309891947845, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 152490, "dt_s": 4.298, "eta_s": 21006, "world_size": 1, "timestamp": "2026-05-05T01:46:49.725596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43460, "epoch": 0, "train_loss": 3.667912870645523, "train_ppl": 39.17006748870621, "lr": 0.00056, "grad_norm": 0.7055, "tokens_per_sec": 154723, "dt_s": 4.236, "eta_s": 20964, "world_size": 1, "timestamp": "2026-05-05T01:46:53.961283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43470, "epoch": 0, "train_loss": 3.801785871386528, "train_ppl": 44.78108638564679, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 153070, "dt_s": 4.281, "eta_s": 21014, "world_size": 1, "timestamp": "2026-05-05T01:46:58.242767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43480, "epoch": 0, "train_loss": 3.7728753238916397, "train_ppl": 43.50497606675981, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 154073, "dt_s": 4.254, "eta_s": 21036, "world_size": 1, "timestamp": "2026-05-05T01:47:02.496303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43490, "epoch": 0, "train_loss": 3.8774981796741486, "train_ppl": 48.303217805636855, "lr": 0.00056, "grad_norm": 0.7158, "tokens_per_sec": 153098, "dt_s": 4.281, "eta_s": 21002, "world_size": 1, "timestamp": "2026-05-05T01:47:06.776970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43500, "epoch": 0, "train_loss": 3.7917180359363556, "train_ppl": 44.33249971413776, "lr": 0.00056, "grad_norm": 0.6439, "tokens_per_sec": 152029, "dt_s": 4.311, "eta_s": 21010, "world_size": 1, "timestamp": "2026-05-05T01:47:11.087716"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43510, "epoch": 0, "train_loss": 3.7149629443883896, "train_ppl": 41.05706566152527, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 130613, "dt_s": 5.018, "eta_s": 21006, "world_size": 1, "timestamp": "2026-05-05T01:47:16.105294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43520, "epoch": 0, "train_loss": 3.775044083595276, "train_ppl": 43.59943029299865, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 149122, "dt_s": 4.395, "eta_s": 21113, "world_size": 1, "timestamp": "2026-05-05T01:47:20.500068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43530, "epoch": 0, "train_loss": 3.7331893295049667, "train_ppl": 41.81224877522045, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 152918, "dt_s": 4.286, "eta_s": 21141, "world_size": 1, "timestamp": "2026-05-05T01:47:24.785779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43540, "epoch": 0, "train_loss": 3.848885253071785, "train_ppl": 46.9407070460647, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 154517, "dt_s": 4.241, "eta_s": 21098, "world_size": 1, "timestamp": "2026-05-05T01:47:29.027131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43550, "epoch": 0, "train_loss": 3.703870415687561, "train_ppl": 40.604155585299424, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 149988, "dt_s": 4.369, "eta_s": 21151, "world_size": 1, "timestamp": "2026-05-05T01:47:33.396533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43560, "epoch": 0, "train_loss": 3.7261595726013184, "train_ppl": 41.51934954213294, "lr": 0.00056, "grad_norm": 0.6778, "tokens_per_sec": 153892, "dt_s": 4.259, "eta_s": 21169, "world_size": 1, "timestamp": "2026-05-05T01:47:37.655127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43570, "epoch": 0, "train_loss": 3.8914407193660736, "train_ppl": 48.98140417103088, "lr": 0.00056, "grad_norm": 1.0324, "tokens_per_sec": 153543, "dt_s": 4.268, "eta_s": 21041, "world_size": 1, "timestamp": "2026-05-05T01:47:41.923388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43580, "epoch": 0, "train_loss": 3.7999080568552017, "train_ppl": 44.69707471475818, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 150744, "dt_s": 4.348, "eta_s": 21097, "world_size": 1, "timestamp": "2026-05-05T01:47:46.270879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43590, "epoch": 0, "train_loss": 3.787362441420555, "train_ppl": 44.13982523207532, "lr": 0.00056, "grad_norm": 0.6451, "tokens_per_sec": 151925, "dt_s": 4.314, "eta_s": 21164, "world_size": 1, "timestamp": "2026-05-05T01:47:50.584595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43600, "epoch": 0, "train_loss": 3.6818105578422546, "train_ppl": 39.71824118432667, "lr": 0.00056, "grad_norm": 0.6999, "tokens_per_sec": 152803, "dt_s": 4.289, "eta_s": 21080, "world_size": 1, "timestamp": "2026-05-05T01:47:54.873520"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43610, "epoch": 0, "train_loss": 3.7567720860242844, "train_ppl": 42.81001566278753, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 150177, "dt_s": 4.364, "eta_s": 21179, "world_size": 1, "timestamp": "2026-05-05T01:47:59.237423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43620, "epoch": 0, "train_loss": 3.7168850153684616, "train_ppl": 41.13605614429927, "lr": 0.00056, "grad_norm": 0.6494, "tokens_per_sec": 153136, "dt_s": 4.28, "eta_s": 21186, "world_size": 1, "timestamp": "2026-05-05T01:48:03.517034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43630, "epoch": 0, "train_loss": 3.761002629995346, "train_ppl": 42.991508953434426, "lr": 0.00056, "grad_norm": 0.695, "tokens_per_sec": 149729, "dt_s": 4.377, "eta_s": 21211, "world_size": 1, "timestamp": "2026-05-05T01:48:07.894012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43640, "epoch": 0, "train_loss": 3.688604950904846, "train_ppl": 39.989021378538745, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 151679, "dt_s": 4.321, "eta_s": 21213, "world_size": 1, "timestamp": "2026-05-05T01:48:12.214724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43650, "epoch": 0, "train_loss": 3.77588252723217, "train_ppl": 43.636001287116265, "lr": 0.00056, "grad_norm": 0.7185, "tokens_per_sec": 137641, "dt_s": 4.761, "eta_s": 21672, "world_size": 1, "timestamp": "2026-05-05T01:48:16.976087"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43660, "epoch": 0, "train_loss": 3.76992729306221, "train_ppl": 43.37691091883349, "lr": 0.00056, "grad_norm": 0.646, "tokens_per_sec": 151429, "dt_s": 4.328, "eta_s": 21633, "world_size": 1, "timestamp": "2026-05-05T01:48:21.303930"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43670, "epoch": 0, "train_loss": 3.798128604888916, "train_ppl": 44.61760914082432, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 153231, "dt_s": 4.277, "eta_s": 21626, "world_size": 1, "timestamp": "2026-05-05T01:48:25.580880"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43680, "epoch": 0, "train_loss": 3.812559336423874, "train_ppl": 45.26614202756405, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 154108, "dt_s": 4.253, "eta_s": 21499, "world_size": 1, "timestamp": "2026-05-05T01:48:29.833502"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43690, "epoch": 0, "train_loss": 3.7721844166517258, "train_ppl": 43.474928545043426, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 151836, "dt_s": 4.316, "eta_s": 21491, "world_size": 1, "timestamp": "2026-05-05T01:48:34.149712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43700, "epoch": 0, "train_loss": 3.899821788072586, "train_ppl": 49.39364578430783, "lr": 0.00056, "grad_norm": 0.7485, "tokens_per_sec": 153777, "dt_s": 4.262, "eta_s": 20997, "world_size": 1, "timestamp": "2026-05-05T01:48:38.411447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43710, "epoch": 0, "train_loss": 3.8380944579839706, "train_ppl": 46.43690260890211, "lr": 0.00056, "grad_norm": 0.7079, "tokens_per_sec": 153240, "dt_s": 4.277, "eta_s": 20942, "world_size": 1, "timestamp": "2026-05-05T01:48:42.688144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43720, "epoch": 0, "train_loss": 3.746244892477989, "train_ppl": 42.36171018163031, "lr": 0.00056, "grad_norm": 0.6376, "tokens_per_sec": 151348, "dt_s": 4.33, "eta_s": 20990, "world_size": 1, "timestamp": "2026-05-05T01:48:47.018304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43730, "epoch": 0, "train_loss": 3.7641504406929016, "train_ppl": 43.127051304132024, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 154005, "dt_s": 4.255, "eta_s": 20989, "world_size": 1, "timestamp": "2026-05-05T01:48:51.273744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43740, "epoch": 0, "train_loss": 3.584498628973961, "train_ppl": 36.03528614056901, "lr": 0.00056, "grad_norm": 0.8089, "tokens_per_sec": 153563, "dt_s": 4.268, "eta_s": 20937, "world_size": 1, "timestamp": "2026-05-05T01:48:55.541443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43750, "epoch": 0, "train_loss": 3.7729351222515106, "train_ppl": 43.50757767075988, "lr": 0.00056, "grad_norm": 0.7, "tokens_per_sec": 152180, "dt_s": 4.306, "eta_s": 20976, "world_size": 1, "timestamp": "2026-05-05T01:48:59.847979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43760, "epoch": 0, "train_loss": 3.72318896651268, "train_ppl": 41.39619492208368, "lr": 0.00056, "grad_norm": 0.7325, "tokens_per_sec": 153167, "dt_s": 4.279, "eta_s": 20974, "world_size": 1, "timestamp": "2026-05-05T01:49:04.126627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43770, "epoch": 0, "train_loss": 3.656134784221649, "train_ppl": 38.71142531419791, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 153218, "dt_s": 4.277, "eta_s": 20918, "world_size": 1, "timestamp": "2026-05-05T01:49:08.403928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43780, "epoch": 0, "train_loss": 3.687948152422905, "train_ppl": 39.96276527343186, "lr": 0.00056, "grad_norm": 0.7659, "tokens_per_sec": 153452, "dt_s": 4.271, "eta_s": 20929, "world_size": 1, "timestamp": "2026-05-05T01:49:12.674712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43790, "epoch": 0, "train_loss": 3.8604178577661514, "train_ppl": 47.48518927896986, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 155948, "dt_s": 4.202, "eta_s": 20861, "world_size": 1, "timestamp": "2026-05-05T01:49:16.877144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43800, "epoch": 0, "train_loss": 3.852025091648102, "train_ppl": 47.08832491571309, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 150908, "dt_s": 4.343, "eta_s": 20892, "world_size": 1, "timestamp": "2026-05-05T01:49:21.219918"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43810, "epoch": 0, "train_loss": 3.7278067618608475, "train_ppl": 41.58779612552393, "lr": 0.00056, "grad_norm": 0.7147, "tokens_per_sec": 154953, "dt_s": 4.229, "eta_s": 20840, "world_size": 1, "timestamp": "2026-05-05T01:49:25.449330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43820, "epoch": 0, "train_loss": 3.7006453573703766, "train_ppl": 40.47341575073098, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 155285, "dt_s": 4.22, "eta_s": 20780, "world_size": 1, "timestamp": "2026-05-05T01:49:29.669710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43830, "epoch": 0, "train_loss": 3.818183586001396, "train_ppl": 45.5214473852753, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 150639, "dt_s": 4.351, "eta_s": 20853, "world_size": 1, "timestamp": "2026-05-05T01:49:34.020235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43840, "epoch": 0, "train_loss": 3.7478478401899338, "train_ppl": 42.429668240095985, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 153383, "dt_s": 4.273, "eta_s": 20918, "world_size": 1, "timestamp": "2026-05-05T01:49:38.292917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43850, "epoch": 0, "train_loss": 3.622841790318489, "train_ppl": 37.44382426856822, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 154266, "dt_s": 4.248, "eta_s": 20821, "world_size": 1, "timestamp": "2026-05-05T01:49:42.541156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43860, "epoch": 0, "train_loss": 3.6314998269081116, "train_ppl": 37.76942175266653, "lr": 0.00056, "grad_norm": 0.6686, "tokens_per_sec": 149251, "dt_s": 4.391, "eta_s": 20975, "world_size": 1, "timestamp": "2026-05-05T01:49:46.932154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43870, "epoch": 0, "train_loss": 3.7482508569955826, "train_ppl": 42.44677155568516, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 152567, "dt_s": 4.296, "eta_s": 21044, "world_size": 1, "timestamp": "2026-05-05T01:49:51.227715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43880, "epoch": 0, "train_loss": 3.6878259032964706, "train_ppl": 39.957880158893865, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 153558, "dt_s": 4.268, "eta_s": 20959, "world_size": 1, "timestamp": "2026-05-05T01:49:55.495543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43890, "epoch": 0, "train_loss": 3.7776672542095184, "train_ppl": 43.71394917295404, "lr": 0.00056, "grad_norm": 0.632, "tokens_per_sec": 152544, "dt_s": 4.296, "eta_s": 20977, "world_size": 1, "timestamp": "2026-05-05T01:49:59.791760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43900, "epoch": 0, "train_loss": 3.7151076942682266, "train_ppl": 41.06300909699164, "lr": 0.00056, "grad_norm": 0.6387, "tokens_per_sec": 152931, "dt_s": 4.285, "eta_s": 21009, "world_size": 1, "timestamp": "2026-05-05T01:50:04.077119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43910, "epoch": 0, "train_loss": 3.6458531469106674, "train_ppl": 38.31544761715384, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 152595, "dt_s": 4.295, "eta_s": 20911, "world_size": 1, "timestamp": "2026-05-05T01:50:08.371842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43920, "epoch": 0, "train_loss": 3.7691004127264023, "train_ppl": 43.34105822915146, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 152986, "dt_s": 4.284, "eta_s": 20895, "world_size": 1, "timestamp": "2026-05-05T01:50:12.655625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43930, "epoch": 0, "train_loss": 3.773987367749214, "train_ppl": 43.553382418173214, "lr": 0.00056, "grad_norm": 0.6887, "tokens_per_sec": 151690, "dt_s": 4.32, "eta_s": 20942, "world_size": 1, "timestamp": "2026-05-05T01:50:16.976026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43940, "epoch": 0, "train_loss": 3.690269097685814, "train_ppl": 40.055624382956815, "lr": 0.00056, "grad_norm": 0.687, "tokens_per_sec": 135821, "dt_s": 4.825, "eta_s": 21454, "world_size": 1, "timestamp": "2026-05-05T01:50:21.801213"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43950, "epoch": 0, "train_loss": 3.8886477649211884, "train_ppl": 48.84479220483832, "lr": 0.00056, "grad_norm": 0.7138, "tokens_per_sec": 154495, "dt_s": 4.242, "eta_s": 21407, "world_size": 1, "timestamp": "2026-05-05T01:50:26.043149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43960, "epoch": 0, "train_loss": 3.8321135342121124, "train_ppl": 46.15999593848648, "lr": 0.00056, "grad_norm": 0.6998, "tokens_per_sec": 153150, "dt_s": 4.279, "eta_s": 21387, "world_size": 1, "timestamp": "2026-05-05T01:50:30.322356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43970, "epoch": 0, "train_loss": 3.8843801468610764, "train_ppl": 48.636785449869194, "lr": 0.00056, "grad_norm": 9.9878, "tokens_per_sec": 152233, "dt_s": 4.305, "eta_s": 21404, "world_size": 1, "timestamp": "2026-05-05T01:50:34.627353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43980, "epoch": 0, "train_loss": 3.7127638906240463, "train_ppl": 40.96687816664879, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 153674, "dt_s": 4.265, "eta_s": 21345, "world_size": 1, "timestamp": "2026-05-05T01:50:38.891942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 43990, "epoch": 0, "train_loss": 3.8203248530626297, "train_ppl": 45.619025394146085, "lr": 0.00056, "grad_norm": 0.6525, "tokens_per_sec": 155432, "dt_s": 4.216, "eta_s": 20748, "world_size": 1, "timestamp": "2026-05-05T01:50:43.108340"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44000, "epoch": 0, "train_loss": 3.881622612476349, "train_ppl": 48.50285258873222, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 152362, "dt_s": 4.301, "eta_s": 20801, "world_size": 1, "timestamp": "2026-05-05T01:50:47.409673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44010, "epoch": 0, "train_loss": 3.830412268638611, "train_ppl": 46.08153228920082, "lr": 0.00056, "grad_norm": 0.7248, "tokens_per_sec": 130266, "dt_s": 5.031, "eta_s": 20779, "world_size": 1, "timestamp": "2026-05-05T01:50:52.440597"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44020, "epoch": 0, "train_loss": 3.7145185321569443, "train_ppl": 41.038823453188535, "lr": 0.00056, "grad_norm": 0.626, "tokens_per_sec": 151538, "dt_s": 4.325, "eta_s": 20794, "world_size": 1, "timestamp": "2026-05-05T01:50:56.765342"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44030, "epoch": 0, "train_loss": 3.8453515768051147, "train_ppl": 46.775126509939156, "lr": 0.00056, "grad_norm": 0.6982, "tokens_per_sec": 150531, "dt_s": 4.354, "eta_s": 20876, "world_size": 1, "timestamp": "2026-05-05T01:51:01.118964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44040, "epoch": 0, "train_loss": 3.660360023379326, "train_ppl": 38.87533638216252, "lr": 0.00056, "grad_norm": 0.7276, "tokens_per_sec": 154655, "dt_s": 4.238, "eta_s": 20892, "world_size": 1, "timestamp": "2026-05-05T01:51:05.356538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44050, "epoch": 0, "train_loss": 3.732733801007271, "train_ppl": 41.79320644184263, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 149998, "dt_s": 4.369, "eta_s": 20954, "world_size": 1, "timestamp": "2026-05-05T01:51:09.725667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44060, "epoch": 0, "train_loss": 3.729047864675522, "train_ppl": 41.639442899200056, "lr": 0.00056, "grad_norm": 0.7327, "tokens_per_sec": 153156, "dt_s": 4.279, "eta_s": 20968, "world_size": 1, "timestamp": "2026-05-05T01:51:14.004693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44070, "epoch": 0, "train_loss": 3.8207073658704758, "train_ppl": 45.63647859346413, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 154527, "dt_s": 4.241, "eta_s": 20882, "world_size": 1, "timestamp": "2026-05-05T01:51:18.245764"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44080, "epoch": 0, "train_loss": 3.751046434044838, "train_ppl": 42.565600796820846, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 150093, "dt_s": 4.366, "eta_s": 20890, "world_size": 1, "timestamp": "2026-05-05T01:51:22.612113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44090, "epoch": 0, "train_loss": 3.731271281838417, "train_ppl": 41.73212775155833, "lr": 0.00056, "grad_norm": 0.6522, "tokens_per_sec": 153423, "dt_s": 4.272, "eta_s": 20919, "world_size": 1, "timestamp": "2026-05-05T01:51:26.883712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44100, "epoch": 0, "train_loss": 3.6953500509262085, "train_ppl": 40.25966305397831, "lr": 0.00056, "grad_norm": 0.6883, "tokens_per_sec": 154014, "dt_s": 4.255, "eta_s": 20804, "world_size": 1, "timestamp": "2026-05-05T01:51:31.138911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44110, "epoch": 0, "train_loss": 3.5792947709560394, "train_ppl": 35.84825070286798, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 151607, "dt_s": 4.323, "eta_s": 20842, "world_size": 1, "timestamp": "2026-05-05T01:51:35.461711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44120, "epoch": 0, "train_loss": 3.716188669204712, "train_ppl": 41.10742118049208, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 154412, "dt_s": 4.244, "eta_s": 20841, "world_size": 1, "timestamp": "2026-05-05T01:51:39.705912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44130, "epoch": 0, "train_loss": 3.7495407164096832, "train_ppl": 42.501557248899765, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 153585, "dt_s": 4.267, "eta_s": 20740, "world_size": 1, "timestamp": "2026-05-05T01:51:43.973020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44140, "epoch": 0, "train_loss": 3.7918102741241455, "train_ppl": 44.33658905216526, "lr": 0.00056, "grad_norm": 0.6835, "tokens_per_sec": 152044, "dt_s": 4.31, "eta_s": 20773, "world_size": 1, "timestamp": "2026-05-05T01:51:48.283356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44150, "epoch": 0, "train_loss": 3.7226839810609818, "train_ppl": 41.37529572323217, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 153579, "dt_s": 4.267, "eta_s": 20781, "world_size": 1, "timestamp": "2026-05-05T01:51:52.550577"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44160, "epoch": 0, "train_loss": 3.711038902401924, "train_ppl": 40.8962716994843, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 152324, "dt_s": 4.302, "eta_s": 20757, "world_size": 1, "timestamp": "2026-05-05T01:51:56.853005"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44170, "epoch": 0, "train_loss": 3.6638001799583435, "train_ppl": 39.009303928925966, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 151460, "dt_s": 4.327, "eta_s": 20833, "world_size": 1, "timestamp": "2026-05-05T01:52:01.179966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44180, "epoch": 0, "train_loss": 3.7564456462860107, "train_ppl": 42.79604305321067, "lr": 0.00056, "grad_norm": 0.6574, "tokens_per_sec": 152405, "dt_s": 4.3, "eta_s": 20861, "world_size": 1, "timestamp": "2026-05-05T01:52:05.480085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44190, "epoch": 0, "train_loss": 3.6609539091587067, "train_ppl": 38.898430748637665, "lr": 0.00056, "grad_norm": 0.6707, "tokens_per_sec": 151431, "dt_s": 4.328, "eta_s": 20873, "world_size": 1, "timestamp": "2026-05-05T01:52:09.807847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44200, "epoch": 0, "train_loss": 3.842751204967499, "train_ppl": 46.65365179642411, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 155177, "dt_s": 4.223, "eta_s": 20826, "world_size": 1, "timestamp": "2026-05-05T01:52:14.031128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44210, "epoch": 0, "train_loss": 3.7539732605218887, "train_ppl": 42.69036541737759, "lr": 0.00056, "grad_norm": 0.7457, "tokens_per_sec": 155044, "dt_s": 4.227, "eta_s": 20749, "world_size": 1, "timestamp": "2026-05-05T01:52:18.258095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44220, "epoch": 0, "train_loss": 3.8349476903676987, "train_ppl": 46.29100613910635, "lr": 0.00056, "grad_norm": 0.7397, "tokens_per_sec": 153440, "dt_s": 4.271, "eta_s": 20690, "world_size": 1, "timestamp": "2026-05-05T01:52:22.529202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44230, "epoch": 0, "train_loss": 3.6822811663150787, "train_ppl": 39.736937324089524, "lr": 0.00056, "grad_norm": 0.6652, "tokens_per_sec": 155318, "dt_s": 4.219, "eta_s": 20608, "world_size": 1, "timestamp": "2026-05-05T01:52:26.748666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44240, "epoch": 0, "train_loss": 3.7848485112190247, "train_ppl": 44.02900015395605, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 138431, "dt_s": 4.734, "eta_s": 20997, "world_size": 1, "timestamp": "2026-05-05T01:52:31.482848"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44250, "epoch": 0, "train_loss": 3.7342465668916702, "train_ppl": 41.856477623924235, "lr": 0.00056, "grad_norm": 0.7192, "tokens_per_sec": 151342, "dt_s": 4.33, "eta_s": 21097, "world_size": 1, "timestamp": "2026-05-05T01:52:35.813186"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44260, "epoch": 0, "train_loss": 3.7524968832731247, "train_ppl": 42.62738483612243, "lr": 0.00056, "grad_norm": 0.7013, "tokens_per_sec": 153980, "dt_s": 4.256, "eta_s": 21121, "world_size": 1, "timestamp": "2026-05-05T01:52:40.069303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44270, "epoch": 0, "train_loss": 3.674040913581848, "train_ppl": 39.41084032375504, "lr": 0.00056, "grad_norm": 0.8143, "tokens_per_sec": 153546, "dt_s": 4.268, "eta_s": 21113, "world_size": 1, "timestamp": "2026-05-05T01:52:44.337472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44280, "epoch": 0, "train_loss": 3.782375380396843, "train_ppl": 43.92024521463387, "lr": 0.00056, "grad_norm": 0.6549, "tokens_per_sec": 150810, "dt_s": 4.346, "eta_s": 21231, "world_size": 1, "timestamp": "2026-05-05T01:52:48.683098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44290, "epoch": 0, "train_loss": 3.6468009054660797, "train_ppl": 38.351778624229325, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 153978, "dt_s": 4.256, "eta_s": 20764, "world_size": 1, "timestamp": "2026-05-05T01:52:52.939272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44300, "epoch": 0, "train_loss": 3.7192779183387756, "train_ppl": 41.23460860144186, "lr": 0.00056, "grad_norm": 0.6923, "tokens_per_sec": 152844, "dt_s": 4.288, "eta_s": 20719, "world_size": 1, "timestamp": "2026-05-05T01:52:57.227024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44310, "epoch": 0, "train_loss": 3.764164999127388, "train_ppl": 43.127679171053394, "lr": 0.00056, "grad_norm": 0.7429, "tokens_per_sec": 152468, "dt_s": 4.298, "eta_s": 20755, "world_size": 1, "timestamp": "2026-05-05T01:53:01.525377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44320, "epoch": 0, "train_loss": 3.8224860429763794, "train_ppl": 45.717723385839015, "lr": 0.00056, "grad_norm": 0.6369, "tokens_per_sec": 153683, "dt_s": 4.264, "eta_s": 20747, "world_size": 1, "timestamp": "2026-05-05T01:53:05.789771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44330, "epoch": 0, "train_loss": 3.6777387261390686, "train_ppl": 39.55684400474838, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 150233, "dt_s": 4.362, "eta_s": 20759, "world_size": 1, "timestamp": "2026-05-05T01:53:10.152013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44340, "epoch": 0, "train_loss": 3.6860368698835373, "train_ppl": 39.886458083468995, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 154009, "dt_s": 4.255, "eta_s": 20754, "world_size": 1, "timestamp": "2026-05-05T01:53:14.407353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44350, "epoch": 0, "train_loss": 3.7702631801366806, "train_ppl": 43.39148310970986, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 152471, "dt_s": 4.298, "eta_s": 20760, "world_size": 1, "timestamp": "2026-05-05T01:53:18.705600"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44360, "epoch": 0, "train_loss": 3.816116124391556, "train_ppl": 45.427430761750486, "lr": 0.00056, "grad_norm": 0.8695, "tokens_per_sec": 150985, "dt_s": 4.341, "eta_s": 20796, "world_size": 1, "timestamp": "2026-05-05T01:53:23.046171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44370, "epoch": 0, "train_loss": 3.7731276601552963, "train_ppl": 43.51595533504648, "lr": 0.00056, "grad_norm": 0.7138, "tokens_per_sec": 153287, "dt_s": 4.275, "eta_s": 20803, "world_size": 1, "timestamp": "2026-05-05T01:53:27.321543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44380, "epoch": 0, "train_loss": 3.7248555421829224, "train_ppl": 41.46524233376819, "lr": 0.00056, "grad_norm": 0.7032, "tokens_per_sec": 154128, "dt_s": 4.252, "eta_s": 20692, "world_size": 1, "timestamp": "2026-05-05T01:53:31.573583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44390, "epoch": 0, "train_loss": 3.7579466700553894, "train_ppl": 42.86032916649082, "lr": 0.00056, "grad_norm": 0.73, "tokens_per_sec": 150478, "dt_s": 4.355, "eta_s": 20784, "world_size": 1, "timestamp": "2026-05-05T01:53:35.928774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44400, "epoch": 0, "train_loss": 3.783822685480118, "train_ppl": 43.98385723068482, "lr": 0.00056, "grad_norm": 0.6873, "tokens_per_sec": 152781, "dt_s": 4.29, "eta_s": 20771, "world_size": 1, "timestamp": "2026-05-05T01:53:40.218334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44410, "epoch": 0, "train_loss": 3.6915837973356247, "train_ppl": 40.1083201302562, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 153500, "dt_s": 4.269, "eta_s": 20698, "world_size": 1, "timestamp": "2026-05-05T01:53:44.487748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44420, "epoch": 0, "train_loss": 3.7431882172822952, "train_ppl": 42.232421889693, "lr": 0.00056, "grad_norm": 0.7118, "tokens_per_sec": 150966, "dt_s": 4.341, "eta_s": 20758, "world_size": 1, "timestamp": "2026-05-05T01:53:48.828876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44430, "epoch": 0, "train_loss": 3.7598941326141357, "train_ppl": 42.94387938185008, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 148440, "dt_s": 4.415, "eta_s": 20910, "world_size": 1, "timestamp": "2026-05-05T01:53:53.243866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44440, "epoch": 0, "train_loss": 3.6866030544042587, "train_ppl": 39.90904757292844, "lr": 0.00056, "grad_norm": 0.6353, "tokens_per_sec": 146778, "dt_s": 4.465, "eta_s": 21012, "world_size": 1, "timestamp": "2026-05-05T01:53:57.708854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44450, "epoch": 0, "train_loss": 3.7940659672021866, "train_ppl": 44.436711669688584, "lr": 0.00056, "grad_norm": 0.6522, "tokens_per_sec": 146239, "dt_s": 4.481, "eta_s": 21193, "world_size": 1, "timestamp": "2026-05-05T01:54:02.190279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44460, "epoch": 0, "train_loss": 3.7618023455142975, "train_ppl": 43.02590368149344, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 151287, "dt_s": 4.332, "eta_s": 21249, "world_size": 1, "timestamp": "2026-05-05T01:54:06.522162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44470, "epoch": 0, "train_loss": 3.7603823989629745, "train_ppl": 42.96485255287097, "lr": 0.00056, "grad_norm": 0.6771, "tokens_per_sec": 147931, "dt_s": 4.43, "eta_s": 21330, "world_size": 1, "timestamp": "2026-05-05T01:54:10.952332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44480, "epoch": 0, "train_loss": 3.795776277780533, "train_ppl": 44.512777277088645, "lr": 0.00056, "grad_norm": 0.714, "tokens_per_sec": 151331, "dt_s": 4.331, "eta_s": 21244, "world_size": 1, "timestamp": "2026-05-05T01:54:15.282967"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44490, "epoch": 0, "train_loss": 3.7237741500139236, "train_ppl": 41.42042638160002, "lr": 0.00056, "grad_norm": 0.8472, "tokens_per_sec": 151045, "dt_s": 4.339, "eta_s": 21118, "world_size": 1, "timestamp": "2026-05-05T01:54:19.621819"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44500, "epoch": 0, "train_loss": 3.7481373995542526, "train_ppl": 42.44195592678136, "lr": 0.00056, "grad_norm": 0.6832, "tokens_per_sec": 148689, "dt_s": 4.408, "eta_s": 21043, "world_size": 1, "timestamp": "2026-05-05T01:54:24.029436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44510, "epoch": 0, "train_loss": 3.813051626086235, "train_ppl": 45.28843156734405, "lr": 0.00056, "grad_norm": 0.7602, "tokens_per_sec": 126699, "dt_s": 5.173, "eta_s": 21085, "world_size": 1, "timestamp": "2026-05-05T01:54:29.201982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44520, "epoch": 0, "train_loss": 3.597535863518715, "train_ppl": 36.50816241949127, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 148133, "dt_s": 4.424, "eta_s": 21075, "world_size": 1, "timestamp": "2026-05-05T01:54:33.626124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44530, "epoch": 0, "train_loss": 3.7710504680871964, "train_ppl": 43.425658152551655, "lr": 0.00056, "grad_norm": 0.721, "tokens_per_sec": 133240, "dt_s": 4.919, "eta_s": 21637, "world_size": 1, "timestamp": "2026-05-05T01:54:38.544781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44540, "epoch": 0, "train_loss": 3.717296749353409, "train_ppl": 41.15299674389154, "lr": 0.00056, "grad_norm": 0.6828, "tokens_per_sec": 152409, "dt_s": 4.3, "eta_s": 21595, "world_size": 1, "timestamp": "2026-05-05T01:54:42.844768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44550, "epoch": 0, "train_loss": 3.7212758511304855, "train_ppl": 41.31707493178083, "lr": 0.00056, "grad_norm": 0.7076, "tokens_per_sec": 148525, "dt_s": 4.412, "eta_s": 21595, "world_size": 1, "timestamp": "2026-05-05T01:54:47.257223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44560, "epoch": 0, "train_loss": 3.774040162563324, "train_ppl": 43.55568187160091, "lr": 0.00056, "grad_norm": 0.6841, "tokens_per_sec": 150431, "dt_s": 4.357, "eta_s": 21568, "world_size": 1, "timestamp": "2026-05-05T01:54:51.613766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44570, "epoch": 0, "train_loss": 3.718955844640732, "train_ppl": 41.22133015699585, "lr": 0.00056, "grad_norm": 0.6685, "tokens_per_sec": 149933, "dt_s": 4.371, "eta_s": 21512, "world_size": 1, "timestamp": "2026-05-05T01:54:55.984797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44580, "epoch": 0, "train_loss": 3.7409251630306244, "train_ppl": 42.13695569111396, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 146100, "dt_s": 4.486, "eta_s": 21091, "world_size": 1, "timestamp": "2026-05-05T01:55:00.470485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44590, "epoch": 0, "train_loss": 3.7864686995744705, "train_ppl": 44.10039324681897, "lr": 0.00056, "grad_norm": 0.6469, "tokens_per_sec": 150176, "dt_s": 4.364, "eta_s": 21148, "world_size": 1, "timestamp": "2026-05-05T01:55:04.834422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44600, "epoch": 0, "train_loss": 3.820064902305603, "train_ppl": 45.60716823516555, "lr": 0.00056, "grad_norm": 1.1484, "tokens_per_sec": 150764, "dt_s": 4.347, "eta_s": 21081, "world_size": 1, "timestamp": "2026-05-05T01:55:09.181365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44610, "epoch": 0, "train_loss": 3.6990253925323486, "train_ppl": 40.40790331858512, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 151555, "dt_s": 4.324, "eta_s": 21045, "world_size": 1, "timestamp": "2026-05-05T01:55:13.505631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44620, "epoch": 0, "train_loss": 3.6781162172555923, "train_ppl": 39.57177918072866, "lr": 0.00056, "grad_norm": 0.6388, "tokens_per_sec": 151035, "dt_s": 4.339, "eta_s": 21010, "world_size": 1, "timestamp": "2026-05-05T01:55:17.844711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44630, "epoch": 0, "train_loss": 3.6798351258039474, "train_ppl": 39.63985794405777, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 150797, "dt_s": 4.346, "eta_s": 20872, "world_size": 1, "timestamp": "2026-05-05T01:55:22.190670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44640, "epoch": 0, "train_loss": 3.7015235275030136, "train_ppl": 40.50897390637905, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 149405, "dt_s": 4.386, "eta_s": 20889, "world_size": 1, "timestamp": "2026-05-05T01:55:26.577131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44650, "epoch": 0, "train_loss": 3.577936053276062, "train_ppl": 35.79957612583415, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 151123, "dt_s": 4.337, "eta_s": 20875, "world_size": 1, "timestamp": "2026-05-05T01:55:30.913748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44660, "epoch": 0, "train_loss": 3.7012661695480347, "train_ppl": 40.498549941098936, "lr": 0.00056, "grad_norm": 0.7682, "tokens_per_sec": 148618, "dt_s": 4.41, "eta_s": 20953, "world_size": 1, "timestamp": "2026-05-05T01:55:35.323431"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44670, "epoch": 0, "train_loss": 3.7606272995471954, "train_ppl": 42.97537595890368, "lr": 0.00056, "grad_norm": 0.6647, "tokens_per_sec": 150290, "dt_s": 4.361, "eta_s": 20969, "world_size": 1, "timestamp": "2026-05-05T01:55:39.684072"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44680, "epoch": 0, "train_loss": 3.8103594928979874, "train_ppl": 45.16667304629354, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 153230, "dt_s": 4.277, "eta_s": 20898, "world_size": 1, "timestamp": "2026-05-05T01:55:43.961029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44690, "epoch": 0, "train_loss": 3.733257457613945, "train_ppl": 41.81509746169834, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 149243, "dt_s": 4.391, "eta_s": 20898, "world_size": 1, "timestamp": "2026-05-05T01:55:48.352268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44700, "epoch": 0, "train_loss": 3.6729410141706467, "train_ppl": 39.367516194148024, "lr": 0.00056, "grad_norm": 0.6761, "tokens_per_sec": 154154, "dt_s": 4.251, "eta_s": 20812, "world_size": 1, "timestamp": "2026-05-05T01:55:52.603603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44710, "epoch": 0, "train_loss": 3.708002135157585, "train_ppl": 40.77226762222836, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 151958, "dt_s": 4.313, "eta_s": 20715, "world_size": 1, "timestamp": "2026-05-05T01:55:56.916372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44720, "epoch": 0, "train_loss": 3.7868693619966507, "train_ppl": 44.118066157395496, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 149577, "dt_s": 4.381, "eta_s": 20731, "world_size": 1, "timestamp": "2026-05-05T01:56:01.297799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44730, "epoch": 0, "train_loss": 3.7847184240818024, "train_ppl": 44.02327291989908, "lr": 0.00056, "grad_norm": 0.7033, "tokens_per_sec": 150213, "dt_s": 4.363, "eta_s": 20809, "world_size": 1, "timestamp": "2026-05-05T01:56:05.660658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44740, "epoch": 0, "train_loss": 3.727437525987625, "train_ppl": 41.57244325389603, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 146375, "dt_s": 4.477, "eta_s": 20887, "world_size": 1, "timestamp": "2026-05-05T01:56:10.137937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44750, "epoch": 0, "train_loss": 3.761176496744156, "train_ppl": 42.9989843971693, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 149168, "dt_s": 4.393, "eta_s": 21019, "world_size": 1, "timestamp": "2026-05-05T01:56:14.531384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44760, "epoch": 0, "train_loss": 3.8121864944696426, "train_ppl": 45.24926805656762, "lr": 0.00056, "grad_norm": 0.633, "tokens_per_sec": 148228, "dt_s": 4.421, "eta_s": 21118, "world_size": 1, "timestamp": "2026-05-05T01:56:18.952659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44770, "epoch": 0, "train_loss": 3.6284799724817276, "train_ppl": 37.65553564348911, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 142986, "dt_s": 4.583, "eta_s": 21307, "world_size": 1, "timestamp": "2026-05-05T01:56:23.536065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44780, "epoch": 0, "train_loss": 3.6270740777254105, "train_ppl": 37.602633119785615, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 149117, "dt_s": 4.395, "eta_s": 21334, "world_size": 1, "timestamp": "2026-05-05T01:56:27.931020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44790, "epoch": 0, "train_loss": 3.827509269118309, "train_ppl": 45.94795160920701, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 149436, "dt_s": 4.386, "eta_s": 21241, "world_size": 1, "timestamp": "2026-05-05T01:56:32.316589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44800, "epoch": 0, "train_loss": 3.7620673924684525, "train_ppl": 43.03730907762997, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 147399, "dt_s": 4.446, "eta_s": 21287, "world_size": 1, "timestamp": "2026-05-05T01:56:36.762735"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44810, "epoch": 0, "train_loss": 3.70330548286438, "train_ppl": 40.58122344322152, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 150567, "dt_s": 4.353, "eta_s": 21217, "world_size": 1, "timestamp": "2026-05-05T01:56:41.115385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44820, "epoch": 0, "train_loss": 3.756520166993141, "train_ppl": 42.799232363434726, "lr": 0.00056, "grad_norm": 0.6756, "tokens_per_sec": 148315, "dt_s": 4.419, "eta_s": 21055, "world_size": 1, "timestamp": "2026-05-05T01:56:45.534045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44830, "epoch": 0, "train_loss": 3.752532660961151, "train_ppl": 42.62890997268125, "lr": 0.00056, "grad_norm": 0.7018, "tokens_per_sec": 131424, "dt_s": 4.987, "eta_s": 21617, "world_size": 1, "timestamp": "2026-05-05T01:56:50.520655"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44840, "epoch": 0, "train_loss": 3.7649487107992172, "train_ppl": 43.16149208465551, "lr": 0.00056, "grad_norm": 0.687, "tokens_per_sec": 150304, "dt_s": 4.36, "eta_s": 21588, "world_size": 1, "timestamp": "2026-05-05T01:56:54.880890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44850, "epoch": 0, "train_loss": 3.7303767055273056, "train_ppl": 41.69481187209896, "lr": 0.00056, "grad_norm": 0.7259, "tokens_per_sec": 149992, "dt_s": 4.369, "eta_s": 21510, "world_size": 1, "timestamp": "2026-05-05T01:56:59.250204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44860, "epoch": 0, "train_loss": 3.82195220887661, "train_ppl": 45.69332421926482, "lr": 0.00056, "grad_norm": 0.6346, "tokens_per_sec": 149033, "dt_s": 4.397, "eta_s": 21549, "world_size": 1, "timestamp": "2026-05-05T01:57:03.647632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44870, "epoch": 0, "train_loss": 3.7700062692165375, "train_ppl": 43.38033679572397, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 148929, "dt_s": 4.4, "eta_s": 21527, "world_size": 1, "timestamp": "2026-05-05T01:57:08.048111"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44880, "epoch": 0, "train_loss": 3.6749434918165207, "train_ppl": 39.44642774824287, "lr": 0.00056, "grad_norm": 0.7233, "tokens_per_sec": 146265, "dt_s": 4.481, "eta_s": 21038, "world_size": 1, "timestamp": "2026-05-05T01:57:12.528758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44890, "epoch": 0, "train_loss": 3.7911670356988907, "train_ppl": 44.30807922473343, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 145626, "dt_s": 4.5, "eta_s": 21168, "world_size": 1, "timestamp": "2026-05-05T01:57:17.029048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44900, "epoch": 0, "train_loss": 3.7769440710544586, "train_ppl": 43.682347409579776, "lr": 0.00056, "grad_norm": 0.7024, "tokens_per_sec": 146636, "dt_s": 4.469, "eta_s": 21259, "world_size": 1, "timestamp": "2026-05-05T01:57:21.498378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44910, "epoch": 0, "train_loss": 3.783163383603096, "train_ppl": 43.95486814838224, "lr": 0.00056, "grad_norm": 0.8517, "tokens_per_sec": 147221, "dt_s": 4.452, "eta_s": 21306, "world_size": 1, "timestamp": "2026-05-05T01:57:25.949905"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44920, "epoch": 0, "train_loss": 3.8170797675848007, "train_ppl": 45.471227695109825, "lr": 0.00056, "grad_norm": 0.6618, "tokens_per_sec": 151955, "dt_s": 4.313, "eta_s": 21218, "world_size": 1, "timestamp": "2026-05-05T01:57:30.262775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44930, "epoch": 0, "train_loss": 3.763429746031761, "train_ppl": 43.095981065928626, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 150435, "dt_s": 4.356, "eta_s": 21095, "world_size": 1, "timestamp": "2026-05-05T01:57:34.619221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44940, "epoch": 0, "train_loss": 3.7477438002824783, "train_ppl": 42.42525409096673, "lr": 0.00056, "grad_norm": 0.6777, "tokens_per_sec": 148814, "dt_s": 4.404, "eta_s": 20999, "world_size": 1, "timestamp": "2026-05-05T01:57:39.023123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44950, "epoch": 0, "train_loss": 3.794198140501976, "train_ppl": 44.44258540466867, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 149947, "dt_s": 4.371, "eta_s": 20900, "world_size": 1, "timestamp": "2026-05-05T01:57:43.393718"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44960, "epoch": 0, "train_loss": 3.7429736107587814, "train_ppl": 42.22335950890948, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 147121, "dt_s": 4.455, "eta_s": 20899, "world_size": 1, "timestamp": "2026-05-05T01:57:47.848319"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44970, "epoch": 0, "train_loss": 3.5924582481384277, "train_ppl": 36.32325784690491, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 150436, "dt_s": 4.356, "eta_s": 20936, "world_size": 1, "timestamp": "2026-05-05T01:57:52.204713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44980, "epoch": 0, "train_loss": 3.773413047194481, "train_ppl": 43.5283759969605, "lr": 0.00056, "grad_norm": 0.691, "tokens_per_sec": 150079, "dt_s": 4.367, "eta_s": 20941, "world_size": 1, "timestamp": "2026-05-05T01:57:56.571479"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 44990, "epoch": 0, "train_loss": 3.735253319144249, "train_ppl": 41.89863794599577, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 147950, "dt_s": 4.43, "eta_s": 20961, "world_size": 1, "timestamp": "2026-05-05T01:58:01.001103"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45000, "epoch": 0, "train_loss": 3.743227928876877, "train_ppl": 42.23409903981021, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 150831, "dt_s": 4.345, "eta_s": 20933, "world_size": 1, "timestamp": "2026-05-05T01:58:05.346095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45010, "epoch": 0, "train_loss": 3.7501424998044968, "train_ppl": 42.527141677675964, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 127366, "dt_s": 5.145, "eta_s": 20833, "world_size": 1, "timestamp": "2026-05-05T01:58:10.491584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45020, "epoch": 0, "train_loss": 3.8161672800779343, "train_ppl": 45.42975469259215, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 148919, "dt_s": 4.401, "eta_s": 20870, "world_size": 1, "timestamp": "2026-05-05T01:58:14.892371"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45030, "epoch": 0, "train_loss": 3.7673797756433487, "train_ppl": 43.266548117956624, "lr": 0.00056, "grad_norm": 0.6652, "tokens_per_sec": 149158, "dt_s": 4.394, "eta_s": 20892, "world_size": 1, "timestamp": "2026-05-05T01:58:19.286094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45040, "epoch": 0, "train_loss": 3.6898356080055237, "train_ppl": 40.038264446097806, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 149704, "dt_s": 4.378, "eta_s": 20838, "world_size": 1, "timestamp": "2026-05-05T01:58:23.663829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45050, "epoch": 0, "train_loss": 3.6593127101659775, "train_ppl": 38.834643041752265, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 149242, "dt_s": 4.391, "eta_s": 20878, "world_size": 1, "timestamp": "2026-05-05T01:58:28.055067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45060, "epoch": 0, "train_loss": 3.818168357014656, "train_ppl": 45.52075414503537, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 151099, "dt_s": 4.337, "eta_s": 20857, "world_size": 1, "timestamp": "2026-05-05T01:58:32.392370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45070, "epoch": 0, "train_loss": 3.6693179607391357, "train_ppl": 39.22514364692157, "lr": 0.00056, "grad_norm": 0.7214, "tokens_per_sec": 151729, "dt_s": 4.319, "eta_s": 20775, "world_size": 1, "timestamp": "2026-05-05T01:58:36.711642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45080, "epoch": 0, "train_loss": 3.7224911600351334, "train_ppl": 41.367318465382276, "lr": 0.00056, "grad_norm": 0.6813, "tokens_per_sec": 152098, "dt_s": 4.309, "eta_s": 20690, "world_size": 1, "timestamp": "2026-05-05T01:58:41.020448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45090, "epoch": 0, "train_loss": 3.753629580140114, "train_ppl": 42.675696097216004, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 151710, "dt_s": 4.32, "eta_s": 20630, "world_size": 1, "timestamp": "2026-05-05T01:58:45.340268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45100, "epoch": 0, "train_loss": 3.7627764344215393, "train_ppl": 43.06783515617026, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 148791, "dt_s": 4.405, "eta_s": 20639, "world_size": 1, "timestamp": "2026-05-05T01:58:49.744838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45110, "epoch": 0, "train_loss": 3.7626908868551254, "train_ppl": 43.06415096527096, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 150233, "dt_s": 4.362, "eta_s": 20658, "world_size": 1, "timestamp": "2026-05-05T01:58:54.107122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45120, "epoch": 0, "train_loss": 3.7392297238111496, "train_ppl": 42.06557557127191, "lr": 0.00056, "grad_norm": 0.6533, "tokens_per_sec": 135228, "dt_s": 4.846, "eta_s": 21155, "world_size": 1, "timestamp": "2026-05-05T01:58:58.953462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45130, "epoch": 0, "train_loss": 3.8769547939300537, "train_ppl": 48.27697765559475, "lr": 0.00056, "grad_norm": 0.7151, "tokens_per_sec": 150607, "dt_s": 4.351, "eta_s": 21191, "world_size": 1, "timestamp": "2026-05-05T01:59:03.304922"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45140, "epoch": 0, "train_loss": 3.752906784415245, "train_ppl": 42.644861431445854, "lr": 0.00056, "grad_norm": 0.6286, "tokens_per_sec": 152274, "dt_s": 4.304, "eta_s": 21172, "world_size": 1, "timestamp": "2026-05-05T01:59:07.608723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45150, "epoch": 0, "train_loss": 3.658371850848198, "train_ppl": 38.798122289133055, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 150822, "dt_s": 4.345, "eta_s": 21111, "world_size": 1, "timestamp": "2026-05-05T01:59:11.953971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45160, "epoch": 0, "train_loss": 3.7820500135421753, "train_ppl": 43.905957347117464, "lr": 0.00056, "grad_norm": 0.6526, "tokens_per_sec": 152159, "dt_s": 4.307, "eta_s": 21054, "world_size": 1, "timestamp": "2026-05-05T01:59:16.261101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45170, "epoch": 0, "train_loss": 3.745448648929596, "train_ppl": 42.32799336837817, "lr": 0.00056, "grad_norm": 0.7145, "tokens_per_sec": 153246, "dt_s": 4.277, "eta_s": 20508, "world_size": 1, "timestamp": "2026-05-05T01:59:20.537574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45180, "epoch": 0, "train_loss": 3.675036460161209, "train_ppl": 39.450095187809744, "lr": 0.00056, "grad_norm": 0.7197, "tokens_per_sec": 149116, "dt_s": 4.395, "eta_s": 20545, "world_size": 1, "timestamp": "2026-05-05T01:59:24.932536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45190, "epoch": 0, "train_loss": 3.7798368632793427, "train_ppl": 43.808894313232415, "lr": 0.00056, "grad_norm": 0.707, "tokens_per_sec": 151712, "dt_s": 4.32, "eta_s": 20556, "world_size": 1, "timestamp": "2026-05-05T01:59:29.252308"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45200, "epoch": 0, "train_loss": 3.7314340323209763, "train_ppl": 41.73892022821255, "lr": 0.00056, "grad_norm": 0.7129, "tokens_per_sec": 151877, "dt_s": 4.315, "eta_s": 20523, "world_size": 1, "timestamp": "2026-05-05T01:59:33.567381"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45210, "epoch": 0, "train_loss": 3.6577940434217453, "train_ppl": 38.775710921292884, "lr": 0.00056, "grad_norm": 0.942, "tokens_per_sec": 149528, "dt_s": 4.383, "eta_s": 20590, "world_size": 1, "timestamp": "2026-05-05T01:59:37.950255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45220, "epoch": 0, "train_loss": 3.737300395965576, "train_ppl": 41.98449552512794, "lr": 0.00056, "grad_norm": 0.6839, "tokens_per_sec": 151885, "dt_s": 4.315, "eta_s": 20622, "world_size": 1, "timestamp": "2026-05-05T01:59:42.265061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45230, "epoch": 0, "train_loss": 3.713461756706238, "train_ppl": 40.99547753952002, "lr": 0.00056, "grad_norm": 0.7615, "tokens_per_sec": 152705, "dt_s": 4.292, "eta_s": 20520, "world_size": 1, "timestamp": "2026-05-05T01:59:46.556747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45240, "epoch": 0, "train_loss": 3.690780371427536, "train_ppl": 40.07610900809135, "lr": 0.00056, "grad_norm": 0.6457, "tokens_per_sec": 149813, "dt_s": 4.375, "eta_s": 20568, "world_size": 1, "timestamp": "2026-05-05T01:59:50.931280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45250, "epoch": 0, "train_loss": 3.747950941324234, "train_ppl": 42.43404301253764, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 153832, "dt_s": 4.26, "eta_s": 20511, "world_size": 1, "timestamp": "2026-05-05T01:59:55.191492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45260, "epoch": 0, "train_loss": 3.7205468863248825, "train_ppl": 41.2869672133519, "lr": 0.00056, "grad_norm": 0.6787, "tokens_per_sec": 149500, "dt_s": 4.384, "eta_s": 20508, "world_size": 1, "timestamp": "2026-05-05T01:59:59.575170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45270, "epoch": 0, "train_loss": 3.8066785633563995, "train_ppl": 45.000723317186726, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 150449, "dt_s": 4.356, "eta_s": 20543, "world_size": 1, "timestamp": "2026-05-05T02:00:03.931229"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45280, "epoch": 0, "train_loss": 3.7025485783815384, "train_ppl": 40.55051895492646, "lr": 0.00056, "grad_norm": 0.6661, "tokens_per_sec": 152270, "dt_s": 4.304, "eta_s": 20550, "world_size": 1, "timestamp": "2026-05-05T02:00:08.235157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45290, "epoch": 0, "train_loss": 3.6180613338947296, "train_ppl": 37.26525286479128, "lr": 0.00056, "grad_norm": 0.6784, "tokens_per_sec": 149693, "dt_s": 4.378, "eta_s": 20549, "world_size": 1, "timestamp": "2026-05-05T02:00:12.613209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45300, "epoch": 0, "train_loss": 3.7408635318279266, "train_ppl": 42.13435881988166, "lr": 0.00056, "grad_norm": 0.7337, "tokens_per_sec": 151397, "dt_s": 4.329, "eta_s": 20609, "world_size": 1, "timestamp": "2026-05-05T02:00:16.941944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45310, "epoch": 0, "train_loss": 3.772115334868431, "train_ppl": 43.47192532318608, "lr": 0.00056, "grad_norm": 0.6773, "tokens_per_sec": 149558, "dt_s": 4.382, "eta_s": 20604, "world_size": 1, "timestamp": "2026-05-05T02:00:21.323912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45320, "epoch": 0, "train_loss": 3.798007592558861, "train_ppl": 44.61221018665729, "lr": 0.00056, "grad_norm": 0.7368, "tokens_per_sec": 150410, "dt_s": 4.357, "eta_s": 20600, "world_size": 1, "timestamp": "2026-05-05T02:00:25.681068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45330, "epoch": 0, "train_loss": 3.6492351591587067, "train_ppl": 38.44525030369838, "lr": 0.00056, "grad_norm": 0.6382, "tokens_per_sec": 148427, "dt_s": 4.415, "eta_s": 20701, "world_size": 1, "timestamp": "2026-05-05T02:00:30.096449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45340, "epoch": 0, "train_loss": 3.832681715488434, "train_ppl": 46.186230636220046, "lr": 0.00056, "grad_norm": 0.7644, "tokens_per_sec": 150358, "dt_s": 4.359, "eta_s": 20679, "world_size": 1, "timestamp": "2026-05-05T02:00:34.455114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45350, "epoch": 0, "train_loss": 3.7538034468889236, "train_ppl": 42.6831166268226, "lr": 0.00056, "grad_norm": 0.7135, "tokens_per_sec": 149019, "dt_s": 4.398, "eta_s": 20740, "world_size": 1, "timestamp": "2026-05-05T02:00:38.852921"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45360, "epoch": 0, "train_loss": 3.914111867547035, "train_ppl": 50.10455226556148, "lr": 0.00056, "grad_norm": 0.8674, "tokens_per_sec": 151807, "dt_s": 4.317, "eta_s": 20674, "world_size": 1, "timestamp": "2026-05-05T02:00:43.169985"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45370, "epoch": 0, "train_loss": 3.7656024396419525, "train_ppl": 43.18971722171464, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 150796, "dt_s": 4.346, "eta_s": 20659, "world_size": 1, "timestamp": "2026-05-05T02:00:47.516007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45380, "epoch": 0, "train_loss": 3.7600308805704117, "train_ppl": 42.94975227113376, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 151072, "dt_s": 4.338, "eta_s": 20581, "world_size": 1, "timestamp": "2026-05-05T02:00:51.854071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45390, "epoch": 0, "train_loss": 3.6872496753931046, "train_ppl": 39.93486194589217, "lr": 0.00056, "grad_norm": 0.7832, "tokens_per_sec": 150987, "dt_s": 4.341, "eta_s": 20560, "world_size": 1, "timestamp": "2026-05-05T02:00:56.194594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45400, "epoch": 0, "train_loss": 3.9329250007867813, "train_ppl": 51.056098602040834, "lr": 0.00056, "grad_norm": 0.6699, "tokens_per_sec": 150922, "dt_s": 4.342, "eta_s": 20503, "world_size": 1, "timestamp": "2026-05-05T02:01:00.537001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45410, "epoch": 0, "train_loss": 3.7428376525640488, "train_ppl": 42.21761928739902, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 150793, "dt_s": 4.346, "eta_s": 20526, "world_size": 1, "timestamp": "2026-05-05T02:01:04.883069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45420, "epoch": 0, "train_loss": 3.750842958688736, "train_ppl": 42.556940627136456, "lr": 0.00056, "grad_norm": 0.7079, "tokens_per_sec": 135753, "dt_s": 4.828, "eta_s": 20977, "world_size": 1, "timestamp": "2026-05-05T02:01:09.710680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45430, "epoch": 0, "train_loss": 3.8777732104063034, "train_ppl": 48.31650450203654, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 150935, "dt_s": 4.342, "eta_s": 20976, "world_size": 1, "timestamp": "2026-05-05T02:01:14.052661"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45440, "epoch": 0, "train_loss": 3.786592960357666, "train_ppl": 44.10587353670855, "lr": 0.00056, "grad_norm": 0.618, "tokens_per_sec": 151744, "dt_s": 4.319, "eta_s": 20952, "world_size": 1, "timestamp": "2026-05-05T02:01:18.371504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45450, "epoch": 0, "train_loss": 3.7577054500579834, "train_ppl": 42.84999164485905, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 152132, "dt_s": 4.308, "eta_s": 20914, "world_size": 1, "timestamp": "2026-05-05T02:01:22.679375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45460, "epoch": 0, "train_loss": 3.7952832132577896, "train_ppl": 44.49083501572439, "lr": 0.00056, "grad_norm": 0.6944, "tokens_per_sec": 148805, "dt_s": 4.404, "eta_s": 20965, "world_size": 1, "timestamp": "2026-05-05T02:01:27.083516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45470, "epoch": 0, "train_loss": 3.7811797708272934, "train_ppl": 43.86776512825665, "lr": 0.00056, "grad_norm": 0.644, "tokens_per_sec": 152118, "dt_s": 4.308, "eta_s": 20470, "world_size": 1, "timestamp": "2026-05-05T02:01:31.391747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45480, "epoch": 0, "train_loss": 3.8962964862585068, "train_ppl": 49.21982484088095, "lr": 0.00056, "grad_norm": 0.6875, "tokens_per_sec": 151841, "dt_s": 4.316, "eta_s": 20441, "world_size": 1, "timestamp": "2026-05-05T02:01:35.707851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45490, "epoch": 0, "train_loss": 3.708289533853531, "train_ppl": 40.783987202789156, "lr": 0.00056, "grad_norm": 0.6463, "tokens_per_sec": 150430, "dt_s": 4.357, "eta_s": 20473, "world_size": 1, "timestamp": "2026-05-05T02:01:40.064439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45500, "epoch": 0, "train_loss": 3.754950165748596, "train_ppl": 42.732090235762136, "lr": 0.00056, "grad_norm": 0.6526, "tokens_per_sec": 152815, "dt_s": 4.289, "eta_s": 20450, "world_size": 1, "timestamp": "2026-05-05T02:01:44.353024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45510, "epoch": 0, "train_loss": 3.8450921028852463, "train_ppl": 46.7629911589829, "lr": 0.00056, "grad_norm": 0.6756, "tokens_per_sec": 126244, "dt_s": 5.191, "eta_s": 20459, "world_size": 1, "timestamp": "2026-05-05T02:01:49.544232"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45520, "epoch": 0, "train_loss": 3.6453813016414642, "train_ppl": 38.29737291902403, "lr": 0.00056, "grad_norm": 0.6535, "tokens_per_sec": 150402, "dt_s": 4.357, "eta_s": 20501, "world_size": 1, "timestamp": "2026-05-05T02:01:53.901612"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45530, "epoch": 0, "train_loss": 3.796159476041794, "train_ppl": 44.529837764510916, "lr": 0.00056, "grad_norm": 0.6405, "tokens_per_sec": 151084, "dt_s": 4.338, "eta_s": 20517, "world_size": 1, "timestamp": "2026-05-05T02:01:58.239357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45540, "epoch": 0, "train_loss": 3.7563828974962234, "train_ppl": 42.79335773755243, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 147571, "dt_s": 4.441, "eta_s": 20593, "world_size": 1, "timestamp": "2026-05-05T02:02:02.680348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45550, "epoch": 0, "train_loss": 3.6656802892684937, "train_ppl": 39.08271467291241, "lr": 0.00056, "grad_norm": 0.6376, "tokens_per_sec": 150746, "dt_s": 4.347, "eta_s": 20644, "world_size": 1, "timestamp": "2026-05-05T02:02:07.027794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45560, "epoch": 0, "train_loss": 3.8228238821029663, "train_ppl": 45.73317123087354, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 150704, "dt_s": 4.349, "eta_s": 20573, "world_size": 1, "timestamp": "2026-05-05T02:02:11.376439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45570, "epoch": 0, "train_loss": 3.8669099509716034, "train_ppl": 47.794470408327754, "lr": 0.00056, "grad_norm": 0.7935, "tokens_per_sec": 149271, "dt_s": 4.39, "eta_s": 20600, "world_size": 1, "timestamp": "2026-05-05T02:02:15.766846"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45580, "epoch": 0, "train_loss": 3.684453174471855, "train_ppl": 39.82334007585562, "lr": 0.00056, "grad_norm": 0.686, "tokens_per_sec": 150290, "dt_s": 4.361, "eta_s": 20617, "world_size": 1, "timestamp": "2026-05-05T02:02:20.127459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45590, "epoch": 0, "train_loss": 3.721801295876503, "train_ppl": 41.338790476384325, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 151025, "dt_s": 4.339, "eta_s": 20517, "world_size": 1, "timestamp": "2026-05-05T02:02:24.466894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45600, "epoch": 0, "train_loss": 3.7443766444921494, "train_ppl": 42.282641884498176, "lr": 0.00056, "grad_norm": 0.7633, "tokens_per_sec": 149051, "dt_s": 4.397, "eta_s": 20560, "world_size": 1, "timestamp": "2026-05-05T02:02:28.863758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45610, "epoch": 0, "train_loss": 3.664439931511879, "train_ppl": 39.03426817632341, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 152601, "dt_s": 4.295, "eta_s": 20504, "world_size": 1, "timestamp": "2026-05-05T02:02:33.158365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45620, "epoch": 0, "train_loss": 3.681493401527405, "train_ppl": 39.70564629070093, "lr": 0.00056, "grad_norm": 0.6384, "tokens_per_sec": 150745, "dt_s": 4.347, "eta_s": 20460, "world_size": 1, "timestamp": "2026-05-05T02:02:37.505847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45630, "epoch": 0, "train_loss": 3.6711352318525314, "train_ppl": 39.29649117665704, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 149775, "dt_s": 4.376, "eta_s": 20469, "world_size": 1, "timestamp": "2026-05-05T02:02:41.881450"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45640, "epoch": 0, "train_loss": 3.867746740579605, "train_ppl": 47.83448106240048, "lr": 0.00056, "grad_norm": 0.6805, "tokens_per_sec": 151805, "dt_s": 4.317, "eta_s": 20444, "world_size": 1, "timestamp": "2026-05-05T02:02:46.198567"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45650, "epoch": 0, "train_loss": 3.744929760694504, "train_ppl": 42.306035567919096, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 148700, "dt_s": 4.407, "eta_s": 20449, "world_size": 1, "timestamp": "2026-05-05T02:02:50.605843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45660, "epoch": 0, "train_loss": 3.774733603000641, "train_ppl": 43.58589561719135, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 152905, "dt_s": 4.286, "eta_s": 20437, "world_size": 1, "timestamp": "2026-05-05T02:02:54.891899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45670, "epoch": 0, "train_loss": 3.762575089931488, "train_ppl": 43.059164557781024, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 151835, "dt_s": 4.316, "eta_s": 20403, "world_size": 1, "timestamp": "2026-05-05T02:02:59.208188"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45680, "epoch": 0, "train_loss": 3.730575382709503, "train_ppl": 41.70309650279025, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 148253, "dt_s": 4.421, "eta_s": 20441, "world_size": 1, "timestamp": "2026-05-05T02:03:03.628735"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45690, "epoch": 0, "train_loss": 3.801877051591873, "train_ppl": 44.78516972045583, "lr": 0.00056, "grad_norm": 0.6817, "tokens_per_sec": 151843, "dt_s": 4.316, "eta_s": 20436, "world_size": 1, "timestamp": "2026-05-05T02:03:07.944785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45700, "epoch": 0, "train_loss": 3.70739908516407, "train_ppl": 40.74768731882401, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 151496, "dt_s": 4.326, "eta_s": 20355, "world_size": 1, "timestamp": "2026-05-05T02:03:12.270708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45710, "epoch": 0, "train_loss": 3.8098291903734207, "train_ppl": 45.14272739533156, "lr": 0.00056, "grad_norm": 0.75, "tokens_per_sec": 149175, "dt_s": 4.393, "eta_s": 20451, "world_size": 1, "timestamp": "2026-05-05T02:03:16.663924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45720, "epoch": 0, "train_loss": 3.6740736216306686, "train_ppl": 39.41212939652582, "lr": 0.00056, "grad_norm": 0.6374, "tokens_per_sec": 135995, "dt_s": 4.819, "eta_s": 20919, "world_size": 1, "timestamp": "2026-05-05T02:03:21.482937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45730, "epoch": 0, "train_loss": 3.7759602814912796, "train_ppl": 43.63939430397589, "lr": 0.00056, "grad_norm": 0.6419, "tokens_per_sec": 150063, "dt_s": 4.367, "eta_s": 20865, "world_size": 1, "timestamp": "2026-05-05T02:03:25.850146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45740, "epoch": 0, "train_loss": 3.6758667677640915, "train_ppl": 39.48286450420226, "lr": 0.00056, "grad_norm": 0.724, "tokens_per_sec": 150398, "dt_s": 4.358, "eta_s": 20899, "world_size": 1, "timestamp": "2026-05-05T02:03:30.207664"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45750, "epoch": 0, "train_loss": 3.6799512654542923, "train_ppl": 39.644461970648955, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 151672, "dt_s": 4.321, "eta_s": 20890, "world_size": 1, "timestamp": "2026-05-05T02:03:34.528577"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45760, "epoch": 0, "train_loss": 3.8097096383571625, "train_ppl": 45.13733081384458, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 150371, "dt_s": 4.358, "eta_s": 20853, "world_size": 1, "timestamp": "2026-05-05T02:03:38.886840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45770, "epoch": 0, "train_loss": 3.759879156947136, "train_ppl": 42.943236273428276, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 153255, "dt_s": 4.276, "eta_s": 20339, "world_size": 1, "timestamp": "2026-05-05T02:03:43.163117"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45780, "epoch": 0, "train_loss": 3.803507760167122, "train_ppl": 44.858260859739694, "lr": 0.00056, "grad_norm": 0.7816, "tokens_per_sec": 152572, "dt_s": 4.295, "eta_s": 20267, "world_size": 1, "timestamp": "2026-05-05T02:03:47.458543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45790, "epoch": 0, "train_loss": 3.654384508728981, "train_ppl": 38.64372891613957, "lr": 0.00056, "grad_norm": 0.7667, "tokens_per_sec": 147619, "dt_s": 4.44, "eta_s": 20340, "world_size": 1, "timestamp": "2026-05-05T02:03:51.898094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45800, "epoch": 0, "train_loss": 3.7170501947402954, "train_ppl": 41.14285153342651, "lr": 0.00056, "grad_norm": 0.6487, "tokens_per_sec": 150991, "dt_s": 4.34, "eta_s": 20354, "world_size": 1, "timestamp": "2026-05-05T02:03:56.238473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45810, "epoch": 0, "train_loss": 3.65051706135273, "train_ppl": 38.49456495593979, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 150768, "dt_s": 4.347, "eta_s": 20339, "world_size": 1, "timestamp": "2026-05-05T02:04:00.585307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45820, "epoch": 0, "train_loss": 3.758622094988823, "train_ppl": 42.889287880082726, "lr": 0.00056, "grad_norm": 0.6162, "tokens_per_sec": 149729, "dt_s": 4.377, "eta_s": 20429, "world_size": 1, "timestamp": "2026-05-05T02:04:04.962298"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45830, "epoch": 0, "train_loss": 3.835637927055359, "train_ppl": 46.32296891952511, "lr": 0.00056, "grad_norm": 0.6749, "tokens_per_sec": 152144, "dt_s": 4.308, "eta_s": 20436, "world_size": 1, "timestamp": "2026-05-05T02:04:09.269790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45840, "epoch": 0, "train_loss": 3.6857823729515076, "train_ppl": 39.87630839384444, "lr": 0.00056, "grad_norm": 0.7285, "tokens_per_sec": 151382, "dt_s": 4.329, "eta_s": 20328, "world_size": 1, "timestamp": "2026-05-05T02:04:13.598963"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45850, "epoch": 0, "train_loss": 3.7321992963552475, "train_ppl": 41.77087374757268, "lr": 0.00056, "grad_norm": 0.6551, "tokens_per_sec": 153009, "dt_s": 4.283, "eta_s": 20270, "world_size": 1, "timestamp": "2026-05-05T02:04:17.882113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45860, "epoch": 0, "train_loss": 3.7341681122779846, "train_ppl": 41.853193918954595, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 150645, "dt_s": 4.35, "eta_s": 20269, "world_size": 1, "timestamp": "2026-05-05T02:04:22.232480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45870, "epoch": 0, "train_loss": 3.8398988842964172, "train_ppl": 46.52077022154589, "lr": 0.00056, "grad_norm": 0.7434, "tokens_per_sec": 151986, "dt_s": 4.312, "eta_s": 20204, "world_size": 1, "timestamp": "2026-05-05T02:04:26.544457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45880, "epoch": 0, "train_loss": 3.7869992852211, "train_ppl": 44.12379849118071, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 151280, "dt_s": 4.332, "eta_s": 20223, "world_size": 1, "timestamp": "2026-05-05T02:04:30.876539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45890, "epoch": 0, "train_loss": 3.6700389683246613, "train_ppl": 39.25343547111863, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 151935, "dt_s": 4.313, "eta_s": 20204, "world_size": 1, "timestamp": "2026-05-05T02:04:35.189963"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45900, "epoch": 0, "train_loss": 3.711656555533409, "train_ppl": 40.92153921224145, "lr": 0.00056, "grad_norm": 1.0811, "tokens_per_sec": 150705, "dt_s": 4.349, "eta_s": 20261, "world_size": 1, "timestamp": "2026-05-05T02:04:39.538577"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45910, "epoch": 0, "train_loss": 3.8360414803028107, "train_ppl": 46.341666476540375, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 151762, "dt_s": 4.318, "eta_s": 20226, "world_size": 1, "timestamp": "2026-05-05T02:04:43.856910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45920, "epoch": 0, "train_loss": 3.77895624935627, "train_ppl": 43.770332572440665, "lr": 0.00056, "grad_norm": 0.7197, "tokens_per_sec": 152210, "dt_s": 4.306, "eta_s": 20216, "world_size": 1, "timestamp": "2026-05-05T02:04:48.162581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45930, "epoch": 0, "train_loss": 3.732112407684326, "train_ppl": 41.76724448954251, "lr": 0.00056, "grad_norm": 0.6494, "tokens_per_sec": 149857, "dt_s": 4.373, "eta_s": 20250, "world_size": 1, "timestamp": "2026-05-05T02:04:52.535779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45940, "epoch": 0, "train_loss": 3.7918590903282166, "train_ppl": 44.33875344897263, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 150900, "dt_s": 4.343, "eta_s": 20274, "world_size": 1, "timestamp": "2026-05-05T02:04:56.878823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45950, "epoch": 0, "train_loss": 3.804427608847618, "train_ppl": 44.89954265539272, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 151758, "dt_s": 4.318, "eta_s": 20241, "world_size": 1, "timestamp": "2026-05-05T02:05:01.197242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45960, "epoch": 0, "train_loss": 3.7516505420207977, "train_ppl": 42.591322784410394, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 150181, "dt_s": 4.364, "eta_s": 20279, "world_size": 1, "timestamp": "2026-05-05T02:05:05.561024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45970, "epoch": 0, "train_loss": 3.7106812447309494, "train_ppl": 40.88164744959018, "lr": 0.00056, "grad_norm": 0.6934, "tokens_per_sec": 152244, "dt_s": 4.305, "eta_s": 20274, "world_size": 1, "timestamp": "2026-05-05T02:05:09.865682"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45980, "epoch": 0, "train_loss": 3.866664230823517, "train_ppl": 47.78272778673974, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 151304, "dt_s": 4.331, "eta_s": 20230, "world_size": 1, "timestamp": "2026-05-05T02:05:14.197080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 45990, "epoch": 0, "train_loss": 3.7846719622612, "train_ppl": 44.02122756600614, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 150169, "dt_s": 4.364, "eta_s": 20246, "world_size": 1, "timestamp": "2026-05-05T02:05:18.561241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46000, "epoch": 0, "train_loss": 3.822627753019333, "train_ppl": 45.724202505451075, "lr": 0.00056, "grad_norm": 0.7072, "tokens_per_sec": 152306, "dt_s": 4.303, "eta_s": 20227, "world_size": 1, "timestamp": "2026-05-05T02:05:22.864212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46010, "epoch": 0, "train_loss": 3.67828831076622, "train_ppl": 39.57858981314567, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 115351, "dt_s": 5.681, "eta_s": 20733, "world_size": 1, "timestamp": "2026-05-05T02:05:28.545626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46020, "epoch": 0, "train_loss": 3.8200370520353317, "train_ppl": 45.605898080891045, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 151824, "dt_s": 4.317, "eta_s": 20739, "world_size": 1, "timestamp": "2026-05-05T02:05:32.862208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46030, "epoch": 0, "train_loss": 3.664416193962097, "train_ppl": 39.03334160943663, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 149253, "dt_s": 4.391, "eta_s": 20790, "world_size": 1, "timestamp": "2026-05-05T02:05:37.253114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46040, "epoch": 0, "train_loss": 3.8223176300525665, "train_ppl": 45.71002457868104, "lr": 0.00056, "grad_norm": 0.651, "tokens_per_sec": 150002, "dt_s": 4.369, "eta_s": 20790, "world_size": 1, "timestamp": "2026-05-05T02:05:41.622150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46050, "epoch": 0, "train_loss": 3.670670300722122, "train_ppl": 39.27822526111857, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 153648, "dt_s": 4.265, "eta_s": 20751, "world_size": 1, "timestamp": "2026-05-05T02:05:45.887465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46060, "epoch": 0, "train_loss": 3.8490273356437683, "train_ppl": 46.94737697628172, "lr": 0.00056, "grad_norm": 0.76, "tokens_per_sec": 150939, "dt_s": 4.342, "eta_s": 20217, "world_size": 1, "timestamp": "2026-05-05T02:05:50.229369"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46070, "epoch": 0, "train_loss": 3.724009335041046, "train_ppl": 41.430168991315064, "lr": 0.00056, "grad_norm": 0.7239, "tokens_per_sec": 150294, "dt_s": 4.361, "eta_s": 20253, "world_size": 1, "timestamp": "2026-05-05T02:05:54.589856"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46080, "epoch": 0, "train_loss": 3.7083702236413956, "train_ppl": 40.78727818683745, "lr": 0.00056, "grad_norm": 0.6838, "tokens_per_sec": 150405, "dt_s": 4.357, "eta_s": 20218, "world_size": 1, "timestamp": "2026-05-05T02:05:58.947143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46090, "epoch": 0, "train_loss": 3.790847525000572, "train_ppl": 44.29392458079969, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 147913, "dt_s": 4.431, "eta_s": 20271, "world_size": 1, "timestamp": "2026-05-05T02:06:03.377864"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46100, "epoch": 0, "train_loss": 3.687399536371231, "train_ppl": 39.94084707182194, "lr": 0.00056, "grad_norm": 0.621, "tokens_per_sec": 152363, "dt_s": 4.301, "eta_s": 20300, "world_size": 1, "timestamp": "2026-05-05T02:06:07.679209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46110, "epoch": 0, "train_loss": 3.7721535563468933, "train_ppl": 43.4735869161976, "lr": 0.00056, "grad_norm": 0.6514, "tokens_per_sec": 150917, "dt_s": 4.343, "eta_s": 20296, "world_size": 1, "timestamp": "2026-05-05T02:06:12.021717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46120, "epoch": 0, "train_loss": 3.858615517616272, "train_ppl": 47.39968189564984, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 148029, "dt_s": 4.427, "eta_s": 20354, "world_size": 1, "timestamp": "2026-05-05T02:06:16.448962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46130, "epoch": 0, "train_loss": 3.714703232049942, "train_ppl": 41.04640401953244, "lr": 0.00056, "grad_norm": 0.6375, "tokens_per_sec": 151366, "dt_s": 4.33, "eta_s": 20324, "world_size": 1, "timestamp": "2026-05-05T02:06:20.778588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46140, "epoch": 0, "train_loss": 3.799631506204605, "train_ppl": 44.6847154187305, "lr": 0.00056, "grad_norm": 0.6361, "tokens_per_sec": 150461, "dt_s": 4.356, "eta_s": 20250, "world_size": 1, "timestamp": "2026-05-05T02:06:25.134307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46150, "epoch": 0, "train_loss": 3.7271355241537094, "train_ppl": 41.559890195411754, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 150294, "dt_s": 4.361, "eta_s": 20300, "world_size": 1, "timestamp": "2026-05-05T02:06:29.494794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46160, "epoch": 0, "train_loss": 3.8357552140951157, "train_ppl": 46.328402322050046, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 151499, "dt_s": 4.326, "eta_s": 20280, "world_size": 1, "timestamp": "2026-05-05T02:06:33.820644"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46170, "epoch": 0, "train_loss": 3.9254213720560074, "train_ppl": 50.674426347953755, "lr": 0.00056, "grad_norm": 0.7081, "tokens_per_sec": 151091, "dt_s": 4.338, "eta_s": 20193, "world_size": 1, "timestamp": "2026-05-05T02:06:38.158147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46180, "epoch": 0, "train_loss": 3.780924752354622, "train_ppl": 43.85657946413024, "lr": 0.00056, "grad_norm": 0.7653, "tokens_per_sec": 152467, "dt_s": 4.298, "eta_s": 20159, "world_size": 1, "timestamp": "2026-05-05T02:06:42.456523"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46190, "epoch": 0, "train_loss": 3.8390068411827087, "train_ppl": 46.479290192563056, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 150431, "dt_s": 4.357, "eta_s": 20156, "world_size": 1, "timestamp": "2026-05-05T02:06:46.813080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46200, "epoch": 0, "train_loss": 3.715587243437767, "train_ppl": 41.08270555123461, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 148685, "dt_s": 4.408, "eta_s": 20195, "world_size": 1, "timestamp": "2026-05-05T02:06:51.220753"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46210, "epoch": 0, "train_loss": 3.7881560772657394, "train_ppl": 44.174870084165626, "lr": 0.00056, "grad_norm": 0.6953, "tokens_per_sec": 151138, "dt_s": 4.336, "eta_s": 20201, "world_size": 1, "timestamp": "2026-05-05T02:06:55.556931"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46220, "epoch": 0, "train_loss": 3.777286648750305, "train_ppl": 43.697314571065995, "lr": 0.00056, "grad_norm": 0.6574, "tokens_per_sec": 150516, "dt_s": 4.354, "eta_s": 20212, "world_size": 1, "timestamp": "2026-05-05T02:06:59.911034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46230, "epoch": 0, "train_loss": 3.465707764029503, "train_ppl": 31.99909957202137, "lr": 0.00056, "grad_norm": 0.7514, "tokens_per_sec": 148287, "dt_s": 4.42, "eta_s": 20320, "world_size": 1, "timestamp": "2026-05-05T02:07:04.330563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46240, "epoch": 0, "train_loss": 3.8662926852703094, "train_ppl": 47.76497762441159, "lr": 0.00056, "grad_norm": 0.7128, "tokens_per_sec": 152246, "dt_s": 4.305, "eta_s": 20267, "world_size": 1, "timestamp": "2026-05-05T02:07:08.635191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46250, "epoch": 0, "train_loss": 3.7931348383426666, "train_ppl": 44.39535462238896, "lr": 0.00056, "grad_norm": 0.6488, "tokens_per_sec": 151120, "dt_s": 4.337, "eta_s": 20197, "world_size": 1, "timestamp": "2026-05-05T02:07:12.971862"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46260, "epoch": 0, "train_loss": 3.6908096075057983, "train_ppl": 40.07728069347842, "lr": 0.00056, "grad_norm": 0.6467, "tokens_per_sec": 146882, "dt_s": 4.462, "eta_s": 20309, "world_size": 1, "timestamp": "2026-05-05T02:07:17.433670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46270, "epoch": 0, "train_loss": 3.789212852716446, "train_ppl": 44.22157767778105, "lr": 0.00056, "grad_norm": 0.6813, "tokens_per_sec": 149246, "dt_s": 4.391, "eta_s": 20339, "world_size": 1, "timestamp": "2026-05-05T02:07:21.824810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46280, "epoch": 0, "train_loss": 3.746183529496193, "train_ppl": 42.35911082053266, "lr": 0.00056, "grad_norm": 0.7028, "tokens_per_sec": 150862, "dt_s": 4.344, "eta_s": 20265, "world_size": 1, "timestamp": "2026-05-05T02:07:26.168902"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46290, "epoch": 0, "train_loss": 3.7983403354883194, "train_ppl": 44.62705705412253, "lr": 0.00056, "grad_norm": 0.6365, "tokens_per_sec": 150305, "dt_s": 4.36, "eta_s": 20312, "world_size": 1, "timestamp": "2026-05-05T02:07:30.529096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46300, "epoch": 0, "train_loss": 3.8018387258052826, "train_ppl": 44.78345332648999, "lr": 0.00056, "grad_norm": 0.685, "tokens_per_sec": 151440, "dt_s": 4.328, "eta_s": 20299, "world_size": 1, "timestamp": "2026-05-05T02:07:34.856649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46310, "epoch": 0, "train_loss": 3.7330887019634247, "train_ppl": 41.80804152310613, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 133238, "dt_s": 4.919, "eta_s": 20719, "world_size": 1, "timestamp": "2026-05-05T02:07:39.775357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46320, "epoch": 0, "train_loss": 3.6859351843595505, "train_ppl": 39.88240241428372, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 151044, "dt_s": 4.339, "eta_s": 20666, "world_size": 1, "timestamp": "2026-05-05T02:07:44.114223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46330, "epoch": 0, "train_loss": 3.691511243581772, "train_ppl": 40.105410226633516, "lr": 0.00056, "grad_norm": 0.662, "tokens_per_sec": 152463, "dt_s": 4.298, "eta_s": 20619, "world_size": 1, "timestamp": "2026-05-05T02:07:48.412721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46340, "epoch": 0, "train_loss": 3.5840426236391068, "train_ppl": 36.01885760388078, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 148720, "dt_s": 4.407, "eta_s": 20658, "world_size": 1, "timestamp": "2026-05-05T02:07:52.819414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46350, "epoch": 0, "train_loss": 3.748073533177376, "train_ppl": 42.439245399385456, "lr": 0.00056, "grad_norm": 0.7307, "tokens_per_sec": 151712, "dt_s": 4.32, "eta_s": 20646, "world_size": 1, "timestamp": "2026-05-05T02:07:57.139160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46360, "epoch": 0, "train_loss": 3.8608891367912292, "train_ppl": 47.507573326830716, "lr": 0.00056, "grad_norm": 0.6213, "tokens_per_sec": 151085, "dt_s": 4.338, "eta_s": 20103, "world_size": 1, "timestamp": "2026-05-05T02:08:01.476838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46370, "epoch": 0, "train_loss": 3.7518395483493805, "train_ppl": 42.599373574760556, "lr": 0.00056, "grad_norm": 0.6743, "tokens_per_sec": 149381, "dt_s": 4.387, "eta_s": 20143, "world_size": 1, "timestamp": "2026-05-05T02:08:05.864009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46380, "epoch": 0, "train_loss": 3.6994417011737823, "train_ppl": 40.42472898000998, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 151064, "dt_s": 4.338, "eta_s": 20176, "world_size": 1, "timestamp": "2026-05-05T02:08:10.202325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46390, "epoch": 0, "train_loss": 3.747227981686592, "train_ppl": 42.40337599901984, "lr": 0.00056, "grad_norm": 0.6556, "tokens_per_sec": 150038, "dt_s": 4.368, "eta_s": 20136, "world_size": 1, "timestamp": "2026-05-05T02:08:14.570302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46400, "epoch": 0, "train_loss": 3.754759281873703, "train_ppl": 42.7239341472533, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 151359, "dt_s": 4.33, "eta_s": 20141, "world_size": 1, "timestamp": "2026-05-05T02:08:18.900118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46410, "epoch": 0, "train_loss": 3.691863536834717, "train_ppl": 40.11954158110721, "lr": 0.00056, "grad_norm": 0.6311, "tokens_per_sec": 151342, "dt_s": 4.33, "eta_s": 20130, "world_size": 1, "timestamp": "2026-05-05T02:08:23.230414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46420, "epoch": 0, "train_loss": 3.687813639640808, "train_ppl": 39.95739013221536, "lr": 0.00056, "grad_norm": 0.6716, "tokens_per_sec": 149579, "dt_s": 4.381, "eta_s": 20120, "world_size": 1, "timestamp": "2026-05-05T02:08:27.611802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46430, "epoch": 0, "train_loss": 3.8950487822294235, "train_ppl": 49.15845136304601, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 150573, "dt_s": 4.352, "eta_s": 20129, "world_size": 1, "timestamp": "2026-05-05T02:08:31.964277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46440, "epoch": 0, "train_loss": 3.7716167867183685, "train_ppl": 43.450257876817986, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 151047, "dt_s": 4.339, "eta_s": 20097, "world_size": 1, "timestamp": "2026-05-05T02:08:36.303021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46450, "epoch": 0, "train_loss": 3.7437284886837006, "train_ppl": 42.25524502424037, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 149110, "dt_s": 4.395, "eta_s": 20153, "world_size": 1, "timestamp": "2026-05-05T02:08:40.698158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46460, "epoch": 0, "train_loss": 3.904461905360222, "train_ppl": 49.62337065700983, "lr": 0.00056, "grad_norm": 0.763, "tokens_per_sec": 151548, "dt_s": 4.324, "eta_s": 20144, "world_size": 1, "timestamp": "2026-05-05T02:08:45.022597"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46470, "epoch": 0, "train_loss": 3.801318109035492, "train_ppl": 44.760144377720366, "lr": 0.00056, "grad_norm": 0.6391, "tokens_per_sec": 151991, "dt_s": 4.312, "eta_s": 20075, "world_size": 1, "timestamp": "2026-05-05T02:08:49.334427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46480, "epoch": 0, "train_loss": 3.779974117875099, "train_ppl": 43.81490769798482, "lr": 0.00056, "grad_norm": 0.7162, "tokens_per_sec": 149332, "dt_s": 4.389, "eta_s": 20104, "world_size": 1, "timestamp": "2026-05-05T02:08:53.723037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46490, "epoch": 0, "train_loss": 3.78523588180542, "train_ppl": 44.04605899739786, "lr": 0.00056, "grad_norm": 0.6316, "tokens_per_sec": 152852, "dt_s": 4.288, "eta_s": 20052, "world_size": 1, "timestamp": "2026-05-05T02:08:58.010588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46500, "epoch": 0, "train_loss": 3.7527166604995728, "train_ppl": 42.63675439410247, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 151142, "dt_s": 4.336, "eta_s": 19993, "world_size": 1, "timestamp": "2026-05-05T02:09:02.346644"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46510, "epoch": 0, "train_loss": 3.8091209530830383, "train_ppl": 45.110766951525626, "lr": 0.00056, "grad_norm": 0.6966, "tokens_per_sec": 128201, "dt_s": 5.112, "eta_s": 20007, "world_size": 1, "timestamp": "2026-05-05T02:09:07.458622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46520, "epoch": 0, "train_loss": 3.7434013187885284, "train_ppl": 42.24142264141238, "lr": 0.00056, "grad_norm": 0.7196, "tokens_per_sec": 151102, "dt_s": 4.337, "eta_s": 20026, "world_size": 1, "timestamp": "2026-05-05T02:09:11.795807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46530, "epoch": 0, "train_loss": 3.661542162299156, "train_ppl": 38.921319604244665, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 147628, "dt_s": 4.439, "eta_s": 20069, "world_size": 1, "timestamp": "2026-05-05T02:09:16.235081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46540, "epoch": 0, "train_loss": 3.732441693544388, "train_ppl": 41.78100011720921, "lr": 0.00056, "grad_norm": 0.7056, "tokens_per_sec": 148523, "dt_s": 4.413, "eta_s": 20180, "world_size": 1, "timestamp": "2026-05-05T02:09:20.647638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46550, "epoch": 0, "train_loss": 3.8054103702306747, "train_ppl": 44.94368988157269, "lr": 0.00056, "grad_norm": 0.7023, "tokens_per_sec": 151036, "dt_s": 4.339, "eta_s": 20178, "world_size": 1, "timestamp": "2026-05-05T02:09:24.986674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46560, "epoch": 0, "train_loss": 3.861973211169243, "train_ppl": 47.559102995786304, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 149854, "dt_s": 4.373, "eta_s": 20201, "world_size": 1, "timestamp": "2026-05-05T02:09:29.359993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46570, "epoch": 0, "train_loss": 3.7170830368995667, "train_ppl": 41.144202775698176, "lr": 0.00056, "grad_norm": 0.6176, "tokens_per_sec": 152318, "dt_s": 4.303, "eta_s": 20164, "world_size": 1, "timestamp": "2026-05-05T02:09:33.662555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46580, "epoch": 0, "train_loss": 3.774440288543701, "train_ppl": 43.57311311862562, "lr": 0.00056, "grad_norm": 0.6307, "tokens_per_sec": 152191, "dt_s": 4.306, "eta_s": 20037, "world_size": 1, "timestamp": "2026-05-05T02:09:37.968732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46590, "epoch": 0, "train_loss": 3.7780349999666214, "train_ppl": 43.73002774852137, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 149091, "dt_s": 4.396, "eta_s": 20017, "world_size": 1, "timestamp": "2026-05-05T02:09:42.364436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46600, "epoch": 0, "train_loss": 3.6714940816164017, "train_ppl": 39.310595243705926, "lr": 0.00056, "grad_norm": 0.6414, "tokens_per_sec": 137149, "dt_s": 4.778, "eta_s": 20418, "world_size": 1, "timestamp": "2026-05-05T02:09:47.142911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46610, "epoch": 0, "train_loss": 3.8147298395633698, "train_ppl": 45.36449903442984, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 151080, "dt_s": 4.338, "eta_s": 20381, "world_size": 1, "timestamp": "2026-05-05T02:09:51.480749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46620, "epoch": 0, "train_loss": 3.7025954723358154, "train_ppl": 40.55242057369511, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 152172, "dt_s": 4.307, "eta_s": 20380, "world_size": 1, "timestamp": "2026-05-05T02:09:55.787441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46630, "epoch": 0, "train_loss": 3.856152057647705, "train_ppl": 47.283058384431534, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 151618, "dt_s": 4.322, "eta_s": 20391, "world_size": 1, "timestamp": "2026-05-05T02:10:00.109883"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46640, "epoch": 0, "train_loss": 3.861351802945137, "train_ppl": 47.529558558582465, "lr": 0.00056, "grad_norm": 0.6477, "tokens_per_sec": 149635, "dt_s": 4.38, "eta_s": 20372, "world_size": 1, "timestamp": "2026-05-05T02:10:04.489597"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46650, "epoch": 0, "train_loss": 4.0282774567604065, "train_ppl": 56.16408281432897, "lr": 0.00056, "grad_norm": 0.9108, "tokens_per_sec": 153462, "dt_s": 4.27, "eta_s": 19900, "world_size": 1, "timestamp": "2026-05-05T02:10:08.760086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46660, "epoch": 0, "train_loss": 3.7820391803979874, "train_ppl": 43.90548171012714, "lr": 0.00056, "grad_norm": 0.9728, "tokens_per_sec": 151899, "dt_s": 4.314, "eta_s": 19874, "world_size": 1, "timestamp": "2026-05-05T02:10:13.074533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46670, "epoch": 0, "train_loss": 3.7355221062898636, "train_ppl": 41.909901274945625, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 149357, "dt_s": 4.388, "eta_s": 19944, "world_size": 1, "timestamp": "2026-05-05T02:10:17.462404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46680, "epoch": 0, "train_loss": 3.7329075634479523, "train_ppl": 41.800469162373616, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 152165, "dt_s": 4.307, "eta_s": 19926, "world_size": 1, "timestamp": "2026-05-05T02:10:21.769334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46690, "epoch": 0, "train_loss": 3.8310613334178925, "train_ppl": 46.111451897616696, "lr": 0.00056, "grad_norm": 0.7255, "tokens_per_sec": 150931, "dt_s": 4.342, "eta_s": 19887, "world_size": 1, "timestamp": "2026-05-05T02:10:26.111440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46700, "epoch": 0, "train_loss": 3.7184978127479553, "train_ppl": 41.20245379643886, "lr": 0.00056, "grad_norm": 0.6351, "tokens_per_sec": 150973, "dt_s": 4.341, "eta_s": 19947, "world_size": 1, "timestamp": "2026-05-05T02:10:30.452363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46710, "epoch": 0, "train_loss": 3.6408401429653168, "train_ppl": 38.1238527612003, "lr": 0.00056, "grad_norm": 0.7106, "tokens_per_sec": 152152, "dt_s": 4.307, "eta_s": 19936, "world_size": 1, "timestamp": "2026-05-05T02:10:34.759674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46720, "epoch": 0, "train_loss": 3.770729437470436, "train_ppl": 43.41171942423071, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 150771, "dt_s": 4.347, "eta_s": 19894, "world_size": 1, "timestamp": "2026-05-05T02:10:39.106414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46730, "epoch": 0, "train_loss": 3.6300467401742935, "train_ppl": 37.714579361998716, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 152009, "dt_s": 4.311, "eta_s": 19893, "world_size": 1, "timestamp": "2026-05-05T02:10:43.417675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46740, "epoch": 0, "train_loss": 3.7235169261693954, "train_ppl": 41.409773430439415, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 152790, "dt_s": 4.289, "eta_s": 19841, "world_size": 1, "timestamp": "2026-05-05T02:10:47.706944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46750, "epoch": 0, "train_loss": 3.7969409078359604, "train_ppl": 44.56464839482445, "lr": 0.00056, "grad_norm": 0.7216, "tokens_per_sec": 151341, "dt_s": 4.33, "eta_s": 19827, "world_size": 1, "timestamp": "2026-05-05T02:10:52.037302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46760, "epoch": 0, "train_loss": 3.7099782824516296, "train_ppl": 40.85291929210522, "lr": 0.00056, "grad_norm": 0.73, "tokens_per_sec": 151096, "dt_s": 4.337, "eta_s": 19850, "world_size": 1, "timestamp": "2026-05-05T02:10:56.374718"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46770, "epoch": 0, "train_loss": 3.7071066796779633, "train_ppl": 40.73577421331941, "lr": 0.00056, "grad_norm": 0.7083, "tokens_per_sec": 152104, "dt_s": 4.309, "eta_s": 19810, "world_size": 1, "timestamp": "2026-05-05T02:11:00.683325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46780, "epoch": 0, "train_loss": 3.672983929514885, "train_ppl": 39.36920570090992, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 149766, "dt_s": 4.376, "eta_s": 19866, "world_size": 1, "timestamp": "2026-05-05T02:11:05.059226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46790, "epoch": 0, "train_loss": 3.811452329158783, "train_ppl": 45.216059805283365, "lr": 0.00056, "grad_norm": 0.6858, "tokens_per_sec": 151963, "dt_s": 4.313, "eta_s": 19883, "world_size": 1, "timestamp": "2026-05-05T02:11:09.371818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46800, "epoch": 0, "train_loss": 3.7397419065237045, "train_ppl": 42.08712635037031, "lr": 0.00056, "grad_norm": 0.6556, "tokens_per_sec": 152195, "dt_s": 4.306, "eta_s": 19856, "world_size": 1, "timestamp": "2026-05-05T02:11:13.677920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46810, "epoch": 0, "train_loss": 3.7585148960351944, "train_ppl": 42.884690439724906, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 149797, "dt_s": 4.375, "eta_s": 19886, "world_size": 1, "timestamp": "2026-05-05T02:11:18.052891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46820, "epoch": 0, "train_loss": 3.730613023042679, "train_ppl": 41.70466625077981, "lr": 0.00056, "grad_norm": 0.6294, "tokens_per_sec": 152606, "dt_s": 4.294, "eta_s": 19869, "world_size": 1, "timestamp": "2026-05-05T02:11:22.347366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46830, "epoch": 0, "train_loss": 3.9887103140354156, "train_ppl": 53.985220466353475, "lr": 0.00056, "grad_norm": 1.1026, "tokens_per_sec": 150937, "dt_s": 4.342, "eta_s": 19834, "world_size": 1, "timestamp": "2026-05-05T02:11:26.689304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46840, "epoch": 0, "train_loss": 3.735086739063263, "train_ppl": 41.891659048782294, "lr": 0.00056, "grad_norm": 0.8093, "tokens_per_sec": 148386, "dt_s": 4.417, "eta_s": 19925, "world_size": 1, "timestamp": "2026-05-05T02:11:31.105903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46850, "epoch": 0, "train_loss": 3.7297516465187073, "train_ppl": 41.66875829768545, "lr": 0.00056, "grad_norm": 0.6341, "tokens_per_sec": 152172, "dt_s": 4.307, "eta_s": 19921, "world_size": 1, "timestamp": "2026-05-05T02:11:35.412582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46860, "epoch": 0, "train_loss": 3.7467636466026306, "train_ppl": 42.383691194391474, "lr": 0.00056, "grad_norm": 0.6949, "tokens_per_sec": 151526, "dt_s": 4.325, "eta_s": 19871, "world_size": 1, "timestamp": "2026-05-05T02:11:39.737644"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46870, "epoch": 0, "train_loss": 3.6993124186992645, "train_ppl": 40.419503108829794, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 149534, "dt_s": 4.383, "eta_s": 19947, "world_size": 1, "timestamp": "2026-05-05T02:11:44.120331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46880, "epoch": 0, "train_loss": 3.8641588240861893, "train_ppl": 47.66316246104448, "lr": 0.00056, "grad_norm": 0.6743, "tokens_per_sec": 153211, "dt_s": 4.277, "eta_s": 19884, "world_size": 1, "timestamp": "2026-05-05T02:11:48.397809"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46890, "epoch": 0, "train_loss": 3.777224689722061, "train_ppl": 43.69460721179183, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 149891, "dt_s": 4.372, "eta_s": 19839, "world_size": 1, "timestamp": "2026-05-05T02:11:52.770054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46900, "epoch": 0, "train_loss": 3.8418138325214386, "train_ppl": 46.60994043883379, "lr": 0.00056, "grad_norm": 0.6648, "tokens_per_sec": 136674, "dt_s": 4.795, "eta_s": 20282, "world_size": 1, "timestamp": "2026-05-05T02:11:57.565102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46910, "epoch": 0, "train_loss": 3.7941968888044357, "train_ppl": 44.44252977602865, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 151243, "dt_s": 4.333, "eta_s": 20285, "world_size": 1, "timestamp": "2026-05-05T02:12:01.898294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46920, "epoch": 0, "train_loss": 3.732773035764694, "train_ppl": 41.794846220327265, "lr": 0.00056, "grad_norm": 0.6861, "tokens_per_sec": 149260, "dt_s": 4.391, "eta_s": 20288, "world_size": 1, "timestamp": "2026-05-05T02:12:06.288998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46930, "epoch": 0, "train_loss": 3.701129913330078, "train_ppl": 40.493032137777284, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 152003, "dt_s": 4.311, "eta_s": 20314, "world_size": 1, "timestamp": "2026-05-05T02:12:10.600496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46940, "epoch": 0, "train_loss": 3.6643140017986298, "train_ppl": 39.02935291162051, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 152585, "dt_s": 4.295, "eta_s": 20239, "world_size": 1, "timestamp": "2026-05-05T02:12:14.895523"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46950, "epoch": 0, "train_loss": 3.8554701805114746, "train_ppl": 47.250828137771094, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 146262, "dt_s": 4.481, "eta_s": 19947, "world_size": 1, "timestamp": "2026-05-05T02:12:19.376261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46960, "epoch": 0, "train_loss": 3.7819136679172516, "train_ppl": 43.899971370015265, "lr": 0.00056, "grad_norm": 0.7089, "tokens_per_sec": 151839, "dt_s": 4.316, "eta_s": 19927, "world_size": 1, "timestamp": "2026-05-05T02:12:23.692406"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46970, "epoch": 0, "train_loss": 3.7537295818328857, "train_ppl": 42.67996395245876, "lr": 0.00056, "grad_norm": 0.6385, "tokens_per_sec": 149527, "dt_s": 4.383, "eta_s": 19916, "world_size": 1, "timestamp": "2026-05-05T02:12:28.075315"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46980, "epoch": 0, "train_loss": 3.7988699078559875, "train_ppl": 44.65069656925327, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 151168, "dt_s": 4.335, "eta_s": 19933, "world_size": 1, "timestamp": "2026-05-05T02:12:32.410610"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 46990, "epoch": 0, "train_loss": 3.6852483600378036, "train_ppl": 39.855019614958614, "lr": 0.00056, "grad_norm": 0.6226, "tokens_per_sec": 152055, "dt_s": 4.31, "eta_s": 19943, "world_size": 1, "timestamp": "2026-05-05T02:12:36.720625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47000, "epoch": 0, "train_loss": 3.6602480858564377, "train_ppl": 38.87098501685153, "lr": 0.00056, "grad_norm": 0.6346, "tokens_per_sec": 149545, "dt_s": 4.382, "eta_s": 19848, "world_size": 1, "timestamp": "2026-05-05T02:12:41.103001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47010, "epoch": 0, "train_loss": 3.783559486269951, "train_ppl": 43.97228223753296, "lr": 0.00056, "grad_norm": 0.7269, "tokens_per_sec": 129765, "dt_s": 5.05, "eta_s": 19809, "world_size": 1, "timestamp": "2026-05-05T02:12:46.153370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47020, "epoch": 0, "train_loss": 3.6972392201423645, "train_ppl": 40.33579225790511, "lr": 0.00056, "grad_norm": 0.7283, "tokens_per_sec": 151552, "dt_s": 4.324, "eta_s": 19751, "world_size": 1, "timestamp": "2026-05-05T02:12:50.477706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47030, "epoch": 0, "train_loss": 3.805091544985771, "train_ppl": 44.92936298264922, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 150073, "dt_s": 4.367, "eta_s": 19776, "world_size": 1, "timestamp": "2026-05-05T02:12:54.844625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47040, "epoch": 0, "train_loss": 3.6666615307331085, "train_ppl": 39.12108307435386, "lr": 0.00056, "grad_norm": 0.6765, "tokens_per_sec": 152422, "dt_s": 4.3, "eta_s": 19762, "world_size": 1, "timestamp": "2026-05-05T02:12:59.144288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47050, "epoch": 0, "train_loss": 3.7121823728084564, "train_ppl": 40.9430621225439, "lr": 0.00056, "grad_norm": 0.7456, "tokens_per_sec": 152686, "dt_s": 4.292, "eta_s": 19675, "world_size": 1, "timestamp": "2026-05-05T02:13:03.436482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47060, "epoch": 0, "train_loss": 3.736763581633568, "train_ppl": 41.961963694453814, "lr": 0.00056, "grad_norm": 0.7558, "tokens_per_sec": 149091, "dt_s": 4.396, "eta_s": 19779, "world_size": 1, "timestamp": "2026-05-05T02:13:07.832200"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47070, "epoch": 0, "train_loss": 3.782222554087639, "train_ppl": 43.91353355853026, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 151898, "dt_s": 4.314, "eta_s": 19765, "world_size": 1, "timestamp": "2026-05-05T02:13:12.146650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47080, "epoch": 0, "train_loss": 3.7726221084594727, "train_ppl": 43.49396133005316, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 151458, "dt_s": 4.327, "eta_s": 19725, "world_size": 1, "timestamp": "2026-05-05T02:13:16.473690"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47090, "epoch": 0, "train_loss": 3.6752209663391113, "train_ppl": 39.457374645622544, "lr": 0.00056, "grad_norm": 0.6093, "tokens_per_sec": 151608, "dt_s": 4.323, "eta_s": 19741, "world_size": 1, "timestamp": "2026-05-05T02:13:20.796419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47100, "epoch": 0, "train_loss": 3.740766391158104, "train_ppl": 42.13026605883334, "lr": 0.00056, "grad_norm": 0.6979, "tokens_per_sec": 151508, "dt_s": 4.326, "eta_s": 19767, "world_size": 1, "timestamp": "2026-05-05T02:13:25.122001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47110, "epoch": 0, "train_loss": 3.797181859612465, "train_ppl": 44.57538761979019, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 151038, "dt_s": 4.339, "eta_s": 19711, "world_size": 1, "timestamp": "2026-05-05T02:13:29.461066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47120, "epoch": 0, "train_loss": 3.6749414652585983, "train_ppl": 39.44634780785321, "lr": 0.00056, "grad_norm": 0.6914, "tokens_per_sec": 152744, "dt_s": 4.291, "eta_s": 19685, "world_size": 1, "timestamp": "2026-05-05T02:13:33.751658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47130, "epoch": 0, "train_loss": 3.7247060984373093, "train_ppl": 41.459046075648615, "lr": 0.00056, "grad_norm": 0.6316, "tokens_per_sec": 152136, "dt_s": 4.308, "eta_s": 19663, "world_size": 1, "timestamp": "2026-05-05T02:13:38.059396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47140, "epoch": 0, "train_loss": 3.682515174150467, "train_ppl": 39.74623716685329, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 149105, "dt_s": 4.395, "eta_s": 19725, "world_size": 1, "timestamp": "2026-05-05T02:13:42.454684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47150, "epoch": 0, "train_loss": 3.56641948223114, "train_ppl": 35.38965275055629, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 152252, "dt_s": 4.304, "eta_s": 19702, "world_size": 1, "timestamp": "2026-05-05T02:13:46.759138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47160, "epoch": 0, "train_loss": 3.5860466957092285, "train_ppl": 36.09111437010218, "lr": 0.00056, "grad_norm": 0.6553, "tokens_per_sec": 152197, "dt_s": 4.306, "eta_s": 19667, "world_size": 1, "timestamp": "2026-05-05T02:13:51.065126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47170, "epoch": 0, "train_loss": 3.745280310511589, "train_ppl": 42.32086854064507, "lr": 0.00056, "grad_norm": 0.6384, "tokens_per_sec": 149845, "dt_s": 4.374, "eta_s": 19738, "world_size": 1, "timestamp": "2026-05-05T02:13:55.438703"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47180, "epoch": 0, "train_loss": 3.7597872614860535, "train_ppl": 42.93929016624801, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 152132, "dt_s": 4.308, "eta_s": 19734, "world_size": 1, "timestamp": "2026-05-05T02:13:59.746548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47190, "epoch": 0, "train_loss": 3.8269024044275284, "train_ppl": 45.92007587901539, "lr": 0.00056, "grad_norm": 0.6734, "tokens_per_sec": 135252, "dt_s": 4.845, "eta_s": 20140, "world_size": 1, "timestamp": "2026-05-05T02:14:04.592034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47200, "epoch": 0, "train_loss": 3.7051394134759903, "train_ppl": 40.65571487633229, "lr": 0.00056, "grad_norm": 0.6467, "tokens_per_sec": 149399, "dt_s": 4.387, "eta_s": 20210, "world_size": 1, "timestamp": "2026-05-05T02:14:08.978694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47210, "epoch": 0, "train_loss": 3.742874041199684, "train_ppl": 42.21915555691587, "lr": 0.00056, "grad_norm": 0.6947, "tokens_per_sec": 150143, "dt_s": 4.365, "eta_s": 20259, "world_size": 1, "timestamp": "2026-05-05T02:14:13.343592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47220, "epoch": 0, "train_loss": 3.6928815841674805, "train_ppl": 40.16040597081764, "lr": 0.00056, "grad_norm": 0.8401, "tokens_per_sec": 148066, "dt_s": 4.426, "eta_s": 20302, "world_size": 1, "timestamp": "2026-05-05T02:14:17.769720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47230, "epoch": 0, "train_loss": 3.7537168115377426, "train_ppl": 42.67941892020251, "lr": 0.00056, "grad_norm": 0.7124, "tokens_per_sec": 149586, "dt_s": 4.381, "eta_s": 20364, "world_size": 1, "timestamp": "2026-05-05T02:14:22.150879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47240, "epoch": 0, "train_loss": 3.81091770529747, "train_ppl": 45.19189268153304, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 150145, "dt_s": 4.365, "eta_s": 19923, "world_size": 1, "timestamp": "2026-05-05T02:14:26.515702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47250, "epoch": 0, "train_loss": 3.714895009994507, "train_ppl": 41.054276569393664, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 148079, "dt_s": 4.426, "eta_s": 19954, "world_size": 1, "timestamp": "2026-05-05T02:14:30.941488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47260, "epoch": 0, "train_loss": 3.7288794964551926, "train_ppl": 41.632432730465, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 151890, "dt_s": 4.315, "eta_s": 19904, "world_size": 1, "timestamp": "2026-05-05T02:14:35.256191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47270, "epoch": 0, "train_loss": 3.787956401705742, "train_ppl": 44.16605032281845, "lr": 0.00056, "grad_norm": 0.6323, "tokens_per_sec": 151077, "dt_s": 4.338, "eta_s": 19820, "world_size": 1, "timestamp": "2026-05-05T02:14:39.594079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47280, "epoch": 0, "train_loss": 3.647973135113716, "train_ppl": 38.39676207648937, "lr": 0.00056, "grad_norm": 0.6447, "tokens_per_sec": 148363, "dt_s": 4.417, "eta_s": 19848, "world_size": 1, "timestamp": "2026-05-05T02:14:44.011378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47290, "epoch": 0, "train_loss": 3.703250750899315, "train_ppl": 40.579002413898934, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 150489, "dt_s": 4.355, "eta_s": 19835, "world_size": 1, "timestamp": "2026-05-05T02:14:48.366256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47300, "epoch": 0, "train_loss": 3.812707930803299, "train_ppl": 45.272868821617124, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 150752, "dt_s": 4.347, "eta_s": 19759, "world_size": 1, "timestamp": "2026-05-05T02:14:52.713496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47310, "epoch": 0, "train_loss": 3.6944844126701355, "train_ppl": 40.224827828989135, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 149617, "dt_s": 4.38, "eta_s": 19814, "world_size": 1, "timestamp": "2026-05-05T02:14:57.093744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47320, "epoch": 0, "train_loss": 3.8796554505825043, "train_ppl": 48.40753341024299, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 149774, "dt_s": 4.376, "eta_s": 19844, "world_size": 1, "timestamp": "2026-05-05T02:15:01.469411"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47330, "epoch": 0, "train_loss": 3.802835777401924, "train_ppl": 44.82812700741385, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 149188, "dt_s": 4.393, "eta_s": 19818, "world_size": 1, "timestamp": "2026-05-05T02:15:05.862275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47340, "epoch": 0, "train_loss": 3.742176115512848, "train_ppl": 42.18970000386536, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 152526, "dt_s": 4.297, "eta_s": 19761, "world_size": 1, "timestamp": "2026-05-05T02:15:10.158954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47350, "epoch": 0, "train_loss": 3.6773499697446823, "train_ppl": 39.54146901745574, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 153264, "dt_s": 4.276, "eta_s": 19692, "world_size": 1, "timestamp": "2026-05-05T02:15:14.434981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47360, "epoch": 0, "train_loss": 3.8258403092622757, "train_ppl": 45.871330279251445, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 148715, "dt_s": 4.407, "eta_s": 19711, "world_size": 1, "timestamp": "2026-05-05T02:15:18.841798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47370, "epoch": 0, "train_loss": 3.8332191556692123, "train_ppl": 46.21105964381818, "lr": 0.00056, "grad_norm": 0.7359, "tokens_per_sec": 152904, "dt_s": 4.286, "eta_s": 19626, "world_size": 1, "timestamp": "2026-05-05T02:15:23.127876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47380, "epoch": 0, "train_loss": 3.787618011236191, "train_ppl": 44.15110748071176, "lr": 0.00056, "grad_norm": 0.6643, "tokens_per_sec": 152098, "dt_s": 4.309, "eta_s": 19545, "world_size": 1, "timestamp": "2026-05-05T02:15:27.436696"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47390, "epoch": 0, "train_loss": 3.7487368881702423, "train_ppl": 42.467407024259174, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 150383, "dt_s": 4.358, "eta_s": 19596, "world_size": 1, "timestamp": "2026-05-05T02:15:31.794620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47400, "epoch": 0, "train_loss": 3.6883577406406403, "train_ppl": 39.97913690382056, "lr": 0.00056, "grad_norm": 0.7503, "tokens_per_sec": 153313, "dt_s": 4.275, "eta_s": 19591, "world_size": 1, "timestamp": "2026-05-05T02:15:36.069296"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47410, "epoch": 0, "train_loss": 3.7982077896595, "train_ppl": 44.62114231585313, "lr": 0.00056, "grad_norm": 0.683, "tokens_per_sec": 151393, "dt_s": 4.329, "eta_s": 19516, "world_size": 1, "timestamp": "2026-05-05T02:15:40.398121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47420, "epoch": 0, "train_loss": 3.79461969435215, "train_ppl": 44.461324297106316, "lr": 0.00056, "grad_norm": 0.7698, "tokens_per_sec": 151263, "dt_s": 4.333, "eta_s": 19554, "world_size": 1, "timestamp": "2026-05-05T02:15:44.730728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47430, "epoch": 0, "train_loss": 3.618545040488243, "train_ppl": 37.28328267352807, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 151096, "dt_s": 4.337, "eta_s": 19575, "world_size": 1, "timestamp": "2026-05-05T02:15:49.068089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47440, "epoch": 0, "train_loss": 3.708635300397873, "train_ppl": 40.798091379344434, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 149430, "dt_s": 4.386, "eta_s": 19596, "world_size": 1, "timestamp": "2026-05-05T02:15:53.453847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47450, "epoch": 0, "train_loss": 3.667013317346573, "train_ppl": 39.13484776865929, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 151073, "dt_s": 4.338, "eta_s": 19649, "world_size": 1, "timestamp": "2026-05-05T02:15:57.791866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47460, "epoch": 0, "train_loss": 3.743741735816002, "train_ppl": 42.255804788769254, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 152124, "dt_s": 4.308, "eta_s": 19626, "world_size": 1, "timestamp": "2026-05-05T02:16:02.099950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47470, "epoch": 0, "train_loss": 3.32035244256258, "train_ppl": 27.670100961477033, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 149016, "dt_s": 4.398, "eta_s": 19681, "world_size": 1, "timestamp": "2026-05-05T02:16:06.497842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47480, "epoch": 0, "train_loss": 3.7861254513263702, "train_ppl": 44.08525846173929, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 151784, "dt_s": 4.318, "eta_s": 19659, "world_size": 1, "timestamp": "2026-05-05T02:16:10.815575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47490, "epoch": 0, "train_loss": 3.785995841026306, "train_ppl": 44.07954492843604, "lr": 0.00056, "grad_norm": 0.6852, "tokens_per_sec": 135279, "dt_s": 4.845, "eta_s": 20069, "world_size": 1, "timestamp": "2026-05-05T02:16:15.660099"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47500, "epoch": 0, "train_loss": 3.7967788875102997, "train_ppl": 44.55742860087104, "lr": 0.00056, "grad_norm": 0.7613, "tokens_per_sec": 149341, "dt_s": 4.388, "eta_s": 20110, "world_size": 1, "timestamp": "2026-05-05T02:16:20.048434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47510, "epoch": 0, "train_loss": 3.7125770151615143, "train_ppl": 40.959223177629994, "lr": 0.00056, "grad_norm": 0.6482, "tokens_per_sec": 126883, "dt_s": 5.165, "eta_s": 20186, "world_size": 1, "timestamp": "2026-05-05T02:16:25.213533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47520, "epoch": 0, "train_loss": 3.7485240697860718, "train_ppl": 42.45837014095792, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 149067, "dt_s": 4.396, "eta_s": 20180, "world_size": 1, "timestamp": "2026-05-05T02:16:29.609919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47530, "epoch": 0, "train_loss": 3.6733269542455673, "train_ppl": 39.38271262856533, "lr": 0.00056, "grad_norm": 0.6438, "tokens_per_sec": 151709, "dt_s": 4.32, "eta_s": 20177, "world_size": 1, "timestamp": "2026-05-05T02:16:33.929764"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47540, "epoch": 0, "train_loss": 3.7729597836732864, "train_ppl": 43.50865064271371, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 151380, "dt_s": 4.329, "eta_s": 19708, "world_size": 1, "timestamp": "2026-05-05T02:16:38.259004"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47550, "epoch": 0, "train_loss": 3.718736082315445, "train_ppl": 41.212272256958165, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 149551, "dt_s": 4.382, "eta_s": 19698, "world_size": 1, "timestamp": "2026-05-05T02:16:42.641194"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47560, "epoch": 0, "train_loss": 3.735777661204338, "train_ppl": 41.92061292483067, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 151888, "dt_s": 4.315, "eta_s": 19619, "world_size": 1, "timestamp": "2026-05-05T02:16:46.955985"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47570, "epoch": 0, "train_loss": 3.7312779277563095, "train_ppl": 41.732405100774464, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 151511, "dt_s": 4.326, "eta_s": 19551, "world_size": 1, "timestamp": "2026-05-05T02:16:51.281459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47580, "epoch": 0, "train_loss": 3.7968777418136597, "train_ppl": 44.56183351215353, "lr": 0.00056, "grad_norm": 0.6906, "tokens_per_sec": 151144, "dt_s": 4.336, "eta_s": 19561, "world_size": 1, "timestamp": "2026-05-05T02:16:55.617439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47590, "epoch": 0, "train_loss": 3.6833377927541733, "train_ppl": 39.77894661282891, "lr": 0.00056, "grad_norm": 0.6337, "tokens_per_sec": 150214, "dt_s": 4.363, "eta_s": 19587, "world_size": 1, "timestamp": "2026-05-05T02:16:59.980303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47600, "epoch": 0, "train_loss": 3.786402866244316, "train_ppl": 44.09749006663447, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 151164, "dt_s": 4.335, "eta_s": 19541, "world_size": 1, "timestamp": "2026-05-05T02:17:04.315700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47610, "epoch": 0, "train_loss": 3.68213327229023, "train_ppl": 39.73106090304798, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 148155, "dt_s": 4.423, "eta_s": 19634, "world_size": 1, "timestamp": "2026-05-05T02:17:08.739197"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47620, "epoch": 0, "train_loss": 3.786969244480133, "train_ppl": 44.12247299948924, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 150567, "dt_s": 4.353, "eta_s": 19654, "world_size": 1, "timestamp": "2026-05-05T02:17:13.091814"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47630, "epoch": 0, "train_loss": 3.7454284131526947, "train_ppl": 42.327136837214006, "lr": 0.00056, "grad_norm": 0.624, "tokens_per_sec": 151681, "dt_s": 4.321, "eta_s": 19636, "world_size": 1, "timestamp": "2026-05-05T02:17:17.412467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47640, "epoch": 0, "train_loss": 3.799596533179283, "train_ppl": 44.683152686373546, "lr": 0.00056, "grad_norm": 0.7538, "tokens_per_sec": 149227, "dt_s": 4.392, "eta_s": 19658, "world_size": 1, "timestamp": "2026-05-05T02:17:21.804169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47650, "epoch": 0, "train_loss": 3.708535209298134, "train_ppl": 40.7940080578665, "lr": 0.00056, "grad_norm": 0.7098, "tokens_per_sec": 151296, "dt_s": 4.332, "eta_s": 19650, "world_size": 1, "timestamp": "2026-05-05T02:17:26.135808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47660, "epoch": 0, "train_loss": 3.7806731462478638, "train_ppl": 43.84554626898339, "lr": 0.00056, "grad_norm": 0.7426, "tokens_per_sec": 149254, "dt_s": 4.391, "eta_s": 19616, "world_size": 1, "timestamp": "2026-05-05T02:17:30.526729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47670, "epoch": 0, "train_loss": 3.799831449985504, "train_ppl": 44.6936507429312, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 151470, "dt_s": 4.327, "eta_s": 19589, "world_size": 1, "timestamp": "2026-05-05T02:17:34.853379"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47680, "epoch": 0, "train_loss": 3.8122875839471817, "train_ppl": 45.25384251264528, "lr": 0.00056, "grad_norm": 0.7018, "tokens_per_sec": 152125, "dt_s": 4.308, "eta_s": 19573, "world_size": 1, "timestamp": "2026-05-05T02:17:39.161396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47690, "epoch": 0, "train_loss": 3.7847093790769577, "train_ppl": 44.02287473098305, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 149179, "dt_s": 4.393, "eta_s": 19570, "world_size": 1, "timestamp": "2026-05-05T02:17:43.554531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47700, "epoch": 0, "train_loss": 3.7385136485099792, "train_ppl": 42.035464233854974, "lr": 0.00056, "grad_norm": 0.6583, "tokens_per_sec": 151624, "dt_s": 4.322, "eta_s": 19557, "world_size": 1, "timestamp": "2026-05-05T02:17:47.876793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47710, "epoch": 0, "train_loss": 3.7263020128011703, "train_ppl": 41.525263987796855, "lr": 0.00056, "grad_norm": 0.631, "tokens_per_sec": 151374, "dt_s": 4.329, "eta_s": 19498, "world_size": 1, "timestamp": "2026-05-05T02:17:52.206217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47720, "epoch": 0, "train_loss": 3.689568445086479, "train_ppl": 40.02756913525376, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 146684, "dt_s": 4.468, "eta_s": 19620, "world_size": 1, "timestamp": "2026-05-05T02:17:56.674047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47730, "epoch": 0, "train_loss": 3.7561124116182327, "train_ppl": 42.78178430390835, "lr": 0.00056, "grad_norm": 0.6601, "tokens_per_sec": 150721, "dt_s": 4.348, "eta_s": 19652, "world_size": 1, "timestamp": "2026-05-05T02:18:01.022238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47740, "epoch": 0, "train_loss": 3.748147740960121, "train_ppl": 42.44239483854294, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 149066, "dt_s": 4.396, "eta_s": 19650, "world_size": 1, "timestamp": "2026-05-05T02:18:05.418683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47750, "epoch": 0, "train_loss": 3.899866208434105, "train_ppl": 49.39583991664202, "lr": 0.00056, "grad_norm": 0.6902, "tokens_per_sec": 150272, "dt_s": 4.361, "eta_s": 19681, "world_size": 1, "timestamp": "2026-05-05T02:18:09.779834"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47760, "epoch": 0, "train_loss": 3.6367121040821075, "train_ppl": 37.96680039678135, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 152478, "dt_s": 4.298, "eta_s": 19649, "world_size": 1, "timestamp": "2026-05-05T02:18:14.077898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47770, "epoch": 0, "train_loss": 3.787458509206772, "train_ppl": 44.14406585105953, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 150832, "dt_s": 4.345, "eta_s": 19534, "world_size": 1, "timestamp": "2026-05-05T02:18:18.422874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47780, "epoch": 0, "train_loss": 3.707700625061989, "train_ppl": 40.7599762250032, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 136028, "dt_s": 4.818, "eta_s": 19951, "world_size": 1, "timestamp": "2026-05-05T02:18:23.240700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47790, "epoch": 0, "train_loss": 3.757690593600273, "train_ppl": 42.84935505049908, "lr": 0.00056, "grad_norm": 0.8137, "tokens_per_sec": 150951, "dt_s": 4.342, "eta_s": 19897, "world_size": 1, "timestamp": "2026-05-05T02:18:27.582259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47800, "epoch": 0, "train_loss": 3.6799449175596237, "train_ppl": 39.64421031257893, "lr": 0.00056, "grad_norm": 0.6465, "tokens_per_sec": 148571, "dt_s": 4.411, "eta_s": 19938, "world_size": 1, "timestamp": "2026-05-05T02:18:31.993360"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47810, "epoch": 0, "train_loss": 3.559613913297653, "train_ppl": 35.14962372270444, "lr": 0.00056, "grad_norm": 0.6539, "tokens_per_sec": 150181, "dt_s": 4.364, "eta_s": 19992, "world_size": 1, "timestamp": "2026-05-05T02:18:36.357146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47820, "epoch": 0, "train_loss": 3.6772177815437317, "train_ppl": 39.53624244725645, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 150754, "dt_s": 4.347, "eta_s": 19990, "world_size": 1, "timestamp": "2026-05-05T02:18:40.704366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47830, "epoch": 0, "train_loss": 3.8174111396074295, "train_ppl": 45.486298084617395, "lr": 0.00056, "grad_norm": 0.7166, "tokens_per_sec": 150458, "dt_s": 4.356, "eta_s": 19571, "world_size": 1, "timestamp": "2026-05-05T02:18:45.060168"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47840, "epoch": 0, "train_loss": 3.798108458518982, "train_ppl": 44.61671026701954, "lr": 0.00056, "grad_norm": 0.7064, "tokens_per_sec": 150663, "dt_s": 4.35, "eta_s": 19574, "world_size": 1, "timestamp": "2026-05-05T02:18:49.409965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47850, "epoch": 0, "train_loss": 3.734772637486458, "train_ppl": 41.87850287891511, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 150852, "dt_s": 4.344, "eta_s": 19510, "world_size": 1, "timestamp": "2026-05-05T02:18:53.754356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47860, "epoch": 0, "train_loss": 3.8282105177640915, "train_ppl": 45.980183848135546, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 148699, "dt_s": 4.407, "eta_s": 19545, "world_size": 1, "timestamp": "2026-05-05T02:18:58.161658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47870, "epoch": 0, "train_loss": 3.749250590801239, "train_ppl": 42.48922824730912, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 151143, "dt_s": 4.336, "eta_s": 19530, "world_size": 1, "timestamp": "2026-05-05T02:19:02.497688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47880, "epoch": 0, "train_loss": 3.5838968604803085, "train_ppl": 36.01360776404605, "lr": 0.00056, "grad_norm": 0.6361, "tokens_per_sec": 149213, "dt_s": 4.392, "eta_s": 19558, "world_size": 1, "timestamp": "2026-05-05T02:19:06.889797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47890, "epoch": 0, "train_loss": 3.7639065831899643, "train_ppl": 43.116535731294725, "lr": 0.00056, "grad_norm": 0.6663, "tokens_per_sec": 147548, "dt_s": 4.442, "eta_s": 19636, "world_size": 1, "timestamp": "2026-05-05T02:19:11.331476"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47900, "epoch": 0, "train_loss": 3.8444773703813553, "train_ppl": 46.73425326230289, "lr": 0.00056, "grad_norm": 0.7524, "tokens_per_sec": 147462, "dt_s": 4.444, "eta_s": 19721, "world_size": 1, "timestamp": "2026-05-05T02:19:15.775754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47910, "epoch": 0, "train_loss": 3.746672138571739, "train_ppl": 42.37981292371752, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 143346, "dt_s": 4.572, "eta_s": 19864, "world_size": 1, "timestamp": "2026-05-05T02:19:20.347611"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47920, "epoch": 0, "train_loss": 3.8831620067358017, "train_ppl": 48.5775751005212, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 148109, "dt_s": 4.425, "eta_s": 19939, "world_size": 1, "timestamp": "2026-05-05T02:19:24.772461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47930, "epoch": 0, "train_loss": 3.8863863945007324, "train_ppl": 48.73446083362032, "lr": 0.00056, "grad_norm": 0.9297, "tokens_per_sec": 147160, "dt_s": 4.453, "eta_s": 19990, "world_size": 1, "timestamp": "2026-05-05T02:19:29.225836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47940, "epoch": 0, "train_loss": 3.765247330069542, "train_ppl": 43.17438286254951, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 146617, "dt_s": 4.47, "eta_s": 20011, "world_size": 1, "timestamp": "2026-05-05T02:19:33.695729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47950, "epoch": 0, "train_loss": 3.676792547106743, "train_ppl": 39.51943384950932, "lr": 0.00056, "grad_norm": 0.6096, "tokens_per_sec": 144050, "dt_s": 4.55, "eta_s": 20100, "world_size": 1, "timestamp": "2026-05-05T02:19:38.245264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47960, "epoch": 0, "train_loss": 3.748494803905487, "train_ppl": 42.45712757755, "lr": 0.00056, "grad_norm": 0.6919, "tokens_per_sec": 146632, "dt_s": 4.469, "eta_s": 20004, "world_size": 1, "timestamp": "2026-05-05T02:19:42.714678"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47970, "epoch": 0, "train_loss": 3.7692286670207977, "train_ppl": 43.34661726247032, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 145764, "dt_s": 4.496, "eta_s": 20063, "world_size": 1, "timestamp": "2026-05-05T02:19:47.210728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47980, "epoch": 0, "train_loss": 3.72199310362339, "train_ppl": 41.346720337124694, "lr": 0.00056, "grad_norm": 0.651, "tokens_per_sec": 145482, "dt_s": 4.505, "eta_s": 20105, "world_size": 1, "timestamp": "2026-05-05T02:19:51.715484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 47990, "epoch": 0, "train_loss": 3.7601013481616974, "train_ppl": 42.952778943362524, "lr": 0.00056, "grad_norm": 0.6618, "tokens_per_sec": 144350, "dt_s": 4.54, "eta_s": 20163, "world_size": 1, "timestamp": "2026-05-05T02:19:56.255535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48000, "epoch": 0, "train_loss": 3.5965984910726547, "train_ppl": 36.47395670823318, "lr": 0.00056, "grad_norm": 0.6783, "tokens_per_sec": 144618, "dt_s": 4.532, "eta_s": 20142, "world_size": 1, "timestamp": "2026-05-05T02:20:00.787207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48010, "epoch": 0, "train_loss": 3.777386248111725, "train_ppl": 43.701667012439614, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 104454, "dt_s": 6.274, "eta_s": 20293, "world_size": 1, "timestamp": "2026-05-05T02:20:07.061325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48020, "epoch": 0, "train_loss": 3.708519399166107, "train_ppl": 40.79336310431162, "lr": 0.00056, "grad_norm": 0.6528, "tokens_per_sec": 143166, "dt_s": 4.578, "eta_s": 20362, "world_size": 1, "timestamp": "2026-05-05T02:20:11.638933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48030, "epoch": 0, "train_loss": 3.7010559737682343, "train_ppl": 40.49003821140945, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 145074, "dt_s": 4.517, "eta_s": 20369, "world_size": 1, "timestamp": "2026-05-05T02:20:16.156404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48040, "epoch": 0, "train_loss": 3.64590086042881, "train_ppl": 38.31727582557365, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 144542, "dt_s": 4.534, "eta_s": 20359, "world_size": 1, "timestamp": "2026-05-05T02:20:20.690408"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48050, "epoch": 0, "train_loss": 3.7434316724538803, "train_ppl": 42.24270484287888, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 148252, "dt_s": 4.421, "eta_s": 20255, "world_size": 1, "timestamp": "2026-05-05T02:20:25.111085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48060, "epoch": 0, "train_loss": 3.762208864092827, "train_ppl": 43.04339806635329, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 146637, "dt_s": 4.469, "eta_s": 20095, "world_size": 1, "timestamp": "2026-05-05T02:20:29.580308"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48070, "epoch": 0, "train_loss": 3.680686205625534, "train_ppl": 39.67360898766334, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 145570, "dt_s": 4.502, "eta_s": 20023, "world_size": 1, "timestamp": "2026-05-05T02:20:34.082307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48080, "epoch": 0, "train_loss": 3.7457034438848495, "train_ppl": 42.33877970164774, "lr": 0.00056, "grad_norm": 0.7929, "tokens_per_sec": 129820, "dt_s": 5.048, "eta_s": 20492, "world_size": 1, "timestamp": "2026-05-05T02:20:39.130546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48090, "epoch": 0, "train_loss": 3.78460693359375, "train_ppl": 44.01836501731289, "lr": 0.00056, "grad_norm": 0.7158, "tokens_per_sec": 142335, "dt_s": 4.604, "eta_s": 20550, "world_size": 1, "timestamp": "2026-05-05T02:20:43.734859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48100, "epoch": 0, "train_loss": 3.723640874028206, "train_ppl": 41.41490640129374, "lr": 0.00056, "grad_norm": 0.7002, "tokens_per_sec": 139717, "dt_s": 4.691, "eta_s": 20786, "world_size": 1, "timestamp": "2026-05-05T02:20:48.425505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48110, "epoch": 0, "train_loss": 3.711247995495796, "train_ppl": 40.904823721514994, "lr": 0.00056, "grad_norm": 0.6427, "tokens_per_sec": 140117, "dt_s": 4.677, "eta_s": 20967, "world_size": 1, "timestamp": "2026-05-05T02:20:53.102734"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48120, "epoch": 0, "train_loss": 3.7134632915258408, "train_ppl": 40.99554046023086, "lr": 0.00056, "grad_norm": 0.6661, "tokens_per_sec": 141536, "dt_s": 4.63, "eta_s": 21077, "world_size": 1, "timestamp": "2026-05-05T02:20:57.733060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48130, "epoch": 0, "train_loss": 3.7133032083511353, "train_ppl": 40.98897828922582, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 137937, "dt_s": 4.751, "eta_s": 20807, "world_size": 1, "timestamp": "2026-05-05T02:21:02.484242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48140, "epoch": 0, "train_loss": 3.878863424062729, "train_ppl": 48.36920853918574, "lr": 0.00056, "grad_norm": 0.736, "tokens_per_sec": 147518, "dt_s": 4.443, "eta_s": 20658, "world_size": 1, "timestamp": "2026-05-05T02:21:06.926773"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48150, "epoch": 0, "train_loss": 3.813205435872078, "train_ppl": 45.29539790703645, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 147092, "dt_s": 4.455, "eta_s": 20444, "world_size": 1, "timestamp": "2026-05-05T02:21:11.382261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48160, "epoch": 0, "train_loss": 3.695789724588394, "train_ppl": 40.27736805939868, "lr": 0.00056, "grad_norm": 0.6992, "tokens_per_sec": 140118, "dt_s": 4.677, "eta_s": 20440, "world_size": 1, "timestamp": "2026-05-05T02:21:16.059427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48170, "epoch": 0, "train_loss": 3.70084410905838, "train_ppl": 40.48146070987883, "lr": 0.00056, "grad_norm": 0.7073, "tokens_per_sec": 139603, "dt_s": 4.694, "eta_s": 20492, "world_size": 1, "timestamp": "2026-05-05T02:21:20.753917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48180, "epoch": 0, "train_loss": 3.798849567770958, "train_ppl": 44.64978837952478, "lr": 0.00056, "grad_norm": 1.0854, "tokens_per_sec": 138521, "dt_s": 4.731, "eta_s": 20470, "world_size": 1, "timestamp": "2026-05-05T02:21:25.485025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48190, "epoch": 0, "train_loss": 3.7006858587265015, "train_ppl": 40.47505501215182, "lr": 0.00056, "grad_norm": 0.6783, "tokens_per_sec": 140071, "dt_s": 4.679, "eta_s": 20675, "world_size": 1, "timestamp": "2026-05-05T02:21:30.163835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48200, "epoch": 0, "train_loss": 3.7853397578001022, "train_ppl": 44.050634563229664, "lr": 0.00056, "grad_norm": 1.0945, "tokens_per_sec": 140147, "dt_s": 4.676, "eta_s": 20867, "world_size": 1, "timestamp": "2026-05-05T02:21:34.840063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48210, "epoch": 0, "train_loss": 3.6920403093099594, "train_ppl": 40.12663423865295, "lr": 0.00056, "grad_norm": 0.6643, "tokens_per_sec": 141300, "dt_s": 4.638, "eta_s": 20828, "world_size": 1, "timestamp": "2026-05-05T02:21:39.478136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48220, "epoch": 0, "train_loss": 3.8467782139778137, "train_ppl": 46.841905267378586, "lr": 0.00056, "grad_norm": 0.6936, "tokens_per_sec": 140735, "dt_s": 4.657, "eta_s": 20789, "world_size": 1, "timestamp": "2026-05-05T02:21:44.134839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48230, "epoch": 0, "train_loss": 3.7888759821653366, "train_ppl": 44.20668323942749, "lr": 0.00056, "grad_norm": 0.7007, "tokens_per_sec": 140003, "dt_s": 4.681, "eta_s": 20740, "world_size": 1, "timestamp": "2026-05-05T02:21:48.815890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48240, "epoch": 0, "train_loss": 3.7879183888435364, "train_ppl": 44.16437147674244, "lr": 0.00056, "grad_norm": 0.6841, "tokens_per_sec": 139938, "dt_s": 4.683, "eta_s": 20739, "world_size": 1, "timestamp": "2026-05-05T02:21:53.499083"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48250, "epoch": 0, "train_loss": 3.6565875113010406, "train_ppl": 38.72895499250011, "lr": 0.00056, "grad_norm": 0.6467, "tokens_per_sec": 138608, "dt_s": 4.728, "eta_s": 20781, "world_size": 1, "timestamp": "2026-05-05T02:21:58.227247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48260, "epoch": 0, "train_loss": 3.8646311461925507, "train_ppl": 47.68568014371496, "lr": 0.00056, "grad_norm": 0.7495, "tokens_per_sec": 141643, "dt_s": 4.627, "eta_s": 20766, "world_size": 1, "timestamp": "2026-05-05T02:22:02.854055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48270, "epoch": 0, "train_loss": 3.721936345100403, "train_ppl": 41.34437362494658, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 137904, "dt_s": 4.752, "eta_s": 20846, "world_size": 1, "timestamp": "2026-05-05T02:22:07.606363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48280, "epoch": 0, "train_loss": 3.7197875380516052, "train_ppl": 41.25562792631221, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 140082, "dt_s": 4.678, "eta_s": 20839, "world_size": 1, "timestamp": "2026-05-05T02:22:12.284720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48290, "epoch": 0, "train_loss": 3.79447665810585, "train_ppl": 44.454965170976955, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 141031, "dt_s": 4.647, "eta_s": 20802, "world_size": 1, "timestamp": "2026-05-05T02:22:16.931685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48300, "epoch": 0, "train_loss": 3.8725516945123672, "train_ppl": 48.064876617043225, "lr": 0.00056, "grad_norm": 0.6392, "tokens_per_sec": 138231, "dt_s": 4.741, "eta_s": 20809, "world_size": 1, "timestamp": "2026-05-05T02:22:21.672711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48310, "epoch": 0, "train_loss": 3.695993483066559, "train_ppl": 40.2855757507839, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 140277, "dt_s": 4.672, "eta_s": 20845, "world_size": 1, "timestamp": "2026-05-05T02:22:26.344620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48320, "epoch": 0, "train_loss": 3.7336782813072205, "train_ppl": 41.83269794853805, "lr": 0.00056, "grad_norm": 0.6814, "tokens_per_sec": 140009, "dt_s": 4.681, "eta_s": 20776, "world_size": 1, "timestamp": "2026-05-05T02:22:31.025440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48330, "epoch": 0, "train_loss": 3.807792440056801, "train_ppl": 45.05087650143713, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 138813, "dt_s": 4.721, "eta_s": 20810, "world_size": 1, "timestamp": "2026-05-05T02:22:35.746637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48340, "epoch": 0, "train_loss": 3.7477279901504517, "train_ppl": 42.42458334740058, "lr": 0.00056, "grad_norm": 0.6994, "tokens_per_sec": 140196, "dt_s": 4.675, "eta_s": 20830, "world_size": 1, "timestamp": "2026-05-05T02:22:40.421255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48350, "epoch": 0, "train_loss": 3.8204459995031357, "train_ppl": 45.624552311468356, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 138654, "dt_s": 4.727, "eta_s": 20812, "world_size": 1, "timestamp": "2026-05-05T02:22:45.147809"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48360, "epoch": 0, "train_loss": 3.7925294637680054, "train_ppl": 44.3684869367949, "lr": 0.00056, "grad_norm": 0.7175, "tokens_per_sec": 138921, "dt_s": 4.717, "eta_s": 20848, "world_size": 1, "timestamp": "2026-05-05T02:22:49.865326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48370, "epoch": 0, "train_loss": 3.787385955452919, "train_ppl": 44.14086314955712, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 140221, "dt_s": 4.674, "eta_s": 20837, "world_size": 1, "timestamp": "2026-05-05T02:22:54.539087"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48380, "epoch": 0, "train_loss": 3.8289978206157684, "train_ppl": 46.01639843205179, "lr": 0.00056, "grad_norm": 0.6684, "tokens_per_sec": 124919, "dt_s": 5.246, "eta_s": 21297, "world_size": 1, "timestamp": "2026-05-05T02:22:59.785397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48390, "epoch": 0, "train_loss": 3.913698449730873, "train_ppl": 50.08384243218608, "lr": 0.00056, "grad_norm": 0.8653, "tokens_per_sec": 140874, "dt_s": 4.652, "eta_s": 21273, "world_size": 1, "timestamp": "2026-05-05T02:23:04.437466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48400, "epoch": 0, "train_loss": 3.6219970881938934, "train_ppl": 37.41220874538524, "lr": 0.00056, "grad_norm": 0.9074, "tokens_per_sec": 139341, "dt_s": 4.703, "eta_s": 21247, "world_size": 1, "timestamp": "2026-05-05T02:23:09.140739"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48410, "epoch": 0, "train_loss": 3.800285264849663, "train_ppl": 44.713937988950576, "lr": 0.00056, "grad_norm": 0.6927, "tokens_per_sec": 141176, "dt_s": 4.642, "eta_s": 21176, "world_size": 1, "timestamp": "2026-05-05T02:23:13.782891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48420, "epoch": 0, "train_loss": 3.694280982017517, "train_ppl": 40.21664569828874, "lr": 0.00056, "grad_norm": 0.6717, "tokens_per_sec": 140592, "dt_s": 4.661, "eta_s": 21160, "world_size": 1, "timestamp": "2026-05-05T02:23:18.444314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48430, "epoch": 0, "train_loss": 3.7423146963119507, "train_ppl": 42.19554709134347, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 138894, "dt_s": 4.718, "eta_s": 20688, "world_size": 1, "timestamp": "2026-05-05T02:23:23.162727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48440, "epoch": 0, "train_loss": 3.7961380183696747, "train_ppl": 44.52888226810405, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 137948, "dt_s": 4.751, "eta_s": 20771, "world_size": 1, "timestamp": "2026-05-05T02:23:27.913487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48450, "epoch": 0, "train_loss": 3.81168170273304, "train_ppl": 45.22643236408468, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 139527, "dt_s": 4.697, "eta_s": 20760, "world_size": 1, "timestamp": "2026-05-05T02:23:32.610477"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48460, "epoch": 0, "train_loss": 3.7024193704128265, "train_ppl": 40.54527984321686, "lr": 0.00056, "grad_norm": 0.6488, "tokens_per_sec": 139425, "dt_s": 4.7, "eta_s": 20807, "world_size": 1, "timestamp": "2026-05-05T02:23:37.310956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48470, "epoch": 0, "train_loss": 3.7657144367694855, "train_ppl": 43.19455461686455, "lr": 0.00056, "grad_norm": 0.6688, "tokens_per_sec": 141030, "dt_s": 4.647, "eta_s": 20790, "world_size": 1, "timestamp": "2026-05-05T02:23:41.957897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48480, "epoch": 0, "train_loss": 3.743729665875435, "train_ppl": 42.25529476679482, "lr": 0.00056, "grad_norm": 0.6574, "tokens_per_sec": 140190, "dt_s": 4.675, "eta_s": 20746, "world_size": 1, "timestamp": "2026-05-05T02:23:46.632680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48490, "epoch": 0, "train_loss": 3.7223610281944275, "train_ppl": 41.361935610333326, "lr": 0.00056, "grad_norm": 0.7003, "tokens_per_sec": 139291, "dt_s": 4.705, "eta_s": 20701, "world_size": 1, "timestamp": "2026-05-05T02:23:51.337661"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48500, "epoch": 0, "train_loss": 3.8063813149929047, "train_ppl": 44.98734891368286, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 140960, "dt_s": 4.649, "eta_s": 20654, "world_size": 1, "timestamp": "2026-05-05T02:23:55.986921"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48510, "epoch": 0, "train_loss": 3.655056193470955, "train_ppl": 38.66969403843755, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 118634, "dt_s": 5.524, "eta_s": 20643, "world_size": 1, "timestamp": "2026-05-05T02:24:01.511137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48520, "epoch": 0, "train_loss": 3.7463718354701996, "train_ppl": 42.367088045209734, "lr": 0.00056, "grad_norm": 0.708, "tokens_per_sec": 138538, "dt_s": 4.731, "eta_s": 20712, "world_size": 1, "timestamp": "2026-05-05T02:24:06.241689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48530, "epoch": 0, "train_loss": 3.8464929163455963, "train_ppl": 46.8285432888783, "lr": 0.00056, "grad_norm": 0.715, "tokens_per_sec": 137501, "dt_s": 4.766, "eta_s": 20788, "world_size": 1, "timestamp": "2026-05-05T02:24:11.007934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48540, "epoch": 0, "train_loss": 3.6904318779706955, "train_ppl": 40.06214517961915, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 141403, "dt_s": 4.635, "eta_s": 20722, "world_size": 1, "timestamp": "2026-05-05T02:24:15.642590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48550, "epoch": 0, "train_loss": 3.7589938044548035, "train_ppl": 42.90523319770607, "lr": 0.00056, "grad_norm": 0.7167, "tokens_per_sec": 139601, "dt_s": 4.695, "eta_s": 20757, "world_size": 1, "timestamp": "2026-05-05T02:24:20.337115"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48560, "epoch": 0, "train_loss": 3.668158784508705, "train_ppl": 39.179701135798595, "lr": 0.00056, "grad_norm": 0.6199, "tokens_per_sec": 139802, "dt_s": 4.688, "eta_s": 20748, "world_size": 1, "timestamp": "2026-05-05T02:24:25.024916"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48570, "epoch": 0, "train_loss": 3.7820358276367188, "train_ppl": 43.90533450577535, "lr": 0.00056, "grad_norm": 0.6663, "tokens_per_sec": 141713, "dt_s": 4.625, "eta_s": 20649, "world_size": 1, "timestamp": "2026-05-05T02:24:29.649461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48580, "epoch": 0, "train_loss": 3.697927564382553, "train_ppl": 40.36356672628028, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 138236, "dt_s": 4.741, "eta_s": 20622, "world_size": 1, "timestamp": "2026-05-05T02:24:34.390403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48590, "epoch": 0, "train_loss": 3.687970355153084, "train_ppl": 39.96365256577655, "lr": 0.00056, "grad_norm": 0.6904, "tokens_per_sec": 139482, "dt_s": 4.699, "eta_s": 20674, "world_size": 1, "timestamp": "2026-05-05T02:24:39.088923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48600, "epoch": 0, "train_loss": 3.7896072417497635, "train_ppl": 44.23902162267734, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 140616, "dt_s": 4.661, "eta_s": 20639, "world_size": 1, "timestamp": "2026-05-05T02:24:43.749517"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48610, "epoch": 0, "train_loss": 3.735134705901146, "train_ppl": 41.89366850739384, "lr": 0.00056, "grad_norm": 0.7165, "tokens_per_sec": 140021, "dt_s": 4.68, "eta_s": 20628, "world_size": 1, "timestamp": "2026-05-05T02:24:48.429977"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48620, "epoch": 0, "train_loss": 3.8010130375623703, "train_ppl": 44.74649141720819, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 140866, "dt_s": 4.652, "eta_s": 20648, "world_size": 1, "timestamp": "2026-05-05T02:24:53.082349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48630, "epoch": 0, "train_loss": 3.640576422214508, "train_ppl": 38.11380003574081, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 139968, "dt_s": 4.682, "eta_s": 20592, "world_size": 1, "timestamp": "2026-05-05T02:24:57.764537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48640, "epoch": 0, "train_loss": 3.799442231655121, "train_ppl": 44.676258539712016, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 138605, "dt_s": 4.728, "eta_s": 20613, "world_size": 1, "timestamp": "2026-05-05T02:25:02.492788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48650, "epoch": 0, "train_loss": 3.7040902078151703, "train_ppl": 40.61308103988169, "lr": 0.00056, "grad_norm": 0.6623, "tokens_per_sec": 142001, "dt_s": 4.615, "eta_s": 20569, "world_size": 1, "timestamp": "2026-05-05T02:25:07.107984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48660, "epoch": 0, "train_loss": 3.7446377128362656, "train_ppl": 42.293681984847474, "lr": 0.00056, "grad_norm": 0.6327, "tokens_per_sec": 139688, "dt_s": 4.692, "eta_s": 20574, "world_size": 1, "timestamp": "2026-05-05T02:25:11.799573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48670, "epoch": 0, "train_loss": 3.787930116057396, "train_ppl": 44.16488940480864, "lr": 0.00056, "grad_norm": 0.6245, "tokens_per_sec": 124892, "dt_s": 5.247, "eta_s": 21093, "world_size": 1, "timestamp": "2026-05-05T02:25:17.046982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48680, "epoch": 0, "train_loss": 3.911079555749893, "train_ppl": 49.952849761523375, "lr": 0.00056, "grad_norm": 0.6953, "tokens_per_sec": 139118, "dt_s": 4.711, "eta_s": 21113, "world_size": 1, "timestamp": "2026-05-05T02:25:21.757812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48690, "epoch": 0, "train_loss": 3.828645199537277, "train_ppl": 46.00017494054904, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 139571, "dt_s": 4.696, "eta_s": 21080, "world_size": 1, "timestamp": "2026-05-05T02:25:26.453380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48700, "epoch": 0, "train_loss": 3.7665890604257584, "train_ppl": 43.23235012216679, "lr": 0.00056, "grad_norm": 0.7301, "tokens_per_sec": 140387, "dt_s": 4.668, "eta_s": 21122, "world_size": 1, "timestamp": "2026-05-05T02:25:31.121601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48710, "epoch": 0, "train_loss": 3.668211579322815, "train_ppl": 39.18176967544054, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 139687, "dt_s": 4.692, "eta_s": 21117, "world_size": 1, "timestamp": "2026-05-05T02:25:35.813270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48720, "epoch": 0, "train_loss": 3.849966898560524, "train_ppl": 46.99150771928592, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 141085, "dt_s": 4.645, "eta_s": 20582, "world_size": 1, "timestamp": "2026-05-05T02:25:40.458405"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48730, "epoch": 0, "train_loss": 3.725785955786705, "train_ppl": 41.503840112484454, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 140833, "dt_s": 4.653, "eta_s": 20527, "world_size": 1, "timestamp": "2026-05-05T02:25:45.111835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48740, "epoch": 0, "train_loss": 3.7635154128074646, "train_ppl": 43.099673117813154, "lr": 0.00056, "grad_norm": 0.7388, "tokens_per_sec": 139883, "dt_s": 4.685, "eta_s": 20513, "world_size": 1, "timestamp": "2026-05-05T02:25:49.796901"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48750, "epoch": 0, "train_loss": 3.8251640498638153, "train_ppl": 45.84031984775992, "lr": 0.00056, "grad_norm": 0.7546, "tokens_per_sec": 142037, "dt_s": 4.614, "eta_s": 20461, "world_size": 1, "timestamp": "2026-05-05T02:25:54.410957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48760, "epoch": 0, "train_loss": 3.792717531323433, "train_ppl": 44.37683199436381, "lr": 0.00056, "grad_norm": 0.6412, "tokens_per_sec": 139680, "dt_s": 4.692, "eta_s": 20456, "world_size": 1, "timestamp": "2026-05-05T02:25:59.102820"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48770, "epoch": 0, "train_loss": 3.8742029517889023, "train_ppl": 48.14430965844784, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 139924, "dt_s": 4.684, "eta_s": 20486, "world_size": 1, "timestamp": "2026-05-05T02:26:03.786492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48780, "epoch": 0, "train_loss": 3.839825451374054, "train_ppl": 46.51735419086402, "lr": 0.00056, "grad_norm": 0.6197, "tokens_per_sec": 140376, "dt_s": 4.669, "eta_s": 20494, "world_size": 1, "timestamp": "2026-05-05T02:26:08.455090"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48790, "epoch": 0, "train_loss": 3.756426125764847, "train_ppl": 42.79520766030019, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 139855, "dt_s": 4.686, "eta_s": 20490, "world_size": 1, "timestamp": "2026-05-05T02:26:13.141089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48800, "epoch": 0, "train_loss": 3.8288818150758743, "train_ppl": 46.01106058452363, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 140437, "dt_s": 4.667, "eta_s": 20532, "world_size": 1, "timestamp": "2026-05-05T02:26:17.807700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48810, "epoch": 0, "train_loss": 3.768329307436943, "train_ppl": 43.307650591958435, "lr": 0.00056, "grad_norm": 0.6236, "tokens_per_sec": 140053, "dt_s": 4.679, "eta_s": 20516, "world_size": 1, "timestamp": "2026-05-05T02:26:22.487082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48820, "epoch": 0, "train_loss": 3.7630126625299454, "train_ppl": 43.07801019116966, "lr": 0.00056, "grad_norm": 0.6824, "tokens_per_sec": 140760, "dt_s": 4.656, "eta_s": 20487, "world_size": 1, "timestamp": "2026-05-05T02:26:27.142950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48830, "epoch": 0, "train_loss": 3.7183596342802048, "train_ppl": 41.19676089783277, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 139293, "dt_s": 4.705, "eta_s": 20514, "world_size": 1, "timestamp": "2026-05-05T02:26:31.847835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48840, "epoch": 0, "train_loss": 3.6839827448129654, "train_ppl": 39.80461040141048, "lr": 0.00056, "grad_norm": 0.6595, "tokens_per_sec": 139008, "dt_s": 4.715, "eta_s": 20535, "world_size": 1, "timestamp": "2026-05-05T02:26:36.562385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48850, "epoch": 0, "train_loss": 3.7214248180389404, "train_ppl": 41.323230267159076, "lr": 0.00056, "grad_norm": 0.6337, "tokens_per_sec": 142464, "dt_s": 4.6, "eta_s": 20472, "world_size": 1, "timestamp": "2026-05-05T02:26:41.162540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48860, "epoch": 0, "train_loss": 3.7860643416643143, "train_ppl": 44.08256450880716, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 139049, "dt_s": 4.713, "eta_s": 20497, "world_size": 1, "timestamp": "2026-05-05T02:26:45.875697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48870, "epoch": 0, "train_loss": 3.8137714713811874, "train_ppl": 45.321043968257, "lr": 0.00056, "grad_norm": 0.6869, "tokens_per_sec": 139208, "dt_s": 4.708, "eta_s": 20538, "world_size": 1, "timestamp": "2026-05-05T02:26:50.583481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48880, "epoch": 0, "train_loss": 3.755162850022316, "train_ppl": 42.74117964589149, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 141298, "dt_s": 4.638, "eta_s": 20474, "world_size": 1, "timestamp": "2026-05-05T02:26:55.221618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48890, "epoch": 0, "train_loss": 3.6679724901914597, "train_ppl": 39.172402859960386, "lr": 0.00056, "grad_norm": 0.6723, "tokens_per_sec": 139583, "dt_s": 4.695, "eta_s": 20453, "world_size": 1, "timestamp": "2026-05-05T02:26:59.916743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48900, "epoch": 0, "train_loss": 3.7749160081148148, "train_ppl": 43.593846632588665, "lr": 0.00056, "grad_norm": 0.7193, "tokens_per_sec": 141410, "dt_s": 4.634, "eta_s": 20478, "world_size": 1, "timestamp": "2026-05-05T02:27:04.551211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48910, "epoch": 0, "train_loss": 3.799231931567192, "train_ppl": 44.666864106472474, "lr": 0.00056, "grad_norm": 0.6387, "tokens_per_sec": 141438, "dt_s": 4.634, "eta_s": 20404, "world_size": 1, "timestamp": "2026-05-05T02:27:09.184746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48920, "epoch": 0, "train_loss": 3.8038409799337387, "train_ppl": 44.87321100965861, "lr": 0.00056, "grad_norm": 0.6773, "tokens_per_sec": 137978, "dt_s": 4.75, "eta_s": 20436, "world_size": 1, "timestamp": "2026-05-05T02:27:13.934481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48930, "epoch": 0, "train_loss": 3.715585008263588, "train_ppl": 41.082613724334585, "lr": 0.00056, "grad_norm": 0.6026, "tokens_per_sec": 141402, "dt_s": 4.635, "eta_s": 20428, "world_size": 1, "timestamp": "2026-05-05T02:27:18.569227"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48940, "epoch": 0, "train_loss": 3.673910215497017, "train_ppl": 39.405689738996244, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 140621, "dt_s": 4.66, "eta_s": 20393, "world_size": 1, "timestamp": "2026-05-05T02:27:23.229695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48950, "epoch": 0, "train_loss": 3.7655156403779984, "train_ppl": 43.185968548742785, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 138555, "dt_s": 4.73, "eta_s": 20472, "world_size": 1, "timestamp": "2026-05-05T02:27:27.959656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48960, "epoch": 0, "train_loss": 3.859356716275215, "train_ppl": 47.43482749963349, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 142086, "dt_s": 4.612, "eta_s": 20449, "world_size": 1, "timestamp": "2026-05-05T02:27:32.572082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48970, "epoch": 0, "train_loss": 3.712843731045723, "train_ppl": 40.97014911005141, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 125423, "dt_s": 5.225, "eta_s": 20860, "world_size": 1, "timestamp": "2026-05-05T02:27:37.797313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48980, "epoch": 0, "train_loss": 3.6810272485017776, "train_ppl": 39.687141696869325, "lr": 0.00056, "grad_norm": 0.6244, "tokens_per_sec": 140170, "dt_s": 4.675, "eta_s": 20891, "world_size": 1, "timestamp": "2026-05-05T02:27:42.472792"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 48990, "epoch": 0, "train_loss": 3.819053053855896, "train_ppl": 45.56104403197833, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 140597, "dt_s": 4.661, "eta_s": 20887, "world_size": 1, "timestamp": "2026-05-05T02:27:47.134061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49000, "epoch": 0, "train_loss": 3.815656006336212, "train_ppl": 45.40653358860114, "lr": 0.00056, "grad_norm": 0.6131, "tokens_per_sec": 140726, "dt_s": 4.657, "eta_s": 20818, "world_size": 1, "timestamp": "2026-05-05T02:27:51.791066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49010, "epoch": 0, "train_loss": 3.7541440576314926, "train_ppl": 42.69765743110852, "lr": 0.00056, "grad_norm": 0.6388, "tokens_per_sec": 119861, "dt_s": 5.468, "eta_s": 20865, "world_size": 1, "timestamp": "2026-05-05T02:27:57.258714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49020, "epoch": 0, "train_loss": 3.7904154509305954, "train_ppl": 44.27479045850754, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 140106, "dt_s": 4.678, "eta_s": 20382, "world_size": 1, "timestamp": "2026-05-05T02:28:01.936340"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49030, "epoch": 0, "train_loss": 3.7252391576766968, "train_ppl": 41.4811520946008, "lr": 0.00056, "grad_norm": 0.6168, "tokens_per_sec": 140524, "dt_s": 4.664, "eta_s": 20367, "world_size": 1, "timestamp": "2026-05-05T02:28:06.600019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49040, "epoch": 0, "train_loss": 3.7539075911045074, "train_ppl": 42.68756205800133, "lr": 0.00056, "grad_norm": 0.7531, "tokens_per_sec": 140260, "dt_s": 4.672, "eta_s": 20372, "world_size": 1, "timestamp": "2026-05-05T02:28:11.272456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49050, "epoch": 0, "train_loss": 3.730897754430771, "train_ppl": 41.71654256899136, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 140115, "dt_s": 4.677, "eta_s": 20386, "world_size": 1, "timestamp": "2026-05-05T02:28:15.949780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49060, "epoch": 0, "train_loss": 3.7596246749162674, "train_ppl": 42.932309381857095, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 141716, "dt_s": 4.624, "eta_s": 20340, "world_size": 1, "timestamp": "2026-05-05T02:28:20.574232"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49070, "epoch": 0, "train_loss": 3.5955813080072403, "train_ppl": 36.43687487984211, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 139184, "dt_s": 4.709, "eta_s": 20362, "world_size": 1, "timestamp": "2026-05-05T02:28:25.282798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49080, "epoch": 0, "train_loss": 3.7317660599946976, "train_ppl": 41.752781005753086, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 140576, "dt_s": 4.662, "eta_s": 20356, "world_size": 1, "timestamp": "2026-05-05T02:28:29.944798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49090, "epoch": 0, "train_loss": 3.7684076875448227, "train_ppl": 43.311045183316324, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 142690, "dt_s": 4.593, "eta_s": 20282, "world_size": 1, "timestamp": "2026-05-05T02:28:34.537656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49100, "epoch": 0, "train_loss": 3.672346353530884, "train_ppl": 39.34411284099828, "lr": 0.00056, "grad_norm": 0.6087, "tokens_per_sec": 140612, "dt_s": 4.661, "eta_s": 20263, "world_size": 1, "timestamp": "2026-05-05T02:28:39.198410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49110, "epoch": 0, "train_loss": 3.7040975093841553, "train_ppl": 40.6133775801772, "lr": 0.00056, "grad_norm": 0.7047, "tokens_per_sec": 140747, "dt_s": 4.656, "eta_s": 20286, "world_size": 1, "timestamp": "2026-05-05T02:28:43.854732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49120, "epoch": 0, "train_loss": 3.7344648241996765, "train_ppl": 41.86561410306846, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 141463, "dt_s": 4.633, "eta_s": 20215, "world_size": 1, "timestamp": "2026-05-05T02:28:48.487457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49130, "epoch": 0, "train_loss": 3.7264881283044815, "train_ppl": 41.532993202445105, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 140596, "dt_s": 4.661, "eta_s": 20210, "world_size": 1, "timestamp": "2026-05-05T02:28:53.148739"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49140, "epoch": 0, "train_loss": 3.7508625388145447, "train_ppl": 42.55777390554579, "lr": 0.00056, "grad_norm": 0.6454, "tokens_per_sec": 141224, "dt_s": 4.641, "eta_s": 20247, "world_size": 1, "timestamp": "2026-05-05T02:28:57.789324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49150, "epoch": 0, "train_loss": 3.7175842970609665, "train_ppl": 41.16483189526815, "lr": 0.00056, "grad_norm": 0.5968, "tokens_per_sec": 139166, "dt_s": 4.709, "eta_s": 20284, "world_size": 1, "timestamp": "2026-05-05T02:29:02.498518"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49160, "epoch": 0, "train_loss": 3.7336267679929733, "train_ppl": 41.83054306312594, "lr": 0.00056, "grad_norm": 0.7099, "tokens_per_sec": 139604, "dt_s": 4.694, "eta_s": 20313, "world_size": 1, "timestamp": "2026-05-05T02:29:07.192941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49170, "epoch": 0, "train_loss": 3.847052678465843, "train_ppl": 46.8547634714056, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 147315, "dt_s": 4.449, "eta_s": 20148, "world_size": 1, "timestamp": "2026-05-05T02:29:11.641649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49180, "epoch": 0, "train_loss": 3.841211289167404, "train_ppl": 46.58186438835868, "lr": 0.00056, "grad_norm": 0.7202, "tokens_per_sec": 150249, "dt_s": 4.362, "eta_s": 19883, "world_size": 1, "timestamp": "2026-05-05T02:29:16.003463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49190, "epoch": 0, "train_loss": 3.6588730812072754, "train_ppl": 38.81757396037641, "lr": 0.00056, "grad_norm": 0.676, "tokens_per_sec": 149906, "dt_s": 4.372, "eta_s": 19644, "world_size": 1, "timestamp": "2026-05-05T02:29:20.375290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49200, "epoch": 0, "train_loss": 3.7642408907413483, "train_ppl": 43.13095232443292, "lr": 0.00056, "grad_norm": 0.7121, "tokens_per_sec": 148580, "dt_s": 4.411, "eta_s": 19380, "world_size": 1, "timestamp": "2026-05-05T02:29:24.786071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49210, "epoch": 0, "train_loss": 3.7237167209386826, "train_ppl": 41.418047713119854, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 151294, "dt_s": 4.332, "eta_s": 19061, "world_size": 1, "timestamp": "2026-05-05T02:29:29.117791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49220, "epoch": 0, "train_loss": 3.610216796398163, "train_ppl": 36.974067790704616, "lr": 0.00056, "grad_norm": 0.7201, "tokens_per_sec": 150693, "dt_s": 4.349, "eta_s": 18969, "world_size": 1, "timestamp": "2026-05-05T02:29:33.466749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49230, "epoch": 0, "train_loss": 3.725553512573242, "train_ppl": 41.49419394765387, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 148227, "dt_s": 4.421, "eta_s": 19017, "world_size": 1, "timestamp": "2026-05-05T02:29:37.888096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49240, "epoch": 0, "train_loss": 3.7027731835842133, "train_ppl": 40.55962783536758, "lr": 0.00056, "grad_norm": 0.6343, "tokens_per_sec": 151177, "dt_s": 4.335, "eta_s": 18981, "world_size": 1, "timestamp": "2026-05-05T02:29:42.223124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49250, "epoch": 0, "train_loss": 3.6909024864435196, "train_ppl": 40.08100320160461, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 150249, "dt_s": 4.362, "eta_s": 18934, "world_size": 1, "timestamp": "2026-05-05T02:29:46.584955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49260, "epoch": 0, "train_loss": 3.7705032378435135, "train_ppl": 43.40190082001705, "lr": 0.00056, "grad_norm": 0.629, "tokens_per_sec": 133542, "dt_s": 4.908, "eta_s": 19429, "world_size": 1, "timestamp": "2026-05-05T02:29:51.492489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49270, "epoch": 0, "train_loss": 3.9076097905635834, "train_ppl": 49.779825452915574, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 153110, "dt_s": 4.28, "eta_s": 19365, "world_size": 1, "timestamp": "2026-05-05T02:29:55.772802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49280, "epoch": 0, "train_loss": 3.720499500632286, "train_ppl": 41.28501084816751, "lr": 0.00056, "grad_norm": 0.7255, "tokens_per_sec": 149362, "dt_s": 4.388, "eta_s": 19332, "world_size": 1, "timestamp": "2026-05-05T02:30:00.160548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49290, "epoch": 0, "train_loss": 3.76648411154747, "train_ppl": 43.2278131735937, "lr": 0.00056, "grad_norm": 0.6871, "tokens_per_sec": 148202, "dt_s": 4.422, "eta_s": 19403, "world_size": 1, "timestamp": "2026-05-05T02:30:04.582624"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49300, "epoch": 0, "train_loss": 3.770293802022934, "train_ppl": 43.39281185911432, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 149287, "dt_s": 4.39, "eta_s": 19422, "world_size": 1, "timestamp": "2026-05-05T02:30:08.972546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49310, "epoch": 0, "train_loss": 3.7082522213459015, "train_ppl": 40.78246547834535, "lr": 0.00056, "grad_norm": 0.654, "tokens_per_sec": 146405, "dt_s": 4.476, "eta_s": 19044, "world_size": 1, "timestamp": "2026-05-05T02:30:13.448893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49320, "epoch": 0, "train_loss": 3.598877429962158, "train_ppl": 36.55717341350667, "lr": 0.00056, "grad_norm": 0.7122, "tokens_per_sec": 150605, "dt_s": 4.352, "eta_s": 19101, "world_size": 1, "timestamp": "2026-05-05T02:30:17.800384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49330, "epoch": 0, "train_loss": 3.7097411304712296, "train_ppl": 40.84323209010495, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 150840, "dt_s": 4.345, "eta_s": 19060, "world_size": 1, "timestamp": "2026-05-05T02:30:22.145153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49340, "epoch": 0, "train_loss": 3.723170429468155, "train_ppl": 41.39542756608752, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 149698, "dt_s": 4.378, "eta_s": 19017, "world_size": 1, "timestamp": "2026-05-05T02:30:26.523013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49350, "epoch": 0, "train_loss": 3.654303565621376, "train_ppl": 38.64060109922097, "lr": 0.00056, "grad_norm": 0.6413, "tokens_per_sec": 152314, "dt_s": 4.303, "eta_s": 18937, "world_size": 1, "timestamp": "2026-05-05T02:30:30.825707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49360, "epoch": 0, "train_loss": 3.7776793837547302, "train_ppl": 43.71447940649266, "lr": 0.00056, "grad_norm": 1.0943, "tokens_per_sec": 151409, "dt_s": 4.328, "eta_s": 18804, "world_size": 1, "timestamp": "2026-05-05T02:30:35.154101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49370, "epoch": 0, "train_loss": 3.7818905413150787, "train_ppl": 43.89895612458162, "lr": 0.00056, "grad_norm": 0.7135, "tokens_per_sec": 148423, "dt_s": 4.415, "eta_s": 18856, "world_size": 1, "timestamp": "2026-05-05T02:30:39.569594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49380, "epoch": 0, "train_loss": 3.6558200865983963, "train_ppl": 38.69924483734288, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 149349, "dt_s": 4.388, "eta_s": 18889, "world_size": 1, "timestamp": "2026-05-05T02:30:43.957721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49390, "epoch": 0, "train_loss": 3.8051878213882446, "train_ppl": 44.9336888283177, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 149380, "dt_s": 4.387, "eta_s": 18893, "world_size": 1, "timestamp": "2026-05-05T02:30:48.344904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49400, "epoch": 0, "train_loss": 3.689663052558899, "train_ppl": 40.03135622153726, "lr": 0.00056, "grad_norm": 0.6549, "tokens_per_sec": 151384, "dt_s": 4.329, "eta_s": 18911, "world_size": 1, "timestamp": "2026-05-05T02:30:52.674019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49410, "epoch": 0, "train_loss": 3.725613698363304, "train_ppl": 41.49669138365392, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 151643, "dt_s": 4.322, "eta_s": 18901, "world_size": 1, "timestamp": "2026-05-05T02:30:56.995778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49420, "epoch": 0, "train_loss": 3.7587513625621796, "train_ppl": 42.89483243260768, "lr": 0.00056, "grad_norm": 0.7488, "tokens_per_sec": 149063, "dt_s": 4.397, "eta_s": 18880, "world_size": 1, "timestamp": "2026-05-05T02:31:01.392283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49430, "epoch": 0, "train_loss": 3.728505164384842, "train_ppl": 41.61685129222523, "lr": 0.00056, "grad_norm": 0.6224, "tokens_per_sec": 152380, "dt_s": 4.301, "eta_s": 18800, "world_size": 1, "timestamp": "2026-05-05T02:31:05.693109"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49440, "epoch": 0, "train_loss": 3.730594679713249, "train_ppl": 41.70390125536432, "lr": 0.00056, "grad_norm": 0.7475, "tokens_per_sec": 153334, "dt_s": 4.274, "eta_s": 18698, "world_size": 1, "timestamp": "2026-05-05T02:31:09.967189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49450, "epoch": 0, "train_loss": 3.7593552619218826, "train_ppl": 42.92074441777643, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 148851, "dt_s": 4.403, "eta_s": 18757, "world_size": 1, "timestamp": "2026-05-05T02:31:14.369964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49460, "epoch": 0, "train_loss": 3.6612146347761154, "train_ppl": 38.90857388824155, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 151978, "dt_s": 4.312, "eta_s": 18745, "world_size": 1, "timestamp": "2026-05-05T02:31:18.682167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49470, "epoch": 0, "train_loss": 3.75653637945652, "train_ppl": 42.79992625004685, "lr": 0.00056, "grad_norm": 0.7167, "tokens_per_sec": 153172, "dt_s": 4.279, "eta_s": 18639, "world_size": 1, "timestamp": "2026-05-05T02:31:22.960760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49480, "epoch": 0, "train_loss": 3.714388385415077, "train_ppl": 41.03348273157234, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 150537, "dt_s": 4.353, "eta_s": 18680, "world_size": 1, "timestamp": "2026-05-05T02:31:27.314248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49490, "epoch": 0, "train_loss": 3.643844723701477, "train_ppl": 38.23857120891208, "lr": 0.00056, "grad_norm": 0.6249, "tokens_per_sec": 152436, "dt_s": 4.299, "eta_s": 18697, "world_size": 1, "timestamp": "2026-05-05T02:31:31.613469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49500, "epoch": 0, "train_loss": 3.790678694844246, "train_ppl": 44.28644706182152, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 151757, "dt_s": 4.318, "eta_s": 18620, "world_size": 1, "timestamp": "2026-05-05T02:31:35.931952"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49510, "epoch": 0, "train_loss": 3.6924886107444763, "train_ppl": 40.14462709915571, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 128167, "dt_s": 5.113, "eta_s": 18649, "world_size": 1, "timestamp": "2026-05-05T02:31:41.045309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49520, "epoch": 0, "train_loss": 3.7276214361190796, "train_ppl": 41.580089550493895, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 151940, "dt_s": 4.313, "eta_s": 18674, "world_size": 1, "timestamp": "2026-05-05T02:31:45.358585"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49530, "epoch": 0, "train_loss": 3.7280896604061127, "train_ppl": 41.59956291687423, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 149263, "dt_s": 4.391, "eta_s": 18702, "world_size": 1, "timestamp": "2026-05-05T02:31:49.749267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49540, "epoch": 0, "train_loss": 3.7516140937805176, "train_ppl": 42.58977043393409, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 151085, "dt_s": 4.338, "eta_s": 18731, "world_size": 1, "timestamp": "2026-05-05T02:31:54.086892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49550, "epoch": 0, "train_loss": 3.742068201303482, "train_ppl": 42.1851473813968, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 148751, "dt_s": 4.406, "eta_s": 18802, "world_size": 1, "timestamp": "2026-05-05T02:31:58.492649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49560, "epoch": 0, "train_loss": 3.756997600197792, "train_ppl": 42.81967101675624, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 132531, "dt_s": 4.945, "eta_s": 19310, "world_size": 1, "timestamp": "2026-05-05T02:32:03.437598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49570, "epoch": 0, "train_loss": 3.510507993400097, "train_ppl": 33.465263599739465, "lr": 0.00056, "grad_norm": 0.692, "tokens_per_sec": 146873, "dt_s": 4.462, "eta_s": 19434, "world_size": 1, "timestamp": "2026-05-05T02:32:07.899699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49580, "epoch": 0, "train_loss": 3.6268581598997116, "train_ppl": 37.59451491746567, "lr": 0.00056, "grad_norm": 0.6885, "tokens_per_sec": 151805, "dt_s": 4.317, "eta_s": 19366, "world_size": 1, "timestamp": "2026-05-05T02:32:12.216794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49590, "epoch": 0, "train_loss": 3.754111498594284, "train_ppl": 42.696267259122955, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 147307, "dt_s": 4.449, "eta_s": 19458, "world_size": 1, "timestamp": "2026-05-05T02:32:16.665760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49600, "epoch": 0, "train_loss": 3.700142353773117, "train_ppl": 40.45306259629906, "lr": 0.00056, "grad_norm": 0.6265, "tokens_per_sec": 149232, "dt_s": 4.392, "eta_s": 19441, "world_size": 1, "timestamp": "2026-05-05T02:32:21.057324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49610, "epoch": 0, "train_loss": 3.6327409893274307, "train_ppl": 37.816328843182696, "lr": 0.00056, "grad_norm": 0.6218, "tokens_per_sec": 148619, "dt_s": 4.41, "eta_s": 18975, "world_size": 1, "timestamp": "2026-05-05T02:32:25.466962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49620, "epoch": 0, "train_loss": 3.722140610218048, "train_ppl": 41.352819700879024, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 150261, "dt_s": 4.361, "eta_s": 18884, "world_size": 1, "timestamp": "2026-05-05T02:32:29.828446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49630, "epoch": 0, "train_loss": 3.8563559502363205, "train_ppl": 47.29270003250008, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 150285, "dt_s": 4.361, "eta_s": 18917, "world_size": 1, "timestamp": "2026-05-05T02:32:34.189259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49640, "epoch": 0, "train_loss": 3.725155547261238, "train_ppl": 41.47768398322761, "lr": 0.00056, "grad_norm": 0.6312, "tokens_per_sec": 144877, "dt_s": 4.524, "eta_s": 18977, "world_size": 1, "timestamp": "2026-05-05T02:32:38.712785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49650, "epoch": 0, "train_loss": 3.6338199079036713, "train_ppl": 37.857151601112726, "lr": 0.00056, "grad_norm": 0.7453, "tokens_per_sec": 146770, "dt_s": 4.465, "eta_s": 19036, "world_size": 1, "timestamp": "2026-05-05T02:32:43.178027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49660, "epoch": 0, "train_loss": 3.8149742782115936, "train_ppl": 45.37558922663174, "lr": 0.00056, "grad_norm": 0.7078, "tokens_per_sec": 148292, "dt_s": 4.419, "eta_s": 19040, "world_size": 1, "timestamp": "2026-05-05T02:32:47.597405"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49670, "epoch": 0, "train_loss": 3.770007684826851, "train_ppl": 43.3803982054196, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 145772, "dt_s": 4.496, "eta_s": 19151, "world_size": 1, "timestamp": "2026-05-05T02:32:52.093215"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49680, "epoch": 0, "train_loss": 3.822681412100792, "train_ppl": 45.72665608998592, "lr": 0.00056, "grad_norm": 0.6985, "tokens_per_sec": 148825, "dt_s": 4.404, "eta_s": 19184, "world_size": 1, "timestamp": "2026-05-05T02:32:56.496738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49690, "epoch": 0, "train_loss": 3.718794569373131, "train_ppl": 41.21468271199255, "lr": 0.00056, "grad_norm": 0.6301, "tokens_per_sec": 147835, "dt_s": 4.433, "eta_s": 19101, "world_size": 1, "timestamp": "2026-05-05T02:33:00.929831"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49700, "epoch": 0, "train_loss": 3.813907876610756, "train_ppl": 45.327226417313376, "lr": 0.00056, "grad_norm": 0.6405, "tokens_per_sec": 147432, "dt_s": 4.445, "eta_s": 19079, "world_size": 1, "timestamp": "2026-05-05T02:33:05.374951"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49710, "epoch": 0, "train_loss": 3.839796468615532, "train_ppl": 46.51600600915753, "lr": 0.00056, "grad_norm": 0.8675, "tokens_per_sec": 149147, "dt_s": 4.394, "eta_s": 19053, "world_size": 1, "timestamp": "2026-05-05T02:33:09.769010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49720, "epoch": 0, "train_loss": 3.6978702545166016, "train_ppl": 40.36125356196607, "lr": 0.00056, "grad_norm": 0.8497, "tokens_per_sec": 147495, "dt_s": 4.443, "eta_s": 19004, "world_size": 1, "timestamp": "2026-05-05T02:33:14.212323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49730, "epoch": 0, "train_loss": 3.679002195596695, "train_ppl": 39.60685445567425, "lr": 0.00056, "grad_norm": 0.639, "tokens_per_sec": 150014, "dt_s": 4.369, "eta_s": 18969, "world_size": 1, "timestamp": "2026-05-05T02:33:18.580942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49740, "epoch": 0, "train_loss": 3.756377726793289, "train_ppl": 42.79313646638407, "lr": 0.00056, "grad_norm": 0.675, "tokens_per_sec": 149934, "dt_s": 4.371, "eta_s": 18912, "world_size": 1, "timestamp": "2026-05-05T02:33:22.951961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49750, "epoch": 0, "train_loss": 3.79621259868145, "train_ppl": 44.53220336986947, "lr": 0.00056, "grad_norm": 0.6486, "tokens_per_sec": 144545, "dt_s": 4.534, "eta_s": 18984, "world_size": 1, "timestamp": "2026-05-05T02:33:27.485909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49760, "epoch": 0, "train_loss": 3.6651338636875153, "train_ppl": 39.06136471145482, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 148974, "dt_s": 4.399, "eta_s": 18984, "world_size": 1, "timestamp": "2026-05-05T02:33:31.885065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49770, "epoch": 0, "train_loss": 3.723557010293007, "train_ppl": 41.411433338184075, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 150120, "dt_s": 4.366, "eta_s": 18912, "world_size": 1, "timestamp": "2026-05-05T02:33:36.250629"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49780, "epoch": 0, "train_loss": 3.6879623383283615, "train_ppl": 39.96333218546289, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 146431, "dt_s": 4.476, "eta_s": 19000, "world_size": 1, "timestamp": "2026-05-05T02:33:40.726205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49790, "epoch": 0, "train_loss": 3.647555649280548, "train_ppl": 38.380735317988005, "lr": 0.00056, "grad_norm": 0.6898, "tokens_per_sec": 149601, "dt_s": 4.381, "eta_s": 19004, "world_size": 1, "timestamp": "2026-05-05T02:33:45.106921"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49800, "epoch": 0, "train_loss": 3.646345466375351, "train_ppl": 38.33431570199544, "lr": 0.00056, "grad_norm": 0.6971, "tokens_per_sec": 147584, "dt_s": 4.441, "eta_s": 18919, "world_size": 1, "timestamp": "2026-05-05T02:33:49.547521"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49810, "epoch": 0, "train_loss": 3.6832780838012695, "train_ppl": 39.77657152448677, "lr": 0.00056, "grad_norm": 0.7022, "tokens_per_sec": 146473, "dt_s": 4.474, "eta_s": 18979, "world_size": 1, "timestamp": "2026-05-05T02:33:54.021805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49820, "epoch": 0, "train_loss": 3.7909345477819443, "train_ppl": 44.29777932903762, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 150098, "dt_s": 4.366, "eta_s": 18975, "world_size": 1, "timestamp": "2026-05-05T02:33:58.388037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49830, "epoch": 0, "train_loss": 3.8041984289884567, "train_ppl": 44.88925376357897, "lr": 0.00056, "grad_norm": 0.7878, "tokens_per_sec": 145605, "dt_s": 4.501, "eta_s": 18993, "world_size": 1, "timestamp": "2026-05-05T02:34:02.888977"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49840, "epoch": 0, "train_loss": 3.6144576370716095, "train_ppl": 37.1312018760633, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 150361, "dt_s": 4.359, "eta_s": 18969, "world_size": 1, "timestamp": "2026-05-05T02:34:07.247532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49850, "epoch": 0, "train_loss": 3.825286418199539, "train_ppl": 45.84592959462944, "lr": 0.00056, "grad_norm": 0.6327, "tokens_per_sec": 150595, "dt_s": 4.352, "eta_s": 18889, "world_size": 1, "timestamp": "2026-05-05T02:34:11.599378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49860, "epoch": 0, "train_loss": 3.848428338766098, "train_ppl": 46.91926406467182, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 132430, "dt_s": 4.949, "eta_s": 19291, "world_size": 1, "timestamp": "2026-05-05T02:34:16.548040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49870, "epoch": 0, "train_loss": 3.757592022418976, "train_ppl": 42.8451315471152, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 149701, "dt_s": 4.378, "eta_s": 19296, "world_size": 1, "timestamp": "2026-05-05T02:34:20.925849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49880, "epoch": 0, "train_loss": 3.7973276525735855, "train_ppl": 44.58188687130543, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 148902, "dt_s": 4.401, "eta_s": 19206, "world_size": 1, "timestamp": "2026-05-05T02:34:25.327153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49890, "epoch": 0, "train_loss": 3.7768441289663315, "train_ppl": 43.67798192271694, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 148003, "dt_s": 4.428, "eta_s": 19261, "world_size": 1, "timestamp": "2026-05-05T02:34:29.755143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49900, "epoch": 0, "train_loss": 3.822458252310753, "train_ppl": 45.71645287752941, "lr": 0.00056, "grad_norm": 0.7221, "tokens_per_sec": 148422, "dt_s": 4.416, "eta_s": 19311, "world_size": 1, "timestamp": "2026-05-05T02:34:34.170657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49910, "epoch": 0, "train_loss": 3.7248516231775284, "train_ppl": 41.465079831578244, "lr": 0.00056, "grad_norm": 0.6486, "tokens_per_sec": 149000, "dt_s": 4.398, "eta_s": 18836, "world_size": 1, "timestamp": "2026-05-05T02:34:38.569035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49920, "epoch": 0, "train_loss": 3.7750519663095474, "train_ppl": 43.599773976204624, "lr": 0.00056, "grad_norm": 0.6364, "tokens_per_sec": 149016, "dt_s": 4.398, "eta_s": 18849, "world_size": 1, "timestamp": "2026-05-05T02:34:42.967008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49930, "epoch": 0, "train_loss": 3.856929913163185, "train_ppl": 47.31985208041911, "lr": 0.00056, "grad_norm": 0.6756, "tokens_per_sec": 147294, "dt_s": 4.449, "eta_s": 18885, "world_size": 1, "timestamp": "2026-05-05T02:34:47.416310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49940, "epoch": 0, "train_loss": 3.8531785011291504, "train_ppl": 47.14266837022073, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 147681, "dt_s": 4.438, "eta_s": 18889, "world_size": 1, "timestamp": "2026-05-05T02:34:51.853966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49950, "epoch": 0, "train_loss": 3.8772262930870056, "train_ppl": 48.290086593780735, "lr": 0.00056, "grad_norm": 0.6929, "tokens_per_sec": 150278, "dt_s": 4.361, "eta_s": 18838, "world_size": 1, "timestamp": "2026-05-05T02:34:56.214962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49960, "epoch": 0, "train_loss": 3.7987236380577087, "train_ppl": 44.64416599849736, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 149062, "dt_s": 4.397, "eta_s": 18832, "world_size": 1, "timestamp": "2026-05-05T02:35:00.611524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49970, "epoch": 0, "train_loss": 3.635099411010742, "train_ppl": 37.905620945937315, "lr": 0.00056, "grad_norm": 0.7277, "tokens_per_sec": 145757, "dt_s": 4.496, "eta_s": 18912, "world_size": 1, "timestamp": "2026-05-05T02:35:05.107769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49980, "epoch": 0, "train_loss": 3.7010671347379684, "train_ppl": 40.490490122022344, "lr": 0.00056, "grad_norm": 0.6624, "tokens_per_sec": 145766, "dt_s": 4.496, "eta_s": 18947, "world_size": 1, "timestamp": "2026-05-05T02:35:09.603755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 49990, "epoch": 0, "train_loss": 3.709186241030693, "train_ppl": 40.82057489860027, "lr": 0.00056, "grad_norm": 0.6813, "tokens_per_sec": 147194, "dt_s": 4.452, "eta_s": 18955, "world_size": 1, "timestamp": "2026-05-05T02:35:14.056149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50000, "epoch": 0, "train_loss": 3.7266367375850677, "train_ppl": 41.53916584933046, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 148320, "dt_s": 4.419, "eta_s": 19000, "world_size": 1, "timestamp": "2026-05-05T02:35:18.474673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50010, "epoch": 0, "train_loss": 3.594993159174919, "train_ppl": 36.41545087529731, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 125642, "dt_s": 5.216, "eta_s": 19031, "world_size": 1, "timestamp": "2026-05-05T02:35:23.690741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50020, "epoch": 0, "train_loss": 3.653373807668686, "train_ppl": 38.60469138930828, "lr": 0.00056, "grad_norm": 0.6617, "tokens_per_sec": 144488, "dt_s": 4.536, "eta_s": 19060, "world_size": 1, "timestamp": "2026-05-05T02:35:28.226463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50030, "epoch": 0, "train_loss": 3.689098596572876, "train_ppl": 40.0087666588961, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 146810, "dt_s": 4.464, "eta_s": 19028, "world_size": 1, "timestamp": "2026-05-05T02:35:32.690462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50040, "epoch": 0, "train_loss": 3.789459690451622, "train_ppl": 44.23249457915712, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 148068, "dt_s": 4.426, "eta_s": 19001, "world_size": 1, "timestamp": "2026-05-05T02:35:37.116523"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50050, "epoch": 0, "train_loss": 3.7065216451883316, "train_ppl": 40.71194935030559, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 143914, "dt_s": 4.554, "eta_s": 19112, "world_size": 1, "timestamp": "2026-05-05T02:35:41.670365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50060, "epoch": 0, "train_loss": 3.705400675535202, "train_ppl": 40.66633805977657, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 148242, "dt_s": 4.421, "eta_s": 19093, "world_size": 1, "timestamp": "2026-05-05T02:35:46.091235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50070, "epoch": 0, "train_loss": 3.7637940049171448, "train_ppl": 43.11168201938851, "lr": 0.00056, "grad_norm": 0.6914, "tokens_per_sec": 148080, "dt_s": 4.426, "eta_s": 18995, "world_size": 1, "timestamp": "2026-05-05T02:35:50.516925"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50080, "epoch": 0, "train_loss": 3.807361528277397, "train_ppl": 45.03146773011462, "lr": 0.00056, "grad_norm": 0.7587, "tokens_per_sec": 148490, "dt_s": 4.414, "eta_s": 18948, "world_size": 1, "timestamp": "2026-05-05T02:35:54.930434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50090, "epoch": 0, "train_loss": 3.8117456287145615, "train_ppl": 45.22932360057585, "lr": 0.00056, "grad_norm": 0.6386, "tokens_per_sec": 148776, "dt_s": 4.405, "eta_s": 18925, "world_size": 1, "timestamp": "2026-05-05T02:35:59.335441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50100, "epoch": 0, "train_loss": 3.655739977955818, "train_ppl": 38.69614481754101, "lr": 0.00056, "grad_norm": 0.6286, "tokens_per_sec": 145739, "dt_s": 4.497, "eta_s": 18872, "world_size": 1, "timestamp": "2026-05-05T02:36:03.832275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50110, "epoch": 0, "train_loss": 3.949848845601082, "train_ppl": 51.927517168945634, "lr": 0.00056, "grad_norm": 0.8054, "tokens_per_sec": 147840, "dt_s": 4.433, "eta_s": 18878, "world_size": 1, "timestamp": "2026-05-05T02:36:08.265166"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50120, "epoch": 0, "train_loss": 3.6638643741607666, "train_ppl": 39.011808180457116, "lr": 0.00056, "grad_norm": 0.6276, "tokens_per_sec": 145942, "dt_s": 4.491, "eta_s": 18929, "world_size": 1, "timestamp": "2026-05-05T02:36:12.755671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50130, "epoch": 0, "train_loss": 3.8002395778894424, "train_ppl": 44.71189519170933, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 145590, "dt_s": 4.501, "eta_s": 18999, "world_size": 1, "timestamp": "2026-05-05T02:36:17.257084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50140, "epoch": 0, "train_loss": 3.7279347628355026, "train_ppl": 41.593119744668705, "lr": 0.00056, "grad_norm": 0.7357, "tokens_per_sec": 148855, "dt_s": 4.403, "eta_s": 18993, "world_size": 1, "timestamp": "2026-05-05T02:36:21.659793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50150, "epoch": 0, "train_loss": 3.6950884014368057, "train_ppl": 40.249130511673954, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 130536, "dt_s": 5.021, "eta_s": 19434, "world_size": 1, "timestamp": "2026-05-05T02:36:26.680311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50160, "epoch": 0, "train_loss": 3.743626967072487, "train_ppl": 42.25095542143065, "lr": 0.00056, "grad_norm": 0.6765, "tokens_per_sec": 143222, "dt_s": 4.576, "eta_s": 19551, "world_size": 1, "timestamp": "2026-05-05T02:36:31.256119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50170, "epoch": 0, "train_loss": 3.6753868460655212, "train_ppl": 39.46392036701984, "lr": 0.00056, "grad_norm": 0.6486, "tokens_per_sec": 148689, "dt_s": 4.408, "eta_s": 19476, "world_size": 1, "timestamp": "2026-05-05T02:36:35.663726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50180, "epoch": 0, "train_loss": 3.76742522418499, "train_ppl": 43.268514564156156, "lr": 0.00056, "grad_norm": 0.7178, "tokens_per_sec": 145382, "dt_s": 4.508, "eta_s": 19476, "world_size": 1, "timestamp": "2026-05-05T02:36:40.171562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50190, "epoch": 0, "train_loss": 3.7253405451774597, "train_ppl": 41.48535797814885, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 149232, "dt_s": 4.392, "eta_s": 19462, "world_size": 1, "timestamp": "2026-05-05T02:36:44.563114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50200, "epoch": 0, "train_loss": 3.78935045003891, "train_ppl": 44.227662867107924, "lr": 0.00056, "grad_norm": 0.6103, "tokens_per_sec": 148191, "dt_s": 4.422, "eta_s": 18950, "world_size": 1, "timestamp": "2026-05-05T02:36:48.985503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50210, "epoch": 0, "train_loss": 3.7125341594219208, "train_ppl": 40.95746787744015, "lr": 0.00056, "grad_norm": 0.6618, "tokens_per_sec": 145664, "dt_s": 4.499, "eta_s": 18880, "world_size": 1, "timestamp": "2026-05-05T02:36:53.484645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50220, "epoch": 0, "train_loss": 3.7734155505895615, "train_ppl": 43.52848496581923, "lr": 0.00056, "grad_norm": 0.7664, "tokens_per_sec": 149326, "dt_s": 4.389, "eta_s": 18860, "world_size": 1, "timestamp": "2026-05-05T02:36:57.873445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50230, "epoch": 0, "train_loss": 3.715846061706543, "train_ppl": 41.093339882082205, "lr": 0.00056, "grad_norm": 0.6781, "tokens_per_sec": 146648, "dt_s": 4.469, "eta_s": 18822, "world_size": 1, "timestamp": "2026-05-05T02:37:02.342367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50240, "epoch": 0, "train_loss": 3.7116643637418747, "train_ppl": 40.92185873739781, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 146099, "dt_s": 4.486, "eta_s": 18898, "world_size": 1, "timestamp": "2026-05-05T02:37:06.828091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50250, "epoch": 0, "train_loss": 3.7842864394187927, "train_ppl": 44.00425964819875, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 149458, "dt_s": 4.385, "eta_s": 18861, "world_size": 1, "timestamp": "2026-05-05T02:37:11.213010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50260, "epoch": 0, "train_loss": 3.637129619717598, "train_ppl": 37.98265543921029, "lr": 0.00056, "grad_norm": 0.7719, "tokens_per_sec": 148623, "dt_s": 4.41, "eta_s": 18781, "world_size": 1, "timestamp": "2026-05-05T02:37:15.622531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50270, "epoch": 0, "train_loss": 3.740960642695427, "train_ppl": 42.13845072269913, "lr": 0.00056, "grad_norm": 0.6325, "tokens_per_sec": 148298, "dt_s": 4.419, "eta_s": 18802, "world_size": 1, "timestamp": "2026-05-05T02:37:20.041752"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50280, "epoch": 0, "train_loss": 3.6772197037935257, "train_ppl": 39.536318445863394, "lr": 0.00056, "grad_norm": 0.6849, "tokens_per_sec": 149892, "dt_s": 4.372, "eta_s": 18716, "world_size": 1, "timestamp": "2026-05-05T02:37:24.413972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50290, "epoch": 0, "train_loss": 3.7651383727788925, "train_ppl": 43.169678955034506, "lr": 0.00056, "grad_norm": 0.7253, "tokens_per_sec": 146431, "dt_s": 4.476, "eta_s": 18703, "world_size": 1, "timestamp": "2026-05-05T02:37:28.889544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50300, "epoch": 0, "train_loss": 3.7431376427412033, "train_ppl": 42.23028605834653, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 149720, "dt_s": 4.377, "eta_s": 18692, "world_size": 1, "timestamp": "2026-05-05T02:37:33.266780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50310, "epoch": 0, "train_loss": 3.744644209742546, "train_ppl": 42.2939567638282, "lr": 0.00056, "grad_norm": 0.6449, "tokens_per_sec": 149477, "dt_s": 4.384, "eta_s": 18666, "world_size": 1, "timestamp": "2026-05-05T02:37:37.651130"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50320, "epoch": 0, "train_loss": 3.8223026990890503, "train_ppl": 45.70934208906686, "lr": 0.00056, "grad_norm": 0.7323, "tokens_per_sec": 146302, "dt_s": 4.48, "eta_s": 18713, "world_size": 1, "timestamp": "2026-05-05T02:37:42.130637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50330, "epoch": 0, "train_loss": 3.7851714193820953, "train_ppl": 44.04321977320961, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 149694, "dt_s": 4.378, "eta_s": 18713, "world_size": 1, "timestamp": "2026-05-05T02:37:46.508648"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50340, "epoch": 0, "train_loss": 3.725295752286911, "train_ppl": 41.48349977066711, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 147749, "dt_s": 4.436, "eta_s": 18675, "world_size": 1, "timestamp": "2026-05-05T02:37:50.944279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50350, "epoch": 0, "train_loss": 3.6456043124198914, "train_ppl": 38.305914598378294, "lr": 0.00056, "grad_norm": 0.6264, "tokens_per_sec": 148699, "dt_s": 4.407, "eta_s": 18696, "world_size": 1, "timestamp": "2026-05-05T02:37:55.351552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50360, "epoch": 0, "train_loss": 3.7264114916324615, "train_ppl": 41.52981037402929, "lr": 0.00056, "grad_norm": 0.7337, "tokens_per_sec": 150125, "dt_s": 4.365, "eta_s": 18676, "world_size": 1, "timestamp": "2026-05-05T02:37:59.717017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50370, "epoch": 0, "train_loss": 3.7077429741621017, "train_ppl": 40.76170240986788, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 146366, "dt_s": 4.478, "eta_s": 18670, "world_size": 1, "timestamp": "2026-05-05T02:38:04.194543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50380, "epoch": 0, "train_loss": 3.7190905064344406, "train_ppl": 41.22688146902028, "lr": 0.00056, "grad_norm": 0.5966, "tokens_per_sec": 148717, "dt_s": 4.407, "eta_s": 18690, "world_size": 1, "timestamp": "2026-05-05T02:38:08.601336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50390, "epoch": 0, "train_loss": 3.728104993700981, "train_ppl": 41.600200780129086, "lr": 0.00056, "grad_norm": 0.6141, "tokens_per_sec": 148503, "dt_s": 4.413, "eta_s": 18666, "world_size": 1, "timestamp": "2026-05-05T02:38:13.014397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50400, "epoch": 0, "train_loss": 3.7409865856170654, "train_ppl": 42.13954393140465, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 146494, "dt_s": 4.474, "eta_s": 18718, "world_size": 1, "timestamp": "2026-05-05T02:38:17.488027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50410, "epoch": 0, "train_loss": 3.717602863907814, "train_ppl": 41.16559620349283, "lr": 0.00056, "grad_norm": 0.6891, "tokens_per_sec": 150462, "dt_s": 4.356, "eta_s": 18705, "world_size": 1, "timestamp": "2026-05-05T02:38:21.843689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50420, "epoch": 0, "train_loss": 3.815869480371475, "train_ppl": 45.41622773924152, "lr": 0.00056, "grad_norm": 0.7899, "tokens_per_sec": 149053, "dt_s": 4.397, "eta_s": 18632, "world_size": 1, "timestamp": "2026-05-05T02:38:26.240538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50430, "epoch": 0, "train_loss": 3.7264790534973145, "train_ppl": 41.53261630025088, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 144889, "dt_s": 4.523, "eta_s": 18726, "world_size": 1, "timestamp": "2026-05-05T02:38:30.763706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50440, "epoch": 0, "train_loss": 3.650668129324913, "train_ppl": 38.500380691082356, "lr": 0.00056, "grad_norm": 0.6235, "tokens_per_sec": 150782, "dt_s": 4.346, "eta_s": 18666, "world_size": 1, "timestamp": "2026-05-05T02:38:35.110106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50450, "epoch": 0, "train_loss": 3.7513725012540817, "train_ppl": 42.57948230651202, "lr": 0.00056, "grad_norm": 0.6542, "tokens_per_sec": 132883, "dt_s": 4.932, "eta_s": 19048, "world_size": 1, "timestamp": "2026-05-05T02:38:40.042010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50460, "epoch": 0, "train_loss": 3.6105340868234634, "train_ppl": 36.985801169744995, "lr": 0.00056, "grad_norm": 0.6404, "tokens_per_sec": 148074, "dt_s": 4.426, "eta_s": 19103, "world_size": 1, "timestamp": "2026-05-05T02:38:44.467867"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50470, "epoch": 0, "train_loss": 3.787171810865402, "train_ppl": 44.13141163465682, "lr": 0.00056, "grad_norm": 0.7191, "tokens_per_sec": 147984, "dt_s": 4.429, "eta_s": 19125, "world_size": 1, "timestamp": "2026-05-05T02:38:48.896474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50480, "epoch": 0, "train_loss": 3.8212072551250458, "train_ppl": 45.659297481712755, "lr": 0.00056, "grad_norm": 0.7434, "tokens_per_sec": 144032, "dt_s": 4.55, "eta_s": 19144, "world_size": 1, "timestamp": "2026-05-05T02:38:53.446592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50490, "epoch": 0, "train_loss": 3.7437947392463684, "train_ppl": 42.25804455073297, "lr": 0.00056, "grad_norm": 0.723, "tokens_per_sec": 146207, "dt_s": 4.482, "eta_s": 19254, "world_size": 1, "timestamp": "2026-05-05T02:38:57.928988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50500, "epoch": 0, "train_loss": 3.6958062201738358, "train_ppl": 40.27803246364472, "lr": 0.00056, "grad_norm": 0.6333, "tokens_per_sec": 146381, "dt_s": 4.477, "eta_s": 18866, "world_size": 1, "timestamp": "2026-05-05T02:39:02.406071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50510, "epoch": 0, "train_loss": 3.804277464747429, "train_ppl": 44.89280176002732, "lr": 0.00056, "grad_norm": 0.6278, "tokens_per_sec": 123635, "dt_s": 5.301, "eta_s": 18926, "world_size": 1, "timestamp": "2026-05-05T02:39:07.706852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50520, "epoch": 0, "train_loss": 3.569223627448082, "train_ppl": 35.4890297446887, "lr": 0.00056, "grad_norm": 0.6868, "tokens_per_sec": 148509, "dt_s": 4.413, "eta_s": 18909, "world_size": 1, "timestamp": "2026-05-05T02:39:12.119776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50530, "epoch": 0, "train_loss": 3.712163895368576, "train_ppl": 40.942305606564275, "lr": 0.00056, "grad_norm": 0.7119, "tokens_per_sec": 146840, "dt_s": 4.463, "eta_s": 18831, "world_size": 1, "timestamp": "2026-05-05T02:39:16.582849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50540, "epoch": 0, "train_loss": 3.6578202843666077, "train_ppl": 38.77672844593552, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 145061, "dt_s": 4.518, "eta_s": 18856, "world_size": 1, "timestamp": "2026-05-05T02:39:21.100714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50550, "epoch": 0, "train_loss": 3.76256987452507, "train_ppl": 43.058939987323456, "lr": 0.00056, "grad_norm": 0.6915, "tokens_per_sec": 147559, "dt_s": 4.441, "eta_s": 18822, "world_size": 1, "timestamp": "2026-05-05T02:39:25.542038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50560, "epoch": 0, "train_loss": 3.7011119723320007, "train_ppl": 40.49230565888245, "lr": 0.00056, "grad_norm": 0.6404, "tokens_per_sec": 146255, "dt_s": 4.481, "eta_s": 18798, "world_size": 1, "timestamp": "2026-05-05T02:39:30.022990"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50570, "epoch": 0, "train_loss": 3.7409947216510773, "train_ppl": 42.13988678156204, "lr": 0.00056, "grad_norm": 0.63, "tokens_per_sec": 151008, "dt_s": 4.34, "eta_s": 18732, "world_size": 1, "timestamp": "2026-05-05T02:39:34.362886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50580, "epoch": 0, "train_loss": 3.7973698377609253, "train_ppl": 44.583767606224356, "lr": 0.00056, "grad_norm": 0.6558, "tokens_per_sec": 147892, "dt_s": 4.431, "eta_s": 18701, "world_size": 1, "timestamp": "2026-05-05T02:39:38.794238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50590, "epoch": 0, "train_loss": 3.7393438071012497, "train_ppl": 42.07037482428511, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 145340, "dt_s": 4.509, "eta_s": 18689, "world_size": 1, "timestamp": "2026-05-05T02:39:43.303387"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50600, "epoch": 0, "train_loss": 3.8313889652490616, "train_ppl": 46.12656195217204, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 147919, "dt_s": 4.431, "eta_s": 18676, "world_size": 1, "timestamp": "2026-05-05T02:39:47.733892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50610, "epoch": 0, "train_loss": 3.733140766620636, "train_ppl": 41.81021830112281, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 148079, "dt_s": 4.426, "eta_s": 18625, "world_size": 1, "timestamp": "2026-05-05T02:39:52.159646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50620, "epoch": 0, "train_loss": 3.704109087586403, "train_ppl": 40.613847812799, "lr": 0.00056, "grad_norm": 0.6989, "tokens_per_sec": 148377, "dt_s": 4.417, "eta_s": 18685, "world_size": 1, "timestamp": "2026-05-05T02:39:56.576483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50630, "epoch": 0, "train_loss": 3.7205376774072647, "train_ppl": 41.28658700682279, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 149370, "dt_s": 4.387, "eta_s": 18644, "world_size": 1, "timestamp": "2026-05-05T02:40:00.963963"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50640, "epoch": 0, "train_loss": 3.6267673820257187, "train_ppl": 37.59110232222409, "lr": 0.00056, "grad_norm": 0.8485, "tokens_per_sec": 145874, "dt_s": 4.493, "eta_s": 18626, "world_size": 1, "timestamp": "2026-05-05T02:40:05.456646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50650, "epoch": 0, "train_loss": 3.8319186121225357, "train_ppl": 46.150999212482105, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 146404, "dt_s": 4.476, "eta_s": 18660, "world_size": 1, "timestamp": "2026-05-05T02:40:09.933028"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50660, "epoch": 0, "train_loss": 3.766224205493927, "train_ppl": 43.21657946318636, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 147105, "dt_s": 4.455, "eta_s": 18680, "world_size": 1, "timestamp": "2026-05-05T02:40:14.388112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50670, "epoch": 0, "train_loss": 3.8026582300662994, "train_ppl": 44.82016859942061, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 145736, "dt_s": 4.497, "eta_s": 18743, "world_size": 1, "timestamp": "2026-05-05T02:40:18.884956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50680, "epoch": 0, "train_loss": 3.7059400379657745, "train_ppl": 40.68827787093803, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 148720, "dt_s": 4.407, "eta_s": 18754, "world_size": 1, "timestamp": "2026-05-05T02:40:23.291639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50690, "epoch": 0, "train_loss": 3.8016417622566223, "train_ppl": 44.774633487223475, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 147076, "dt_s": 4.456, "eta_s": 18719, "world_size": 1, "timestamp": "2026-05-05T02:40:27.747542"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50700, "epoch": 0, "train_loss": 3.7427031695842743, "train_ppl": 42.21194211790816, "lr": 0.00056, "grad_norm": 0.7237, "tokens_per_sec": 145405, "dt_s": 4.507, "eta_s": 18740, "world_size": 1, "timestamp": "2026-05-05T02:40:32.254677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50710, "epoch": 0, "train_loss": 3.6929735392332077, "train_ppl": 40.16409909338638, "lr": 0.00056, "grad_norm": 0.6451, "tokens_per_sec": 148958, "dt_s": 4.4, "eta_s": 18689, "world_size": 1, "timestamp": "2026-05-05T02:40:36.654311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50720, "epoch": 0, "train_loss": 3.723686471581459, "train_ppl": 41.416794862748134, "lr": 0.00056, "grad_norm": 0.6298, "tokens_per_sec": 145196, "dt_s": 4.514, "eta_s": 18699, "world_size": 1, "timestamp": "2026-05-05T02:40:41.167918"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50730, "epoch": 0, "train_loss": 3.736950173974037, "train_ppl": 41.969794206004266, "lr": 0.00056, "grad_norm": 0.683, "tokens_per_sec": 148038, "dt_s": 4.427, "eta_s": 18712, "world_size": 1, "timestamp": "2026-05-05T02:40:45.594892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50740, "epoch": 0, "train_loss": 3.8020565509796143, "train_ppl": 44.79320935253347, "lr": 0.00056, "grad_norm": 0.6915, "tokens_per_sec": 132361, "dt_s": 4.951, "eta_s": 19123, "world_size": 1, "timestamp": "2026-05-05T02:40:50.546233"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50750, "epoch": 0, "train_loss": 3.7190701067447662, "train_ppl": 41.22604046201024, "lr": 0.00056, "grad_norm": 0.6292, "tokens_per_sec": 145842, "dt_s": 4.494, "eta_s": 19107, "world_size": 1, "timestamp": "2026-05-05T02:40:55.039850"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50760, "epoch": 0, "train_loss": 3.690104141831398, "train_ppl": 40.049017518148055, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 150452, "dt_s": 4.356, "eta_s": 19066, "world_size": 1, "timestamp": "2026-05-05T02:40:59.395791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50770, "epoch": 0, "train_loss": 3.6220261454582214, "train_ppl": 37.41329585761803, "lr": 0.00056, "grad_norm": 0.6225, "tokens_per_sec": 145076, "dt_s": 4.517, "eta_s": 19064, "world_size": 1, "timestamp": "2026-05-05T02:41:03.913202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50780, "epoch": 0, "train_loss": 3.5780169516801834, "train_ppl": 35.80247237156021, "lr": 0.00056, "grad_norm": 0.8155, "tokens_per_sec": 147309, "dt_s": 4.449, "eta_s": 19078, "world_size": 1, "timestamp": "2026-05-05T02:41:08.362059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50790, "epoch": 0, "train_loss": 3.714967116713524, "train_ppl": 41.057236965309635, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 146729, "dt_s": 4.466, "eta_s": 18667, "world_size": 1, "timestamp": "2026-05-05T02:41:12.828516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50800, "epoch": 0, "train_loss": 3.819076821208, "train_ppl": 45.56212691022259, "lr": 0.00056, "grad_norm": 0.6377, "tokens_per_sec": 147102, "dt_s": 4.455, "eta_s": 18631, "world_size": 1, "timestamp": "2026-05-05T02:41:17.283669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50810, "epoch": 0, "train_loss": 3.651199519634247, "train_ppl": 38.52084485703545, "lr": 0.00056, "grad_norm": 0.627, "tokens_per_sec": 148954, "dt_s": 4.4, "eta_s": 18663, "world_size": 1, "timestamp": "2026-05-05T02:41:21.683443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50820, "epoch": 0, "train_loss": 3.7866704761981964, "train_ppl": 44.109292573081106, "lr": 0.00056, "grad_norm": 0.7164, "tokens_per_sec": 147800, "dt_s": 4.434, "eta_s": 18589, "world_size": 1, "timestamp": "2026-05-05T02:41:26.117553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50830, "epoch": 0, "train_loss": 3.783338561654091, "train_ppl": 43.962568750984794, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 145702, "dt_s": 4.498, "eta_s": 18625, "world_size": 1, "timestamp": "2026-05-05T02:41:30.615480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50840, "epoch": 0, "train_loss": 3.7347826212644577, "train_ppl": 41.87892098667796, "lr": 0.00056, "grad_norm": 0.6451, "tokens_per_sec": 148598, "dt_s": 4.41, "eta_s": 18574, "world_size": 1, "timestamp": "2026-05-05T02:41:35.025778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50850, "epoch": 0, "train_loss": 3.8188203424215317, "train_ppl": 45.55044268964514, "lr": 0.00056, "grad_norm": 0.7186, "tokens_per_sec": 150109, "dt_s": 4.366, "eta_s": 18495, "world_size": 1, "timestamp": "2026-05-05T02:41:39.391678"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50860, "epoch": 0, "train_loss": 3.6637367010116577, "train_ppl": 39.00682773799544, "lr": 0.00056, "grad_norm": 0.8241, "tokens_per_sec": 146387, "dt_s": 4.477, "eta_s": 18555, "world_size": 1, "timestamp": "2026-05-05T02:41:43.868584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50870, "epoch": 0, "train_loss": 3.6635747253894806, "train_ppl": 39.000510094469355, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 148719, "dt_s": 4.407, "eta_s": 18527, "world_size": 1, "timestamp": "2026-05-05T02:41:48.275290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50880, "epoch": 0, "train_loss": 3.7266353219747543, "train_ppl": 41.53910704610049, "lr": 0.00056, "grad_norm": 0.6082, "tokens_per_sec": 144590, "dt_s": 4.533, "eta_s": 18552, "world_size": 1, "timestamp": "2026-05-05T02:41:52.807833"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50890, "epoch": 0, "train_loss": 3.755907282233238, "train_ppl": 42.77300940283093, "lr": 0.00056, "grad_norm": 0.6475, "tokens_per_sec": 144122, "dt_s": 4.547, "eta_s": 18662, "world_size": 1, "timestamp": "2026-05-05T02:41:57.355110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50900, "epoch": 0, "train_loss": 3.732236847281456, "train_ppl": 41.77244231202078, "lr": 0.00056, "grad_norm": 0.6341, "tokens_per_sec": 147439, "dt_s": 4.445, "eta_s": 18724, "world_size": 1, "timestamp": "2026-05-05T02:42:01.800044"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50910, "epoch": 0, "train_loss": 3.7835272401571274, "train_ppl": 43.97086432522003, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 146480, "dt_s": 4.474, "eta_s": 18717, "world_size": 1, "timestamp": "2026-05-05T02:42:06.274109"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50920, "epoch": 0, "train_loss": 3.742470994591713, "train_ppl": 42.202142698195054, "lr": 0.00056, "grad_norm": 0.6889, "tokens_per_sec": 148275, "dt_s": 4.42, "eta_s": 18723, "world_size": 1, "timestamp": "2026-05-05T02:42:10.694006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50930, "epoch": 0, "train_loss": 3.870760142803192, "train_ppl": 47.978842995041475, "lr": 0.00056, "grad_norm": 0.8047, "tokens_per_sec": 148673, "dt_s": 4.408, "eta_s": 18615, "world_size": 1, "timestamp": "2026-05-05T02:42:15.102073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50940, "epoch": 0, "train_loss": 3.7333604097366333, "train_ppl": 41.81940263635207, "lr": 0.00056, "grad_norm": 0.7021, "tokens_per_sec": 145473, "dt_s": 4.505, "eta_s": 18575, "world_size": 1, "timestamp": "2026-05-05T02:42:19.607064"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50950, "epoch": 0, "train_loss": 3.7282140403985977, "train_ppl": 41.60473739199112, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 147491, "dt_s": 4.443, "eta_s": 18569, "world_size": 1, "timestamp": "2026-05-05T02:42:24.050456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50960, "epoch": 0, "train_loss": 3.8350639045238495, "train_ppl": 46.29638612193121, "lr": 0.00056, "grad_norm": 0.6959, "tokens_per_sec": 148109, "dt_s": 4.425, "eta_s": 18524, "world_size": 1, "timestamp": "2026-05-05T02:42:28.475321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50970, "epoch": 0, "train_loss": 3.650778964161873, "train_ppl": 38.50464811098417, "lr": 0.00056, "grad_norm": 0.6804, "tokens_per_sec": 146494, "dt_s": 4.474, "eta_s": 18564, "world_size": 1, "timestamp": "2026-05-05T02:42:32.948936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50980, "epoch": 0, "train_loss": 3.6766968816518784, "train_ppl": 39.51565338572689, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 148224, "dt_s": 4.421, "eta_s": 18571, "world_size": 1, "timestamp": "2026-05-05T02:42:37.370389"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 50990, "epoch": 0, "train_loss": 3.652900218963623, "train_ppl": 38.58641297207147, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 145083, "dt_s": 4.517, "eta_s": 18577, "world_size": 1, "timestamp": "2026-05-05T02:42:41.887488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51000, "epoch": 0, "train_loss": 3.6953559517860413, "train_ppl": 40.25990062130783, "lr": 0.00056, "grad_norm": 0.7117, "tokens_per_sec": 146546, "dt_s": 4.472, "eta_s": 18596, "world_size": 1, "timestamp": "2026-05-05T02:42:46.359570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51010, "epoch": 0, "train_loss": 3.855597972869873, "train_ppl": 47.2568668183755, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 124885, "dt_s": 5.248, "eta_s": 18620, "world_size": 1, "timestamp": "2026-05-05T02:42:51.607279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51020, "epoch": 0, "train_loss": 3.7523170709609985, "train_ppl": 42.61972059657813, "lr": 0.00056, "grad_norm": 0.6218, "tokens_per_sec": 147821, "dt_s": 4.433, "eta_s": 18582, "world_size": 1, "timestamp": "2026-05-05T02:42:56.040741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51030, "epoch": 0, "train_loss": 3.6915665566921234, "train_ppl": 40.107628642968265, "lr": 0.00056, "grad_norm": 0.6141, "tokens_per_sec": 132841, "dt_s": 4.933, "eta_s": 19004, "world_size": 1, "timestamp": "2026-05-05T02:43:00.974173"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51040, "epoch": 0, "train_loss": 3.751936599612236, "train_ppl": 42.603508098389966, "lr": 0.00056, "grad_norm": 0.6274, "tokens_per_sec": 147836, "dt_s": 4.433, "eta_s": 18930, "world_size": 1, "timestamp": "2026-05-05T02:43:05.407207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51050, "epoch": 0, "train_loss": 3.6551319509744644, "train_ppl": 38.67262366888872, "lr": 0.00056, "grad_norm": 0.6568, "tokens_per_sec": 147544, "dt_s": 4.442, "eta_s": 18900, "world_size": 1, "timestamp": "2026-05-05T02:43:09.849024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51060, "epoch": 0, "train_loss": 3.656098410487175, "train_ppl": 38.71001726070065, "lr": 0.00056, "grad_norm": 0.7065, "tokens_per_sec": 148821, "dt_s": 4.404, "eta_s": 18849, "world_size": 1, "timestamp": "2026-05-05T02:43:14.252655"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51070, "epoch": 0, "train_loss": 3.734584093093872, "train_ppl": 41.87060766634982, "lr": 0.00056, "grad_norm": 0.6293, "tokens_per_sec": 146079, "dt_s": 4.486, "eta_s": 18889, "world_size": 1, "timestamp": "2026-05-05T02:43:18.738997"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51080, "epoch": 0, "train_loss": 3.7862153500318527, "train_ppl": 44.089221847554825, "lr": 0.00056, "grad_norm": 0.6688, "tokens_per_sec": 146748, "dt_s": 4.466, "eta_s": 18495, "world_size": 1, "timestamp": "2026-05-05T02:43:23.204859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51090, "epoch": 0, "train_loss": 3.7433470338582993, "train_ppl": 42.23912963097003, "lr": 0.00056, "grad_norm": 0.6419, "tokens_per_sec": 147073, "dt_s": 4.456, "eta_s": 18510, "world_size": 1, "timestamp": "2026-05-05T02:43:27.660890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51100, "epoch": 0, "train_loss": 3.691376358270645, "train_ppl": 40.10000096072069, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 147192, "dt_s": 4.452, "eta_s": 18514, "world_size": 1, "timestamp": "2026-05-05T02:43:32.113306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51110, "epoch": 0, "train_loss": 3.660788059234619, "train_ppl": 38.89197998179527, "lr": 0.00056, "grad_norm": 0.8138, "tokens_per_sec": 147318, "dt_s": 4.449, "eta_s": 18547, "world_size": 1, "timestamp": "2026-05-05T02:43:36.561914"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51120, "epoch": 0, "train_loss": 3.7472062706947327, "train_ppl": 42.40245538966242, "lr": 0.00056, "grad_norm": 0.6263, "tokens_per_sec": 148434, "dt_s": 4.415, "eta_s": 18484, "world_size": 1, "timestamp": "2026-05-05T02:43:40.977065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51130, "epoch": 0, "train_loss": 3.7063284814357758, "train_ppl": 40.704086036873235, "lr": 0.00056, "grad_norm": 0.7676, "tokens_per_sec": 147751, "dt_s": 4.436, "eta_s": 18454, "world_size": 1, "timestamp": "2026-05-05T02:43:45.412653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51140, "epoch": 0, "train_loss": 3.7931676357984543, "train_ppl": 44.39681070094707, "lr": 0.00056, "grad_norm": 0.6723, "tokens_per_sec": 149917, "dt_s": 4.371, "eta_s": 18379, "world_size": 1, "timestamp": "2026-05-05T02:43:49.784137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51150, "epoch": 0, "train_loss": 3.7109145522117615, "train_ppl": 40.891186556497324, "lr": 0.00056, "grad_norm": 0.6282, "tokens_per_sec": 147099, "dt_s": 4.455, "eta_s": 18377, "world_size": 1, "timestamp": "2026-05-05T02:43:54.239361"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51160, "epoch": 0, "train_loss": 3.749628484249115, "train_ppl": 42.50528768245564, "lr": 0.00056, "grad_norm": 0.6377, "tokens_per_sec": 148369, "dt_s": 4.417, "eta_s": 18347, "world_size": 1, "timestamp": "2026-05-05T02:43:58.656452"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51170, "epoch": 0, "train_loss": 3.7061088234186172, "train_ppl": 40.69514603995107, "lr": 0.00056, "grad_norm": 0.8712, "tokens_per_sec": 145522, "dt_s": 4.504, "eta_s": 18415, "world_size": 1, "timestamp": "2026-05-05T02:44:03.159954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51180, "epoch": 0, "train_loss": 3.8111398220062256, "train_ppl": 45.20193167087026, "lr": 0.00056, "grad_norm": 0.7748, "tokens_per_sec": 146319, "dt_s": 4.479, "eta_s": 18447, "world_size": 1, "timestamp": "2026-05-05T02:44:07.638935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51190, "epoch": 0, "train_loss": 3.8403830975294113, "train_ppl": 46.543301648663586, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 147371, "dt_s": 4.447, "eta_s": 18505, "world_size": 1, "timestamp": "2026-05-05T02:44:12.085943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51200, "epoch": 0, "train_loss": 3.7418822199106216, "train_ppl": 42.17730245845625, "lr": 0.00056, "grad_norm": 0.7054, "tokens_per_sec": 148785, "dt_s": 4.405, "eta_s": 18459, "world_size": 1, "timestamp": "2026-05-05T02:44:16.490674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51210, "epoch": 0, "train_loss": 3.7003638297319412, "train_ppl": 40.46202296934193, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 147260, "dt_s": 4.45, "eta_s": 18482, "world_size": 1, "timestamp": "2026-05-05T02:44:20.941044"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51220, "epoch": 0, "train_loss": 3.798232078552246, "train_ppl": 44.62222612715527, "lr": 0.00056, "grad_norm": 0.6829, "tokens_per_sec": 145887, "dt_s": 4.492, "eta_s": 18468, "world_size": 1, "timestamp": "2026-05-05T02:44:25.433307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51230, "epoch": 0, "train_loss": 3.674519047141075, "train_ppl": 39.42968847471884, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 146449, "dt_s": 4.475, "eta_s": 18461, "world_size": 1, "timestamp": "2026-05-05T02:44:29.908309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51240, "epoch": 0, "train_loss": 3.6799569875001907, "train_ppl": 39.64468881872899, "lr": 0.00056, "grad_norm": 0.6402, "tokens_per_sec": 150141, "dt_s": 4.365, "eta_s": 18388, "world_size": 1, "timestamp": "2026-05-05T02:44:34.273272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51250, "epoch": 0, "train_loss": 3.73769947886467, "train_ppl": 42.00125416313959, "lr": 0.00056, "grad_norm": 0.6957, "tokens_per_sec": 148444, "dt_s": 4.415, "eta_s": 18392, "world_size": 1, "timestamp": "2026-05-05T02:44:38.688114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51260, "epoch": 0, "train_loss": 3.7150129228830338, "train_ppl": 41.05911768313959, "lr": 0.00056, "grad_norm": 0.6241, "tokens_per_sec": 142662, "dt_s": 4.594, "eta_s": 18506, "world_size": 1, "timestamp": "2026-05-05T02:44:43.281932"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51270, "epoch": 0, "train_loss": 3.7037312537431717, "train_ppl": 40.59850542521064, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 146104, "dt_s": 4.486, "eta_s": 18496, "world_size": 1, "timestamp": "2026-05-05T02:44:47.767497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51280, "epoch": 0, "train_loss": 3.759825199842453, "train_ppl": 42.94091924324393, "lr": 0.00056, "grad_norm": 0.6944, "tokens_per_sec": 146264, "dt_s": 4.481, "eta_s": 18497, "world_size": 1, "timestamp": "2026-05-05T02:44:52.248165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51290, "epoch": 0, "train_loss": 3.710384503006935, "train_ppl": 40.86951795879746, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 144816, "dt_s": 4.525, "eta_s": 18625, "world_size": 1, "timestamp": "2026-05-05T02:44:56.773627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51300, "epoch": 0, "train_loss": 3.680901348590851, "train_ppl": 39.68214540378778, "lr": 0.00056, "grad_norm": 0.5984, "tokens_per_sec": 146385, "dt_s": 4.477, "eta_s": 18672, "world_size": 1, "timestamp": "2026-05-05T02:45:01.250638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51310, "epoch": 0, "train_loss": 3.778174951672554, "train_ppl": 43.7361482687839, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 146778, "dt_s": 4.465, "eta_s": 18561, "world_size": 1, "timestamp": "2026-05-05T02:45:05.715561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51320, "epoch": 0, "train_loss": 3.714045360684395, "train_ppl": 41.01940964605556, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 144263, "dt_s": 4.543, "eta_s": 18604, "world_size": 1, "timestamp": "2026-05-05T02:45:10.258404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51330, "epoch": 0, "train_loss": 3.744849815964699, "train_ppl": 42.30265355852521, "lr": 0.00056, "grad_norm": 0.7793, "tokens_per_sec": 133208, "dt_s": 4.92, "eta_s": 18962, "world_size": 1, "timestamp": "2026-05-05T02:45:15.178211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51340, "epoch": 0, "train_loss": 3.6286175698041916, "train_ppl": 37.66071730085252, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 145327, "dt_s": 4.51, "eta_s": 18945, "world_size": 1, "timestamp": "2026-05-05T02:45:19.687750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51350, "epoch": 0, "train_loss": 3.734354540705681, "train_ppl": 41.86099727145174, "lr": 0.00056, "grad_norm": 0.6378, "tokens_per_sec": 146782, "dt_s": 4.465, "eta_s": 18930, "world_size": 1, "timestamp": "2026-05-05T02:45:24.152601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51360, "epoch": 0, "train_loss": 3.725993290543556, "train_ppl": 41.512446193221564, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 147210, "dt_s": 4.452, "eta_s": 18915, "world_size": 1, "timestamp": "2026-05-05T02:45:28.604489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51370, "epoch": 0, "train_loss": 3.7770015746355057, "train_ppl": 43.68485937320713, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 139742, "dt_s": 4.69, "eta_s": 19032, "world_size": 1, "timestamp": "2026-05-05T02:45:33.294288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51380, "epoch": 0, "train_loss": 3.6893255412578583, "train_ppl": 40.017847466223536, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 145924, "dt_s": 4.491, "eta_s": 18673, "world_size": 1, "timestamp": "2026-05-05T02:45:37.785382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51390, "epoch": 0, "train_loss": 3.9147711992263794, "train_ppl": 50.13759867722541, "lr": 0.00056, "grad_norm": 0.7257, "tokens_per_sec": 144564, "dt_s": 4.533, "eta_s": 18688, "world_size": 1, "timestamp": "2026-05-05T02:45:42.318732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51400, "epoch": 0, "train_loss": 3.8098649233579636, "train_ppl": 45.1443405085323, "lr": 0.00056, "grad_norm": 0.6168, "tokens_per_sec": 144823, "dt_s": 4.525, "eta_s": 18733, "world_size": 1, "timestamp": "2026-05-05T02:45:46.843992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51410, "epoch": 0, "train_loss": 3.687440812587738, "train_ppl": 39.94249571289775, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 145837, "dt_s": 4.494, "eta_s": 18763, "world_size": 1, "timestamp": "2026-05-05T02:45:51.337777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51420, "epoch": 0, "train_loss": 3.6479484140872955, "train_ppl": 38.395812880852205, "lr": 0.00056, "grad_norm": 0.6267, "tokens_per_sec": 144532, "dt_s": 4.534, "eta_s": 18631, "world_size": 1, "timestamp": "2026-05-05T02:45:55.872116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51430, "epoch": 0, "train_loss": 3.783039942383766, "train_ppl": 43.94944264073505, "lr": 0.00056, "grad_norm": 0.7239, "tokens_per_sec": 144222, "dt_s": 4.544, "eta_s": 18670, "world_size": 1, "timestamp": "2026-05-05T02:46:00.416251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51440, "epoch": 0, "train_loss": 3.793130949139595, "train_ppl": 44.39518196017516, "lr": 0.00056, "grad_norm": 0.7981, "tokens_per_sec": 147467, "dt_s": 4.444, "eta_s": 18592, "world_size": 1, "timestamp": "2026-05-05T02:46:04.860373"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51450, "epoch": 0, "train_loss": 3.7941821962594986, "train_ppl": 44.4418768069597, "lr": 0.00056, "grad_norm": 0.6291, "tokens_per_sec": 146327, "dt_s": 4.479, "eta_s": 18549, "world_size": 1, "timestamp": "2026-05-05T02:46:09.339100"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51460, "epoch": 0, "train_loss": 3.7859363853931427, "train_ppl": 44.07692422909121, "lr": 0.00056, "grad_norm": 0.6528, "tokens_per_sec": 148183, "dt_s": 4.423, "eta_s": 18486, "world_size": 1, "timestamp": "2026-05-05T02:46:13.761753"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51470, "epoch": 0, "train_loss": 3.6992472112178802, "train_ppl": 40.41686754076357, "lr": 0.00056, "grad_norm": 0.7132, "tokens_per_sec": 148961, "dt_s": 4.4, "eta_s": 18370, "world_size": 1, "timestamp": "2026-05-05T02:46:18.161323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51480, "epoch": 0, "train_loss": 3.819339171051979, "train_ppl": 45.57408169520825, "lr": 0.00056, "grad_norm": 0.6036, "tokens_per_sec": 146844, "dt_s": 4.463, "eta_s": 18299, "world_size": 1, "timestamp": "2026-05-05T02:46:22.624269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51490, "epoch": 0, "train_loss": 3.768548756837845, "train_ppl": 43.31715547281736, "lr": 0.00056, "grad_norm": 0.7152, "tokens_per_sec": 147142, "dt_s": 4.454, "eta_s": 18302, "world_size": 1, "timestamp": "2026-05-05T02:46:27.078202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51500, "epoch": 0, "train_loss": 3.703788235783577, "train_ppl": 40.60081887679914, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 145154, "dt_s": 4.515, "eta_s": 18328, "world_size": 1, "timestamp": "2026-05-05T02:46:31.593086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51510, "epoch": 0, "train_loss": 3.6844366937875748, "train_ppl": 39.82268376536908, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 123952, "dt_s": 5.287, "eta_s": 18386, "world_size": 1, "timestamp": "2026-05-05T02:46:36.880325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51520, "epoch": 0, "train_loss": 3.666007161140442, "train_ppl": 39.09549180114651, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 148841, "dt_s": 4.403, "eta_s": 18385, "world_size": 1, "timestamp": "2026-05-05T02:46:41.283407"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51530, "epoch": 0, "train_loss": 3.7414690405130386, "train_ppl": 42.159879265734794, "lr": 0.00056, "grad_norm": 0.6414, "tokens_per_sec": 148117, "dt_s": 4.425, "eta_s": 18349, "world_size": 1, "timestamp": "2026-05-05T02:46:45.708009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51540, "epoch": 0, "train_loss": 3.729938715696335, "train_ppl": 41.67655396717496, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 147567, "dt_s": 4.441, "eta_s": 18334, "world_size": 1, "timestamp": "2026-05-05T02:46:50.149122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51550, "epoch": 0, "train_loss": 3.60319447517395, "train_ppl": 36.71533353050181, "lr": 0.00056, "grad_norm": 0.831, "tokens_per_sec": 151613, "dt_s": 4.323, "eta_s": 18171, "world_size": 1, "timestamp": "2026-05-05T02:46:54.471693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51560, "epoch": 0, "train_loss": 3.815179616212845, "train_ppl": 45.38490751609586, "lr": 0.00056, "grad_norm": 0.7089, "tokens_per_sec": 150141, "dt_s": 4.365, "eta_s": 18056, "world_size": 1, "timestamp": "2026-05-05T02:46:58.836672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51570, "epoch": 0, "train_loss": 3.7081216126680374, "train_ppl": 40.77713928228047, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 151436, "dt_s": 4.328, "eta_s": 17990, "world_size": 1, "timestamp": "2026-05-05T02:47:03.164304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51580, "epoch": 0, "train_loss": 3.8203818053007126, "train_ppl": 45.62162357372681, "lr": 0.00056, "grad_norm": 0.7692, "tokens_per_sec": 148814, "dt_s": 4.404, "eta_s": 17968, "world_size": 1, "timestamp": "2026-05-05T02:47:07.568207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51590, "epoch": 0, "train_loss": 3.73677758872509, "train_ppl": 41.96255146363619, "lr": 0.00056, "grad_norm": 0.7229, "tokens_per_sec": 148519, "dt_s": 4.413, "eta_s": 17941, "world_size": 1, "timestamp": "2026-05-05T02:47:11.980815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51600, "epoch": 0, "train_loss": 3.87017260491848, "train_ppl": 47.95066188666231, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 149515, "dt_s": 4.383, "eta_s": 17986, "world_size": 1, "timestamp": "2026-05-05T02:47:16.364048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51610, "epoch": 0, "train_loss": 3.682301387190819, "train_ppl": 39.73774084788539, "lr": 0.00056, "grad_norm": 0.6345, "tokens_per_sec": 149456, "dt_s": 4.385, "eta_s": 17998, "world_size": 1, "timestamp": "2026-05-05T02:47:20.749016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51620, "epoch": 0, "train_loss": 3.80068039894104, "train_ppl": 44.73160948128435, "lr": 0.00056, "grad_norm": 0.6947, "tokens_per_sec": 136158, "dt_s": 4.813, "eta_s": 18392, "world_size": 1, "timestamp": "2026-05-05T02:47:25.562261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51630, "epoch": 0, "train_loss": 3.712259352207184, "train_ppl": 40.94621401616203, "lr": 0.00056, "grad_norm": 0.6275, "tokens_per_sec": 150792, "dt_s": 4.346, "eta_s": 18341, "world_size": 1, "timestamp": "2026-05-05T02:47:29.908377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51640, "epoch": 0, "train_loss": 3.7416651844978333, "train_ppl": 42.16814948350266, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 150299, "dt_s": 4.36, "eta_s": 18293, "world_size": 1, "timestamp": "2026-05-05T02:47:34.268739"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51650, "epoch": 0, "train_loss": 3.728803649544716, "train_ppl": 41.6292751588143, "lr": 0.00056, "grad_norm": 0.6287, "tokens_per_sec": 150060, "dt_s": 4.367, "eta_s": 18276, "world_size": 1, "timestamp": "2026-05-05T02:47:38.636070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51660, "epoch": 0, "train_loss": 3.735371008515358, "train_ppl": 41.90356926052192, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 150554, "dt_s": 4.353, "eta_s": 18245, "world_size": 1, "timestamp": "2026-05-05T02:47:42.989065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51670, "epoch": 0, "train_loss": 4.011533036828041, "train_ppl": 55.231477583257146, "lr": 0.00056, "grad_norm": 1.6163, "tokens_per_sec": 149570, "dt_s": 4.382, "eta_s": 17887, "world_size": 1, "timestamp": "2026-05-05T02:47:47.370697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51680, "epoch": 0, "train_loss": 3.8021290004253387, "train_ppl": 44.796454713284156, "lr": 0.00056, "grad_norm": 0.6345, "tokens_per_sec": 150916, "dt_s": 4.343, "eta_s": 17879, "world_size": 1, "timestamp": "2026-05-05T02:47:51.713259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51690, "epoch": 0, "train_loss": 3.664427489042282, "train_ppl": 39.03378249664992, "lr": 0.00056, "grad_norm": 0.7192, "tokens_per_sec": 150439, "dt_s": 4.356, "eta_s": 17872, "world_size": 1, "timestamp": "2026-05-05T02:47:56.069544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51700, "epoch": 0, "train_loss": 3.795816197991371, "train_ppl": 44.51455427201132, "lr": 0.00056, "grad_norm": 0.6643, "tokens_per_sec": 153078, "dt_s": 4.281, "eta_s": 17797, "world_size": 1, "timestamp": "2026-05-05T02:48:00.350778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51710, "epoch": 0, "train_loss": 3.7115463465452194, "train_ppl": 40.91702953931751, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 150465, "dt_s": 4.356, "eta_s": 17794, "world_size": 1, "timestamp": "2026-05-05T02:48:04.706331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51720, "epoch": 0, "train_loss": 3.732715606689453, "train_ppl": 41.79244604987943, "lr": 0.00056, "grad_norm": 0.6876, "tokens_per_sec": 149566, "dt_s": 4.382, "eta_s": 17790, "world_size": 1, "timestamp": "2026-05-05T02:48:09.088046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51730, "epoch": 0, "train_loss": 3.723161995410919, "train_ppl": 41.39507843615442, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 148792, "dt_s": 4.405, "eta_s": 17837, "world_size": 1, "timestamp": "2026-05-05T02:48:13.492600"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51740, "epoch": 0, "train_loss": 3.7127917259931564, "train_ppl": 40.968018510694726, "lr": 0.00056, "grad_norm": 0.7074, "tokens_per_sec": 148795, "dt_s": 4.404, "eta_s": 17872, "world_size": 1, "timestamp": "2026-05-05T02:48:17.897051"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51750, "epoch": 0, "train_loss": 3.7209569215774536, "train_ppl": 41.30389979662191, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 146945, "dt_s": 4.46, "eta_s": 18014, "world_size": 1, "timestamp": "2026-05-05T02:48:22.356964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51760, "epoch": 0, "train_loss": 3.5988272428512573, "train_ppl": 36.55533876062869, "lr": 0.00056, "grad_norm": 0.7368, "tokens_per_sec": 148053, "dt_s": 4.427, "eta_s": 18067, "world_size": 1, "timestamp": "2026-05-05T02:48:26.783494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51770, "epoch": 0, "train_loss": 3.6749473065137863, "train_ppl": 39.446578224709945, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 147724, "dt_s": 4.436, "eta_s": 18108, "world_size": 1, "timestamp": "2026-05-05T02:48:31.219872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51780, "epoch": 0, "train_loss": 3.7422617226839066, "train_ppl": 42.193311899330375, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 146566, "dt_s": 4.471, "eta_s": 18158, "world_size": 1, "timestamp": "2026-05-05T02:48:35.691349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51790, "epoch": 0, "train_loss": 3.8849470913410187, "train_ppl": 48.66436762494428, "lr": 0.00056, "grad_norm": 0.6618, "tokens_per_sec": 149609, "dt_s": 4.38, "eta_s": 18134, "world_size": 1, "timestamp": "2026-05-05T02:48:40.071816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51800, "epoch": 0, "train_loss": 3.8421716392040253, "train_ppl": 46.62662077098697, "lr": 0.00056, "grad_norm": 0.7936, "tokens_per_sec": 146312, "dt_s": 4.479, "eta_s": 18145, "world_size": 1, "timestamp": "2026-05-05T02:48:44.550976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51810, "epoch": 0, "train_loss": 3.6836378425359726, "train_ppl": 39.790884067906134, "lr": 0.00056, "grad_norm": 0.5996, "tokens_per_sec": 148120, "dt_s": 4.425, "eta_s": 18139, "world_size": 1, "timestamp": "2026-05-05T02:48:48.975509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51820, "epoch": 0, "train_loss": 3.7954697012901306, "train_ppl": 44.49913279769815, "lr": 0.00056, "grad_norm": 0.6652, "tokens_per_sec": 146990, "dt_s": 4.459, "eta_s": 18153, "world_size": 1, "timestamp": "2026-05-05T02:48:53.434026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51830, "epoch": 0, "train_loss": 3.7993360608816147, "train_ppl": 44.67151547857718, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 146817, "dt_s": 4.464, "eta_s": 18142, "world_size": 1, "timestamp": "2026-05-05T02:48:57.897811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51840, "epoch": 0, "train_loss": 3.7792764455080032, "train_ppl": 43.784349908519225, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 149136, "dt_s": 4.394, "eta_s": 18149, "world_size": 1, "timestamp": "2026-05-05T02:49:02.292191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51850, "epoch": 0, "train_loss": 3.633486196398735, "train_ppl": 37.84452034179482, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 148184, "dt_s": 4.423, "eta_s": 18098, "world_size": 1, "timestamp": "2026-05-05T02:49:06.714788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51860, "epoch": 0, "train_loss": 3.8050127774477005, "train_ppl": 44.92582414671451, "lr": 0.00056, "grad_norm": 0.6634, "tokens_per_sec": 147663, "dt_s": 4.438, "eta_s": 18105, "world_size": 1, "timestamp": "2026-05-05T02:49:11.153016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51870, "epoch": 0, "train_loss": 3.555757775902748, "train_ppl": 35.0143429425212, "lr": 0.00056, "grad_norm": 0.6559, "tokens_per_sec": 150438, "dt_s": 4.356, "eta_s": 18017, "world_size": 1, "timestamp": "2026-05-05T02:49:15.509392"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51880, "epoch": 0, "train_loss": 3.7626891285181046, "train_ppl": 43.06407524404662, "lr": 0.00056, "grad_norm": 0.6904, "tokens_per_sec": 148336, "dt_s": 4.418, "eta_s": 17975, "world_size": 1, "timestamp": "2026-05-05T02:49:19.927439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51890, "epoch": 0, "train_loss": 3.7710523307323456, "train_ppl": 43.4257390392185, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 147602, "dt_s": 4.44, "eta_s": 18008, "world_size": 1, "timestamp": "2026-05-05T02:49:24.367481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51900, "epoch": 0, "train_loss": 3.726356729865074, "train_ppl": 41.527536190483666, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 149967, "dt_s": 4.37, "eta_s": 17961, "world_size": 1, "timestamp": "2026-05-05T02:49:28.737513"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51910, "epoch": 0, "train_loss": 3.761152908205986, "train_ppl": 42.99797012594722, "lr": 0.00056, "grad_norm": 0.6785, "tokens_per_sec": 147638, "dt_s": 4.439, "eta_s": 17957, "world_size": 1, "timestamp": "2026-05-05T02:49:33.176471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51920, "epoch": 0, "train_loss": 3.642370715737343, "train_ppl": 38.182248770472086, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 134645, "dt_s": 4.867, "eta_s": 18369, "world_size": 1, "timestamp": "2026-05-05T02:49:38.043814"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51930, "epoch": 0, "train_loss": 3.6657374799251556, "train_ppl": 39.08494990294521, "lr": 0.00056, "grad_norm": 0.7368, "tokens_per_sec": 148393, "dt_s": 4.416, "eta_s": 18364, "world_size": 1, "timestamp": "2026-05-05T02:49:42.460201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51940, "epoch": 0, "train_loss": 3.796537607908249, "train_ppl": 44.54667909909973, "lr": 0.00056, "grad_norm": 0.6377, "tokens_per_sec": 147117, "dt_s": 4.455, "eta_s": 18371, "world_size": 1, "timestamp": "2026-05-05T02:49:46.914885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51950, "epoch": 0, "train_loss": 3.6906700134277344, "train_ppl": 40.07168653289363, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 149080, "dt_s": 4.396, "eta_s": 18388, "world_size": 1, "timestamp": "2026-05-05T02:49:51.310911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51960, "epoch": 0, "train_loss": 3.852829709649086, "train_ppl": 47.12622827639469, "lr": 0.00056, "grad_norm": 0.6136, "tokens_per_sec": 147536, "dt_s": 4.442, "eta_s": 18386, "world_size": 1, "timestamp": "2026-05-05T02:49:55.752964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51970, "epoch": 0, "train_loss": 3.9973467141389847, "train_ppl": 54.453477547228005, "lr": 0.00056, "grad_norm": 0.7836, "tokens_per_sec": 146646, "dt_s": 4.469, "eta_s": 18057, "world_size": 1, "timestamp": "2026-05-05T02:50:00.221932"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51980, "epoch": 0, "train_loss": 3.711812660098076, "train_ppl": 40.92792774993264, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 148784, "dt_s": 4.405, "eta_s": 18043, "world_size": 1, "timestamp": "2026-05-05T02:50:04.626736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 51990, "epoch": 0, "train_loss": 3.656388610601425, "train_ppl": 38.721252542293485, "lr": 0.00056, "grad_norm": 0.7375, "tokens_per_sec": 147036, "dt_s": 4.457, "eta_s": 18040, "world_size": 1, "timestamp": "2026-05-05T02:50:09.083854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52000, "epoch": 0, "train_loss": 3.623407617211342, "train_ppl": 37.46501698645007, "lr": 0.00056, "grad_norm": 0.6638, "tokens_per_sec": 151740, "dt_s": 4.319, "eta_s": 17973, "world_size": 1, "timestamp": "2026-05-05T02:50:13.402839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52010, "epoch": 0, "train_loss": 3.5838783532381058, "train_ppl": 36.01294125765219, "lr": 0.00056, "grad_norm": 0.8541, "tokens_per_sec": 127815, "dt_s": 5.127, "eta_s": 17895, "world_size": 1, "timestamp": "2026-05-05T02:50:18.530253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52020, "epoch": 0, "train_loss": 3.746071770787239, "train_ppl": 42.35437708551791, "lr": 0.00056, "grad_norm": 0.6416, "tokens_per_sec": 148231, "dt_s": 4.421, "eta_s": 17852, "world_size": 1, "timestamp": "2026-05-05T02:50:22.951447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52030, "epoch": 0, "train_loss": 3.6473828852176666, "train_ppl": 38.3741050789681, "lr": 0.00056, "grad_norm": 0.6685, "tokens_per_sec": 152149, "dt_s": 4.307, "eta_s": 17768, "world_size": 1, "timestamp": "2026-05-05T02:50:27.258843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52040, "epoch": 0, "train_loss": 3.7290345430374146, "train_ppl": 41.63888819730554, "lr": 0.00056, "grad_norm": 0.6599, "tokens_per_sec": 149277, "dt_s": 4.39, "eta_s": 17710, "world_size": 1, "timestamp": "2026-05-05T02:50:31.649043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52050, "epoch": 0, "train_loss": 3.806146264076233, "train_ppl": 44.97677583873622, "lr": 0.00056, "grad_norm": 0.6307, "tokens_per_sec": 147597, "dt_s": 4.44, "eta_s": 17804, "world_size": 1, "timestamp": "2026-05-05T02:50:36.089258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52060, "epoch": 0, "train_loss": 3.8443887382745743, "train_ppl": 46.73011129053597, "lr": 0.00056, "grad_norm": 0.8553, "tokens_per_sec": 151642, "dt_s": 4.322, "eta_s": 17775, "world_size": 1, "timestamp": "2026-05-05T02:50:40.410973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52070, "epoch": 0, "train_loss": 3.7139338701963425, "train_ppl": 41.0148366269833, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 149402, "dt_s": 4.387, "eta_s": 17743, "world_size": 1, "timestamp": "2026-05-05T02:50:44.797552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52080, "epoch": 0, "train_loss": 3.8052008002996445, "train_ppl": 44.93427202246847, "lr": 0.00056, "grad_norm": 0.7468, "tokens_per_sec": 149632, "dt_s": 4.38, "eta_s": 17797, "world_size": 1, "timestamp": "2026-05-05T02:50:49.177361"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52090, "epoch": 0, "train_loss": 3.6511832773685455, "train_ppl": 38.52021919631933, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 152130, "dt_s": 4.308, "eta_s": 17726, "world_size": 1, "timestamp": "2026-05-05T02:50:53.485252"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52100, "epoch": 0, "train_loss": 3.759618043899536, "train_ppl": 42.93202469793915, "lr": 0.00056, "grad_norm": 0.6485, "tokens_per_sec": 147204, "dt_s": 4.452, "eta_s": 17731, "world_size": 1, "timestamp": "2026-05-05T02:50:57.937301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52110, "epoch": 0, "train_loss": 3.724934220314026, "train_ppl": 41.4685048698842, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 151784, "dt_s": 4.318, "eta_s": 17723, "world_size": 1, "timestamp": "2026-05-05T02:51:02.255035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52120, "epoch": 0, "train_loss": 3.6724900603294373, "train_ppl": 39.34976726377635, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 151344, "dt_s": 4.33, "eta_s": 17673, "world_size": 1, "timestamp": "2026-05-05T02:51:06.585312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52130, "epoch": 0, "train_loss": 3.7599858045578003, "train_ppl": 42.94781631119174, "lr": 0.00056, "grad_norm": 0.6548, "tokens_per_sec": 147282, "dt_s": 4.45, "eta_s": 17726, "world_size": 1, "timestamp": "2026-05-05T02:51:11.034981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52140, "epoch": 0, "train_loss": 3.812329202890396, "train_ppl": 45.2557259689405, "lr": 0.00056, "grad_norm": 0.6788, "tokens_per_sec": 152458, "dt_s": 4.299, "eta_s": 17714, "world_size": 1, "timestamp": "2026-05-05T02:51:15.333624"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52150, "epoch": 0, "train_loss": 3.6859941631555557, "train_ppl": 39.884754699726706, "lr": 0.00056, "grad_norm": 0.6822, "tokens_per_sec": 151223, "dt_s": 4.334, "eta_s": 17614, "world_size": 1, "timestamp": "2026-05-05T02:51:19.667366"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52160, "epoch": 0, "train_loss": 3.732750028371811, "train_ppl": 41.793884640941535, "lr": 0.00056, "grad_norm": 0.6208, "tokens_per_sec": 149805, "dt_s": 4.375, "eta_s": 17656, "world_size": 1, "timestamp": "2026-05-05T02:51:24.042102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52170, "epoch": 0, "train_loss": 3.766016498208046, "train_ppl": 43.20760399692814, "lr": 0.00056, "grad_norm": 0.6788, "tokens_per_sec": 150937, "dt_s": 4.342, "eta_s": 17661, "world_size": 1, "timestamp": "2026-05-05T02:51:28.384033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52180, "epoch": 0, "train_loss": 3.775979697704315, "train_ppl": 43.64024162397829, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 149117, "dt_s": 4.395, "eta_s": 17612, "world_size": 1, "timestamp": "2026-05-05T02:51:32.779022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52190, "epoch": 0, "train_loss": 3.6838489770889282, "train_ppl": 39.79928618538296, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 148556, "dt_s": 4.412, "eta_s": 17699, "world_size": 1, "timestamp": "2026-05-05T02:51:37.190502"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52200, "epoch": 0, "train_loss": 3.6569522321224213, "train_ppl": 38.74308282497691, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 151193, "dt_s": 4.335, "eta_s": 17695, "world_size": 1, "timestamp": "2026-05-05T02:51:41.525094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52210, "epoch": 0, "train_loss": 3.621885061264038, "train_ppl": 37.40801780525385, "lr": 0.00056, "grad_norm": 0.6406, "tokens_per_sec": 132957, "dt_s": 4.929, "eta_s": 18140, "world_size": 1, "timestamp": "2026-05-05T02:51:46.454224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52220, "epoch": 0, "train_loss": 3.7518291771411896, "train_ppl": 42.59893177007944, "lr": 0.00056, "grad_norm": 0.637, "tokens_per_sec": 152575, "dt_s": 4.295, "eta_s": 18097, "world_size": 1, "timestamp": "2026-05-05T02:51:50.749544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52230, "epoch": 0, "train_loss": 3.7351245433092117, "train_ppl": 41.89324276129952, "lr": 0.00056, "grad_norm": 0.682, "tokens_per_sec": 151566, "dt_s": 4.324, "eta_s": 18036, "world_size": 1, "timestamp": "2026-05-05T02:51:55.073481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52240, "epoch": 0, "train_loss": 3.682996079325676, "train_ppl": 39.76535593479067, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 150670, "dt_s": 4.35, "eta_s": 17981, "world_size": 1, "timestamp": "2026-05-05T02:51:59.423114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52250, "epoch": 0, "train_loss": 3.6671159863471985, "train_ppl": 39.13886591063511, "lr": 0.00056, "grad_norm": 0.6369, "tokens_per_sec": 153017, "dt_s": 4.283, "eta_s": 17935, "world_size": 1, "timestamp": "2026-05-05T02:52:03.706033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52260, "epoch": 0, "train_loss": 3.7671071141958237, "train_ppl": 43.25475260648106, "lr": 0.00056, "grad_norm": 0.6526, "tokens_per_sec": 151149, "dt_s": 4.336, "eta_s": 17451, "world_size": 1, "timestamp": "2026-05-05T02:52:08.041889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52270, "epoch": 0, "train_loss": 3.7057767510414124, "train_ppl": 40.68163454958541, "lr": 0.00056, "grad_norm": 0.6299, "tokens_per_sec": 145395, "dt_s": 4.507, "eta_s": 17618, "world_size": 1, "timestamp": "2026-05-05T02:52:12.549363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52280, "epoch": 0, "train_loss": 3.7024177461862564, "train_ppl": 40.54521398854953, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 150531, "dt_s": 4.354, "eta_s": 17638, "world_size": 1, "timestamp": "2026-05-05T02:52:16.902989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52290, "epoch": 0, "train_loss": 3.763704925775528, "train_ppl": 43.107841838803104, "lr": 0.00056, "grad_norm": 0.6453, "tokens_per_sec": 148758, "dt_s": 4.406, "eta_s": 17678, "world_size": 1, "timestamp": "2026-05-05T02:52:21.308548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52300, "epoch": 0, "train_loss": 3.738285318017006, "train_ppl": 42.02586735125626, "lr": 0.00056, "grad_norm": 0.6655, "tokens_per_sec": 150052, "dt_s": 4.368, "eta_s": 17742, "world_size": 1, "timestamp": "2026-05-05T02:52:25.676092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52310, "epoch": 0, "train_loss": 3.7316156774759293, "train_ppl": 41.74650258947372, "lr": 0.00056, "grad_norm": 0.741, "tokens_per_sec": 152266, "dt_s": 4.304, "eta_s": 17712, "world_size": 1, "timestamp": "2026-05-05T02:52:29.980171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52320, "epoch": 0, "train_loss": 3.8056318908929825, "train_ppl": 44.95364694032821, "lr": 0.00056, "grad_norm": 0.7001, "tokens_per_sec": 147052, "dt_s": 4.457, "eta_s": 17667, "world_size": 1, "timestamp": "2026-05-05T02:52:34.436796"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52330, "epoch": 0, "train_loss": 3.797919660806656, "train_ppl": 44.60828752931144, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 153003, "dt_s": 4.283, "eta_s": 17606, "world_size": 1, "timestamp": "2026-05-05T02:52:38.720119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52340, "epoch": 0, "train_loss": 3.796700656414032, "train_ppl": 44.55394296072923, "lr": 0.00056, "grad_norm": 0.7412, "tokens_per_sec": 150641, "dt_s": 4.35, "eta_s": 17557, "world_size": 1, "timestamp": "2026-05-05T02:52:43.070599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52350, "epoch": 0, "train_loss": 3.689513400197029, "train_ppl": 40.025365882775205, "lr": 0.00056, "grad_norm": 0.7203, "tokens_per_sec": 149285, "dt_s": 4.39, "eta_s": 17571, "world_size": 1, "timestamp": "2026-05-05T02:52:47.460638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52360, "epoch": 0, "train_loss": 3.8198606371879578, "train_ppl": 45.597853232977926, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 151598, "dt_s": 4.323, "eta_s": 17582, "world_size": 1, "timestamp": "2026-05-05T02:52:51.783636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52370, "epoch": 0, "train_loss": 3.802254930138588, "train_ppl": 44.802096273193335, "lr": 0.00056, "grad_norm": 0.6851, "tokens_per_sec": 151597, "dt_s": 4.323, "eta_s": 17470, "world_size": 1, "timestamp": "2026-05-05T02:52:56.106679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52380, "epoch": 0, "train_loss": 3.646474689245224, "train_ppl": 38.339269692362606, "lr": 0.00056, "grad_norm": 0.6457, "tokens_per_sec": 146395, "dt_s": 4.477, "eta_s": 17621, "world_size": 1, "timestamp": "2026-05-05T02:53:00.583372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52390, "epoch": 0, "train_loss": 3.654597282409668, "train_ppl": 38.65195215939037, "lr": 0.00056, "grad_norm": 0.6268, "tokens_per_sec": 152089, "dt_s": 4.309, "eta_s": 17583, "world_size": 1, "timestamp": "2026-05-05T02:53:04.892408"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52400, "epoch": 0, "train_loss": 3.7558972537517548, "train_ppl": 42.77258045664898, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 150911, "dt_s": 4.343, "eta_s": 17541, "world_size": 1, "timestamp": "2026-05-05T02:53:09.235087"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52410, "epoch": 0, "train_loss": 3.7340309768915176, "train_ppl": 41.847454758561625, "lr": 0.00056, "grad_norm": 0.6201, "tokens_per_sec": 149407, "dt_s": 4.386, "eta_s": 17588, "world_size": 1, "timestamp": "2026-05-05T02:53:13.621496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52420, "epoch": 0, "train_loss": 3.7400382459163666, "train_ppl": 42.099600271997765, "lr": 0.00056, "grad_norm": 0.637, "tokens_per_sec": 150776, "dt_s": 4.347, "eta_s": 17602, "world_size": 1, "timestamp": "2026-05-05T02:53:17.968072"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52430, "epoch": 0, "train_loss": 3.6865668296813965, "train_ppl": 39.90760190492504, "lr": 0.00056, "grad_norm": 0.764, "tokens_per_sec": 150202, "dt_s": 4.363, "eta_s": 17506, "world_size": 1, "timestamp": "2026-05-05T02:53:22.331263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52440, "epoch": 0, "train_loss": 3.7326804846525192, "train_ppl": 41.7909782398221, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 150631, "dt_s": 4.351, "eta_s": 17536, "world_size": 1, "timestamp": "2026-05-05T02:53:26.682023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52450, "epoch": 0, "train_loss": 3.6623268872499466, "train_ppl": 38.95187412173399, "lr": 0.00056, "grad_norm": 0.5934, "tokens_per_sec": 153065, "dt_s": 4.282, "eta_s": 17482, "world_size": 1, "timestamp": "2026-05-05T02:53:30.963595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52460, "epoch": 0, "train_loss": 3.690640315413475, "train_ppl": 40.07049650104646, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 147222, "dt_s": 4.452, "eta_s": 17530, "world_size": 1, "timestamp": "2026-05-05T02:53:35.415150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52470, "epoch": 0, "train_loss": 3.683051362633705, "train_ppl": 39.767554355979144, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 150541, "dt_s": 4.353, "eta_s": 17531, "world_size": 1, "timestamp": "2026-05-05T02:53:39.768505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52480, "epoch": 0, "train_loss": 3.6934851706027985, "train_ppl": 40.18465356412155, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 149579, "dt_s": 4.381, "eta_s": 17541, "world_size": 1, "timestamp": "2026-05-05T02:53:44.149891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52490, "epoch": 0, "train_loss": 3.7487847954034805, "train_ppl": 42.46944156896682, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 145427, "dt_s": 4.506, "eta_s": 17662, "world_size": 1, "timestamp": "2026-05-05T02:53:48.656350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52500, "epoch": 0, "train_loss": 3.739958956837654, "train_ppl": 42.09626236580954, "lr": 0.00056, "grad_norm": 0.697, "tokens_per_sec": 149211, "dt_s": 4.392, "eta_s": 17747, "world_size": 1, "timestamp": "2026-05-05T02:53:53.048487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52510, "epoch": 0, "train_loss": 3.69361712038517, "train_ppl": 40.18995627025177, "lr": 0.00056, "grad_norm": 0.6595, "tokens_per_sec": 114464, "dt_s": 5.725, "eta_s": 18146, "world_size": 1, "timestamp": "2026-05-05T02:53:58.773960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52520, "epoch": 0, "train_loss": 3.713501200079918, "train_ppl": 40.99709457135019, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 148792, "dt_s": 4.405, "eta_s": 18183, "world_size": 1, "timestamp": "2026-05-05T02:54:03.178480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52530, "epoch": 0, "train_loss": 3.698116675019264, "train_ppl": 40.37120062788706, "lr": 0.00056, "grad_norm": 0.6271, "tokens_per_sec": 147886, "dt_s": 4.432, "eta_s": 18219, "world_size": 1, "timestamp": "2026-05-05T02:54:07.610008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52540, "epoch": 0, "train_loss": 3.7172555923461914, "train_ppl": 41.15130304456157, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 147617, "dt_s": 4.44, "eta_s": 18160, "world_size": 1, "timestamp": "2026-05-05T02:54:12.049613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52550, "epoch": 0, "train_loss": 3.6577820628881454, "train_ppl": 38.77524637036812, "lr": 0.00056, "grad_norm": 0.6957, "tokens_per_sec": 150792, "dt_s": 4.346, "eta_s": 18119, "world_size": 1, "timestamp": "2026-05-05T02:54:16.395722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52560, "epoch": 0, "train_loss": 3.803106352686882, "train_ppl": 44.84025803175668, "lr": 0.00056, "grad_norm": 0.6836, "tokens_per_sec": 149375, "dt_s": 4.387, "eta_s": 17659, "world_size": 1, "timestamp": "2026-05-05T02:54:20.783074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52570, "epoch": 0, "train_loss": 3.784236714243889, "train_ppl": 44.002071583092636, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 147832, "dt_s": 4.433, "eta_s": 17678, "world_size": 1, "timestamp": "2026-05-05T02:54:25.216236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52580, "epoch": 0, "train_loss": 3.58796064555645, "train_ppl": 36.16025709968826, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 149931, "dt_s": 4.371, "eta_s": 17625, "world_size": 1, "timestamp": "2026-05-05T02:54:29.587319"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52590, "epoch": 0, "train_loss": 3.760894760489464, "train_ppl": 42.98687173071844, "lr": 0.00056, "grad_norm": 0.7139, "tokens_per_sec": 150832, "dt_s": 4.345, "eta_s": 17545, "world_size": 1, "timestamp": "2026-05-05T02:54:33.932282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52600, "epoch": 0, "train_loss": 3.7293530851602554, "train_ppl": 41.6521540498989, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 149962, "dt_s": 4.37, "eta_s": 17560, "world_size": 1, "timestamp": "2026-05-05T02:54:38.302462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52610, "epoch": 0, "train_loss": 3.7317486703395844, "train_ppl": 41.752054945604364, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 152784, "dt_s": 4.289, "eta_s": 17477, "world_size": 1, "timestamp": "2026-05-05T02:54:42.591920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52620, "epoch": 0, "train_loss": 3.6635993123054504, "train_ppl": 39.00146900852215, "lr": 0.00056, "grad_norm": 0.6817, "tokens_per_sec": 150944, "dt_s": 4.342, "eta_s": 17399, "world_size": 1, "timestamp": "2026-05-05T02:54:46.933655"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52630, "epoch": 0, "train_loss": 3.714688241481781, "train_ppl": 41.04578871522711, "lr": 0.00056, "grad_norm": 0.6674, "tokens_per_sec": 149849, "dt_s": 4.373, "eta_s": 17397, "world_size": 1, "timestamp": "2026-05-05T02:54:51.307116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52640, "epoch": 0, "train_loss": 3.7204084992408752, "train_ppl": 41.281254025676546, "lr": 0.00056, "grad_norm": 0.623, "tokens_per_sec": 149837, "dt_s": 4.374, "eta_s": 17416, "world_size": 1, "timestamp": "2026-05-05T02:54:55.680931"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52650, "epoch": 0, "train_loss": 3.791088044643402, "train_ppl": 44.304579421017216, "lr": 0.00056, "grad_norm": 0.6175, "tokens_per_sec": 150004, "dt_s": 4.369, "eta_s": 17410, "world_size": 1, "timestamp": "2026-05-05T02:55:00.049872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52660, "epoch": 0, "train_loss": 3.822075128555298, "train_ppl": 45.69894117320605, "lr": 0.00056, "grad_norm": 0.6427, "tokens_per_sec": 150444, "dt_s": 4.356, "eta_s": 17459, "world_size": 1, "timestamp": "2026-05-05T02:55:04.406046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52670, "epoch": 0, "train_loss": 3.7727472335100174, "train_ppl": 43.499403854653956, "lr": 0.00056, "grad_norm": 0.7268, "tokens_per_sec": 152206, "dt_s": 4.306, "eta_s": 17426, "world_size": 1, "timestamp": "2026-05-05T02:55:08.711807"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52680, "epoch": 0, "train_loss": 3.799891248345375, "train_ppl": 44.69632342985262, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 149849, "dt_s": 4.373, "eta_s": 17422, "world_size": 1, "timestamp": "2026-05-05T02:55:13.085294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52690, "epoch": 0, "train_loss": 3.6469483226537704, "train_ppl": 38.35743275232464, "lr": 0.00056, "grad_norm": 0.6477, "tokens_per_sec": 151575, "dt_s": 4.324, "eta_s": 17377, "world_size": 1, "timestamp": "2026-05-05T02:55:17.408940"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52700, "epoch": 0, "train_loss": 3.7276186496019363, "train_ppl": 41.57997368702298, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 150669, "dt_s": 4.35, "eta_s": 17358, "world_size": 1, "timestamp": "2026-05-05T02:55:21.758594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52710, "epoch": 0, "train_loss": 3.8178912103176117, "train_ppl": 45.508139966446805, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 147999, "dt_s": 4.428, "eta_s": 17411, "world_size": 1, "timestamp": "2026-05-05T02:55:26.186750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52720, "epoch": 0, "train_loss": 3.768644079566002, "train_ppl": 43.321284779058274, "lr": 0.00056, "grad_norm": 0.7002, "tokens_per_sec": 150057, "dt_s": 4.367, "eta_s": 17456, "world_size": 1, "timestamp": "2026-05-05T02:55:30.554140"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52730, "epoch": 0, "train_loss": 3.8321230858564377, "train_ppl": 46.160436844455425, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 150291, "dt_s": 4.361, "eta_s": 17441, "world_size": 1, "timestamp": "2026-05-05T02:55:34.914766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52740, "epoch": 0, "train_loss": 3.7377365827560425, "train_ppl": 42.00281260202347, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 148634, "dt_s": 4.409, "eta_s": 17505, "world_size": 1, "timestamp": "2026-05-05T02:55:39.323979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52750, "epoch": 0, "train_loss": 3.7425853610038757, "train_ppl": 42.20696948184673, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 151030, "dt_s": 4.339, "eta_s": 17492, "world_size": 1, "timestamp": "2026-05-05T02:55:43.663299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52760, "epoch": 0, "train_loss": 3.6899158358573914, "train_ppl": 40.041476758903585, "lr": 0.00056, "grad_norm": 0.6375, "tokens_per_sec": 149474, "dt_s": 4.384, "eta_s": 17453, "world_size": 1, "timestamp": "2026-05-05T02:55:48.047705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52770, "epoch": 0, "train_loss": 3.7529298216104507, "train_ppl": 42.645843860759335, "lr": 0.00056, "grad_norm": 0.6781, "tokens_per_sec": 151459, "dt_s": 4.327, "eta_s": 17416, "world_size": 1, "timestamp": "2026-05-05T02:55:52.374688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52780, "epoch": 0, "train_loss": 3.7626897543668747, "train_ppl": 43.06410219565358, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 150894, "dt_s": 4.343, "eta_s": 17398, "world_size": 1, "timestamp": "2026-05-05T02:55:56.717872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52790, "epoch": 0, "train_loss": 3.871565669775009, "train_ppl": 48.01750681743578, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 146795, "dt_s": 4.464, "eta_s": 17438, "world_size": 1, "timestamp": "2026-05-05T02:56:01.182344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52800, "epoch": 0, "train_loss": 3.793622687458992, "train_ppl": 44.417018140744936, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 134541, "dt_s": 4.871, "eta_s": 17858, "world_size": 1, "timestamp": "2026-05-05T02:56:06.053419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52810, "epoch": 0, "train_loss": 3.5658418387174606, "train_ppl": 35.369216050325484, "lr": 0.00056, "grad_norm": 0.7284, "tokens_per_sec": 150473, "dt_s": 4.355, "eta_s": 17830, "world_size": 1, "timestamp": "2026-05-05T02:56:10.408750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52820, "epoch": 0, "train_loss": 3.8542985916137695, "train_ppl": 47.19550200818558, "lr": 0.00056, "grad_norm": 0.8014, "tokens_per_sec": 150687, "dt_s": 4.349, "eta_s": 17843, "world_size": 1, "timestamp": "2026-05-05T02:56:14.757897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52830, "epoch": 0, "train_loss": 4.137285456061363, "train_ppl": 62.63257161078493, "lr": 0.00056, "grad_norm": 0.8559, "tokens_per_sec": 151326, "dt_s": 4.331, "eta_s": 17829, "world_size": 1, "timestamp": "2026-05-05T02:56:19.088679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52840, "epoch": 0, "train_loss": 3.6539646089076996, "train_ppl": 38.627505827548404, "lr": 0.00056, "grad_norm": 0.7197, "tokens_per_sec": 149848, "dt_s": 4.373, "eta_s": 17752, "world_size": 1, "timestamp": "2026-05-05T02:56:23.462159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52850, "epoch": 0, "train_loss": 3.722221925854683, "train_ppl": 41.35618246846059, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 151518, "dt_s": 4.325, "eta_s": 17313, "world_size": 1, "timestamp": "2026-05-05T02:56:27.787481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52860, "epoch": 0, "train_loss": 3.741549625992775, "train_ppl": 42.16327687672827, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 149390, "dt_s": 4.387, "eta_s": 17334, "world_size": 1, "timestamp": "2026-05-05T02:56:32.174380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52870, "epoch": 0, "train_loss": 3.655165985226631, "train_ppl": 38.67393988511271, "lr": 0.00056, "grad_norm": 0.7149, "tokens_per_sec": 149156, "dt_s": 4.394, "eta_s": 17365, "world_size": 1, "timestamp": "2026-05-05T02:56:36.568148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52880, "epoch": 0, "train_loss": 3.7523821145296097, "train_ppl": 42.62249282545579, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 151537, "dt_s": 4.325, "eta_s": 17356, "world_size": 1, "timestamp": "2026-05-05T02:56:40.892928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52890, "epoch": 0, "train_loss": 3.6862873882055283, "train_ppl": 39.89645162374843, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 151873, "dt_s": 4.315, "eta_s": 17305, "world_size": 1, "timestamp": "2026-05-05T02:56:45.208100"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52900, "epoch": 0, "train_loss": 3.721059113740921, "train_ppl": 41.308120947182346, "lr": 0.00056, "grad_norm": 0.7081, "tokens_per_sec": 148057, "dt_s": 4.426, "eta_s": 17381, "world_size": 1, "timestamp": "2026-05-05T02:56:49.634501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52910, "epoch": 0, "train_loss": 3.59748312830925, "train_ppl": 36.506237204662646, "lr": 0.00056, "grad_norm": 0.6899, "tokens_per_sec": 150573, "dt_s": 4.352, "eta_s": 17349, "world_size": 1, "timestamp": "2026-05-05T02:56:53.986940"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52920, "epoch": 0, "train_loss": 3.795648157596588, "train_ppl": 44.507074657194174, "lr": 0.00056, "grad_norm": 0.7254, "tokens_per_sec": 148968, "dt_s": 4.399, "eta_s": 17349, "world_size": 1, "timestamp": "2026-05-05T02:56:58.386287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52930, "epoch": 0, "train_loss": 3.733822301030159, "train_ppl": 41.838723115967426, "lr": 0.00056, "grad_norm": 0.6617, "tokens_per_sec": 151118, "dt_s": 4.337, "eta_s": 17354, "world_size": 1, "timestamp": "2026-05-05T02:57:02.723027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52940, "epoch": 0, "train_loss": 3.7771472185850143, "train_ppl": 43.691222272007586, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 150262, "dt_s": 4.361, "eta_s": 17387, "world_size": 1, "timestamp": "2026-05-05T02:57:07.084482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52950, "epoch": 0, "train_loss": 3.8005143254995346, "train_ppl": 44.72418136577714, "lr": 0.00056, "grad_norm": 0.6957, "tokens_per_sec": 149480, "dt_s": 4.384, "eta_s": 17349, "world_size": 1, "timestamp": "2026-05-05T02:57:11.468742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52960, "epoch": 0, "train_loss": 3.7159189879894257, "train_ppl": 41.096336775885874, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 151117, "dt_s": 4.337, "eta_s": 17332, "world_size": 1, "timestamp": "2026-05-05T02:57:15.805501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52970, "epoch": 0, "train_loss": 3.67600154876709, "train_ppl": 39.48818640291875, "lr": 0.00056, "grad_norm": 0.6959, "tokens_per_sec": 151822, "dt_s": 4.317, "eta_s": 17262, "world_size": 1, "timestamp": "2026-05-05T02:57:20.122149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52980, "epoch": 0, "train_loss": 3.751617416739464, "train_ppl": 42.589911958227916, "lr": 0.00056, "grad_norm": 0.6522, "tokens_per_sec": 148814, "dt_s": 4.404, "eta_s": 17311, "world_size": 1, "timestamp": "2026-05-05T02:57:24.526035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 52990, "epoch": 0, "train_loss": 3.7377385199069977, "train_ppl": 42.002893967890834, "lr": 0.00056, "grad_norm": 0.6285, "tokens_per_sec": 152543, "dt_s": 4.296, "eta_s": 17255, "world_size": 1, "timestamp": "2026-05-05T02:57:28.822287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53000, "epoch": 0, "train_loss": 3.7481748461723328, "train_ppl": 42.443545264253, "lr": 0.00056, "grad_norm": 0.6353, "tokens_per_sec": 150840, "dt_s": 4.345, "eta_s": 17219, "world_size": 1, "timestamp": "2026-05-05T02:57:33.167006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53010, "epoch": 0, "train_loss": 3.690824344754219, "train_ppl": 40.07787132667217, "lr": 0.00056, "grad_norm": 0.6952, "tokens_per_sec": 128081, "dt_s": 5.117, "eta_s": 17223, "world_size": 1, "timestamp": "2026-05-05T02:57:38.283751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53020, "epoch": 0, "train_loss": 3.680413231253624, "train_ppl": 39.66278058717368, "lr": 0.00056, "grad_norm": 0.6812, "tokens_per_sec": 152137, "dt_s": 4.308, "eta_s": 17212, "world_size": 1, "timestamp": "2026-05-05T02:57:42.591471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53030, "epoch": 0, "train_loss": 3.769628345966339, "train_ppl": 43.36394545537683, "lr": 0.00056, "grad_norm": 0.6335, "tokens_per_sec": 149818, "dt_s": 4.374, "eta_s": 17184, "world_size": 1, "timestamp": "2026-05-05T02:57:46.965830"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53040, "epoch": 0, "train_loss": 3.646407663822174, "train_ppl": 38.336700072707934, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 147192, "dt_s": 4.452, "eta_s": 17303, "world_size": 1, "timestamp": "2026-05-05T02:57:51.418236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53050, "epoch": 0, "train_loss": 3.79939866065979, "train_ppl": 44.674311993066574, "lr": 0.00056, "grad_norm": 0.8066, "tokens_per_sec": 151755, "dt_s": 4.319, "eta_s": 17278, "world_size": 1, "timestamp": "2026-05-05T02:57:55.736765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53060, "epoch": 0, "train_loss": 3.6774357110261917, "train_ppl": 39.54485949903211, "lr": 0.00056, "grad_norm": 0.6202, "tokens_per_sec": 149974, "dt_s": 4.37, "eta_s": 17292, "world_size": 1, "timestamp": "2026-05-05T02:58:00.106601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53070, "epoch": 0, "train_loss": 3.62242229282856, "train_ppl": 37.42811997246091, "lr": 0.00056, "grad_norm": 0.6301, "tokens_per_sec": 152643, "dt_s": 4.293, "eta_s": 17276, "world_size": 1, "timestamp": "2026-05-05T02:58:04.399998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53080, "epoch": 0, "train_loss": 3.6563225239515305, "train_ppl": 38.718693668987854, "lr": 0.00056, "grad_norm": 0.7781, "tokens_per_sec": 149724, "dt_s": 4.377, "eta_s": 17274, "world_size": 1, "timestamp": "2026-05-05T02:58:08.777136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53090, "epoch": 0, "train_loss": 3.7070375233888626, "train_ppl": 40.73295717575023, "lr": 0.00056, "grad_norm": 0.7089, "tokens_per_sec": 146589, "dt_s": 4.471, "eta_s": 17284, "world_size": 1, "timestamp": "2026-05-05T02:58:13.247888"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53100, "epoch": 0, "train_loss": 3.708991289138794, "train_ppl": 40.81261762596326, "lr": 0.00056, "grad_norm": 0.6982, "tokens_per_sec": 133876, "dt_s": 4.895, "eta_s": 17736, "world_size": 1, "timestamp": "2026-05-05T02:58:18.143189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53110, "epoch": 0, "train_loss": 3.866472542285919, "train_ppl": 47.773569263347895, "lr": 0.00056, "grad_norm": 0.6617, "tokens_per_sec": 149635, "dt_s": 4.38, "eta_s": 17740, "world_size": 1, "timestamp": "2026-05-05T02:58:22.522894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53120, "epoch": 0, "train_loss": 3.6699146777391434, "train_ppl": 39.24855694182425, "lr": 0.00056, "grad_norm": 0.6345, "tokens_per_sec": 148960, "dt_s": 4.4, "eta_s": 17819, "world_size": 1, "timestamp": "2026-05-05T02:58:26.922467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53130, "epoch": 0, "train_loss": 3.7049571573734283, "train_ppl": 40.648305799387316, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 150344, "dt_s": 4.359, "eta_s": 17800, "world_size": 1, "timestamp": "2026-05-05T02:58:31.281508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53140, "epoch": 0, "train_loss": 3.714900851249695, "train_ppl": 41.054516378600056, "lr": 0.00056, "grad_norm": 0.6525, "tokens_per_sec": 148885, "dt_s": 4.402, "eta_s": 17741, "world_size": 1, "timestamp": "2026-05-05T02:58:35.683309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53150, "epoch": 0, "train_loss": 3.7385647743940353, "train_ppl": 42.0376133890639, "lr": 0.00056, "grad_norm": 0.6291, "tokens_per_sec": 149732, "dt_s": 4.377, "eta_s": 17327, "world_size": 1, "timestamp": "2026-05-05T02:58:40.060191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53160, "epoch": 0, "train_loss": 3.8013343811035156, "train_ppl": 44.76087272376026, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 150260, "dt_s": 4.362, "eta_s": 17308, "world_size": 1, "timestamp": "2026-05-05T02:58:44.421692"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53170, "epoch": 0, "train_loss": 3.7935113608837128, "train_ppl": 44.412073621464515, "lr": 0.00056, "grad_norm": 0.6132, "tokens_per_sec": 146702, "dt_s": 4.467, "eta_s": 17357, "world_size": 1, "timestamp": "2026-05-05T02:58:48.888986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53180, "epoch": 0, "train_loss": 3.756562665104866, "train_ppl": 42.80105128864361, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 150001, "dt_s": 4.369, "eta_s": 17361, "world_size": 1, "timestamp": "2026-05-05T02:58:53.258041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53190, "epoch": 0, "train_loss": 3.673557788133621, "train_ppl": 39.391804542564046, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 149122, "dt_s": 4.395, "eta_s": 17351, "world_size": 1, "timestamp": "2026-05-05T02:58:57.652819"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53200, "epoch": 0, "train_loss": 3.772709846496582, "train_ppl": 43.49777757225873, "lr": 0.00056, "grad_norm": 0.74, "tokens_per_sec": 148218, "dt_s": 4.422, "eta_s": 17382, "world_size": 1, "timestamp": "2026-05-05T02:59:02.074421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53210, "epoch": 0, "train_loss": 3.730771616101265, "train_ppl": 41.711280845858404, "lr": 0.00056, "grad_norm": 0.631, "tokens_per_sec": 151729, "dt_s": 4.319, "eta_s": 17344, "world_size": 1, "timestamp": "2026-05-05T02:59:06.393683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53220, "epoch": 0, "train_loss": 3.7260569483041763, "train_ppl": 41.51508886669654, "lr": 0.00056, "grad_norm": 0.6599, "tokens_per_sec": 148747, "dt_s": 4.406, "eta_s": 17291, "world_size": 1, "timestamp": "2026-05-05T02:59:10.799572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53230, "epoch": 0, "train_loss": 3.7731651812791824, "train_ppl": 43.51758813322965, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 147586, "dt_s": 4.441, "eta_s": 17343, "world_size": 1, "timestamp": "2026-05-05T02:59:15.240116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53240, "epoch": 0, "train_loss": 3.541372463107109, "train_ppl": 34.51425624261056, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 150764, "dt_s": 4.347, "eta_s": 17301, "world_size": 1, "timestamp": "2026-05-05T02:59:19.587020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53250, "epoch": 0, "train_loss": 3.732100859284401, "train_ppl": 41.76676214748452, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 149589, "dt_s": 4.381, "eta_s": 17265, "world_size": 1, "timestamp": "2026-05-05T02:59:23.968091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53260, "epoch": 0, "train_loss": 3.7040756046772003, "train_ppl": 40.612487965786265, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 152160, "dt_s": 4.307, "eta_s": 17251, "world_size": 1, "timestamp": "2026-05-05T02:59:28.275157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53270, "epoch": 0, "train_loss": 3.7605149894952774, "train_ppl": 42.970549663224375, "lr": 0.00056, "grad_norm": 0.7253, "tokens_per_sec": 152337, "dt_s": 4.302, "eta_s": 17165, "world_size": 1, "timestamp": "2026-05-05T02:59:32.577224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53280, "epoch": 0, "train_loss": 3.855976924300194, "train_ppl": 47.27477826921941, "lr": 0.00056, "grad_norm": 0.813, "tokens_per_sec": 149676, "dt_s": 4.379, "eta_s": 17111, "world_size": 1, "timestamp": "2026-05-05T02:59:36.955737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53290, "epoch": 0, "train_loss": 3.771787479519844, "train_ppl": 43.457675156079894, "lr": 0.00056, "grad_norm": 0.7073, "tokens_per_sec": 152471, "dt_s": 4.298, "eta_s": 17069, "world_size": 1, "timestamp": "2026-05-05T02:59:41.253984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53300, "epoch": 0, "train_loss": 3.774909257888794, "train_ppl": 43.59355236526397, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 151701, "dt_s": 4.32, "eta_s": 17016, "world_size": 1, "timestamp": "2026-05-05T02:59:45.574066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53310, "epoch": 0, "train_loss": 3.742021545767784, "train_ppl": 42.18317925665954, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 146817, "dt_s": 4.464, "eta_s": 17135, "world_size": 1, "timestamp": "2026-05-05T02:59:50.037853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53320, "epoch": 0, "train_loss": 3.727462887763977, "train_ppl": 41.573497618274466, "lr": 0.00056, "grad_norm": 0.6795, "tokens_per_sec": 152895, "dt_s": 4.286, "eta_s": 17119, "world_size": 1, "timestamp": "2026-05-05T02:59:54.324217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53330, "epoch": 0, "train_loss": 3.6521837562322617, "train_ppl": 38.558777146441784, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 152062, "dt_s": 4.31, "eta_s": 17060, "world_size": 1, "timestamp": "2026-05-05T02:59:58.634020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53340, "epoch": 0, "train_loss": 3.7063163965940475, "train_ppl": 40.70359413740806, "lr": 0.00056, "grad_norm": 0.6208, "tokens_per_sec": 148783, "dt_s": 4.405, "eta_s": 17140, "world_size": 1, "timestamp": "2026-05-05T03:00:03.038827"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53350, "epoch": 0, "train_loss": 3.6777157187461853, "train_ppl": 39.55593391536657, "lr": 0.00056, "grad_norm": 0.7035, "tokens_per_sec": 151532, "dt_s": 4.325, "eta_s": 17139, "world_size": 1, "timestamp": "2026-05-05T03:00:07.363715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53360, "epoch": 0, "train_loss": 3.718517154455185, "train_ppl": 41.20325072994434, "lr": 0.00056, "grad_norm": 0.6366, "tokens_per_sec": 150898, "dt_s": 4.343, "eta_s": 17040, "world_size": 1, "timestamp": "2026-05-05T03:00:11.706778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53370, "epoch": 0, "train_loss": 3.743574306368828, "train_ppl": 42.24873051497098, "lr": 0.00056, "grad_norm": 0.6899, "tokens_per_sec": 150320, "dt_s": 4.36, "eta_s": 17093, "world_size": 1, "timestamp": "2026-05-05T03:00:16.066536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53380, "epoch": 0, "train_loss": 3.7545200884342194, "train_ppl": 42.713716084591844, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 152726, "dt_s": 4.291, "eta_s": 17074, "world_size": 1, "timestamp": "2026-05-05T03:00:20.357640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53390, "epoch": 0, "train_loss": 3.6736123263835907, "train_ppl": 39.3939529612319, "lr": 0.00056, "grad_norm": 0.6203, "tokens_per_sec": 149947, "dt_s": 4.371, "eta_s": 17043, "world_size": 1, "timestamp": "2026-05-05T03:00:24.728251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53400, "epoch": 0, "train_loss": 3.7010637819767, "train_ppl": 40.49035436730289, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 135246, "dt_s": 4.846, "eta_s": 17448, "world_size": 1, "timestamp": "2026-05-05T03:00:29.573924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53410, "epoch": 0, "train_loss": 3.679615020751953, "train_ppl": 39.631133971194416, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 152304, "dt_s": 4.303, "eta_s": 17412, "world_size": 1, "timestamp": "2026-05-05T03:00:33.876903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53420, "epoch": 0, "train_loss": 3.6877091825008392, "train_ppl": 39.95321651550735, "lr": 0.00056, "grad_norm": 0.6302, "tokens_per_sec": 149549, "dt_s": 4.382, "eta_s": 17425, "world_size": 1, "timestamp": "2026-05-05T03:00:38.259148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53430, "epoch": 0, "train_loss": 3.7387324273586273, "train_ppl": 42.04466171039242, "lr": 0.00056, "grad_norm": 0.6624, "tokens_per_sec": 152609, "dt_s": 4.294, "eta_s": 17423, "world_size": 1, "timestamp": "2026-05-05T03:00:42.553510"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53440, "epoch": 0, "train_loss": 3.8111698776483536, "train_ppl": 45.20329026436865, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 153041, "dt_s": 4.282, "eta_s": 17349, "world_size": 1, "timestamp": "2026-05-05T03:00:46.835749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53450, "epoch": 0, "train_loss": 3.865998297929764, "train_ppl": 47.7509182892242, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 149722, "dt_s": 4.377, "eta_s": 16977, "world_size": 1, "timestamp": "2026-05-05T03:00:51.212929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53460, "epoch": 0, "train_loss": 3.6564220041036606, "train_ppl": 38.72254560211659, "lr": 0.00056, "grad_norm": 0.6297, "tokens_per_sec": 151388, "dt_s": 4.329, "eta_s": 16994, "world_size": 1, "timestamp": "2026-05-05T03:00:55.541960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53470, "epoch": 0, "train_loss": 3.7860441356897354, "train_ppl": 44.081673786628315, "lr": 0.00056, "grad_norm": 0.6308, "tokens_per_sec": 151677, "dt_s": 4.321, "eta_s": 16941, "world_size": 1, "timestamp": "2026-05-05T03:00:59.862741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53480, "epoch": 0, "train_loss": 3.7803521007299423, "train_ppl": 43.831472112216275, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 150131, "dt_s": 4.365, "eta_s": 16992, "world_size": 1, "timestamp": "2026-05-05T03:01:04.227981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53490, "epoch": 0, "train_loss": 3.6781399995088577, "train_ppl": 39.5727202979942, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 153498, "dt_s": 4.27, "eta_s": 16978, "world_size": 1, "timestamp": "2026-05-05T03:01:08.497469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53500, "epoch": 0, "train_loss": 3.709110677242279, "train_ppl": 40.81749045785318, "lr": 0.00056, "grad_norm": 0.785, "tokens_per_sec": 150571, "dt_s": 4.353, "eta_s": 16954, "world_size": 1, "timestamp": "2026-05-05T03:01:12.849980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53510, "epoch": 0, "train_loss": 3.7873318046331406, "train_ppl": 44.13847295034805, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 128127, "dt_s": 5.115, "eta_s": 16937, "world_size": 1, "timestamp": "2026-05-05T03:01:17.964877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53520, "epoch": 0, "train_loss": 3.68671190738678, "train_ppl": 39.9133920282355, "lr": 0.00056, "grad_norm": 0.6526, "tokens_per_sec": 151940, "dt_s": 4.313, "eta_s": 16927, "world_size": 1, "timestamp": "2026-05-05T03:01:22.278205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53530, "epoch": 0, "train_loss": 3.6961464136838913, "train_ppl": 40.29173711987204, "lr": 0.00056, "grad_norm": 0.6491, "tokens_per_sec": 145990, "dt_s": 4.489, "eta_s": 17020, "world_size": 1, "timestamp": "2026-05-05T03:01:26.767281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53540, "epoch": 0, "train_loss": 3.6538608372211456, "train_ppl": 38.623497594095475, "lr": 0.00056, "grad_norm": 0.634, "tokens_per_sec": 148898, "dt_s": 4.401, "eta_s": 17118, "world_size": 1, "timestamp": "2026-05-05T03:01:31.168672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53550, "epoch": 0, "train_loss": 3.7344973385334015, "train_ppl": 41.86697535774703, "lr": 0.00056, "grad_norm": 0.6623, "tokens_per_sec": 150994, "dt_s": 4.34, "eta_s": 17105, "world_size": 1, "timestamp": "2026-05-05T03:01:35.508983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53560, "epoch": 0, "train_loss": 3.8892606496810913, "train_ppl": 48.874737609184905, "lr": 0.00056, "grad_norm": 0.8227, "tokens_per_sec": 147477, "dt_s": 4.444, "eta_s": 17203, "world_size": 1, "timestamp": "2026-05-05T03:01:39.952810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53570, "epoch": 0, "train_loss": 3.779030919075012, "train_ppl": 43.773601012888356, "lr": 0.00056, "grad_norm": 0.6747, "tokens_per_sec": 149519, "dt_s": 4.383, "eta_s": 17253, "world_size": 1, "timestamp": "2026-05-05T03:01:44.335922"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53580, "epoch": 0, "train_loss": 3.7151906490325928, "train_ppl": 41.06641561052676, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 151000, "dt_s": 4.34, "eta_s": 17132, "world_size": 1, "timestamp": "2026-05-05T03:01:48.676055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53590, "epoch": 0, "train_loss": 3.7404251396656036, "train_ppl": 42.11589149547143, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 148663, "dt_s": 4.408, "eta_s": 17133, "world_size": 1, "timestamp": "2026-05-05T03:01:53.084398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53600, "epoch": 0, "train_loss": 3.7192048132419586, "train_ppl": 41.23159425157136, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 152067, "dt_s": 4.31, "eta_s": 17105, "world_size": 1, "timestamp": "2026-05-05T03:01:57.394061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53610, "epoch": 0, "train_loss": 3.7395596504211426, "train_ppl": 42.079456413721275, "lr": 0.00056, "grad_norm": 0.6361, "tokens_per_sec": 149763, "dt_s": 4.376, "eta_s": 17047, "world_size": 1, "timestamp": "2026-05-05T03:02:01.770048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53620, "epoch": 0, "train_loss": 3.7637193351984024, "train_ppl": 43.108463002400654, "lr": 0.00056, "grad_norm": 0.6463, "tokens_per_sec": 151099, "dt_s": 4.337, "eta_s": 17007, "world_size": 1, "timestamp": "2026-05-05T03:02:06.107372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53630, "epoch": 0, "train_loss": 3.7214408814907074, "train_ppl": 41.32389406620676, "lr": 0.00056, "grad_norm": 0.6432, "tokens_per_sec": 150957, "dt_s": 4.341, "eta_s": 17004, "world_size": 1, "timestamp": "2026-05-05T03:02:10.448739"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53640, "epoch": 0, "train_loss": 3.759195938706398, "train_ppl": 42.91390669148353, "lr": 0.00056, "grad_norm": 0.6478, "tokens_per_sec": 150105, "dt_s": 4.366, "eta_s": 16967, "world_size": 1, "timestamp": "2026-05-05T03:02:14.814749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53650, "epoch": 0, "train_loss": 3.670278921723366, "train_ppl": 39.262855596521106, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 150406, "dt_s": 4.357, "eta_s": 16999, "world_size": 1, "timestamp": "2026-05-05T03:02:19.172006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53660, "epoch": 0, "train_loss": 3.705136686563492, "train_ppl": 40.65560401190642, "lr": 0.00056, "grad_norm": 0.7007, "tokens_per_sec": 151679, "dt_s": 4.321, "eta_s": 16952, "world_size": 1, "timestamp": "2026-05-05T03:02:23.492723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53670, "epoch": 0, "train_loss": 3.8171019554138184, "train_ppl": 45.47223661412797, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 149711, "dt_s": 4.377, "eta_s": 16979, "world_size": 1, "timestamp": "2026-05-05T03:02:27.870216"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53680, "epoch": 0, "train_loss": 3.6391093730926514, "train_ppl": 38.05792621370965, "lr": 0.00056, "grad_norm": 0.6925, "tokens_per_sec": 151382, "dt_s": 4.329, "eta_s": 16965, "world_size": 1, "timestamp": "2026-05-05T03:02:32.199416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53690, "epoch": 0, "train_loss": 3.683937296271324, "train_ppl": 39.802801381026114, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 134867, "dt_s": 4.859, "eta_s": 17345, "world_size": 1, "timestamp": "2026-05-05T03:02:37.058683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53700, "epoch": 0, "train_loss": 3.6348513662815094, "train_ppl": 37.896219822451144, "lr": 0.00056, "grad_norm": 0.6994, "tokens_per_sec": 150883, "dt_s": 4.343, "eta_s": 17330, "world_size": 1, "timestamp": "2026-05-05T03:02:41.402202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53710, "epoch": 0, "train_loss": 3.7226041555404663, "train_ppl": 41.371993050535124, "lr": 0.00056, "grad_norm": 0.7384, "tokens_per_sec": 153255, "dt_s": 4.276, "eta_s": 17291, "world_size": 1, "timestamp": "2026-05-05T03:02:45.678452"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53720, "epoch": 0, "train_loss": 3.7570307552814484, "train_ppl": 42.82109073006616, "lr": 0.00056, "grad_norm": 0.6468, "tokens_per_sec": 149511, "dt_s": 4.383, "eta_s": 17291, "world_size": 1, "timestamp": "2026-05-05T03:02:50.061811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53730, "epoch": 0, "train_loss": 3.728310540318489, "train_ppl": 41.60875243953932, "lr": 0.00056, "grad_norm": 0.6217, "tokens_per_sec": 152730, "dt_s": 4.291, "eta_s": 17257, "world_size": 1, "timestamp": "2026-05-05T03:02:54.352778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53740, "epoch": 0, "train_loss": 3.757178634405136, "train_ppl": 42.8274235436726, "lr": 0.00056, "grad_norm": 0.6612, "tokens_per_sec": 152610, "dt_s": 4.294, "eta_s": 16813, "world_size": 1, "timestamp": "2026-05-05T03:02:58.647121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53750, "epoch": 0, "train_loss": 3.60809288918972, "train_ppl": 36.89562163726297, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 150902, "dt_s": 4.343, "eta_s": 16808, "world_size": 1, "timestamp": "2026-05-05T03:03:02.990069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53760, "epoch": 0, "train_loss": 3.6674768775701523, "train_ppl": 39.15299333289664, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 151394, "dt_s": 4.329, "eta_s": 16844, "world_size": 1, "timestamp": "2026-05-05T03:03:07.318912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53770, "epoch": 0, "train_loss": 3.6598540544509888, "train_ppl": 38.85567164516697, "lr": 0.00056, "grad_norm": 0.6558, "tokens_per_sec": 151502, "dt_s": 4.326, "eta_s": 16795, "world_size": 1, "timestamp": "2026-05-05T03:03:11.644676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53780, "epoch": 0, "train_loss": 3.621783137321472, "train_ppl": 37.40420522689537, "lr": 0.00056, "grad_norm": 0.5903, "tokens_per_sec": 150222, "dt_s": 4.363, "eta_s": 16847, "world_size": 1, "timestamp": "2026-05-05T03:03:16.007309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53790, "epoch": 0, "train_loss": 3.513846457004547, "train_ppl": 33.57717286284259, "lr": 0.00056, "grad_norm": 0.865, "tokens_per_sec": 151272, "dt_s": 4.332, "eta_s": 16872, "world_size": 1, "timestamp": "2026-05-05T03:03:20.339621"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53800, "epoch": 0, "train_loss": 3.75354240834713, "train_ppl": 42.67197614241029, "lr": 0.00056, "grad_norm": 0.6287, "tokens_per_sec": 150733, "dt_s": 4.348, "eta_s": 16871, "world_size": 1, "timestamp": "2026-05-05T03:03:24.687434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53810, "epoch": 0, "train_loss": 3.7238284945487976, "train_ppl": 41.42267741657118, "lr": 0.00056, "grad_norm": 0.6118, "tokens_per_sec": 147063, "dt_s": 4.456, "eta_s": 16966, "world_size": 1, "timestamp": "2026-05-05T03:03:29.143762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53820, "epoch": 0, "train_loss": 3.7615641951560974, "train_ppl": 43.01565826714278, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 149141, "dt_s": 4.394, "eta_s": 17015, "world_size": 1, "timestamp": "2026-05-05T03:03:33.537993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53830, "epoch": 0, "train_loss": 3.8109701722860336, "train_ppl": 45.194263826252396, "lr": 0.00056, "grad_norm": 0.6625, "tokens_per_sec": 147684, "dt_s": 4.438, "eta_s": 17069, "world_size": 1, "timestamp": "2026-05-05T03:03:37.975573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53840, "epoch": 0, "train_loss": 3.7308862805366516, "train_ppl": 41.71606392054489, "lr": 0.00056, "grad_norm": 0.6723, "tokens_per_sec": 147830, "dt_s": 4.433, "eta_s": 17143, "world_size": 1, "timestamp": "2026-05-05T03:03:42.408787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53850, "epoch": 0, "train_loss": 3.6662335991859436, "train_ppl": 39.10434551026835, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 149430, "dt_s": 4.386, "eta_s": 17168, "world_size": 1, "timestamp": "2026-05-05T03:03:46.794494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53860, "epoch": 0, "train_loss": 3.8010665476322174, "train_ppl": 44.74888586915241, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 148314, "dt_s": 4.419, "eta_s": 17134, "world_size": 1, "timestamp": "2026-05-05T03:03:51.213248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53870, "epoch": 0, "train_loss": 3.7646550238132477, "train_ppl": 43.148817977336506, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 149970, "dt_s": 4.37, "eta_s": 17111, "world_size": 1, "timestamp": "2026-05-05T03:03:55.583162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53880, "epoch": 0, "train_loss": 3.596226692199707, "train_ppl": 36.460398252902515, "lr": 0.00056, "grad_norm": 0.6807, "tokens_per_sec": 149103, "dt_s": 4.395, "eta_s": 17074, "world_size": 1, "timestamp": "2026-05-05T03:03:59.978529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53890, "epoch": 0, "train_loss": 3.7910523861646652, "train_ppl": 44.30299961528088, "lr": 0.00056, "grad_norm": 0.6664, "tokens_per_sec": 146680, "dt_s": 4.468, "eta_s": 17096, "world_size": 1, "timestamp": "2026-05-05T03:04:04.446513"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53900, "epoch": 0, "train_loss": 3.769479364156723, "train_ppl": 43.35748549753085, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 148662, "dt_s": 4.408, "eta_s": 17109, "world_size": 1, "timestamp": "2026-05-05T03:04:08.854879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53910, "epoch": 0, "train_loss": 3.8242391645908356, "train_ppl": 45.7979424111784, "lr": 0.00056, "grad_norm": 1.1117, "tokens_per_sec": 147878, "dt_s": 4.432, "eta_s": 17115, "world_size": 1, "timestamp": "2026-05-05T03:04:13.286637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53920, "epoch": 0, "train_loss": 3.678450047969818, "train_ppl": 39.584991661278956, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 148300, "dt_s": 4.419, "eta_s": 17149, "world_size": 1, "timestamp": "2026-05-05T03:04:17.705790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53930, "epoch": 0, "train_loss": 3.704627647995949, "train_ppl": 40.63491400792936, "lr": 0.00056, "grad_norm": 0.7165, "tokens_per_sec": 148869, "dt_s": 4.402, "eta_s": 17150, "world_size": 1, "timestamp": "2026-05-05T03:04:22.108059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53940, "epoch": 0, "train_loss": 3.7390442192554474, "train_ppl": 42.05777293909948, "lr": 0.00056, "grad_norm": 0.6265, "tokens_per_sec": 147274, "dt_s": 4.45, "eta_s": 17131, "world_size": 1, "timestamp": "2026-05-05T03:04:26.557992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53950, "epoch": 0, "train_loss": 3.811480313539505, "train_ppl": 45.21732516642079, "lr": 0.00056, "grad_norm": 0.6871, "tokens_per_sec": 150808, "dt_s": 4.346, "eta_s": 17079, "world_size": 1, "timestamp": "2026-05-05T03:04:30.903696"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53960, "epoch": 0, "train_loss": 3.739409163594246, "train_ppl": 42.07312448629582, "lr": 0.00056, "grad_norm": 0.646, "tokens_per_sec": 149633, "dt_s": 4.38, "eta_s": 17034, "world_size": 1, "timestamp": "2026-05-05T03:04:35.283445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53970, "epoch": 0, "train_loss": 3.7649744153022766, "train_ppl": 43.16260154361983, "lr": 0.00056, "grad_norm": 0.6908, "tokens_per_sec": 148662, "dt_s": 4.408, "eta_s": 17021, "world_size": 1, "timestamp": "2026-05-05T03:04:39.691828"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53980, "epoch": 0, "train_loss": 3.724452629685402, "train_ppl": 41.448538834669435, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 150653, "dt_s": 4.35, "eta_s": 16976, "world_size": 1, "timestamp": "2026-05-05T03:04:44.041942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 53990, "epoch": 0, "train_loss": 3.771542564034462, "train_ppl": 43.44703300174286, "lr": 0.00056, "grad_norm": 0.6862, "tokens_per_sec": 133300, "dt_s": 4.916, "eta_s": 17333, "world_size": 1, "timestamp": "2026-05-05T03:04:48.958375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54000, "epoch": 0, "train_loss": 3.6578072756528854, "train_ppl": 38.77622401385709, "lr": 0.00056, "grad_norm": 0.7224, "tokens_per_sec": 149861, "dt_s": 4.373, "eta_s": 17350, "world_size": 1, "timestamp": "2026-05-05T03:04:53.331479"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54010, "epoch": 0, "train_loss": 3.677789628505707, "train_ppl": 39.55885759297273, "lr": 0.00056, "grad_norm": 0.624, "tokens_per_sec": 126717, "dt_s": 5.172, "eta_s": 17359, "world_size": 1, "timestamp": "2026-05-05T03:04:58.503330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54020, "epoch": 0, "train_loss": 3.6883329153060913, "train_ppl": 39.97814442069136, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 149771, "dt_s": 4.376, "eta_s": 17330, "world_size": 1, "timestamp": "2026-05-05T03:05:02.879039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54030, "epoch": 0, "train_loss": 3.6898015290498734, "train_ppl": 40.03690000710889, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 150156, "dt_s": 4.365, "eta_s": 17336, "world_size": 1, "timestamp": "2026-05-05T03:05:07.243613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54040, "epoch": 0, "train_loss": 3.705895721912384, "train_ppl": 40.68647476699706, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 149532, "dt_s": 4.383, "eta_s": 16919, "world_size": 1, "timestamp": "2026-05-05T03:05:11.626339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54050, "epoch": 0, "train_loss": 3.7270058542490005, "train_ppl": 41.554501477795206, "lr": 0.00056, "grad_norm": 0.6073, "tokens_per_sec": 149781, "dt_s": 4.375, "eta_s": 16917, "world_size": 1, "timestamp": "2026-05-05T03:05:16.001779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54060, "epoch": 0, "train_loss": 3.7294979095458984, "train_ppl": 41.65818673434925, "lr": 0.00056, "grad_norm": 0.8198, "tokens_per_sec": 151919, "dt_s": 4.314, "eta_s": 16847, "world_size": 1, "timestamp": "2026-05-05T03:05:20.315679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54070, "epoch": 0, "train_loss": 3.700697422027588, "train_ppl": 40.47552304010539, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 151928, "dt_s": 4.314, "eta_s": 16795, "world_size": 1, "timestamp": "2026-05-05T03:05:24.629300"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54080, "epoch": 0, "train_loss": 3.752998009324074, "train_ppl": 42.64875188249228, "lr": 0.00056, "grad_norm": 0.6829, "tokens_per_sec": 148972, "dt_s": 4.399, "eta_s": 16817, "world_size": 1, "timestamp": "2026-05-05T03:05:29.028505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54090, "epoch": 0, "train_loss": 3.7436626255512238, "train_ppl": 42.2524620530881, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 152392, "dt_s": 4.3, "eta_s": 16750, "world_size": 1, "timestamp": "2026-05-05T03:05:33.328972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54100, "epoch": 0, "train_loss": 3.728537753224373, "train_ppl": 41.61820755921324, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 150153, "dt_s": 4.365, "eta_s": 16737, "world_size": 1, "timestamp": "2026-05-05T03:05:37.693603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54110, "epoch": 0, "train_loss": 3.6800003349781036, "train_ppl": 39.64640735324872, "lr": 0.00056, "grad_norm": 0.6041, "tokens_per_sec": 151278, "dt_s": 4.332, "eta_s": 16747, "world_size": 1, "timestamp": "2026-05-05T03:05:42.025763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54120, "epoch": 0, "train_loss": 3.6197596192359924, "train_ppl": 37.328593667624666, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 152043, "dt_s": 4.31, "eta_s": 16740, "world_size": 1, "timestamp": "2026-05-05T03:05:46.336139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54130, "epoch": 0, "train_loss": 3.811753749847412, "train_ppl": 45.229690915413066, "lr": 0.00056, "grad_norm": 0.646, "tokens_per_sec": 145882, "dt_s": 4.492, "eta_s": 16807, "world_size": 1, "timestamp": "2026-05-05T03:05:50.828539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54140, "epoch": 0, "train_loss": 3.793777346611023, "train_ppl": 44.42388817034911, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 151538, "dt_s": 4.325, "eta_s": 16822, "world_size": 1, "timestamp": "2026-05-05T03:05:55.153241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54150, "epoch": 0, "train_loss": 3.7452410608530045, "train_ppl": 42.31920749360182, "lr": 0.00056, "grad_norm": 0.6477, "tokens_per_sec": 152516, "dt_s": 4.297, "eta_s": 16765, "world_size": 1, "timestamp": "2026-05-05T03:05:59.450240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54160, "epoch": 0, "train_loss": 3.73360376060009, "train_ppl": 41.82958066245837, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 149907, "dt_s": 4.372, "eta_s": 16791, "world_size": 1, "timestamp": "2026-05-05T03:06:03.821997"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54170, "epoch": 0, "train_loss": 3.7364762276411057, "train_ppl": 41.94990748893701, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 153138, "dt_s": 4.28, "eta_s": 16763, "world_size": 1, "timestamp": "2026-05-05T03:06:08.101526"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54180, "epoch": 0, "train_loss": 3.7196619659662247, "train_ppl": 41.25044769633276, "lr": 0.00056, "grad_norm": 0.687, "tokens_per_sec": 151662, "dt_s": 4.321, "eta_s": 16627, "world_size": 1, "timestamp": "2026-05-05T03:06:12.422755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54190, "epoch": 0, "train_loss": 3.7192197740077972, "train_ppl": 41.23221111241245, "lr": 0.00056, "grad_norm": 0.6143, "tokens_per_sec": 146995, "dt_s": 4.458, "eta_s": 16726, "world_size": 1, "timestamp": "2026-05-05T03:06:16.881116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54200, "epoch": 0, "train_loss": 3.6869747936725616, "train_ppl": 39.9238860909309, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 150044, "dt_s": 4.368, "eta_s": 16776, "world_size": 1, "timestamp": "2026-05-05T03:06:21.248915"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54210, "epoch": 0, "train_loss": 3.7619419544935226, "train_ppl": 43.03191090330795, "lr": 0.00056, "grad_norm": 0.6443, "tokens_per_sec": 148954, "dt_s": 4.4, "eta_s": 16793, "world_size": 1, "timestamp": "2026-05-05T03:06:25.648644"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54220, "epoch": 0, "train_loss": 3.6964798867702484, "train_ppl": 40.30517557036034, "lr": 0.00056, "grad_norm": 0.6316, "tokens_per_sec": 146269, "dt_s": 4.481, "eta_s": 16943, "world_size": 1, "timestamp": "2026-05-05T03:06:30.129154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54230, "epoch": 0, "train_loss": 3.8504025042057037, "train_ppl": 47.0119819443442, "lr": 0.00056, "grad_norm": 0.72, "tokens_per_sec": 151606, "dt_s": 4.323, "eta_s": 16940, "world_size": 1, "timestamp": "2026-05-05T03:06:34.451959"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54240, "epoch": 0, "train_loss": 3.690830022096634, "train_ppl": 40.07809886311685, "lr": 0.00056, "grad_norm": 0.6412, "tokens_per_sec": 149259, "dt_s": 4.391, "eta_s": 16884, "world_size": 1, "timestamp": "2026-05-05T03:06:38.842706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54250, "epoch": 0, "train_loss": 3.7434459775686264, "train_ppl": 42.24330913394106, "lr": 0.00056, "grad_norm": 0.6273, "tokens_per_sec": 151415, "dt_s": 4.328, "eta_s": 16849, "world_size": 1, "timestamp": "2026-05-05T03:06:43.170959"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54260, "epoch": 0, "train_loss": 3.779173031449318, "train_ppl": 43.779822225305374, "lr": 0.00056, "grad_norm": 0.6497, "tokens_per_sec": 153255, "dt_s": 4.276, "eta_s": 16750, "world_size": 1, "timestamp": "2026-05-05T03:06:47.447236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54270, "epoch": 0, "train_loss": 3.6954246163368225, "train_ppl": 40.26266514420977, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 149590, "dt_s": 4.381, "eta_s": 16669, "world_size": 1, "timestamp": "2026-05-05T03:06:51.828291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54280, "epoch": 0, "train_loss": 3.716527283191681, "train_ppl": 41.12134308521483, "lr": 0.00056, "grad_norm": 0.6282, "tokens_per_sec": 135262, "dt_s": 4.845, "eta_s": 17066, "world_size": 1, "timestamp": "2026-05-05T03:06:56.673400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54290, "epoch": 0, "train_loss": 3.6957935094833374, "train_ppl": 40.27752050529387, "lr": 0.00056, "grad_norm": 0.7174, "tokens_per_sec": 151731, "dt_s": 4.319, "eta_s": 17006, "world_size": 1, "timestamp": "2026-05-05T03:07:00.992619"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54300, "epoch": 0, "train_loss": 3.6289810240268707, "train_ppl": 37.67440773535704, "lr": 0.00056, "grad_norm": 0.6195, "tokens_per_sec": 147610, "dt_s": 4.44, "eta_s": 17087, "world_size": 1, "timestamp": "2026-05-05T03:07:05.432446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54310, "epoch": 0, "train_loss": 3.762441575527191, "train_ppl": 43.053415922846895, "lr": 0.00056, "grad_norm": 0.7349, "tokens_per_sec": 151225, "dt_s": 4.334, "eta_s": 17127, "world_size": 1, "timestamp": "2026-05-05T03:07:09.766112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54320, "epoch": 0, "train_loss": 3.785340890288353, "train_ppl": 44.05068445008399, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 150147, "dt_s": 4.365, "eta_s": 17110, "world_size": 1, "timestamp": "2026-05-05T03:07:14.130906"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54330, "epoch": 0, "train_loss": 3.7458558678627014, "train_ppl": 42.34523363872214, "lr": 0.00056, "grad_norm": 0.621, "tokens_per_sec": 150981, "dt_s": 4.341, "eta_s": 16719, "world_size": 1, "timestamp": "2026-05-05T03:07:18.471588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54340, "epoch": 0, "train_loss": 3.660326972603798, "train_ppl": 38.874051543378755, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 153279, "dt_s": 4.276, "eta_s": 16681, "world_size": 1, "timestamp": "2026-05-05T03:07:22.747211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54350, "epoch": 0, "train_loss": 3.7759464234113693, "train_ppl": 43.63878954995277, "lr": 0.00056, "grad_norm": 0.74, "tokens_per_sec": 150690, "dt_s": 4.349, "eta_s": 16607, "world_size": 1, "timestamp": "2026-05-05T03:07:27.096279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54360, "epoch": 0, "train_loss": 3.7259192019701004, "train_ppl": 41.50937070923297, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 148369, "dt_s": 4.417, "eta_s": 16667, "world_size": 1, "timestamp": "2026-05-05T03:07:31.513373"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54370, "epoch": 0, "train_loss": 3.678910508751869, "train_ppl": 39.60322319462772, "lr": 0.00056, "grad_norm": 0.6488, "tokens_per_sec": 152728, "dt_s": 4.291, "eta_s": 16606, "world_size": 1, "timestamp": "2026-05-05T03:07:35.804393"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54380, "epoch": 0, "train_loss": 3.762966424226761, "train_ppl": 43.07601838312313, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 147814, "dt_s": 4.434, "eta_s": 16673, "world_size": 1, "timestamp": "2026-05-05T03:07:40.238064"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54390, "epoch": 0, "train_loss": 3.7590619921684265, "train_ppl": 42.90815890720787, "lr": 0.00056, "grad_norm": 0.7212, "tokens_per_sec": 151372, "dt_s": 4.329, "eta_s": 16710, "world_size": 1, "timestamp": "2026-05-05T03:07:44.567534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54400, "epoch": 0, "train_loss": 3.673348382115364, "train_ppl": 39.3835565252452, "lr": 0.00056, "grad_norm": 0.6677, "tokens_per_sec": 150897, "dt_s": 4.343, "eta_s": 16701, "world_size": 1, "timestamp": "2026-05-05T03:07:48.910633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54410, "epoch": 0, "train_loss": 3.7395097464323044, "train_ppl": 42.07735653339472, "lr": 0.00056, "grad_norm": 0.7298, "tokens_per_sec": 149376, "dt_s": 4.387, "eta_s": 16673, "world_size": 1, "timestamp": "2026-05-05T03:07:53.297947"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54420, "epoch": 0, "train_loss": 3.69712495803833, "train_ppl": 40.33118366871239, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 150777, "dt_s": 4.347, "eta_s": 16712, "world_size": 1, "timestamp": "2026-05-05T03:07:57.644500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54430, "epoch": 0, "train_loss": 3.683472067117691, "train_ppl": 39.784288264182145, "lr": 0.00056, "grad_norm": 0.6486, "tokens_per_sec": 152622, "dt_s": 4.294, "eta_s": 16600, "world_size": 1, "timestamp": "2026-05-05T03:08:01.938509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54440, "epoch": 0, "train_loss": 3.7938790023326874, "train_ppl": 44.42840434230367, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 146209, "dt_s": 4.482, "eta_s": 16713, "world_size": 1, "timestamp": "2026-05-05T03:08:06.420866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54450, "epoch": 0, "train_loss": 3.744735077023506, "train_ppl": 42.29780007529335, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 151069, "dt_s": 4.338, "eta_s": 16705, "world_size": 1, "timestamp": "2026-05-05T03:08:10.759014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54460, "epoch": 0, "train_loss": 3.7372576892375946, "train_ppl": 41.98270254298458, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 150890, "dt_s": 4.343, "eta_s": 16667, "world_size": 1, "timestamp": "2026-05-05T03:08:15.102324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54470, "epoch": 0, "train_loss": 3.6105698198080063, "train_ppl": 36.98712280641937, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 148317, "dt_s": 4.419, "eta_s": 16717, "world_size": 1, "timestamp": "2026-05-05T03:08:19.520955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54480, "epoch": 0, "train_loss": 3.6453021317720413, "train_ppl": 38.294341041029064, "lr": 0.00056, "grad_norm": 0.649, "tokens_per_sec": 150510, "dt_s": 4.354, "eta_s": 16759, "world_size": 1, "timestamp": "2026-05-05T03:08:23.875206"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54490, "epoch": 0, "train_loss": 3.593203589320183, "train_ppl": 36.350341158733734, "lr": 0.00056, "grad_norm": 0.725, "tokens_per_sec": 146955, "dt_s": 4.46, "eta_s": 16737, "world_size": 1, "timestamp": "2026-05-05T03:08:28.334804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54500, "epoch": 0, "train_loss": 3.7730823159217834, "train_ppl": 43.51398218214212, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 149952, "dt_s": 4.37, "eta_s": 16758, "world_size": 1, "timestamp": "2026-05-05T03:08:32.705284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54510, "epoch": 0, "train_loss": 3.6471952199935913, "train_ppl": 38.36690426963165, "lr": 0.00056, "grad_norm": 0.7375, "tokens_per_sec": 126785, "dt_s": 5.169, "eta_s": 16786, "world_size": 1, "timestamp": "2026-05-05T03:08:37.874354"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54520, "epoch": 0, "train_loss": 3.7459993958473206, "train_ppl": 42.351311800947286, "lr": 0.00056, "grad_norm": 0.6888, "tokens_per_sec": 147780, "dt_s": 4.435, "eta_s": 16794, "world_size": 1, "timestamp": "2026-05-05T03:08:42.309055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54530, "epoch": 0, "train_loss": 3.948324427008629, "train_ppl": 51.84841820159946, "lr": 0.00056, "grad_norm": 0.7229, "tokens_per_sec": 149351, "dt_s": 4.388, "eta_s": 16815, "world_size": 1, "timestamp": "2026-05-05T03:08:46.697084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54540, "epoch": 0, "train_loss": 3.719445914030075, "train_ppl": 41.24153641992486, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 148572, "dt_s": 4.411, "eta_s": 16774, "world_size": 1, "timestamp": "2026-05-05T03:08:51.108218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54550, "epoch": 0, "train_loss": 3.7627220451831818, "train_ppl": 43.06549279311866, "lr": 0.00056, "grad_norm": 0.734, "tokens_per_sec": 148642, "dt_s": 4.409, "eta_s": 16799, "world_size": 1, "timestamp": "2026-05-05T03:08:55.517152"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54560, "epoch": 0, "train_loss": 3.785479113459587, "train_ppl": 44.05677369621147, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 150707, "dt_s": 4.349, "eta_s": 16766, "world_size": 1, "timestamp": "2026-05-05T03:08:59.865726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54570, "epoch": 0, "train_loss": 4.5862473249435425, "train_ppl": 98.12550523132086, "lr": 0.00056, "grad_norm": 1.461, "tokens_per_sec": 150069, "dt_s": 4.367, "eta_s": 16710, "world_size": 1, "timestamp": "2026-05-05T03:09:04.232782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54580, "epoch": 0, "train_loss": 3.739711210131645, "train_ppl": 42.08583444726769, "lr": 0.00056, "grad_norm": 0.6723, "tokens_per_sec": 133377, "dt_s": 4.914, "eta_s": 17106, "world_size": 1, "timestamp": "2026-05-05T03:09:09.146373"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54590, "epoch": 0, "train_loss": 3.68141832947731, "train_ppl": 39.70266561831755, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 151087, "dt_s": 4.338, "eta_s": 17046, "world_size": 1, "timestamp": "2026-05-05T03:09:13.484026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54600, "epoch": 0, "train_loss": 3.7821635007858276, "train_ppl": 43.91094039594759, "lr": 0.00056, "grad_norm": 0.6611, "tokens_per_sec": 150680, "dt_s": 4.349, "eta_s": 16996, "world_size": 1, "timestamp": "2026-05-05T03:09:17.833373"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54610, "epoch": 0, "train_loss": 3.779877185821533, "train_ppl": 43.81066083483676, "lr": 0.00056, "grad_norm": 0.6541, "tokens_per_sec": 150804, "dt_s": 4.346, "eta_s": 16989, "world_size": 1, "timestamp": "2026-05-05T03:09:22.179141"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54620, "epoch": 0, "train_loss": 3.699677497148514, "train_ppl": 40.434262092273656, "lr": 0.00056, "grad_norm": 0.6261, "tokens_per_sec": 149984, "dt_s": 4.37, "eta_s": 16986, "world_size": 1, "timestamp": "2026-05-05T03:09:26.548687"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54630, "epoch": 0, "train_loss": 3.8155943900346756, "train_ppl": 45.40373589212856, "lr": 0.00056, "grad_norm": 0.6568, "tokens_per_sec": 147852, "dt_s": 4.433, "eta_s": 16616, "world_size": 1, "timestamp": "2026-05-05T03:09:30.981247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54640, "epoch": 0, "train_loss": 3.6802867352962494, "train_ppl": 39.65776372308437, "lr": 0.00056, "grad_norm": 0.7985, "tokens_per_sec": 149182, "dt_s": 4.393, "eta_s": 16654, "world_size": 1, "timestamp": "2026-05-05T03:09:35.374244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54650, "epoch": 0, "train_loss": 3.6748698949813843, "train_ppl": 39.44352472283121, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 149070, "dt_s": 4.396, "eta_s": 16685, "world_size": 1, "timestamp": "2026-05-05T03:09:39.770562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54660, "epoch": 0, "train_loss": 3.70888751745224, "train_ppl": 40.80838265153855, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 148766, "dt_s": 4.405, "eta_s": 16726, "world_size": 1, "timestamp": "2026-05-05T03:09:44.175886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54670, "epoch": 0, "train_loss": 3.698643073439598, "train_ppl": 40.39245755844105, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 149251, "dt_s": 4.391, "eta_s": 16738, "world_size": 1, "timestamp": "2026-05-05T03:09:48.566867"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54680, "epoch": 0, "train_loss": 3.658433422446251, "train_ppl": 38.8005112250684, "lr": 0.00056, "grad_norm": 0.7391, "tokens_per_sec": 148556, "dt_s": 4.412, "eta_s": 16717, "world_size": 1, "timestamp": "2026-05-05T03:09:52.978443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54690, "epoch": 0, "train_loss": 3.6212927252054214, "train_ppl": 37.38586624865697, "lr": 0.00056, "grad_norm": 0.637, "tokens_per_sec": 148795, "dt_s": 4.404, "eta_s": 16722, "world_size": 1, "timestamp": "2026-05-05T03:09:57.382876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54700, "epoch": 0, "train_loss": 3.8452266603708267, "train_ppl": 46.7692838928492, "lr": 0.00056, "grad_norm": 0.6436, "tokens_per_sec": 150545, "dt_s": 4.353, "eta_s": 16684, "world_size": 1, "timestamp": "2026-05-05T03:10:01.736135"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54710, "epoch": 0, "train_loss": 3.7313895523548126, "train_ppl": 41.737063723742025, "lr": 0.00056, "grad_norm": 0.7005, "tokens_per_sec": 149224, "dt_s": 4.392, "eta_s": 16670, "world_size": 1, "timestamp": "2026-05-05T03:10:06.127929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54720, "epoch": 0, "train_loss": 3.7468525916337967, "train_ppl": 42.38746118078398, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 149786, "dt_s": 4.375, "eta_s": 16653, "world_size": 1, "timestamp": "2026-05-05T03:10:10.503249"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54730, "epoch": 0, "train_loss": 3.7497230172157288, "train_ppl": 42.50930602332688, "lr": 0.00056, "grad_norm": 0.7028, "tokens_per_sec": 150994, "dt_s": 4.34, "eta_s": 16595, "world_size": 1, "timestamp": "2026-05-05T03:10:14.843544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54740, "epoch": 0, "train_loss": 3.7352322787046432, "train_ppl": 41.89775638950871, "lr": 0.00056, "grad_norm": 0.7428, "tokens_per_sec": 147644, "dt_s": 4.439, "eta_s": 16617, "world_size": 1, "timestamp": "2026-05-05T03:10:19.282356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54750, "epoch": 0, "train_loss": 3.625285252928734, "train_ppl": 37.53542872360635, "lr": 0.00056, "grad_norm": 0.7084, "tokens_per_sec": 149503, "dt_s": 4.384, "eta_s": 16635, "world_size": 1, "timestamp": "2026-05-05T03:10:23.665938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54760, "epoch": 0, "train_loss": 3.7747389376163483, "train_ppl": 43.58612813181492, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 146771, "dt_s": 4.465, "eta_s": 16687, "world_size": 1, "timestamp": "2026-05-05T03:10:28.131104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54770, "epoch": 0, "train_loss": 3.662825122475624, "train_ppl": 38.97128615300491, "lr": 0.00056, "grad_norm": 0.6542, "tokens_per_sec": 147895, "dt_s": 4.431, "eta_s": 16725, "world_size": 1, "timestamp": "2026-05-05T03:10:32.562377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54780, "epoch": 0, "train_loss": 3.6870032250881195, "train_ppl": 39.925021199663334, "lr": 0.00056, "grad_norm": 0.6954, "tokens_per_sec": 149884, "dt_s": 4.372, "eta_s": 16745, "world_size": 1, "timestamp": "2026-05-05T03:10:36.934795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54790, "epoch": 0, "train_loss": 3.810641795396805, "train_ppl": 45.1794255108994, "lr": 0.00056, "grad_norm": 0.6986, "tokens_per_sec": 148111, "dt_s": 4.425, "eta_s": 16730, "world_size": 1, "timestamp": "2026-05-05T03:10:41.359583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54800, "epoch": 0, "train_loss": 3.730767622590065, "train_ppl": 41.71111427172379, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 148052, "dt_s": 4.427, "eta_s": 16758, "world_size": 1, "timestamp": "2026-05-05T03:10:45.786142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54810, "epoch": 0, "train_loss": 3.6846023350954056, "train_ppl": 39.82928059312781, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 148813, "dt_s": 4.404, "eta_s": 16707, "world_size": 1, "timestamp": "2026-05-05T03:10:50.190050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54820, "epoch": 0, "train_loss": 3.7505708634853363, "train_ppl": 42.54536266294614, "lr": 0.00056, "grad_norm": 0.7055, "tokens_per_sec": 146757, "dt_s": 4.466, "eta_s": 16729, "world_size": 1, "timestamp": "2026-05-05T03:10:54.655675"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54830, "epoch": 0, "train_loss": 3.733465939760208, "train_ppl": 41.82381607176905, "lr": 0.00056, "grad_norm": 0.6279, "tokens_per_sec": 149517, "dt_s": 4.383, "eta_s": 16732, "world_size": 1, "timestamp": "2026-05-05T03:10:59.038819"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54840, "epoch": 0, "train_loss": 3.726244732737541, "train_ppl": 41.522885486154436, "lr": 0.00056, "grad_norm": 0.6879, "tokens_per_sec": 150114, "dt_s": 4.366, "eta_s": 16683, "world_size": 1, "timestamp": "2026-05-05T03:11:03.404593"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54850, "epoch": 0, "train_loss": 3.709204062819481, "train_ppl": 40.82130240074699, "lr": 0.00056, "grad_norm": 0.6288, "tokens_per_sec": 149855, "dt_s": 4.373, "eta_s": 16638, "world_size": 1, "timestamp": "2026-05-05T03:11:07.777875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54860, "epoch": 0, "train_loss": 3.7202288210392, "train_ppl": 41.27383735051769, "lr": 0.00056, "grad_norm": 0.7208, "tokens_per_sec": 150233, "dt_s": 4.362, "eta_s": 16603, "world_size": 1, "timestamp": "2026-05-05T03:11:12.140143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54870, "epoch": 0, "train_loss": 3.8014506697654724, "train_ppl": 44.76607820842088, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 134908, "dt_s": 4.858, "eta_s": 16895, "world_size": 1, "timestamp": "2026-05-05T03:11:16.998008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54880, "epoch": 0, "train_loss": 3.8212232142686844, "train_ppl": 45.66002617081431, "lr": 0.00056, "grad_norm": 0.6422, "tokens_per_sec": 151393, "dt_s": 4.329, "eta_s": 16849, "world_size": 1, "timestamp": "2026-05-05T03:11:21.326867"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54890, "epoch": 0, "train_loss": 3.688207507133484, "train_ppl": 39.97313114901451, "lr": 0.00056, "grad_norm": 0.6749, "tokens_per_sec": 150893, "dt_s": 4.343, "eta_s": 16828, "world_size": 1, "timestamp": "2026-05-05T03:11:25.670065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54900, "epoch": 0, "train_loss": 3.8054281175136566, "train_ppl": 44.944487517033174, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 150072, "dt_s": 4.367, "eta_s": 16819, "world_size": 1, "timestamp": "2026-05-05T03:11:30.037055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54910, "epoch": 0, "train_loss": 3.68292897939682, "train_ppl": 39.762687771754294, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 151884, "dt_s": 4.315, "eta_s": 16778, "world_size": 1, "timestamp": "2026-05-05T03:11:34.351891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54920, "epoch": 0, "train_loss": 3.918596461415291, "train_ppl": 50.32975542835226, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 151447, "dt_s": 4.327, "eta_s": 16373, "world_size": 1, "timestamp": "2026-05-05T03:11:38.679242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54930, "epoch": 0, "train_loss": 3.7683161944150925, "train_ppl": 43.30708270151331, "lr": 0.00056, "grad_norm": 0.7161, "tokens_per_sec": 149717, "dt_s": 4.377, "eta_s": 16405, "world_size": 1, "timestamp": "2026-05-05T03:11:43.056533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54940, "epoch": 0, "train_loss": 3.7530626356601715, "train_ppl": 42.65150820413011, "lr": 0.00056, "grad_norm": 0.7344, "tokens_per_sec": 151456, "dt_s": 4.327, "eta_s": 16389, "world_size": 1, "timestamp": "2026-05-05T03:11:47.383595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54950, "epoch": 0, "train_loss": 3.70582315325737, "train_ppl": 40.683522311375114, "lr": 0.00056, "grad_norm": 0.6435, "tokens_per_sec": 150039, "dt_s": 4.368, "eta_s": 16385, "world_size": 1, "timestamp": "2026-05-05T03:11:51.751544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54960, "epoch": 0, "train_loss": 3.771996572613716, "train_ppl": 43.466762805880066, "lr": 0.00056, "grad_norm": 0.6583, "tokens_per_sec": 147851, "dt_s": 4.433, "eta_s": 16470, "world_size": 1, "timestamp": "2026-05-05T03:11:56.184120"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54970, "epoch": 0, "train_loss": 3.7432350516319275, "train_ppl": 42.23439986402381, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 151696, "dt_s": 4.32, "eta_s": 16460, "world_size": 1, "timestamp": "2026-05-05T03:12:00.504352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54980, "epoch": 0, "train_loss": 3.674758806824684, "train_ppl": 39.43914325774494, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 149491, "dt_s": 4.384, "eta_s": 16461, "world_size": 1, "timestamp": "2026-05-05T03:12:04.888284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 54990, "epoch": 0, "train_loss": 3.7321954518556595, "train_ppl": 41.77071315977445, "lr": 0.00056, "grad_norm": 0.6184, "tokens_per_sec": 150069, "dt_s": 4.367, "eta_s": 16486, "world_size": 1, "timestamp": "2026-05-05T03:12:09.255352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55000, "epoch": 0, "train_loss": 3.738383784890175, "train_ppl": 42.03000571074886, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 150928, "dt_s": 4.342, "eta_s": 16463, "world_size": 1, "timestamp": "2026-05-05T03:12:13.597536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55010, "epoch": 0, "train_loss": 3.6958777755498886, "train_ppl": 40.28091467652201, "lr": 0.00056, "grad_norm": 0.6775, "tokens_per_sec": 125533, "dt_s": 5.221, "eta_s": 16478, "world_size": 1, "timestamp": "2026-05-05T03:12:18.818185"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55020, "epoch": 0, "train_loss": 3.7172820270061493, "train_ppl": 41.15239087964259, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 148672, "dt_s": 4.408, "eta_s": 16540, "world_size": 1, "timestamp": "2026-05-05T03:12:23.226279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55030, "epoch": 0, "train_loss": 3.7903326600790024, "train_ppl": 44.27112506263409, "lr": 0.00056, "grad_norm": 0.7221, "tokens_per_sec": 149921, "dt_s": 4.371, "eta_s": 16526, "world_size": 1, "timestamp": "2026-05-05T03:12:27.597639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55040, "epoch": 0, "train_loss": 3.8076500445604324, "train_ppl": 45.04446191623074, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 148171, "dt_s": 4.423, "eta_s": 16564, "world_size": 1, "timestamp": "2026-05-05T03:12:32.020636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55050, "epoch": 0, "train_loss": 3.6960369646549225, "train_ppl": 40.287327469689174, "lr": 0.00056, "grad_norm": 0.755, "tokens_per_sec": 152424, "dt_s": 4.3, "eta_s": 16528, "world_size": 1, "timestamp": "2026-05-05T03:12:36.320238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55060, "epoch": 0, "train_loss": 3.7572815865278244, "train_ppl": 42.8318329448104, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 149642, "dt_s": 4.38, "eta_s": 16463, "world_size": 1, "timestamp": "2026-05-05T03:12:40.699754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55070, "epoch": 0, "train_loss": 3.6957957446575165, "train_ppl": 40.27761053266832, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 151049, "dt_s": 4.339, "eta_s": 16407, "world_size": 1, "timestamp": "2026-05-05T03:12:45.038471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55080, "epoch": 0, "train_loss": 3.6892452985048294, "train_ppl": 40.01463645280456, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 151909, "dt_s": 4.314, "eta_s": 16359, "world_size": 1, "timestamp": "2026-05-05T03:12:49.352629"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55090, "epoch": 0, "train_loss": 3.6594765186309814, "train_ppl": 38.84100500607549, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 149318, "dt_s": 4.389, "eta_s": 16329, "world_size": 1, "timestamp": "2026-05-05T03:12:53.741678"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55100, "epoch": 0, "train_loss": 3.7092483192682266, "train_ppl": 40.82310904660199, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 149718, "dt_s": 4.377, "eta_s": 16383, "world_size": 1, "timestamp": "2026-05-05T03:12:58.118975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55110, "epoch": 0, "train_loss": 3.7278466522693634, "train_ppl": 41.589455112789274, "lr": 0.00056, "grad_norm": 0.6673, "tokens_per_sec": 150002, "dt_s": 4.369, "eta_s": 16371, "world_size": 1, "timestamp": "2026-05-05T03:13:02.487974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55120, "epoch": 0, "train_loss": 3.6870324462652206, "train_ppl": 39.92618787282427, "lr": 0.00056, "grad_norm": 0.7524, "tokens_per_sec": 150195, "dt_s": 4.363, "eta_s": 16385, "world_size": 1, "timestamp": "2026-05-05T03:13:06.851391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55130, "epoch": 0, "train_loss": 3.7041584104299545, "train_ppl": 40.61585105266304, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 151238, "dt_s": 4.333, "eta_s": 16395, "world_size": 1, "timestamp": "2026-05-05T03:13:11.184679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55140, "epoch": 0, "train_loss": 3.6749150305986404, "train_ppl": 39.44530507084458, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 151621, "dt_s": 4.322, "eta_s": 16341, "world_size": 1, "timestamp": "2026-05-05T03:13:15.507032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55150, "epoch": 0, "train_loss": 3.6749744564294815, "train_ppl": 39.44764921052173, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 149953, "dt_s": 4.37, "eta_s": 16331, "world_size": 1, "timestamp": "2026-05-05T03:13:19.877443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55160, "epoch": 0, "train_loss": 3.902797043323517, "train_ppl": 49.540823325096014, "lr": 0.00056, "grad_norm": 0.7047, "tokens_per_sec": 151712, "dt_s": 4.32, "eta_s": 16290, "world_size": 1, "timestamp": "2026-05-05T03:13:24.197216"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55170, "epoch": 0, "train_loss": 3.6885145902633667, "train_ppl": 39.98540810816602, "lr": 0.00056, "grad_norm": 0.7662, "tokens_per_sec": 133938, "dt_s": 4.893, "eta_s": 16683, "world_size": 1, "timestamp": "2026-05-05T03:13:29.090220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55180, "epoch": 0, "train_loss": 3.7819530963897705, "train_ppl": 43.901702312954, "lr": 0.00056, "grad_norm": 0.7044, "tokens_per_sec": 151601, "dt_s": 4.323, "eta_s": 16671, "world_size": 1, "timestamp": "2026-05-05T03:13:33.413138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55190, "epoch": 0, "train_loss": 3.762403428554535, "train_ppl": 43.051773596692016, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 151248, "dt_s": 4.333, "eta_s": 16675, "world_size": 1, "timestamp": "2026-05-05T03:13:37.746167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55200, "epoch": 0, "train_loss": 3.802583396434784, "train_ppl": 44.81681466893414, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 147969, "dt_s": 4.429, "eta_s": 16714, "world_size": 1, "timestamp": "2026-05-05T03:13:42.175186"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55210, "epoch": 0, "train_loss": 3.726147249341011, "train_ppl": 41.51883789153335, "lr": 0.00056, "grad_norm": 0.7075, "tokens_per_sec": 152833, "dt_s": 4.288, "eta_s": 16686, "world_size": 1, "timestamp": "2026-05-05T03:13:46.463279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55220, "epoch": 0, "train_loss": 3.617176726460457, "train_ppl": 37.232302321363605, "lr": 0.00056, "grad_norm": 0.7307, "tokens_per_sec": 151828, "dt_s": 4.316, "eta_s": 16249, "world_size": 1, "timestamp": "2026-05-05T03:13:50.779745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55230, "epoch": 0, "train_loss": 3.799832507967949, "train_ppl": 44.69369802805409, "lr": 0.00056, "grad_norm": 0.7749, "tokens_per_sec": 149930, "dt_s": 4.371, "eta_s": 16281, "world_size": 1, "timestamp": "2026-05-05T03:13:55.150863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55240, "epoch": 0, "train_loss": 3.761189430952072, "train_ppl": 42.99954055857043, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 151628, "dt_s": 4.322, "eta_s": 16268, "world_size": 1, "timestamp": "2026-05-05T03:13:59.473013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55250, "epoch": 0, "train_loss": 3.746066689491272, "train_ppl": 42.354161870939215, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 150466, "dt_s": 4.356, "eta_s": 16209, "world_size": 1, "timestamp": "2026-05-05T03:14:03.828566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55260, "epoch": 0, "train_loss": 3.613657623529434, "train_ppl": 37.10150829094402, "lr": 0.00056, "grad_norm": 0.6771, "tokens_per_sec": 147630, "dt_s": 4.439, "eta_s": 16318, "world_size": 1, "timestamp": "2026-05-05T03:14:08.267755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55270, "epoch": 0, "train_loss": 3.7072016298770905, "train_ppl": 40.73964226682588, "lr": 0.00056, "grad_norm": 0.6618, "tokens_per_sec": 149368, "dt_s": 4.388, "eta_s": 16367, "world_size": 1, "timestamp": "2026-05-05T03:14:12.655332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55280, "epoch": 0, "train_loss": 3.6173180788755417, "train_ppl": 37.23756556919353, "lr": 0.00056, "grad_norm": 0.6325, "tokens_per_sec": 149218, "dt_s": 4.392, "eta_s": 16378, "world_size": 1, "timestamp": "2026-05-05T03:14:17.047293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55290, "epoch": 0, "train_loss": 3.7289277017116547, "train_ppl": 41.6344396809343, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 149006, "dt_s": 4.398, "eta_s": 16431, "world_size": 1, "timestamp": "2026-05-05T03:14:21.445492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55300, "epoch": 0, "train_loss": 3.76521173119545, "train_ppl": 43.172845930486666, "lr": 0.00056, "grad_norm": 0.6622, "tokens_per_sec": 150771, "dt_s": 4.347, "eta_s": 16420, "world_size": 1, "timestamp": "2026-05-05T03:14:25.792217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55310, "epoch": 0, "train_loss": 3.6051988750696182, "train_ppl": 36.78899954461176, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 148693, "dt_s": 4.407, "eta_s": 16392, "world_size": 1, "timestamp": "2026-05-05T03:14:30.199693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55320, "epoch": 0, "train_loss": 3.78227935731411, "train_ppl": 43.91602805976932, "lr": 0.00056, "grad_norm": 0.6548, "tokens_per_sec": 150278, "dt_s": 4.361, "eta_s": 16367, "world_size": 1, "timestamp": "2026-05-05T03:14:34.560669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55330, "epoch": 0, "train_loss": 3.7078633904457092, "train_ppl": 40.76661107812146, "lr": 0.00056, "grad_norm": 0.7896, "tokens_per_sec": 151413, "dt_s": 4.328, "eta_s": 16315, "world_size": 1, "timestamp": "2026-05-05T03:14:38.888939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55340, "epoch": 0, "train_loss": 3.813246339559555, "train_ppl": 45.29725069372924, "lr": 0.00056, "grad_norm": 0.7129, "tokens_per_sec": 146796, "dt_s": 4.464, "eta_s": 16360, "world_size": 1, "timestamp": "2026-05-05T03:14:43.353382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55350, "epoch": 0, "train_loss": 3.705766037106514, "train_ppl": 40.68119869153617, "lr": 0.00056, "grad_norm": 0.8204, "tokens_per_sec": 151635, "dt_s": 4.322, "eta_s": 16338, "world_size": 1, "timestamp": "2026-05-05T03:14:47.675339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55360, "epoch": 0, "train_loss": 3.8124533891677856, "train_ppl": 45.26134645806573, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 151268, "dt_s": 4.332, "eta_s": 16277, "world_size": 1, "timestamp": "2026-05-05T03:14:52.007759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55370, "epoch": 0, "train_loss": 3.7284498065710068, "train_ppl": 41.61454753808497, "lr": 0.00056, "grad_norm": 0.7095, "tokens_per_sec": 148044, "dt_s": 4.427, "eta_s": 16322, "world_size": 1, "timestamp": "2026-05-05T03:14:56.434539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55380, "epoch": 0, "train_loss": 3.6375339925289154, "train_ppl": 37.99801769820214, "lr": 0.00056, "grad_norm": 0.6872, "tokens_per_sec": 151778, "dt_s": 4.318, "eta_s": 16310, "world_size": 1, "timestamp": "2026-05-05T03:15:00.752409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55390, "epoch": 0, "train_loss": 3.6778743118047714, "train_ppl": 39.56220770938838, "lr": 0.00056, "grad_norm": 0.6828, "tokens_per_sec": 152195, "dt_s": 4.306, "eta_s": 16187, "world_size": 1, "timestamp": "2026-05-05T03:15:05.058475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55400, "epoch": 0, "train_loss": 3.747246667742729, "train_ppl": 42.40416835828717, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 148243, "dt_s": 4.421, "eta_s": 16257, "world_size": 1, "timestamp": "2026-05-05T03:15:09.479302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55410, "epoch": 0, "train_loss": 3.717720940709114, "train_ppl": 41.17045719239551, "lr": 0.00056, "grad_norm": 0.6473, "tokens_per_sec": 150276, "dt_s": 4.361, "eta_s": 16274, "world_size": 1, "timestamp": "2026-05-05T03:15:13.840356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55420, "epoch": 0, "train_loss": 3.6862384527921677, "train_ppl": 39.89449932216534, "lr": 0.00056, "grad_norm": 0.6302, "tokens_per_sec": 149537, "dt_s": 4.383, "eta_s": 16236, "world_size": 1, "timestamp": "2026-05-05T03:15:18.222945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55430, "epoch": 0, "train_loss": 3.8220224380493164, "train_ppl": 45.69653333630845, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 150109, "dt_s": 4.366, "eta_s": 16268, "world_size": 1, "timestamp": "2026-05-05T03:15:22.588837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55440, "epoch": 0, "train_loss": 3.7611931413412094, "train_ppl": 42.99970010389461, "lr": 0.00056, "grad_norm": 0.654, "tokens_per_sec": 151832, "dt_s": 4.316, "eta_s": 16271, "world_size": 1, "timestamp": "2026-05-05T03:15:26.905218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55450, "epoch": 0, "train_loss": 3.7260534316301346, "train_ppl": 41.51494287191789, "lr": 0.00056, "grad_norm": 0.649, "tokens_per_sec": 149777, "dt_s": 4.376, "eta_s": 16233, "world_size": 1, "timestamp": "2026-05-05T03:15:31.280788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55460, "epoch": 0, "train_loss": 3.8395031541585922, "train_ppl": 46.50236419288472, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 136942, "dt_s": 4.786, "eta_s": 16545, "world_size": 1, "timestamp": "2026-05-05T03:15:36.066464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55470, "epoch": 0, "train_loss": 3.674711987376213, "train_ppl": 39.43729678203527, "lr": 0.00056, "grad_norm": 0.6336, "tokens_per_sec": 152551, "dt_s": 4.296, "eta_s": 16476, "world_size": 1, "timestamp": "2026-05-05T03:15:40.362480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55480, "epoch": 0, "train_loss": 3.772668182849884, "train_ppl": 43.4959653339743, "lr": 0.00056, "grad_norm": 0.7488, "tokens_per_sec": 147944, "dt_s": 4.43, "eta_s": 16519, "world_size": 1, "timestamp": "2026-05-05T03:15:44.792303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55490, "epoch": 0, "train_loss": 3.9016366600990295, "train_ppl": 49.483370324977706, "lr": 0.00056, "grad_norm": 0.8004, "tokens_per_sec": 150882, "dt_s": 4.344, "eta_s": 16535, "world_size": 1, "timestamp": "2026-05-05T03:15:49.135826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55500, "epoch": 0, "train_loss": 3.805101901292801, "train_ppl": 44.92982828733633, "lr": 0.00056, "grad_norm": 0.7036, "tokens_per_sec": 150999, "dt_s": 4.34, "eta_s": 16504, "world_size": 1, "timestamp": "2026-05-05T03:15:53.475988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55510, "epoch": 0, "train_loss": 3.7623658925294876, "train_ppl": 43.05015763456854, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 126671, "dt_s": 5.174, "eta_s": 16209, "world_size": 1, "timestamp": "2026-05-05T03:15:58.649723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55520, "epoch": 0, "train_loss": 3.8073402792215347, "train_ppl": 45.03051086410754, "lr": 0.00056, "grad_norm": 0.637, "tokens_per_sec": 148433, "dt_s": 4.415, "eta_s": 16294, "world_size": 1, "timestamp": "2026-05-05T03:16:03.064909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55530, "epoch": 0, "train_loss": 3.6639940589666367, "train_ppl": 39.016867747295024, "lr": 0.00056, "grad_norm": 0.6899, "tokens_per_sec": 149558, "dt_s": 4.382, "eta_s": 16254, "world_size": 1, "timestamp": "2026-05-05T03:16:07.446903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55540, "epoch": 0, "train_loss": 3.8099458813667297, "train_ppl": 45.14799545239343, "lr": 0.00056, "grad_norm": 0.7539, "tokens_per_sec": 151636, "dt_s": 4.322, "eta_s": 16233, "world_size": 1, "timestamp": "2026-05-05T03:16:11.768832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55550, "epoch": 0, "train_loss": 3.6880583316087723, "train_ppl": 39.96716858094666, "lr": 0.00056, "grad_norm": 0.6328, "tokens_per_sec": 152193, "dt_s": 4.306, "eta_s": 16204, "world_size": 1, "timestamp": "2026-05-05T03:16:16.074909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55560, "epoch": 0, "train_loss": 3.8008121252059937, "train_ppl": 44.737502197230896, "lr": 0.00056, "grad_norm": 0.6992, "tokens_per_sec": 147671, "dt_s": 4.438, "eta_s": 16231, "world_size": 1, "timestamp": "2026-05-05T03:16:20.512900"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55570, "epoch": 0, "train_loss": 3.6815192252397537, "train_ppl": 39.70667165112862, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 152446, "dt_s": 4.299, "eta_s": 16140, "world_size": 1, "timestamp": "2026-05-05T03:16:24.811876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55580, "epoch": 0, "train_loss": 3.759841486811638, "train_ppl": 42.94161862636782, "lr": 0.00056, "grad_norm": 0.6643, "tokens_per_sec": 152601, "dt_s": 4.295, "eta_s": 16071, "world_size": 1, "timestamp": "2026-05-05T03:16:29.106523"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55590, "epoch": 0, "train_loss": 3.7799712419509888, "train_ppl": 43.81478168981658, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 146561, "dt_s": 4.472, "eta_s": 16178, "world_size": 1, "timestamp": "2026-05-05T03:16:33.578050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55600, "epoch": 0, "train_loss": 3.6837418377399445, "train_ppl": 39.79502234418771, "lr": 0.00056, "grad_norm": 0.6477, "tokens_per_sec": 151327, "dt_s": 4.331, "eta_s": 16191, "world_size": 1, "timestamp": "2026-05-05T03:16:37.908803"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55610, "epoch": 0, "train_loss": 3.784372255206108, "train_ppl": 44.008036070421696, "lr": 0.00056, "grad_norm": 0.7048, "tokens_per_sec": 151134, "dt_s": 4.336, "eta_s": 16112, "world_size": 1, "timestamp": "2026-05-05T03:16:42.245068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55620, "epoch": 0, "train_loss": 3.795572355389595, "train_ppl": 44.503701050573376, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 149769, "dt_s": 4.376, "eta_s": 16164, "world_size": 1, "timestamp": "2026-05-05T03:16:46.620885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55630, "epoch": 0, "train_loss": 3.756284222006798, "train_ppl": 42.78913529036295, "lr": 0.00056, "grad_norm": 0.7642, "tokens_per_sec": 151745, "dt_s": 4.319, "eta_s": 16178, "world_size": 1, "timestamp": "2026-05-05T03:16:50.939724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55640, "epoch": 0, "train_loss": 3.695280998945236, "train_ppl": 40.25688314047153, "lr": 0.00056, "grad_norm": 0.7014, "tokens_per_sec": 151088, "dt_s": 4.338, "eta_s": 16074, "world_size": 1, "timestamp": "2026-05-05T03:16:55.277320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55650, "epoch": 0, "train_loss": 3.748766303062439, "train_ppl": 42.46865621683101, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 150947, "dt_s": 4.342, "eta_s": 16078, "world_size": 1, "timestamp": "2026-05-05T03:16:59.618949"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55660, "epoch": 0, "train_loss": 3.698916330933571, "train_ppl": 40.40349660835174, "lr": 0.00056, "grad_norm": 0.6232, "tokens_per_sec": 151331, "dt_s": 4.331, "eta_s": 16070, "world_size": 1, "timestamp": "2026-05-05T03:17:03.949638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55670, "epoch": 0, "train_loss": 3.7022385746240616, "train_ppl": 40.53795008998114, "lr": 0.00056, "grad_norm": 0.662, "tokens_per_sec": 150615, "dt_s": 4.351, "eta_s": 16047, "world_size": 1, "timestamp": "2026-05-05T03:17:08.300799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55680, "epoch": 0, "train_loss": 3.813564956188202, "train_ppl": 45.311685450486905, "lr": 0.00056, "grad_norm": 0.6388, "tokens_per_sec": 150664, "dt_s": 4.35, "eta_s": 16066, "world_size": 1, "timestamp": "2026-05-05T03:17:12.650609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55690, "epoch": 0, "train_loss": 3.80728879570961, "train_ppl": 45.02819259494132, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 152693, "dt_s": 4.292, "eta_s": 16028, "world_size": 1, "timestamp": "2026-05-05T03:17:16.942617"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55700, "epoch": 0, "train_loss": 3.7187233567237854, "train_ppl": 41.211747809747024, "lr": 0.00056, "grad_norm": 0.6313, "tokens_per_sec": 147453, "dt_s": 4.445, "eta_s": 16099, "world_size": 1, "timestamp": "2026-05-05T03:17:21.387140"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55710, "epoch": 0, "train_loss": 3.6789604276418686, "train_ppl": 39.60520019291436, "lr": 0.00056, "grad_norm": 0.7248, "tokens_per_sec": 149761, "dt_s": 4.376, "eta_s": 16128, "world_size": 1, "timestamp": "2026-05-05T03:17:25.763195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55720, "epoch": 0, "train_loss": 3.75953508913517, "train_ppl": 42.928463429660695, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 150059, "dt_s": 4.367, "eta_s": 16136, "world_size": 1, "timestamp": "2026-05-05T03:17:30.130522"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55730, "epoch": 0, "train_loss": 3.770478531718254, "train_ppl": 43.400828540464886, "lr": 0.00056, "grad_norm": 0.6713, "tokens_per_sec": 148737, "dt_s": 4.406, "eta_s": 16173, "world_size": 1, "timestamp": "2026-05-05T03:17:34.536719"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55740, "epoch": 0, "train_loss": 3.7170199751853943, "train_ppl": 41.141608233551885, "lr": 0.00056, "grad_norm": 0.7004, "tokens_per_sec": 151173, "dt_s": 4.335, "eta_s": 16201, "world_size": 1, "timestamp": "2026-05-05T03:17:38.871857"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55750, "epoch": 0, "train_loss": 3.7108342945575714, "train_ppl": 40.88790485747975, "lr": 0.00056, "grad_norm": 0.6409, "tokens_per_sec": 148781, "dt_s": 4.405, "eta_s": 16167, "world_size": 1, "timestamp": "2026-05-05T03:17:43.276747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55760, "epoch": 0, "train_loss": 3.7102861553430557, "train_ppl": 40.86549873482622, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 134108, "dt_s": 4.887, "eta_s": 16540, "world_size": 1, "timestamp": "2026-05-05T03:17:48.163547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55770, "epoch": 0, "train_loss": 3.597736954689026, "train_ppl": 36.515504626799896, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 151419, "dt_s": 4.328, "eta_s": 16507, "world_size": 1, "timestamp": "2026-05-05T03:17:52.491670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55780, "epoch": 0, "train_loss": 3.7041567116975784, "train_ppl": 40.615782057260475, "lr": 0.00056, "grad_norm": 0.654, "tokens_per_sec": 147590, "dt_s": 4.44, "eta_s": 16527, "world_size": 1, "timestamp": "2026-05-05T03:17:56.932083"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55790, "epoch": 0, "train_loss": 3.7701049894094467, "train_ppl": 43.38461952233323, "lr": 0.00056, "grad_norm": 0.695, "tokens_per_sec": 149207, "dt_s": 4.392, "eta_s": 16565, "world_size": 1, "timestamp": "2026-05-05T03:18:01.324392"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55800, "epoch": 0, "train_loss": 3.853444203734398, "train_ppl": 47.1551959642585, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 150904, "dt_s": 4.343, "eta_s": 16515, "world_size": 1, "timestamp": "2026-05-05T03:18:05.667272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55810, "epoch": 0, "train_loss": 3.758425399661064, "train_ppl": 42.880852587164355, "lr": 0.00056, "grad_norm": 0.6388, "tokens_per_sec": 148356, "dt_s": 4.417, "eta_s": 16164, "world_size": 1, "timestamp": "2026-05-05T03:18:10.084766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55820, "epoch": 0, "train_loss": 3.666981890797615, "train_ppl": 39.133617914775066, "lr": 0.00056, "grad_norm": 0.6833, "tokens_per_sec": 150134, "dt_s": 4.365, "eta_s": 16187, "world_size": 1, "timestamp": "2026-05-05T03:18:14.449921"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55830, "epoch": 0, "train_loss": 3.685837060213089, "train_ppl": 39.87848917958253, "lr": 0.00056, "grad_norm": 0.6936, "tokens_per_sec": 151229, "dt_s": 4.334, "eta_s": 16104, "world_size": 1, "timestamp": "2026-05-05T03:18:18.783491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55840, "epoch": 0, "train_loss": 3.76493076980114, "train_ppl": 43.160717731355355, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 150536, "dt_s": 4.354, "eta_s": 16071, "world_size": 1, "timestamp": "2026-05-05T03:18:23.136996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55850, "epoch": 0, "train_loss": 3.7923286706209183, "train_ppl": 44.35957894303335, "lr": 0.00056, "grad_norm": 0.6695, "tokens_per_sec": 150788, "dt_s": 4.346, "eta_s": 16069, "world_size": 1, "timestamp": "2026-05-05T03:18:27.483256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55860, "epoch": 0, "train_loss": 3.8317057341337204, "train_ppl": 46.141175726227026, "lr": 0.00056, "grad_norm": 0.7207, "tokens_per_sec": 148439, "dt_s": 4.415, "eta_s": 16063, "world_size": 1, "timestamp": "2026-05-05T03:18:31.898286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55870, "epoch": 0, "train_loss": 3.694188207387924, "train_ppl": 40.21291478695024, "lr": 0.00056, "grad_norm": 0.7351, "tokens_per_sec": 151150, "dt_s": 4.336, "eta_s": 16037, "world_size": 1, "timestamp": "2026-05-05T03:18:36.234108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55880, "epoch": 0, "train_loss": 3.679862454533577, "train_ppl": 39.64094126582091, "lr": 0.00056, "grad_norm": 0.6131, "tokens_per_sec": 151665, "dt_s": 4.321, "eta_s": 16024, "world_size": 1, "timestamp": "2026-05-05T03:18:40.555226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55890, "epoch": 0, "train_loss": 3.7826589047908783, "train_ppl": 43.9326994409993, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 149029, "dt_s": 4.398, "eta_s": 16052, "world_size": 1, "timestamp": "2026-05-05T03:18:44.952738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55900, "epoch": 0, "train_loss": 3.6044126600027084, "train_ppl": 36.76008684616215, "lr": 0.00056, "grad_norm": 0.6716, "tokens_per_sec": 150195, "dt_s": 4.363, "eta_s": 16060, "world_size": 1, "timestamp": "2026-05-05T03:18:49.316158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55910, "epoch": 0, "train_loss": 3.751722052693367, "train_ppl": 42.59436862745217, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 151350, "dt_s": 4.33, "eta_s": 15993, "world_size": 1, "timestamp": "2026-05-05T03:18:53.646229"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55920, "epoch": 0, "train_loss": 3.623834788799286, "train_ppl": 37.481024395960375, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 149566, "dt_s": 4.382, "eta_s": 16022, "world_size": 1, "timestamp": "2026-05-05T03:18:58.027961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55930, "epoch": 0, "train_loss": 3.795994058251381, "train_ppl": 44.52247234634286, "lr": 0.00056, "grad_norm": 0.6804, "tokens_per_sec": 150703, "dt_s": 4.349, "eta_s": 16038, "world_size": 1, "timestamp": "2026-05-05T03:19:02.376659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55940, "epoch": 0, "train_loss": 3.8538738191127777, "train_ppl": 47.175458913941, "lr": 0.00056, "grad_norm": 0.7178, "tokens_per_sec": 150282, "dt_s": 4.361, "eta_s": 16007, "world_size": 1, "timestamp": "2026-05-05T03:19:06.737505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55950, "epoch": 0, "train_loss": 3.806074470281601, "train_ppl": 44.97354690123868, "lr": 0.00056, "grad_norm": 0.6821, "tokens_per_sec": 150997, "dt_s": 4.34, "eta_s": 15986, "world_size": 1, "timestamp": "2026-05-05T03:19:11.077755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55960, "epoch": 0, "train_loss": 3.789970263838768, "train_ppl": 44.255084280095645, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 150366, "dt_s": 4.358, "eta_s": 16002, "world_size": 1, "timestamp": "2026-05-05T03:19:15.436170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55970, "epoch": 0, "train_loss": 3.7107312232255936, "train_ppl": 40.88369070384725, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 148915, "dt_s": 4.401, "eta_s": 16012, "world_size": 1, "timestamp": "2026-05-05T03:19:19.837076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55980, "epoch": 0, "train_loss": 3.757756784558296, "train_ppl": 42.85219138422933, "lr": 0.00056, "grad_norm": 0.7345, "tokens_per_sec": 149562, "dt_s": 4.382, "eta_s": 16032, "world_size": 1, "timestamp": "2026-05-05T03:19:24.218936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 55990, "epoch": 0, "train_loss": 3.7440859228372574, "train_ppl": 42.27035119154817, "lr": 0.00056, "grad_norm": 0.6414, "tokens_per_sec": 148582, "dt_s": 4.411, "eta_s": 16064, "world_size": 1, "timestamp": "2026-05-05T03:19:28.629694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56000, "epoch": 0, "train_loss": 3.8083560466766357, "train_ppl": 45.07627463026716, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 148958, "dt_s": 4.4, "eta_s": 16103, "world_size": 1, "timestamp": "2026-05-05T03:19:33.029330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56010, "epoch": 0, "train_loss": 3.7106912583112717, "train_ppl": 40.882056823300275, "lr": 0.00056, "grad_norm": 0.6861, "tokens_per_sec": 127391, "dt_s": 5.144, "eta_s": 16115, "world_size": 1, "timestamp": "2026-05-05T03:19:38.173823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56020, "epoch": 0, "train_loss": 3.637865051627159, "train_ppl": 38.010599370199934, "lr": 0.00056, "grad_norm": 0.6106, "tokens_per_sec": 148407, "dt_s": 4.416, "eta_s": 16122, "world_size": 1, "timestamp": "2026-05-05T03:19:42.589752"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56030, "epoch": 0, "train_loss": 3.7696522772312164, "train_ppl": 43.364983221859134, "lr": 0.00056, "grad_norm": 0.668, "tokens_per_sec": 148504, "dt_s": 4.413, "eta_s": 16140, "world_size": 1, "timestamp": "2026-05-05T03:19:47.002844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56040, "epoch": 0, "train_loss": 3.7236740589141846, "train_ppl": 41.41628077304455, "lr": 0.00056, "grad_norm": 0.6409, "tokens_per_sec": 150599, "dt_s": 4.352, "eta_s": 16092, "world_size": 1, "timestamp": "2026-05-05T03:19:51.354538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56050, "epoch": 0, "train_loss": 3.8006340712308884, "train_ppl": 44.7295372162477, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 149526, "dt_s": 4.383, "eta_s": 16076, "world_size": 1, "timestamp": "2026-05-05T03:19:55.737454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56060, "epoch": 0, "train_loss": 3.7543157637119293, "train_ppl": 42.70498950797292, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 135574, "dt_s": 4.834, "eta_s": 16404, "world_size": 1, "timestamp": "2026-05-05T03:20:00.571432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56070, "epoch": 0, "train_loss": 3.8497407138347626, "train_ppl": 46.980880159940426, "lr": 0.00056, "grad_norm": 0.7109, "tokens_per_sec": 149032, "dt_s": 4.397, "eta_s": 16386, "world_size": 1, "timestamp": "2026-05-05T03:20:04.968883"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56080, "epoch": 0, "train_loss": 3.828649178147316, "train_ppl": 46.000357957670914, "lr": 0.00056, "grad_norm": 0.7539, "tokens_per_sec": 148988, "dt_s": 4.399, "eta_s": 16371, "world_size": 1, "timestamp": "2026-05-05T03:20:09.367618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56090, "epoch": 0, "train_loss": 3.758015289902687, "train_ppl": 42.86327033664352, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 150670, "dt_s": 4.35, "eta_s": 16365, "world_size": 1, "timestamp": "2026-05-05T03:20:13.717289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56100, "epoch": 0, "train_loss": 3.698720633983612, "train_ppl": 40.395590540919684, "lr": 0.00056, "grad_norm": 0.7379, "tokens_per_sec": 150918, "dt_s": 4.342, "eta_s": 16331, "world_size": 1, "timestamp": "2026-05-05T03:20:18.059772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56110, "epoch": 0, "train_loss": 3.729202076792717, "train_ppl": 41.64586470099547, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 148920, "dt_s": 4.401, "eta_s": 16009, "world_size": 1, "timestamp": "2026-05-05T03:20:22.460538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56120, "epoch": 0, "train_loss": 3.670476198196411, "train_ppl": 39.270601998261164, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 151255, "dt_s": 4.333, "eta_s": 15958, "world_size": 1, "timestamp": "2026-05-05T03:20:26.793352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56130, "epoch": 0, "train_loss": 3.693237602710724, "train_ppl": 40.17470636549923, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 150846, "dt_s": 4.345, "eta_s": 15914, "world_size": 1, "timestamp": "2026-05-05T03:20:31.137881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56140, "epoch": 0, "train_loss": 3.817736253142357, "train_ppl": 45.501088699962786, "lr": 0.00056, "grad_norm": 0.8522, "tokens_per_sec": 150555, "dt_s": 4.353, "eta_s": 15912, "world_size": 1, "timestamp": "2026-05-05T03:20:35.490840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56150, "epoch": 0, "train_loss": 3.7915512174367905, "train_ppl": 44.32510484987071, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 152089, "dt_s": 4.309, "eta_s": 15883, "world_size": 1, "timestamp": "2026-05-05T03:20:39.799933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56160, "epoch": 0, "train_loss": 3.7083365321159363, "train_ppl": 40.785904024364946, "lr": 0.00056, "grad_norm": 0.7419, "tokens_per_sec": 149065, "dt_s": 4.396, "eta_s": 15875, "world_size": 1, "timestamp": "2026-05-05T03:20:44.196388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56170, "epoch": 0, "train_loss": 3.7004698514938354, "train_ppl": 40.46631305172397, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 151466, "dt_s": 4.327, "eta_s": 15867, "world_size": 1, "timestamp": "2026-05-05T03:20:48.523150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56180, "epoch": 0, "train_loss": 3.808455526828766, "train_ppl": 45.080759047976414, "lr": 0.00056, "grad_norm": 0.796, "tokens_per_sec": 152557, "dt_s": 4.296, "eta_s": 15827, "world_size": 1, "timestamp": "2026-05-05T03:20:52.818993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56190, "epoch": 0, "train_loss": 3.6893480867147446, "train_ppl": 40.01874969704883, "lr": 0.00056, "grad_norm": 0.7187, "tokens_per_sec": 149632, "dt_s": 4.38, "eta_s": 15842, "world_size": 1, "timestamp": "2026-05-05T03:20:57.198813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56200, "epoch": 0, "train_loss": 3.7062842547893524, "train_ppl": 40.702285871460035, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 152570, "dt_s": 4.295, "eta_s": 15828, "world_size": 1, "timestamp": "2026-05-05T03:21:01.494289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56210, "epoch": 0, "train_loss": 3.7569224387407303, "train_ppl": 42.81645274883808, "lr": 0.00056, "grad_norm": 0.6624, "tokens_per_sec": 150852, "dt_s": 4.344, "eta_s": 15786, "world_size": 1, "timestamp": "2026-05-05T03:21:05.838679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56220, "epoch": 0, "train_loss": 3.7017086148262024, "train_ppl": 40.516472297831704, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 149018, "dt_s": 4.398, "eta_s": 15833, "world_size": 1, "timestamp": "2026-05-05T03:21:10.236527"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56230, "epoch": 0, "train_loss": 3.6646656841039658, "train_ppl": 39.04308125829505, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 149084, "dt_s": 4.396, "eta_s": 15902, "world_size": 1, "timestamp": "2026-05-05T03:21:14.632447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56240, "epoch": 0, "train_loss": 3.824215441942215, "train_ppl": 45.79685597556965, "lr": 0.00056, "grad_norm": 0.7347, "tokens_per_sec": 148572, "dt_s": 4.411, "eta_s": 15920, "world_size": 1, "timestamp": "2026-05-05T03:21:19.043493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56250, "epoch": 0, "train_loss": 3.673107370734215, "train_ppl": 39.37406578362679, "lr": 0.00056, "grad_norm": 0.7825, "tokens_per_sec": 149739, "dt_s": 4.377, "eta_s": 15975, "world_size": 1, "timestamp": "2026-05-05T03:21:23.420168"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56260, "epoch": 0, "train_loss": 3.6378515362739563, "train_ppl": 38.01008564699558, "lr": 0.00056, "grad_norm": 0.6367, "tokens_per_sec": 151584, "dt_s": 4.323, "eta_s": 15955, "world_size": 1, "timestamp": "2026-05-05T03:21:27.743597"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56270, "epoch": 0, "train_loss": 3.7739449739456177, "train_ppl": 43.551536063770286, "lr": 0.00056, "grad_norm": 0.9426, "tokens_per_sec": 150902, "dt_s": 4.343, "eta_s": 15911, "world_size": 1, "timestamp": "2026-05-05T03:21:32.086548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56280, "epoch": 0, "train_loss": 3.7145515084266663, "train_ppl": 41.04017678281355, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 150329, "dt_s": 4.36, "eta_s": 15880, "world_size": 1, "timestamp": "2026-05-05T03:21:36.446048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56290, "epoch": 0, "train_loss": 3.711538851261139, "train_ppl": 40.91672285570673, "lr": 0.00056, "grad_norm": 0.6944, "tokens_per_sec": 150347, "dt_s": 4.359, "eta_s": 15838, "world_size": 1, "timestamp": "2026-05-05T03:21:40.805021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56300, "epoch": 0, "train_loss": 3.7382004410028458, "train_ppl": 42.02230047249314, "lr": 0.00056, "grad_norm": 0.6775, "tokens_per_sec": 146493, "dt_s": 4.474, "eta_s": 15904, "world_size": 1, "timestamp": "2026-05-05T03:21:45.278691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56310, "epoch": 0, "train_loss": 3.658028692007065, "train_ppl": 38.784810654583346, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 149680, "dt_s": 4.378, "eta_s": 15940, "world_size": 1, "timestamp": "2026-05-05T03:21:49.657091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56320, "epoch": 0, "train_loss": 3.69363634288311, "train_ppl": 40.19072882902862, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 150473, "dt_s": 4.355, "eta_s": 15944, "world_size": 1, "timestamp": "2026-05-05T03:21:54.012448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56330, "epoch": 0, "train_loss": 3.7154413759708405, "train_ppl": 41.07671335808501, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 149478, "dt_s": 4.384, "eta_s": 15958, "world_size": 1, "timestamp": "2026-05-05T03:21:58.396756"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56340, "epoch": 0, "train_loss": 3.7702375799417496, "train_ppl": 43.39037229350252, "lr": 0.00056, "grad_norm": 0.6354, "tokens_per_sec": 151458, "dt_s": 4.327, "eta_s": 15930, "world_size": 1, "timestamp": "2026-05-05T03:22:02.723765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56350, "epoch": 0, "train_loss": 3.8161340951919556, "train_ppl": 45.42824713637681, "lr": 0.00056, "grad_norm": 0.6348, "tokens_per_sec": 134561, "dt_s": 4.87, "eta_s": 16214, "world_size": 1, "timestamp": "2026-05-05T03:22:07.594107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56360, "epoch": 0, "train_loss": 3.763495996594429, "train_ppl": 43.09883629350214, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 150114, "dt_s": 4.366, "eta_s": 16200, "world_size": 1, "timestamp": "2026-05-05T03:22:11.959874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56370, "epoch": 0, "train_loss": 3.7253932505846024, "train_ppl": 41.487544538452816, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 151590, "dt_s": 4.323, "eta_s": 16173, "world_size": 1, "timestamp": "2026-05-05T03:22:16.283126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56380, "epoch": 0, "train_loss": 3.787190467119217, "train_ppl": 44.13223496915363, "lr": 0.00056, "grad_norm": 0.6602, "tokens_per_sec": 148869, "dt_s": 4.402, "eta_s": 16181, "world_size": 1, "timestamp": "2026-05-05T03:22:20.685358"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56390, "epoch": 0, "train_loss": 3.774079382419586, "train_ppl": 43.557390152682366, "lr": 0.00056, "grad_norm": 0.6538, "tokens_per_sec": 150947, "dt_s": 4.342, "eta_s": 16187, "world_size": 1, "timestamp": "2026-05-05T03:22:25.027009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56400, "epoch": 0, "train_loss": 3.7280503064393997, "train_ppl": 41.59792584127284, "lr": 0.00056, "grad_norm": 0.6804, "tokens_per_sec": 152090, "dt_s": 4.309, "eta_s": 15776, "world_size": 1, "timestamp": "2026-05-05T03:22:29.336033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56410, "epoch": 0, "train_loss": 3.7575425058603287, "train_ppl": 42.8430100561711, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 148546, "dt_s": 4.412, "eta_s": 15805, "world_size": 1, "timestamp": "2026-05-05T03:22:33.747890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56420, "epoch": 0, "train_loss": 3.752431631088257, "train_ppl": 42.62460339687515, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 151329, "dt_s": 4.331, "eta_s": 15806, "world_size": 1, "timestamp": "2026-05-05T03:22:38.078590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56430, "epoch": 0, "train_loss": 3.705478236079216, "train_ppl": 40.66949228539969, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 152413, "dt_s": 4.3, "eta_s": 15727, "world_size": 1, "timestamp": "2026-05-05T03:22:42.378462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56440, "epoch": 0, "train_loss": 3.7548112869262695, "train_ppl": 42.72615606546947, "lr": 0.00056, "grad_norm": 0.6574, "tokens_per_sec": 148415, "dt_s": 4.416, "eta_s": 15776, "world_size": 1, "timestamp": "2026-05-05T03:22:46.794208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56450, "epoch": 0, "train_loss": 3.7265231758356094, "train_ppl": 41.5344488568256, "lr": 0.00056, "grad_norm": 0.6369, "tokens_per_sec": 151366, "dt_s": 4.33, "eta_s": 15787, "world_size": 1, "timestamp": "2026-05-05T03:22:51.123862"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56460, "epoch": 0, "train_loss": 3.6837316155433655, "train_ppl": 39.79461555372559, "lr": 0.00056, "grad_norm": 0.6761, "tokens_per_sec": 149078, "dt_s": 4.396, "eta_s": 15771, "world_size": 1, "timestamp": "2026-05-05T03:22:55.519920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56470, "epoch": 0, "train_loss": 3.6778090447187424, "train_ppl": 39.55962568363588, "lr": 0.00056, "grad_norm": 0.6412, "tokens_per_sec": 148439, "dt_s": 4.415, "eta_s": 15828, "world_size": 1, "timestamp": "2026-05-05T03:22:59.934934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56480, "epoch": 0, "train_loss": 3.7794100642204285, "train_ppl": 43.790200707857814, "lr": 0.00056, "grad_norm": 2.6938, "tokens_per_sec": 151474, "dt_s": 4.327, "eta_s": 15843, "world_size": 1, "timestamp": "2026-05-05T03:23:04.261506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56490, "epoch": 0, "train_loss": 3.777219146490097, "train_ppl": 43.69436500311979, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 150213, "dt_s": 4.363, "eta_s": 15800, "world_size": 1, "timestamp": "2026-05-05T03:23:08.624384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56500, "epoch": 0, "train_loss": 3.7503974437713623, "train_ppl": 42.537985098048296, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 150740, "dt_s": 4.348, "eta_s": 15809, "world_size": 1, "timestamp": "2026-05-05T03:23:12.972003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56510, "epoch": 0, "train_loss": 3.6971020996570587, "train_ppl": 40.330261773675524, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 128552, "dt_s": 5.098, "eta_s": 15734, "world_size": 1, "timestamp": "2026-05-05T03:23:18.070042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56520, "epoch": 0, "train_loss": 3.7192670553922653, "train_ppl": 41.23416067452716, "lr": 0.00056, "grad_norm": 0.6357, "tokens_per_sec": 147637, "dt_s": 4.439, "eta_s": 15747, "world_size": 1, "timestamp": "2026-05-05T03:23:22.509041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56530, "epoch": 0, "train_loss": 3.8287118822336197, "train_ppl": 46.00324245852035, "lr": 0.00056, "grad_norm": 1.0168, "tokens_per_sec": 148126, "dt_s": 4.424, "eta_s": 15813, "world_size": 1, "timestamp": "2026-05-05T03:23:26.933392"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56540, "epoch": 0, "train_loss": 3.911089912056923, "train_ppl": 49.95336709125132, "lr": 0.00056, "grad_norm": 0.7009, "tokens_per_sec": 151656, "dt_s": 4.321, "eta_s": 15779, "world_size": 1, "timestamp": "2026-05-05T03:23:31.254727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56550, "epoch": 0, "train_loss": 3.7869353890419006, "train_ppl": 44.12097923911604, "lr": 0.00056, "grad_norm": 0.9062, "tokens_per_sec": 149328, "dt_s": 4.389, "eta_s": 15804, "world_size": 1, "timestamp": "2026-05-05T03:23:35.643453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56560, "epoch": 0, "train_loss": 3.6857909858226776, "train_ppl": 39.87665184483042, "lr": 0.00056, "grad_norm": 0.6514, "tokens_per_sec": 152763, "dt_s": 4.29, "eta_s": 15794, "world_size": 1, "timestamp": "2026-05-05T03:23:39.933501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56570, "epoch": 0, "train_loss": 3.7651832550764084, "train_ppl": 43.171616552890626, "lr": 0.00056, "grad_norm": 0.6645, "tokens_per_sec": 151341, "dt_s": 4.33, "eta_s": 15711, "world_size": 1, "timestamp": "2026-05-05T03:23:44.263864"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56580, "epoch": 0, "train_loss": 3.7256796658039093, "train_ppl": 41.499428904470705, "lr": 0.00056, "grad_norm": 0.6524, "tokens_per_sec": 149864, "dt_s": 4.373, "eta_s": 15670, "world_size": 1, "timestamp": "2026-05-05T03:23:48.636892"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56590, "epoch": 0, "train_loss": 3.7975227534770966, "train_ppl": 44.590585686259985, "lr": 0.00056, "grad_norm": 1.2106, "tokens_per_sec": 152146, "dt_s": 4.307, "eta_s": 15655, "world_size": 1, "timestamp": "2026-05-05T03:23:52.944338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56600, "epoch": 0, "train_loss": 3.7610510140657425, "train_ppl": 42.99358910795284, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 150496, "dt_s": 4.355, "eta_s": 15626, "world_size": 1, "timestamp": "2026-05-05T03:23:57.298994"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56610, "epoch": 0, "train_loss": 3.7443762868642807, "train_ppl": 42.282626763049784, "lr": 0.00056, "grad_norm": 0.6814, "tokens_per_sec": 149012, "dt_s": 4.398, "eta_s": 15700, "world_size": 1, "timestamp": "2026-05-05T03:24:01.697043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56620, "epoch": 0, "train_loss": 3.6854293793439865, "train_ppl": 39.862234795981045, "lr": 0.00056, "grad_norm": 0.649, "tokens_per_sec": 151596, "dt_s": 4.323, "eta_s": 15690, "world_size": 1, "timestamp": "2026-05-05T03:24:06.020096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56630, "epoch": 0, "train_loss": 3.800534248352051, "train_ppl": 44.72507240792242, "lr": 0.00056, "grad_norm": 0.652, "tokens_per_sec": 148832, "dt_s": 4.403, "eta_s": 15708, "world_size": 1, "timestamp": "2026-05-05T03:24:10.423445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56640, "epoch": 0, "train_loss": 3.670217990875244, "train_ppl": 39.26046335031147, "lr": 0.00056, "grad_norm": 0.6417, "tokens_per_sec": 151387, "dt_s": 4.329, "eta_s": 15719, "world_size": 1, "timestamp": "2026-05-05T03:24:14.752485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56650, "epoch": 0, "train_loss": 3.7378252297639847, "train_ppl": 42.00653619072585, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 135594, "dt_s": 4.833, "eta_s": 16060, "world_size": 1, "timestamp": "2026-05-05T03:24:19.585771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56660, "epoch": 0, "train_loss": 3.789637267589569, "train_ppl": 44.24034995639581, "lr": 0.00056, "grad_norm": 0.7515, "tokens_per_sec": 150070, "dt_s": 4.367, "eta_s": 16033, "world_size": 1, "timestamp": "2026-05-05T03:24:23.952783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56670, "epoch": 0, "train_loss": 3.6290226131677628, "train_ppl": 37.67597461419072, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 152028, "dt_s": 4.311, "eta_s": 16019, "world_size": 1, "timestamp": "2026-05-05T03:24:28.263575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56680, "epoch": 0, "train_loss": 3.744950160384178, "train_ppl": 42.30689860671886, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 150512, "dt_s": 4.354, "eta_s": 15980, "world_size": 1, "timestamp": "2026-05-05T03:24:32.617758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56690, "epoch": 0, "train_loss": 3.793907105922699, "train_ppl": 44.42965295750938, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 148577, "dt_s": 4.411, "eta_s": 16034, "world_size": 1, "timestamp": "2026-05-05T03:24:37.028683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56700, "epoch": 0, "train_loss": 3.6831490844488144, "train_ppl": 39.771440703460634, "lr": 0.00056, "grad_norm": 0.6278, "tokens_per_sec": 151229, "dt_s": 4.334, "eta_s": 15670, "world_size": 1, "timestamp": "2026-05-05T03:24:41.362244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56710, "epoch": 0, "train_loss": 3.7216295450925827, "train_ppl": 41.33169111639147, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 148434, "dt_s": 4.415, "eta_s": 15700, "world_size": 1, "timestamp": "2026-05-05T03:24:45.777391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56720, "epoch": 0, "train_loss": 3.776207372546196, "train_ppl": 43.650178540239665, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 149459, "dt_s": 4.385, "eta_s": 15749, "world_size": 1, "timestamp": "2026-05-05T03:24:50.162290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56730, "epoch": 0, "train_loss": 3.8086415231227875, "train_ppl": 45.089144681915265, "lr": 0.00056, "grad_norm": 0.7264, "tokens_per_sec": 150684, "dt_s": 4.349, "eta_s": 15741, "world_size": 1, "timestamp": "2026-05-05T03:24:54.511504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56740, "epoch": 0, "train_loss": 3.764891818165779, "train_ppl": 43.15903658355832, "lr": 0.00056, "grad_norm": 0.6839, "tokens_per_sec": 148558, "dt_s": 4.411, "eta_s": 15737, "world_size": 1, "timestamp": "2026-05-05T03:24:58.922984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56750, "epoch": 0, "train_loss": 3.6554486602544785, "train_ppl": 38.68487358741619, "lr": 0.00056, "grad_norm": 0.6468, "tokens_per_sec": 149859, "dt_s": 4.373, "eta_s": 15761, "world_size": 1, "timestamp": "2026-05-05T03:25:03.296157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56760, "epoch": 0, "train_loss": 3.729121431708336, "train_ppl": 41.64250630214353, "lr": 0.00056, "grad_norm": 0.6557, "tokens_per_sec": 149337, "dt_s": 4.388, "eta_s": 15738, "world_size": 1, "timestamp": "2026-05-05T03:25:07.684633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56770, "epoch": 0, "train_loss": 3.7914074808359146, "train_ppl": 44.31873416782718, "lr": 0.00056, "grad_norm": 0.6457, "tokens_per_sec": 148052, "dt_s": 4.427, "eta_s": 15763, "world_size": 1, "timestamp": "2026-05-05T03:25:12.111187"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56780, "epoch": 0, "train_loss": 3.8051107823848724, "train_ppl": 44.930227315050004, "lr": 0.00056, "grad_norm": 0.7087, "tokens_per_sec": 151595, "dt_s": 4.323, "eta_s": 15740, "world_size": 1, "timestamp": "2026-05-05T03:25:16.434310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56790, "epoch": 0, "train_loss": 3.668557748198509, "train_ppl": 39.195335532500074, "lr": 0.00056, "grad_norm": 0.7101, "tokens_per_sec": 152749, "dt_s": 4.29, "eta_s": 15649, "world_size": 1, "timestamp": "2026-05-05T03:25:20.724710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56800, "epoch": 0, "train_loss": 3.7958894968032837, "train_ppl": 44.51781725553722, "lr": 0.00056, "grad_norm": 0.649, "tokens_per_sec": 147587, "dt_s": 4.44, "eta_s": 15693, "world_size": 1, "timestamp": "2026-05-05T03:25:25.165205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56810, "epoch": 0, "train_loss": 3.8339809626340866, "train_ppl": 46.24627696360562, "lr": 0.00056, "grad_norm": 0.719, "tokens_per_sec": 152293, "dt_s": 4.303, "eta_s": 15627, "world_size": 1, "timestamp": "2026-05-05T03:25:29.468468"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56820, "epoch": 0, "train_loss": 3.7088821679353714, "train_ppl": 40.80816434699109, "lr": 0.00056, "grad_norm": 0.628, "tokens_per_sec": 150413, "dt_s": 4.357, "eta_s": 15573, "world_size": 1, "timestamp": "2026-05-05T03:25:33.825519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56830, "epoch": 0, "train_loss": 3.7470041066408157, "train_ppl": 42.39388400382706, "lr": 0.00056, "grad_norm": 0.6876, "tokens_per_sec": 149887, "dt_s": 4.372, "eta_s": 15604, "world_size": 1, "timestamp": "2026-05-05T03:25:38.197903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56840, "epoch": 0, "train_loss": 3.743821859359741, "train_ppl": 42.25919060923265, "lr": 0.00056, "grad_norm": 0.7062, "tokens_per_sec": 152305, "dt_s": 4.303, "eta_s": 15609, "world_size": 1, "timestamp": "2026-05-05T03:25:42.500851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56850, "epoch": 0, "train_loss": 3.758918449282646, "train_ppl": 42.90200018828553, "lr": 0.00056, "grad_norm": 0.6961, "tokens_per_sec": 148908, "dt_s": 4.401, "eta_s": 15576, "world_size": 1, "timestamp": "2026-05-05T03:25:46.901964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56860, "epoch": 0, "train_loss": 3.7418341487646103, "train_ppl": 42.17527499592302, "lr": 0.00056, "grad_norm": 0.6595, "tokens_per_sec": 150408, "dt_s": 4.357, "eta_s": 15611, "world_size": 1, "timestamp": "2026-05-05T03:25:51.259161"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56870, "epoch": 0, "train_loss": 3.688994660973549, "train_ppl": 40.004608539847105, "lr": 0.00056, "grad_norm": 0.7347, "tokens_per_sec": 151419, "dt_s": 4.328, "eta_s": 15586, "world_size": 1, "timestamp": "2026-05-05T03:25:55.587312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56880, "epoch": 0, "train_loss": 3.6287316232919693, "train_ppl": 37.665012881971236, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 148492, "dt_s": 4.413, "eta_s": 15611, "world_size": 1, "timestamp": "2026-05-05T03:26:00.000749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56890, "epoch": 0, "train_loss": 3.728816896677017, "train_ppl": 41.62982663098264, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 151314, "dt_s": 4.331, "eta_s": 15626, "world_size": 1, "timestamp": "2026-05-05T03:26:04.331859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56900, "epoch": 0, "train_loss": 3.77474507689476, "train_ppl": 43.586395720011815, "lr": 0.00056, "grad_norm": 0.7306, "tokens_per_sec": 150575, "dt_s": 4.352, "eta_s": 15587, "world_size": 1, "timestamp": "2026-05-05T03:26:08.684255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56910, "epoch": 0, "train_loss": 3.7199280112981796, "train_ppl": 41.261423645368694, "lr": 0.00056, "grad_norm": 0.6488, "tokens_per_sec": 148594, "dt_s": 4.41, "eta_s": 15621, "world_size": 1, "timestamp": "2026-05-05T03:26:13.094662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56920, "epoch": 0, "train_loss": 3.7399012744426727, "train_ppl": 42.09383422260775, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 149302, "dt_s": 4.389, "eta_s": 15660, "world_size": 1, "timestamp": "2026-05-05T03:26:17.484162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56930, "epoch": 0, "train_loss": 3.662540540099144, "train_ppl": 38.960197189713604, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 150515, "dt_s": 4.354, "eta_s": 15614, "world_size": 1, "timestamp": "2026-05-05T03:26:21.838285"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56940, "epoch": 0, "train_loss": 3.6700250804424286, "train_ppl": 39.25289032781503, "lr": 0.00056, "grad_norm": 0.6514, "tokens_per_sec": 134363, "dt_s": 4.878, "eta_s": 16000, "world_size": 1, "timestamp": "2026-05-05T03:26:26.715809"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56950, "epoch": 0, "train_loss": 3.8178609907627106, "train_ppl": 45.50676475149194, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 152696, "dt_s": 4.292, "eta_s": 15952, "world_size": 1, "timestamp": "2026-05-05T03:26:31.007735"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56960, "epoch": 0, "train_loss": 3.761509045958519, "train_ppl": 43.01328605351948, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 152297, "dt_s": 4.303, "eta_s": 15871, "world_size": 1, "timestamp": "2026-05-05T03:26:35.310918"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56970, "epoch": 0, "train_loss": 3.799923896789551, "train_ppl": 44.69778271909461, "lr": 0.00056, "grad_norm": 0.6485, "tokens_per_sec": 152496, "dt_s": 4.298, "eta_s": 15801, "world_size": 1, "timestamp": "2026-05-05T03:26:39.608449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56980, "epoch": 0, "train_loss": 3.6839872896671295, "train_ppl": 39.80479130797091, "lr": 0.00056, "grad_norm": 0.6406, "tokens_per_sec": 152952, "dt_s": 4.285, "eta_s": 15747, "world_size": 1, "timestamp": "2026-05-05T03:26:43.893206"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 56990, "epoch": 0, "train_loss": 3.756515622138977, "train_ppl": 42.799037847607316, "lr": 0.00056, "grad_norm": 0.7574, "tokens_per_sec": 147121, "dt_s": 4.455, "eta_s": 15441, "world_size": 1, "timestamp": "2026-05-05T03:26:48.347769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57000, "epoch": 0, "train_loss": 3.705738827586174, "train_ppl": 40.6800917906921, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 150633, "dt_s": 4.351, "eta_s": 15478, "world_size": 1, "timestamp": "2026-05-05T03:26:52.698478"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57010, "epoch": 0, "train_loss": 3.685787260532379, "train_ppl": 39.87650329300286, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 106901, "dt_s": 6.131, "eta_s": 15605, "world_size": 1, "timestamp": "2026-05-05T03:26:58.829003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57020, "epoch": 0, "train_loss": 3.8196264058351517, "train_ppl": 45.587174036880214, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 146721, "dt_s": 4.467, "eta_s": 15722, "world_size": 1, "timestamp": "2026-05-05T03:27:03.295686"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57030, "epoch": 0, "train_loss": 3.731325536966324, "train_ppl": 41.73439199491016, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 152508, "dt_s": 4.297, "eta_s": 15726, "world_size": 1, "timestamp": "2026-05-05T03:27:07.592891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57040, "epoch": 0, "train_loss": 3.7348219454288483, "train_ppl": 41.880567872632334, "lr": 0.00056, "grad_norm": 0.7708, "tokens_per_sec": 149479, "dt_s": 4.384, "eta_s": 15672, "world_size": 1, "timestamp": "2026-05-05T03:27:11.977196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57050, "epoch": 0, "train_loss": 3.6415913701057434, "train_ppl": 38.152503194236196, "lr": 0.00056, "grad_norm": 0.6282, "tokens_per_sec": 152068, "dt_s": 4.31, "eta_s": 15638, "world_size": 1, "timestamp": "2026-05-05T03:27:16.286829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57060, "epoch": 0, "train_loss": 3.7711271047592163, "train_ppl": 43.42898627799934, "lr": 0.00056, "grad_norm": 0.7202, "tokens_per_sec": 151408, "dt_s": 4.328, "eta_s": 15520, "world_size": 1, "timestamp": "2026-05-05T03:27:20.615271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57070, "epoch": 0, "train_loss": 3.81074421107769, "train_ppl": 45.18405282947605, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 147713, "dt_s": 4.437, "eta_s": 15495, "world_size": 1, "timestamp": "2026-05-05T03:27:25.051977"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57080, "epoch": 0, "train_loss": 3.6026601791381836, "train_ppl": 36.6957219130159, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 149830, "dt_s": 4.374, "eta_s": 15545, "world_size": 1, "timestamp": "2026-05-05T03:27:29.426002"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57090, "epoch": 0, "train_loss": 3.7955918312072754, "train_ppl": 44.50456780498149, "lr": 0.00056, "grad_norm": 0.6522, "tokens_per_sec": 150452, "dt_s": 4.356, "eta_s": 15520, "world_size": 1, "timestamp": "2026-05-05T03:27:33.781954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57100, "epoch": 0, "train_loss": 3.7820961475372314, "train_ppl": 43.907982951060895, "lr": 0.00056, "grad_norm": 0.6795, "tokens_per_sec": 149943, "dt_s": 4.371, "eta_s": 15560, "world_size": 1, "timestamp": "2026-05-05T03:27:38.152698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57110, "epoch": 0, "train_loss": 3.7105358839035034, "train_ppl": 40.87570529137854, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 152431, "dt_s": 4.299, "eta_s": 15534, "world_size": 1, "timestamp": "2026-05-05T03:27:42.452085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57120, "epoch": 0, "train_loss": 3.6216208189725876, "train_ppl": 37.39813433078385, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 150766, "dt_s": 4.347, "eta_s": 15466, "world_size": 1, "timestamp": "2026-05-05T03:27:46.798961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57130, "epoch": 0, "train_loss": 3.807517722249031, "train_ppl": 45.03850192324287, "lr": 0.00056, "grad_norm": 0.7798, "tokens_per_sec": 149997, "dt_s": 4.369, "eta_s": 15458, "world_size": 1, "timestamp": "2026-05-05T03:27:51.168108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57140, "epoch": 0, "train_loss": 3.730737552046776, "train_ppl": 41.70986001471463, "lr": 0.00056, "grad_norm": 0.6392, "tokens_per_sec": 150949, "dt_s": 4.342, "eta_s": 15444, "world_size": 1, "timestamp": "2026-05-05T03:27:55.509706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57150, "epoch": 0, "train_loss": 3.726594179868698, "train_ppl": 41.537398074908495, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 148842, "dt_s": 4.403, "eta_s": 15462, "world_size": 1, "timestamp": "2026-05-05T03:27:59.912791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57160, "epoch": 0, "train_loss": 3.7668511271476746, "train_ppl": 43.24368136715031, "lr": 0.00056, "grad_norm": 0.6994, "tokens_per_sec": 151483, "dt_s": 4.326, "eta_s": 15477, "world_size": 1, "timestamp": "2026-05-05T03:28:04.239055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57170, "epoch": 0, "train_loss": 3.8107686936855316, "train_ppl": 45.185159066463896, "lr": 0.00056, "grad_norm": 0.6674, "tokens_per_sec": 152444, "dt_s": 4.299, "eta_s": 15439, "world_size": 1, "timestamp": "2026-05-05T03:28:08.538076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57180, "epoch": 0, "train_loss": 3.753688260912895, "train_ppl": 42.67820041351886, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 149106, "dt_s": 4.395, "eta_s": 15453, "world_size": 1, "timestamp": "2026-05-05T03:28:12.933348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57190, "epoch": 0, "train_loss": 3.6464253067970276, "train_ppl": 38.337376452109936, "lr": 0.00056, "grad_norm": 0.9355, "tokens_per_sec": 151393, "dt_s": 4.329, "eta_s": 15440, "world_size": 1, "timestamp": "2026-05-05T03:28:17.262220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57200, "epoch": 0, "train_loss": 3.7059149146080017, "train_ppl": 40.68725565761669, "lr": 0.00056, "grad_norm": 0.7095, "tokens_per_sec": 152106, "dt_s": 4.309, "eta_s": 15368, "world_size": 1, "timestamp": "2026-05-05T03:28:21.570787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57210, "epoch": 0, "train_loss": 3.6772971898317337, "train_ppl": 39.53938207723789, "lr": 0.00056, "grad_norm": 0.6914, "tokens_per_sec": 149209, "dt_s": 4.392, "eta_s": 15411, "world_size": 1, "timestamp": "2026-05-05T03:28:25.963023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57220, "epoch": 0, "train_loss": 3.676794797182083, "train_ppl": 39.51952277131292, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 151248, "dt_s": 4.333, "eta_s": 15431, "world_size": 1, "timestamp": "2026-05-05T03:28:30.296048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57230, "epoch": 0, "train_loss": 3.731595888733864, "train_ppl": 41.74567648687564, "lr": 0.00056, "grad_norm": 0.6159, "tokens_per_sec": 151329, "dt_s": 4.331, "eta_s": 15380, "world_size": 1, "timestamp": "2026-05-05T03:28:34.626743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57240, "epoch": 0, "train_loss": 3.7097930312156677, "train_ppl": 40.84535193926608, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 135488, "dt_s": 4.837, "eta_s": 15736, "world_size": 1, "timestamp": "2026-05-05T03:28:39.463791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57250, "epoch": 0, "train_loss": 3.6374788880348206, "train_ppl": 37.99592389434981, "lr": 0.00056, "grad_norm": 0.6321, "tokens_per_sec": 151978, "dt_s": 4.312, "eta_s": 15734, "world_size": 1, "timestamp": "2026-05-05T03:28:43.776004"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57260, "epoch": 0, "train_loss": 3.6659799367189407, "train_ppl": 39.09442746348697, "lr": 0.00056, "grad_norm": 0.7207, "tokens_per_sec": 149738, "dt_s": 4.377, "eta_s": 15719, "world_size": 1, "timestamp": "2026-05-05T03:28:48.152726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57270, "epoch": 0, "train_loss": 3.627903461456299, "train_ppl": 37.63383306851019, "lr": 0.00056, "grad_norm": 0.6333, "tokens_per_sec": 150730, "dt_s": 4.348, "eta_s": 15725, "world_size": 1, "timestamp": "2026-05-05T03:28:52.500658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57280, "epoch": 0, "train_loss": 3.7553492188453674, "train_ppl": 42.749146011555965, "lr": 0.00056, "grad_norm": 0.6388, "tokens_per_sec": 149645, "dt_s": 4.379, "eta_s": 15755, "world_size": 1, "timestamp": "2026-05-05T03:28:56.880072"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57290, "epoch": 0, "train_loss": 3.65377676486969, "train_ppl": 38.62025056232601, "lr": 0.00056, "grad_norm": 0.7715, "tokens_per_sec": 148350, "dt_s": 4.418, "eta_s": 15454, "world_size": 1, "timestamp": "2026-05-05T03:29:01.297762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57300, "epoch": 0, "train_loss": 3.6769101172685623, "train_ppl": 39.52408042888621, "lr": 0.00056, "grad_norm": 0.6327, "tokens_per_sec": 152987, "dt_s": 4.284, "eta_s": 15429, "world_size": 1, "timestamp": "2026-05-05T03:29:05.581504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57310, "epoch": 0, "train_loss": 3.6406088918447495, "train_ppl": 38.11503759682653, "lr": 0.00056, "grad_norm": 0.675, "tokens_per_sec": 151068, "dt_s": 4.338, "eta_s": 15398, "world_size": 1, "timestamp": "2026-05-05T03:29:09.919690"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57320, "epoch": 0, "train_loss": 3.887565389275551, "train_ppl": 48.79195239276108, "lr": 0.00056, "grad_norm": 0.7257, "tokens_per_sec": 150318, "dt_s": 4.36, "eta_s": 15402, "world_size": 1, "timestamp": "2026-05-05T03:29:14.279498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57330, "epoch": 0, "train_loss": 3.743343785405159, "train_ppl": 42.2389924193596, "lr": 0.00056, "grad_norm": 0.6958, "tokens_per_sec": 151441, "dt_s": 4.328, "eta_s": 15361, "world_size": 1, "timestamp": "2026-05-05T03:29:18.607014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57340, "epoch": 0, "train_loss": 3.7027385532855988, "train_ppl": 40.55822326766439, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 152006, "dt_s": 4.311, "eta_s": 15281, "world_size": 1, "timestamp": "2026-05-05T03:29:22.918455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57350, "epoch": 0, "train_loss": 3.750225216150284, "train_ppl": 42.530659512921545, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 150364, "dt_s": 4.358, "eta_s": 15330, "world_size": 1, "timestamp": "2026-05-05T03:29:27.276913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57360, "epoch": 0, "train_loss": 3.764659032225609, "train_ppl": 43.1489909359385, "lr": 0.00056, "grad_norm": 0.628, "tokens_per_sec": 150307, "dt_s": 4.36, "eta_s": 15341, "world_size": 1, "timestamp": "2026-05-05T03:29:31.637069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57370, "epoch": 0, "train_loss": 3.770848497748375, "train_ppl": 43.41688834331153, "lr": 0.00056, "grad_norm": Infinity, "tokens_per_sec": 149161, "dt_s": 4.394, "eta_s": 15360, "world_size": 1, "timestamp": "2026-05-05T03:29:36.030646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57380, "epoch": 0, "train_loss": 3.8604859858751297, "train_ppl": 47.488424465322225, "lr": 0.00056, "grad_norm": 0.8255, "tokens_per_sec": 152173, "dt_s": 4.307, "eta_s": 15341, "world_size": 1, "timestamp": "2026-05-05T03:29:40.337397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57390, "epoch": 0, "train_loss": 3.748325228691101, "train_ppl": 42.44992851144766, "lr": 0.00056, "grad_norm": 0.686, "tokens_per_sec": 150855, "dt_s": 4.344, "eta_s": 15360, "world_size": 1, "timestamp": "2026-05-05T03:29:44.681695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57400, "epoch": 0, "train_loss": 3.6940776258707047, "train_ppl": 40.20846822767945, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 147613, "dt_s": 4.44, "eta_s": 15413, "world_size": 1, "timestamp": "2026-05-05T03:29:49.121445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57410, "epoch": 0, "train_loss": 3.746769368648529, "train_ppl": 42.38393371651169, "lr": 0.00056, "grad_norm": 0.6188, "tokens_per_sec": 151576, "dt_s": 4.324, "eta_s": 15383, "world_size": 1, "timestamp": "2026-05-05T03:29:53.445079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57420, "epoch": 0, "train_loss": 3.7478249967098236, "train_ppl": 42.4286990098838, "lr": 0.00056, "grad_norm": 0.7578, "tokens_per_sec": 150238, "dt_s": 4.362, "eta_s": 15357, "world_size": 1, "timestamp": "2026-05-05T03:29:57.807222"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57430, "epoch": 0, "train_loss": 3.7238392382860184, "train_ppl": 41.423122453323, "lr": 0.00056, "grad_norm": 0.7092, "tokens_per_sec": 149747, "dt_s": 4.376, "eta_s": 15401, "world_size": 1, "timestamp": "2026-05-05T03:30:02.183658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57440, "epoch": 0, "train_loss": 3.7814050912857056, "train_ppl": 43.87765054685657, "lr": 0.00056, "grad_norm": 0.659, "tokens_per_sec": 151628, "dt_s": 4.322, "eta_s": 15381, "world_size": 1, "timestamp": "2026-05-05T03:30:06.505812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57450, "epoch": 0, "train_loss": 3.569427564740181, "train_ppl": 35.496268019366035, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 150203, "dt_s": 4.363, "eta_s": 15323, "world_size": 1, "timestamp": "2026-05-05T03:30:10.868965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57460, "epoch": 0, "train_loss": 3.734055206179619, "train_ppl": 41.848468704882826, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 150519, "dt_s": 4.354, "eta_s": 15340, "world_size": 1, "timestamp": "2026-05-05T03:30:15.222994"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57470, "epoch": 0, "train_loss": 3.7506613731384277, "train_ppl": 42.54921360323242, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 151953, "dt_s": 4.313, "eta_s": 15301, "world_size": 1, "timestamp": "2026-05-05T03:30:19.535910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57480, "epoch": 0, "train_loss": 3.694504827260971, "train_ppl": 40.22564901077271, "lr": 0.00056, "grad_norm": 0.6343, "tokens_per_sec": 149429, "dt_s": 4.386, "eta_s": 15303, "world_size": 1, "timestamp": "2026-05-05T03:30:23.921672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57490, "epoch": 0, "train_loss": 3.7455101758241653, "train_ppl": 42.330597758482675, "lr": 0.00056, "grad_norm": 0.6521, "tokens_per_sec": 149746, "dt_s": 4.376, "eta_s": 15337, "world_size": 1, "timestamp": "2026-05-05T03:30:28.298141"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57500, "epoch": 0, "train_loss": 3.702034741640091, "train_ppl": 40.52968796072608, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 149524, "dt_s": 4.383, "eta_s": 15347, "world_size": 1, "timestamp": "2026-05-05T03:30:32.681118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57510, "epoch": 0, "train_loss": 3.706037625670433, "train_ppl": 40.69224874033283, "lr": 0.00056, "grad_norm": 0.6747, "tokens_per_sec": 126120, "dt_s": 5.196, "eta_s": 15397, "world_size": 1, "timestamp": "2026-05-05T03:30:37.877443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57520, "epoch": 0, "train_loss": 3.7839587181806564, "train_ppl": 43.98984088054082, "lr": 0.00056, "grad_norm": 0.7484, "tokens_per_sec": 152253, "dt_s": 4.304, "eta_s": 15387, "world_size": 1, "timestamp": "2026-05-05T03:30:42.181864"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57530, "epoch": 0, "train_loss": 3.6237767338752747, "train_ppl": 37.47884850109852, "lr": 0.00056, "grad_norm": 0.6814, "tokens_per_sec": 134726, "dt_s": 4.864, "eta_s": 15719, "world_size": 1, "timestamp": "2026-05-05T03:30:47.046259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57540, "epoch": 0, "train_loss": 3.678426444530487, "train_ppl": 39.584057330356615, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 148743, "dt_s": 4.406, "eta_s": 15735, "world_size": 1, "timestamp": "2026-05-05T03:30:51.452257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57550, "epoch": 0, "train_loss": 3.694487065076828, "train_ppl": 40.224934521733175, "lr": 0.00056, "grad_norm": 0.6564, "tokens_per_sec": 150511, "dt_s": 4.354, "eta_s": 15710, "world_size": 1, "timestamp": "2026-05-05T03:30:55.806484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57560, "epoch": 0, "train_loss": 3.7463579177856445, "train_ppl": 42.36649839754608, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 148360, "dt_s": 4.417, "eta_s": 15696, "world_size": 1, "timestamp": "2026-05-05T03:31:00.223859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57570, "epoch": 0, "train_loss": 3.658878192305565, "train_ppl": 38.8177723613193, "lr": 0.00056, "grad_norm": 0.6259, "tokens_per_sec": 150308, "dt_s": 4.36, "eta_s": 15731, "world_size": 1, "timestamp": "2026-05-05T03:31:04.583973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57580, "epoch": 0, "train_loss": 3.6048485189676285, "train_ppl": 36.77611255178532, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 150311, "dt_s": 4.36, "eta_s": 15372, "world_size": 1, "timestamp": "2026-05-05T03:31:08.943989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57590, "epoch": 0, "train_loss": 3.6974922865629196, "train_ppl": 40.34600118418581, "lr": 0.00056, "grad_norm": 0.6661, "tokens_per_sec": 147346, "dt_s": 4.448, "eta_s": 15397, "world_size": 1, "timestamp": "2026-05-05T03:31:13.391758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57600, "epoch": 0, "train_loss": 3.663043275475502, "train_ppl": 38.97978878339173, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 151111, "dt_s": 4.337, "eta_s": 15380, "world_size": 1, "timestamp": "2026-05-05T03:31:17.728725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57610, "epoch": 0, "train_loss": 3.651404485106468, "train_ppl": 38.52874110939384, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 150978, "dt_s": 4.341, "eta_s": 15322, "world_size": 1, "timestamp": "2026-05-05T03:31:22.069482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57620, "epoch": 0, "train_loss": 3.694435194134712, "train_ppl": 40.22284807059655, "lr": 0.00056, "grad_norm": 0.7225, "tokens_per_sec": 146202, "dt_s": 4.483, "eta_s": 15404, "world_size": 1, "timestamp": "2026-05-05T03:31:26.552023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57630, "epoch": 0, "train_loss": 3.7540035247802734, "train_ppl": 42.69165742917791, "lr": 0.00056, "grad_norm": 0.6443, "tokens_per_sec": 151250, "dt_s": 4.333, "eta_s": 15380, "world_size": 1, "timestamp": "2026-05-05T03:31:30.884980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57640, "epoch": 0, "train_loss": 3.754250183701515, "train_ppl": 42.70218900614572, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 150003, "dt_s": 4.369, "eta_s": 15321, "world_size": 1, "timestamp": "2026-05-05T03:31:35.253969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57650, "epoch": 0, "train_loss": 3.768999755382538, "train_ppl": 43.336695852906125, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 152091, "dt_s": 4.309, "eta_s": 15297, "world_size": 1, "timestamp": "2026-05-05T03:31:39.562972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57660, "epoch": 0, "train_loss": 3.7722567170858383, "train_ppl": 43.47807191488238, "lr": 0.00056, "grad_norm": 0.7078, "tokens_per_sec": 151796, "dt_s": 4.317, "eta_s": 15276, "world_size": 1, "timestamp": "2026-05-05T03:31:43.880368"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57670, "epoch": 0, "train_loss": 3.727756217122078, "train_ppl": 41.585694134355506, "lr": 0.00056, "grad_norm": 0.6473, "tokens_per_sec": 148138, "dt_s": 4.424, "eta_s": 15231, "world_size": 1, "timestamp": "2026-05-05T03:31:48.304336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57680, "epoch": 0, "train_loss": 3.759430542588234, "train_ppl": 42.923975641639316, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 150786, "dt_s": 4.346, "eta_s": 15236, "world_size": 1, "timestamp": "2026-05-05T03:31:52.650620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57690, "epoch": 0, "train_loss": 3.6932730078697205, "train_ppl": 40.176128782546044, "lr": 0.00056, "grad_norm": 0.8067, "tokens_per_sec": 151665, "dt_s": 4.321, "eta_s": 15198, "world_size": 1, "timestamp": "2026-05-05T03:31:56.971725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57700, "epoch": 0, "train_loss": 3.630845293402672, "train_ppl": 37.74470848936005, "lr": 0.00056, "grad_norm": 0.7173, "tokens_per_sec": 147963, "dt_s": 4.429, "eta_s": 15278, "world_size": 1, "timestamp": "2026-05-05T03:32:01.400944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57710, "epoch": 0, "train_loss": 3.65639890730381, "train_ppl": 38.72165124555955, "lr": 0.00056, "grad_norm": 0.6995, "tokens_per_sec": 150773, "dt_s": 4.347, "eta_s": 15294, "world_size": 1, "timestamp": "2026-05-05T03:32:05.747608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57720, "epoch": 0, "train_loss": 3.767014905810356, "train_ppl": 43.25076433945808, "lr": 0.00056, "grad_norm": 0.7087, "tokens_per_sec": 151164, "dt_s": 4.335, "eta_s": 15227, "world_size": 1, "timestamp": "2026-05-05T03:32:10.083020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57730, "epoch": 0, "train_loss": 3.7967474162578583, "train_ppl": 44.556026344852896, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 147152, "dt_s": 4.454, "eta_s": 15298, "world_size": 1, "timestamp": "2026-05-05T03:32:14.536673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57740, "epoch": 0, "train_loss": 3.793286085128784, "train_ppl": 44.40206978490472, "lr": 0.00056, "grad_norm": 0.6772, "tokens_per_sec": 149063, "dt_s": 4.397, "eta_s": 15346, "world_size": 1, "timestamp": "2026-05-05T03:32:18.933204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57750, "epoch": 0, "train_loss": 3.681800037622452, "train_ppl": 39.71782334189713, "lr": 0.00056, "grad_norm": 0.7328, "tokens_per_sec": 149949, "dt_s": 4.371, "eta_s": 15301, "world_size": 1, "timestamp": "2026-05-05T03:32:23.303750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57760, "epoch": 0, "train_loss": 3.620554596185684, "train_ppl": 37.35828083789938, "lr": 0.00056, "grad_norm": 0.6772, "tokens_per_sec": 149441, "dt_s": 4.385, "eta_s": 15324, "world_size": 1, "timestamp": "2026-05-05T03:32:27.689157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57770, "epoch": 0, "train_loss": 3.873388797044754, "train_ppl": 48.10512869218471, "lr": 0.00056, "grad_norm": 0.7106, "tokens_per_sec": 151823, "dt_s": 4.317, "eta_s": 15306, "world_size": 1, "timestamp": "2026-05-05T03:32:32.005769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57780, "epoch": 0, "train_loss": 3.7230437099933624, "train_ppl": 41.39018229159377, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 151168, "dt_s": 4.335, "eta_s": 15219, "world_size": 1, "timestamp": "2026-05-05T03:32:36.341076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57790, "epoch": 0, "train_loss": 3.8091790229082108, "train_ppl": 45.11338660193648, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 151609, "dt_s": 4.323, "eta_s": 15163, "world_size": 1, "timestamp": "2026-05-05T03:32:40.663771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57800, "epoch": 0, "train_loss": 3.6363434195518494, "train_ppl": 37.95280520487557, "lr": 0.00056, "grad_norm": 0.6273, "tokens_per_sec": 152459, "dt_s": 4.299, "eta_s": 15109, "world_size": 1, "timestamp": "2026-05-05T03:32:44.962393"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57810, "epoch": 0, "train_loss": 3.778265357017517, "train_ppl": 43.740102429091415, "lr": 0.00056, "grad_norm": 0.6363, "tokens_per_sec": 147627, "dt_s": 4.439, "eta_s": 15142, "world_size": 1, "timestamp": "2026-05-05T03:32:49.401666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57820, "epoch": 0, "train_loss": 3.709599032998085, "train_ppl": 40.837428782357335, "lr": 0.00056, "grad_norm": 0.6482, "tokens_per_sec": 152401, "dt_s": 4.3, "eta_s": 15126, "world_size": 1, "timestamp": "2026-05-05T03:32:53.701896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57830, "epoch": 0, "train_loss": 3.716806322336197, "train_ppl": 41.132819150671956, "lr": 0.00056, "grad_norm": 0.6503, "tokens_per_sec": 136430, "dt_s": 4.804, "eta_s": 15448, "world_size": 1, "timestamp": "2026-05-05T03:32:58.505543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57840, "epoch": 0, "train_loss": 3.7480527460575104, "train_ppl": 42.43836321887337, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 149987, "dt_s": 4.369, "eta_s": 15477, "world_size": 1, "timestamp": "2026-05-05T03:33:02.874987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57850, "epoch": 0, "train_loss": 3.7231214940547943, "train_ppl": 41.39340191329181, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 152414, "dt_s": 4.3, "eta_s": 15473, "world_size": 1, "timestamp": "2026-05-05T03:33:07.174856"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57860, "epoch": 0, "train_loss": 3.7215039283037186, "train_ppl": 41.32649948815976, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 150872, "dt_s": 4.344, "eta_s": 15402, "world_size": 1, "timestamp": "2026-05-05T03:33:11.518679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57870, "epoch": 0, "train_loss": 3.769129365682602, "train_ppl": 43.34231309907803, "lr": 0.00056, "grad_norm": 0.8523, "tokens_per_sec": 151440, "dt_s": 4.328, "eta_s": 15417, "world_size": 1, "timestamp": "2026-05-05T03:33:15.846216"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57880, "epoch": 0, "train_loss": 3.8644561618566513, "train_ppl": 47.67733662665679, "lr": 0.00056, "grad_norm": 0.6987, "tokens_per_sec": 152653, "dt_s": 4.293, "eta_s": 15057, "world_size": 1, "timestamp": "2026-05-05T03:33:20.139367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57890, "epoch": 0, "train_loss": 3.7984917014837265, "train_ppl": 44.63381258430149, "lr": 0.00056, "grad_norm": 0.6376, "tokens_per_sec": 150018, "dt_s": 4.369, "eta_s": 15052, "world_size": 1, "timestamp": "2026-05-05T03:33:24.507939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57900, "epoch": 0, "train_loss": 3.6414414644241333, "train_ppl": 38.14678434589437, "lr": 0.00056, "grad_norm": 0.7922, "tokens_per_sec": 148157, "dt_s": 4.423, "eta_s": 15134, "world_size": 1, "timestamp": "2026-05-05T03:33:28.931338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57910, "epoch": 0, "train_loss": 3.6891210973262787, "train_ppl": 40.00966689641661, "lr": 0.00056, "grad_norm": 0.6921, "tokens_per_sec": 152087, "dt_s": 4.309, "eta_s": 15105, "world_size": 1, "timestamp": "2026-05-05T03:33:33.240460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57920, "epoch": 0, "train_loss": 3.794309511780739, "train_ppl": 44.447535307870126, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 147878, "dt_s": 4.432, "eta_s": 15173, "world_size": 1, "timestamp": "2026-05-05T03:33:37.672236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57930, "epoch": 0, "train_loss": 3.6836858689785004, "train_ppl": 39.7927951284033, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 150848, "dt_s": 4.345, "eta_s": 15204, "world_size": 1, "timestamp": "2026-05-05T03:33:42.016725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57940, "epoch": 0, "train_loss": 3.6974939703941345, "train_ppl": 40.346069120099195, "lr": 0.00056, "grad_norm": 0.6949, "tokens_per_sec": 152503, "dt_s": 4.297, "eta_s": 15151, "world_size": 1, "timestamp": "2026-05-05T03:33:46.314086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57950, "epoch": 0, "train_loss": 3.7262114882469177, "train_ppl": 41.52150510192244, "lr": 0.00056, "grad_norm": 0.8874, "tokens_per_sec": 146913, "dt_s": 4.461, "eta_s": 15172, "world_size": 1, "timestamp": "2026-05-05T03:33:50.774923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57960, "epoch": 0, "train_loss": 3.8194917291402817, "train_ppl": 45.581034920359706, "lr": 0.00056, "grad_norm": 0.6743, "tokens_per_sec": 151947, "dt_s": 4.313, "eta_s": 15171, "world_size": 1, "timestamp": "2026-05-05T03:33:55.088008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57970, "epoch": 0, "train_loss": 3.679556518793106, "train_ppl": 39.628815540042815, "lr": 0.00056, "grad_norm": 0.648, "tokens_per_sec": 153025, "dt_s": 4.283, "eta_s": 15063, "world_size": 1, "timestamp": "2026-05-05T03:33:59.370728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57980, "epoch": 0, "train_loss": 3.7699929773807526, "train_ppl": 43.37976019524304, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 149287, "dt_s": 4.39, "eta_s": 15090, "world_size": 1, "timestamp": "2026-05-05T03:34:03.760652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 57990, "epoch": 0, "train_loss": 3.7713312208652496, "train_ppl": 43.43785173832823, "lr": 0.00056, "grad_norm": 0.7134, "tokens_per_sec": 153091, "dt_s": 4.281, "eta_s": 15074, "world_size": 1, "timestamp": "2026-05-05T03:34:08.041495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58000, "epoch": 0, "train_loss": 3.75635102391243, "train_ppl": 42.791993781615965, "lr": 0.00056, "grad_norm": 0.6634, "tokens_per_sec": 150836, "dt_s": 4.345, "eta_s": 14989, "world_size": 1, "timestamp": "2026-05-05T03:34:12.386356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58010, "epoch": 0, "train_loss": 3.7152982354164124, "train_ppl": 41.070834035355645, "lr": 0.00056, "grad_norm": 0.624, "tokens_per_sec": 127130, "dt_s": 5.155, "eta_s": 15037, "world_size": 1, "timestamp": "2026-05-05T03:34:17.541406"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58020, "epoch": 0, "train_loss": 3.768840029835701, "train_ppl": 43.329774428241805, "lr": 0.00056, "grad_norm": 0.633, "tokens_per_sec": 150612, "dt_s": 4.351, "eta_s": 15081, "world_size": 1, "timestamp": "2026-05-05T03:34:21.892707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58030, "epoch": 0, "train_loss": 3.6565129309892654, "train_ppl": 38.7260666826689, "lr": 0.00056, "grad_norm": 0.7096, "tokens_per_sec": 147025, "dt_s": 4.457, "eta_s": 15123, "world_size": 1, "timestamp": "2026-05-05T03:34:26.350205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58040, "epoch": 0, "train_loss": 3.7966063767671585, "train_ppl": 44.549742628726094, "lr": 0.00056, "grad_norm": 0.6646, "tokens_per_sec": 149713, "dt_s": 4.377, "eta_s": 15186, "world_size": 1, "timestamp": "2026-05-05T03:34:30.727647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58050, "epoch": 0, "train_loss": 3.6579489558935165, "train_ppl": 38.78171822780772, "lr": 0.00056, "grad_norm": 0.727, "tokens_per_sec": 152985, "dt_s": 4.284, "eta_s": 15139, "world_size": 1, "timestamp": "2026-05-05T03:34:35.011449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58060, "epoch": 0, "train_loss": 3.6953928768634796, "train_ppl": 40.26138724870268, "lr": 0.00056, "grad_norm": 0.6466, "tokens_per_sec": 147475, "dt_s": 4.444, "eta_s": 15173, "world_size": 1, "timestamp": "2026-05-05T03:34:39.455335"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58070, "epoch": 0, "train_loss": 3.6483934819698334, "train_ppl": 38.412905427379, "lr": 0.00056, "grad_norm": 0.6498, "tokens_per_sec": 151771, "dt_s": 4.318, "eta_s": 15146, "world_size": 1, "timestamp": "2026-05-05T03:34:43.773394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58080, "epoch": 0, "train_loss": 3.72040893137455, "train_ppl": 41.28127186470039, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 152064, "dt_s": 4.31, "eta_s": 15039, "world_size": 1, "timestamp": "2026-05-05T03:34:48.083170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58090, "epoch": 0, "train_loss": 3.744242936372757, "train_ppl": 42.27698872991362, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 149123, "dt_s": 4.395, "eta_s": 15047, "world_size": 1, "timestamp": "2026-05-05T03:34:52.477927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58100, "epoch": 0, "train_loss": 3.671697050333023, "train_ppl": 39.31857487455248, "lr": 0.00056, "grad_norm": 0.6442, "tokens_per_sec": 151479, "dt_s": 4.326, "eta_s": 15072, "world_size": 1, "timestamp": "2026-05-05T03:34:56.804351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58110, "epoch": 0, "train_loss": 3.6593222618103027, "train_ppl": 38.835013978221625, "lr": 0.00056, "grad_norm": 0.6659, "tokens_per_sec": 148560, "dt_s": 4.411, "eta_s": 15045, "world_size": 1, "timestamp": "2026-05-05T03:35:01.215772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58120, "epoch": 0, "train_loss": 3.8012713193893433, "train_ppl": 44.758050115398646, "lr": 0.00056, "grad_norm": 0.6935, "tokens_per_sec": 133812, "dt_s": 4.898, "eta_s": 15441, "world_size": 1, "timestamp": "2026-05-05T03:35:06.113400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58130, "epoch": 0, "train_loss": 3.684519901871681, "train_ppl": 39.825997472450865, "lr": 0.00056, "grad_norm": 0.6232, "tokens_per_sec": 152047, "dt_s": 4.31, "eta_s": 15437, "world_size": 1, "timestamp": "2026-05-05T03:35:10.423656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58140, "epoch": 0, "train_loss": 3.8156266063451767, "train_ppl": 45.40519865654428, "lr": 0.00056, "grad_norm": 0.7273, "tokens_per_sec": 147765, "dt_s": 4.435, "eta_s": 15461, "world_size": 1, "timestamp": "2026-05-05T03:35:14.858815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58150, "epoch": 0, "train_loss": 3.6774751096963882, "train_ppl": 39.54641754460173, "lr": 0.00056, "grad_norm": 0.7151, "tokens_per_sec": 150134, "dt_s": 4.365, "eta_s": 15483, "world_size": 1, "timestamp": "2026-05-05T03:35:19.223976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58160, "epoch": 0, "train_loss": 3.7265842854976654, "train_ppl": 41.53698709051343, "lr": 0.00056, "grad_norm": 0.7031, "tokens_per_sec": 151870, "dt_s": 4.315, "eta_s": 15412, "world_size": 1, "timestamp": "2026-05-05T03:35:23.539263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58170, "epoch": 0, "train_loss": 3.6764790266752243, "train_ppl": 39.50704564163507, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 148580, "dt_s": 4.411, "eta_s": 15071, "world_size": 1, "timestamp": "2026-05-05T03:35:27.950096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58180, "epoch": 0, "train_loss": 3.6646037995815277, "train_ppl": 39.040665170616855, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 151428, "dt_s": 4.328, "eta_s": 15079, "world_size": 1, "timestamp": "2026-05-05T03:35:32.277920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58190, "epoch": 0, "train_loss": 3.7468689680099487, "train_ppl": 42.3881553394763, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 150459, "dt_s": 4.356, "eta_s": 15020, "world_size": 1, "timestamp": "2026-05-05T03:35:36.633656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58200, "epoch": 0, "train_loss": 3.730554923415184, "train_ppl": 41.70224329559292, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 149165, "dt_s": 4.394, "eta_s": 15035, "world_size": 1, "timestamp": "2026-05-05T03:35:41.027186"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58210, "epoch": 0, "train_loss": 3.8010988533496857, "train_ppl": 44.75033153736787, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 151311, "dt_s": 4.331, "eta_s": 15042, "world_size": 1, "timestamp": "2026-05-05T03:35:45.358382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58220, "epoch": 0, "train_loss": 3.6848602145910263, "train_ppl": 39.839553072392064, "lr": 0.00056, "grad_norm": 0.6886, "tokens_per_sec": 150479, "dt_s": 4.355, "eta_s": 14999, "world_size": 1, "timestamp": "2026-05-05T03:35:49.713524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58230, "epoch": 0, "train_loss": 3.797622099518776, "train_ppl": 44.59501580449779, "lr": 0.00056, "grad_norm": 0.7125, "tokens_per_sec": 149583, "dt_s": 4.381, "eta_s": 15032, "world_size": 1, "timestamp": "2026-05-05T03:35:54.094773"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58240, "epoch": 0, "train_loss": 3.751037985086441, "train_ppl": 42.56524116334984, "lr": 0.00056, "grad_norm": 0.7045, "tokens_per_sec": 152214, "dt_s": 4.306, "eta_s": 14993, "world_size": 1, "timestamp": "2026-05-05T03:35:58.400301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58250, "epoch": 0, "train_loss": 3.5806864351034164, "train_ppl": 35.89817415840158, "lr": 0.00056, "grad_norm": 0.648, "tokens_per_sec": 149384, "dt_s": 4.387, "eta_s": 14984, "world_size": 1, "timestamp": "2026-05-05T03:36:02.787410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58260, "epoch": 0, "train_loss": 3.72483392059803, "train_ppl": 41.46434579920327, "lr": 0.00056, "grad_norm": 0.6956, "tokens_per_sec": 149959, "dt_s": 4.37, "eta_s": 15006, "world_size": 1, "timestamp": "2026-05-05T03:36:07.157641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58270, "epoch": 0, "train_loss": 3.6875094771385193, "train_ppl": 39.94523844058596, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 151614, "dt_s": 4.323, "eta_s": 14980, "world_size": 1, "timestamp": "2026-05-05T03:36:11.480198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58280, "epoch": 0, "train_loss": 3.765184387564659, "train_ppl": 43.17166544426682, "lr": 0.00056, "grad_norm": 0.718, "tokens_per_sec": 147804, "dt_s": 4.434, "eta_s": 15012, "world_size": 1, "timestamp": "2026-05-05T03:36:15.914160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58290, "epoch": 0, "train_loss": 3.827243685722351, "train_ppl": 45.9357502164967, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 150720, "dt_s": 4.348, "eta_s": 15037, "world_size": 1, "timestamp": "2026-05-05T03:36:20.262357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58300, "epoch": 0, "train_loss": 3.6536515653133392, "train_ppl": 38.61541562676166, "lr": 0.00056, "grad_norm": 0.69, "tokens_per_sec": 151560, "dt_s": 4.324, "eta_s": 14989, "world_size": 1, "timestamp": "2026-05-05T03:36:24.586428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58310, "epoch": 0, "train_loss": 3.6843185424804688, "train_ppl": 39.81797894117577, "lr": 0.00056, "grad_norm": 0.6659, "tokens_per_sec": 148279, "dt_s": 4.42, "eta_s": 15019, "world_size": 1, "timestamp": "2026-05-05T03:36:29.006226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58320, "epoch": 0, "train_loss": 3.6188571006059647, "train_ppl": 37.29491911464838, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 150888, "dt_s": 4.343, "eta_s": 15028, "world_size": 1, "timestamp": "2026-05-05T03:36:33.349563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58330, "epoch": 0, "train_loss": 3.7516554594039917, "train_ppl": 42.59153222278021, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 150862, "dt_s": 4.344, "eta_s": 14962, "world_size": 1, "timestamp": "2026-05-05T03:36:37.693723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58340, "epoch": 0, "train_loss": 3.7885394990444183, "train_ppl": 44.191810938965, "lr": 0.00056, "grad_norm": 0.6835, "tokens_per_sec": 149496, "dt_s": 4.384, "eta_s": 14982, "world_size": 1, "timestamp": "2026-05-05T03:36:42.077484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58350, "epoch": 0, "train_loss": 3.5530484914779663, "train_ppl": 34.91960751909489, "lr": 0.00056, "grad_norm": 0.7034, "tokens_per_sec": 151196, "dt_s": 4.335, "eta_s": 14985, "world_size": 1, "timestamp": "2026-05-05T03:36:46.412005"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58360, "epoch": 0, "train_loss": 3.7395671606063843, "train_ppl": 42.079772439420516, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 149010, "dt_s": 4.398, "eta_s": 14966, "world_size": 1, "timestamp": "2026-05-05T03:36:50.810077"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58370, "epoch": 0, "train_loss": 3.741617873311043, "train_ppl": 42.1661545054986, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 150311, "dt_s": 4.36, "eta_s": 14973, "world_size": 1, "timestamp": "2026-05-05T03:36:55.170109"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58380, "epoch": 0, "train_loss": 3.659091353416443, "train_ppl": 38.82604768275467, "lr": 0.00056, "grad_norm": 0.6623, "tokens_per_sec": 152116, "dt_s": 4.308, "eta_s": 14944, "world_size": 1, "timestamp": "2026-05-05T03:36:59.478391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58390, "epoch": 0, "train_loss": 3.671486124396324, "train_ppl": 39.31028244189271, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 148784, "dt_s": 4.405, "eta_s": 14954, "world_size": 1, "timestamp": "2026-05-05T03:37:03.883189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58400, "epoch": 0, "train_loss": 3.686777502298355, "train_ppl": 39.91601022952566, "lr": 0.00056, "grad_norm": 0.6675, "tokens_per_sec": 150216, "dt_s": 4.363, "eta_s": 14969, "world_size": 1, "timestamp": "2026-05-05T03:37:08.245984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58410, "epoch": 0, "train_loss": 3.7552205324172974, "train_ppl": 42.74364513060465, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 150192, "dt_s": 4.363, "eta_s": 14941, "world_size": 1, "timestamp": "2026-05-05T03:37:12.609437"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58420, "epoch": 0, "train_loss": 3.7582776993513107, "train_ppl": 42.8745195396628, "lr": 0.00056, "grad_norm": 0.7127, "tokens_per_sec": 132281, "dt_s": 4.954, "eta_s": 15344, "world_size": 1, "timestamp": "2026-05-05T03:37:17.563725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58430, "epoch": 0, "train_loss": 3.8757055699825287, "train_ppl": 48.21670655288373, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 149426, "dt_s": 4.386, "eta_s": 15393, "world_size": 1, "timestamp": "2026-05-05T03:37:21.949588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58440, "epoch": 0, "train_loss": 3.7369959205389023, "train_ppl": 41.97171422383406, "lr": 0.00056, "grad_norm": 0.7266, "tokens_per_sec": 149406, "dt_s": 4.386, "eta_s": 15376, "world_size": 1, "timestamp": "2026-05-05T03:37:26.335984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58450, "epoch": 0, "train_loss": 3.7370657473802567, "train_ppl": 41.97464507838951, "lr": 0.00056, "grad_norm": 0.7571, "tokens_per_sec": 149907, "dt_s": 4.372, "eta_s": 15377, "world_size": 1, "timestamp": "2026-05-05T03:37:30.707779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58460, "epoch": 0, "train_loss": 3.713326334953308, "train_ppl": 40.98992623598154, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 150705, "dt_s": 4.349, "eta_s": 15363, "world_size": 1, "timestamp": "2026-05-05T03:37:35.056396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58470, "epoch": 0, "train_loss": 3.6904939264059067, "train_ppl": 40.064631050160145, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 149633, "dt_s": 4.38, "eta_s": 14965, "world_size": 1, "timestamp": "2026-05-05T03:37:39.436170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58480, "epoch": 0, "train_loss": 3.7204470336437225, "train_ppl": 41.28284480479888, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 151093, "dt_s": 4.337, "eta_s": 14927, "world_size": 1, "timestamp": "2026-05-05T03:37:43.773633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58490, "epoch": 0, "train_loss": 3.7250602394342422, "train_ppl": 41.47373102367533, "lr": 0.00056, "grad_norm": 0.6308, "tokens_per_sec": 148546, "dt_s": 4.412, "eta_s": 14940, "world_size": 1, "timestamp": "2026-05-05T03:37:48.185443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58500, "epoch": 0, "train_loss": 3.751193642616272, "train_ppl": 42.57186727933503, "lr": 0.00056, "grad_norm": 0.6349, "tokens_per_sec": 150041, "dt_s": 4.368, "eta_s": 14933, "world_size": 1, "timestamp": "2026-05-05T03:37:52.553324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58510, "epoch": 0, "train_loss": 3.681700214743614, "train_ppl": 39.71385879230965, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 127454, "dt_s": 5.142, "eta_s": 14942, "world_size": 1, "timestamp": "2026-05-05T03:37:57.695288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58520, "epoch": 0, "train_loss": 3.7579055726528168, "train_ppl": 42.858567754483644, "lr": 0.00056, "grad_norm": 0.6217, "tokens_per_sec": 149333, "dt_s": 4.389, "eta_s": 14944, "world_size": 1, "timestamp": "2026-05-05T03:38:02.083832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58530, "epoch": 0, "train_loss": 3.725576013326645, "train_ppl": 41.49512760878353, "lr": 0.00056, "grad_norm": 0.7419, "tokens_per_sec": 149426, "dt_s": 4.386, "eta_s": 14972, "world_size": 1, "timestamp": "2026-05-05T03:38:06.469694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58540, "epoch": 0, "train_loss": 3.648109585046768, "train_ppl": 38.402001669567056, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 148179, "dt_s": 4.423, "eta_s": 14975, "world_size": 1, "timestamp": "2026-05-05T03:38:10.892455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58550, "epoch": 0, "train_loss": 3.6460710763931274, "train_ppl": 38.32379859275212, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 148641, "dt_s": 4.409, "eta_s": 14999, "world_size": 1, "timestamp": "2026-05-05T03:38:15.301473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58560, "epoch": 0, "train_loss": 3.6703273206949234, "train_ppl": 39.26475592433898, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 149711, "dt_s": 4.378, "eta_s": 15002, "world_size": 1, "timestamp": "2026-05-05T03:38:19.679005"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58570, "epoch": 0, "train_loss": 3.813375100493431, "train_ppl": 45.30308358554682, "lr": 0.00056, "grad_norm": 0.6357, "tokens_per_sec": 150943, "dt_s": 4.342, "eta_s": 14965, "world_size": 1, "timestamp": "2026-05-05T03:38:24.020778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58580, "epoch": 0, "train_loss": 3.7557075768709183, "train_ppl": 42.76446825637538, "lr": 0.00056, "grad_norm": 0.6342, "tokens_per_sec": 147892, "dt_s": 4.431, "eta_s": 14992, "world_size": 1, "timestamp": "2026-05-05T03:38:28.452109"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58590, "epoch": 0, "train_loss": 3.6794365644454956, "train_ppl": 39.62406217642715, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 150537, "dt_s": 4.353, "eta_s": 14940, "world_size": 1, "timestamp": "2026-05-05T03:38:32.805603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58600, "epoch": 0, "train_loss": 3.8319106996059418, "train_ppl": 46.15063404337971, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 151667, "dt_s": 4.321, "eta_s": 14876, "world_size": 1, "timestamp": "2026-05-05T03:38:37.126673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58610, "epoch": 0, "train_loss": 3.740445077419281, "train_ppl": 42.11673120011285, "lr": 0.00056, "grad_norm": 0.7326, "tokens_per_sec": 148077, "dt_s": 4.426, "eta_s": 14904, "world_size": 1, "timestamp": "2026-05-05T03:38:41.552466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58620, "epoch": 0, "train_loss": 3.731479227542877, "train_ppl": 41.74080667060294, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 150522, "dt_s": 4.354, "eta_s": 14908, "world_size": 1, "timestamp": "2026-05-05T03:38:45.906388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58630, "epoch": 0, "train_loss": 3.7635220140218735, "train_ppl": 43.099957628935414, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 149502, "dt_s": 4.384, "eta_s": 14871, "world_size": 1, "timestamp": "2026-05-05T03:38:50.289998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58640, "epoch": 0, "train_loss": 3.698063999414444, "train_ppl": 40.369074106485066, "lr": 0.00056, "grad_norm": 0.6379, "tokens_per_sec": 148978, "dt_s": 4.399, "eta_s": 14898, "world_size": 1, "timestamp": "2026-05-05T03:38:54.689047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58650, "epoch": 0, "train_loss": 3.7036663740873337, "train_ppl": 40.59587149359633, "lr": 0.00056, "grad_norm": 0.6723, "tokens_per_sec": 150607, "dt_s": 4.351, "eta_s": 14914, "world_size": 1, "timestamp": "2026-05-05T03:38:59.040518"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58660, "epoch": 0, "train_loss": 3.673692286014557, "train_ppl": 39.3971030131098, "lr": 0.00056, "grad_norm": 0.6649, "tokens_per_sec": 146239, "dt_s": 4.481, "eta_s": 14948, "world_size": 1, "timestamp": "2026-05-05T03:39:03.521942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58670, "epoch": 0, "train_loss": 3.708548352122307, "train_ppl": 40.79454420986498, "lr": 0.00056, "grad_norm": 0.6131, "tokens_per_sec": 152265, "dt_s": 4.304, "eta_s": 14910, "world_size": 1, "timestamp": "2026-05-05T03:39:07.826011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58680, "epoch": 0, "train_loss": 3.6437105238437653, "train_ppl": 38.23343994241206, "lr": 0.00056, "grad_norm": 0.6277, "tokens_per_sec": 151579, "dt_s": 4.324, "eta_s": 14864, "world_size": 1, "timestamp": "2026-05-05T03:39:12.149568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58690, "epoch": 0, "train_loss": 3.7251914143562317, "train_ppl": 41.479171693938916, "lr": 0.00056, "grad_norm": 0.691, "tokens_per_sec": 149801, "dt_s": 4.375, "eta_s": 14844, "world_size": 1, "timestamp": "2026-05-05T03:39:16.524450"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58700, "epoch": 0, "train_loss": 3.725380852818489, "train_ppl": 41.48703018876741, "lr": 0.00056, "grad_norm": 0.6507, "tokens_per_sec": 152650, "dt_s": 4.293, "eta_s": 14800, "world_size": 1, "timestamp": "2026-05-05T03:39:20.817676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58710, "epoch": 0, "train_loss": 3.676055461168289, "train_ppl": 39.49031536325491, "lr": 0.00056, "grad_norm": 0.6943, "tokens_per_sec": 135227, "dt_s": 4.846, "eta_s": 15043, "world_size": 1, "timestamp": "2026-05-05T03:39:25.664014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58720, "epoch": 0, "train_loss": 3.7489141523838043, "train_ppl": 42.47493564302498, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 150370, "dt_s": 4.358, "eta_s": 15076, "world_size": 1, "timestamp": "2026-05-05T03:39:30.022328"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58730, "epoch": 0, "train_loss": 3.7000077068805695, "train_ppl": 40.447616083812655, "lr": 0.00056, "grad_norm": 0.6421, "tokens_per_sec": 153859, "dt_s": 4.259, "eta_s": 15028, "world_size": 1, "timestamp": "2026-05-05T03:39:34.281805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58740, "epoch": 0, "train_loss": 3.6066670566797256, "train_ppl": 36.84305214700636, "lr": 0.00056, "grad_norm": 0.6995, "tokens_per_sec": 150093, "dt_s": 4.366, "eta_s": 15017, "world_size": 1, "timestamp": "2026-05-05T03:39:38.648167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58750, "epoch": 0, "train_loss": 3.758523404598236, "train_ppl": 42.885055328369376, "lr": 0.00056, "grad_norm": 0.634, "tokens_per_sec": 150781, "dt_s": 4.346, "eta_s": 15049, "world_size": 1, "timestamp": "2026-05-05T03:39:42.994586"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58760, "epoch": 0, "train_loss": 3.8405473679304123, "train_ppl": 46.55094796350373, "lr": 0.00056, "grad_norm": 0.6818, "tokens_per_sec": 150830, "dt_s": 4.345, "eta_s": 14705, "world_size": 1, "timestamp": "2026-05-05T03:39:47.339665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58770, "epoch": 0, "train_loss": 3.640927702188492, "train_ppl": 38.12719100227727, "lr": 0.00056, "grad_norm": 0.6182, "tokens_per_sec": 147434, "dt_s": 4.445, "eta_s": 14759, "world_size": 1, "timestamp": "2026-05-05T03:39:51.784738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58780, "epoch": 0, "train_loss": 3.7203783243894577, "train_ppl": 41.28000838876354, "lr": 0.00056, "grad_norm": 0.6405, "tokens_per_sec": 151714, "dt_s": 4.32, "eta_s": 14796, "world_size": 1, "timestamp": "2026-05-05T03:39:56.104444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58790, "epoch": 0, "train_loss": 3.708120048046112, "train_ppl": 40.77707548152421, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 150463, "dt_s": 4.356, "eta_s": 14784, "world_size": 1, "timestamp": "2026-05-05T03:40:00.460063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58800, "epoch": 0, "train_loss": 3.653499409556389, "train_ppl": 38.60954051594432, "lr": 0.00056, "grad_norm": 0.6836, "tokens_per_sec": 148278, "dt_s": 4.42, "eta_s": 14829, "world_size": 1, "timestamp": "2026-05-05T03:40:04.879896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58810, "epoch": 0, "train_loss": 3.7835796028375626, "train_ppl": 43.973166817818964, "lr": 0.00056, "grad_norm": 0.6758, "tokens_per_sec": 151843, "dt_s": 4.316, "eta_s": 14805, "world_size": 1, "timestamp": "2026-05-05T03:40:09.195927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58820, "epoch": 0, "train_loss": 3.657609835267067, "train_ppl": 38.76856877697854, "lr": 0.00056, "grad_norm": 0.7033, "tokens_per_sec": 150561, "dt_s": 4.353, "eta_s": 14738, "world_size": 1, "timestamp": "2026-05-05T03:40:13.548736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58830, "epoch": 0, "train_loss": 3.7009339928627014, "train_ppl": 40.48509950110367, "lr": 0.00056, "grad_norm": 0.8124, "tokens_per_sec": 149040, "dt_s": 4.397, "eta_s": 14787, "world_size": 1, "timestamp": "2026-05-05T03:40:17.945942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58840, "epoch": 0, "train_loss": 3.6736235320568085, "train_ppl": 39.394394399468844, "lr": 0.00056, "grad_norm": 0.665, "tokens_per_sec": 151287, "dt_s": 4.332, "eta_s": 14766, "world_size": 1, "timestamp": "2026-05-05T03:40:22.277847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58850, "epoch": 0, "train_loss": 3.6743295937776566, "train_ppl": 39.422219095190634, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 149976, "dt_s": 4.37, "eta_s": 14728, "world_size": 1, "timestamp": "2026-05-05T03:40:26.647613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58860, "epoch": 0, "train_loss": 3.6723450422286987, "train_ppl": 39.34406124901096, "lr": 0.00056, "grad_norm": 0.696, "tokens_per_sec": 149491, "dt_s": 4.384, "eta_s": 14770, "world_size": 1, "timestamp": "2026-05-05T03:40:31.031579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58870, "epoch": 0, "train_loss": 3.64973221719265, "train_ppl": 38.46436457428613, "lr": 0.00056, "grad_norm": 0.6294, "tokens_per_sec": 152434, "dt_s": 4.299, "eta_s": 14729, "world_size": 1, "timestamp": "2026-05-05T03:40:35.330873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58880, "epoch": 0, "train_loss": 3.688920959830284, "train_ppl": 40.00166026310888, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 149329, "dt_s": 4.389, "eta_s": 14719, "world_size": 1, "timestamp": "2026-05-05T03:40:39.719558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58890, "epoch": 0, "train_loss": 3.7922406792640686, "train_ppl": 44.355675855214386, "lr": 0.00056, "grad_norm": 0.7305, "tokens_per_sec": 151977, "dt_s": 4.312, "eta_s": 14701, "world_size": 1, "timestamp": "2026-05-05T03:40:44.031794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58900, "epoch": 0, "train_loss": 3.763119250535965, "train_ppl": 43.08260203509263, "lr": 0.00056, "grad_norm": 0.6837, "tokens_per_sec": 151063, "dt_s": 4.338, "eta_s": 14676, "world_size": 1, "timestamp": "2026-05-05T03:40:48.370094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58910, "epoch": 0, "train_loss": 3.61262583732605, "train_ppl": 37.06324720858746, "lr": 0.00056, "grad_norm": 0.5982, "tokens_per_sec": 148148, "dt_s": 4.424, "eta_s": 14698, "world_size": 1, "timestamp": "2026-05-05T03:40:52.793779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58920, "epoch": 0, "train_loss": 3.656851142644882, "train_ppl": 38.73916650492859, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 151929, "dt_s": 4.314, "eta_s": 14703, "world_size": 1, "timestamp": "2026-05-05T03:40:57.107396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58930, "epoch": 0, "train_loss": 3.6729753613471985, "train_ppl": 39.36886838039891, "lr": 0.00056, "grad_norm": 0.6556, "tokens_per_sec": 151751, "dt_s": 4.319, "eta_s": 14652, "world_size": 1, "timestamp": "2026-05-05T03:41:01.426038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58940, "epoch": 0, "train_loss": 3.6457832157611847, "train_ppl": 38.31276826754512, "lr": 0.00056, "grad_norm": 0.7466, "tokens_per_sec": 149724, "dt_s": 4.377, "eta_s": 14691, "world_size": 1, "timestamp": "2026-05-05T03:41:05.803145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58950, "epoch": 0, "train_loss": 3.669468343257904, "train_ppl": 39.23104286638087, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 153301, "dt_s": 4.275, "eta_s": 14644, "world_size": 1, "timestamp": "2026-05-05T03:41:10.078123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58960, "epoch": 0, "train_loss": 3.652323305606842, "train_ppl": 38.56415837514197, "lr": 0.00056, "grad_norm": 0.7024, "tokens_per_sec": 152066, "dt_s": 4.31, "eta_s": 14563, "world_size": 1, "timestamp": "2026-05-05T03:41:14.387823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58970, "epoch": 0, "train_loss": 4.050345033407211, "train_ppl": 57.417264502536554, "lr": 0.00056, "grad_norm": 0.7842, "tokens_per_sec": 150698, "dt_s": 4.349, "eta_s": 14582, "world_size": 1, "timestamp": "2026-05-05T03:41:18.736672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58980, "epoch": 0, "train_loss": 3.7451024651527405, "train_ppl": 42.31334263983578, "lr": 0.00056, "grad_norm": 0.6332, "tokens_per_sec": 153668, "dt_s": 4.265, "eta_s": 14542, "world_size": 1, "timestamp": "2026-05-05T03:41:23.001417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 58990, "epoch": 0, "train_loss": 3.686299115419388, "train_ppl": 39.8969195007123, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 148805, "dt_s": 4.404, "eta_s": 14556, "world_size": 1, "timestamp": "2026-05-05T03:41:27.405582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59000, "epoch": 0, "train_loss": 3.762274995446205, "train_ppl": 43.04624467864554, "lr": 0.00056, "grad_norm": 0.71, "tokens_per_sec": 150261, "dt_s": 4.361, "eta_s": 14610, "world_size": 1, "timestamp": "2026-05-05T03:41:31.767063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59010, "epoch": 0, "train_loss": 3.751026287674904, "train_ppl": 42.56474326311885, "lr": 0.00056, "grad_norm": 0.711, "tokens_per_sec": 116187, "dt_s": 5.641, "eta_s": 14980, "world_size": 1, "timestamp": "2026-05-05T03:41:37.407698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59020, "epoch": 0, "train_loss": 3.7442979216575623, "train_ppl": 42.279313406090544, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 147265, "dt_s": 4.45, "eta_s": 15044, "world_size": 1, "timestamp": "2026-05-05T03:41:41.857857"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59030, "epoch": 0, "train_loss": 3.68521411716938, "train_ppl": 39.85365488813214, "lr": 0.00056, "grad_norm": 0.6785, "tokens_per_sec": 150879, "dt_s": 4.344, "eta_s": 15093, "world_size": 1, "timestamp": "2026-05-05T03:41:46.201470"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59040, "epoch": 0, "train_loss": 3.6991382092237473, "train_ppl": 40.412462261701485, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 150761, "dt_s": 4.347, "eta_s": 15050, "world_size": 1, "timestamp": "2026-05-05T03:41:50.548469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59050, "epoch": 0, "train_loss": 3.669033646583557, "train_ppl": 39.21399296855112, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 147365, "dt_s": 4.447, "eta_s": 15103, "world_size": 1, "timestamp": "2026-05-05T03:41:54.995670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59060, "epoch": 0, "train_loss": 3.677029103040695, "train_ppl": 39.528783511908316, "lr": 0.00056, "grad_norm": 0.6558, "tokens_per_sec": 151008, "dt_s": 4.34, "eta_s": 14744, "world_size": 1, "timestamp": "2026-05-05T03:41:59.335558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59070, "epoch": 0, "train_loss": 3.747178018093109, "train_ppl": 42.401257426905296, "lr": 0.00056, "grad_norm": 2.0968, "tokens_per_sec": 150020, "dt_s": 4.368, "eta_s": 14685, "world_size": 1, "timestamp": "2026-05-05T03:42:03.704032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59080, "epoch": 0, "train_loss": 3.6488503366708755, "train_ppl": 38.43045855311289, "lr": 0.00056, "grad_norm": 0.74, "tokens_per_sec": 150748, "dt_s": 4.347, "eta_s": 14683, "world_size": 1, "timestamp": "2026-05-05T03:42:08.051423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59090, "epoch": 0, "train_loss": 3.782273754477501, "train_ppl": 43.915782006128886, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 151752, "dt_s": 4.319, "eta_s": 14660, "world_size": 1, "timestamp": "2026-05-05T03:42:12.370029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59100, "epoch": 0, "train_loss": 3.7126890271902084, "train_ppl": 40.96381136027288, "lr": 0.00056, "grad_norm": 0.6539, "tokens_per_sec": 149065, "dt_s": 4.396, "eta_s": 14621, "world_size": 1, "timestamp": "2026-05-05T03:42:16.766513"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59110, "epoch": 0, "train_loss": 3.697092816233635, "train_ppl": 40.32988737251655, "lr": 0.00056, "grad_norm": 0.7249, "tokens_per_sec": 149940, "dt_s": 4.371, "eta_s": 14638, "world_size": 1, "timestamp": "2026-05-05T03:42:21.137363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59120, "epoch": 0, "train_loss": 3.7328048050403595, "train_ppl": 41.79617403340995, "lr": 0.00056, "grad_norm": 0.6626, "tokens_per_sec": 150450, "dt_s": 4.356, "eta_s": 14625, "world_size": 1, "timestamp": "2026-05-05T03:42:25.493360"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59130, "epoch": 0, "train_loss": 3.7187173664569855, "train_ppl": 41.211500941121756, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 148702, "dt_s": 4.407, "eta_s": 14661, "world_size": 1, "timestamp": "2026-05-05T03:42:29.900561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59140, "epoch": 0, "train_loss": 3.773905426263809, "train_ppl": 43.54981373553703, "lr": 0.00056, "grad_norm": 0.6544, "tokens_per_sec": 151149, "dt_s": 4.336, "eta_s": 14668, "world_size": 1, "timestamp": "2026-05-05T03:42:34.236409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59150, "epoch": 0, "train_loss": 3.739764988422394, "train_ppl": 42.0880978123684, "lr": 0.00056, "grad_norm": 0.6876, "tokens_per_sec": 152251, "dt_s": 4.304, "eta_s": 14602, "world_size": 1, "timestamp": "2026-05-05T03:42:38.540884"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59160, "epoch": 0, "train_loss": 3.6804792284965515, "train_ppl": 39.66539830771949, "lr": 0.00056, "grad_norm": 0.7652, "tokens_per_sec": 148441, "dt_s": 4.415, "eta_s": 14627, "world_size": 1, "timestamp": "2026-05-05T03:42:42.955846"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59170, "epoch": 0, "train_loss": 3.652495101094246, "train_ppl": 38.57078409264424, "lr": 0.00056, "grad_norm": 0.7067, "tokens_per_sec": 150819, "dt_s": 4.345, "eta_s": 14615, "world_size": 1, "timestamp": "2026-05-05T03:42:47.301195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59180, "epoch": 0, "train_loss": 3.7291924208402634, "train_ppl": 41.6454625724475, "lr": 0.00056, "grad_norm": 0.697, "tokens_per_sec": 150720, "dt_s": 4.348, "eta_s": 14572, "world_size": 1, "timestamp": "2026-05-05T03:42:51.649387"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59190, "epoch": 0, "train_loss": 3.600532352924347, "train_ppl": 36.61772280769355, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 150656, "dt_s": 4.35, "eta_s": 14577, "world_size": 1, "timestamp": "2026-05-05T03:42:55.999414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59200, "epoch": 0, "train_loss": 3.6676983386278152, "train_ppl": 39.161665156410976, "lr": 0.00056, "grad_norm": 0.6477, "tokens_per_sec": 152862, "dt_s": 4.287, "eta_s": 14561, "world_size": 1, "timestamp": "2026-05-05T03:43:00.286699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59210, "epoch": 0, "train_loss": 3.662292867898941, "train_ppl": 38.95054902679548, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 151889, "dt_s": 4.315, "eta_s": 14489, "world_size": 1, "timestamp": "2026-05-05T03:43:04.601384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59220, "epoch": 0, "train_loss": 3.7213860005140305, "train_ppl": 41.321626232771344, "lr": 0.00056, "grad_norm": 0.7115, "tokens_per_sec": 150812, "dt_s": 4.346, "eta_s": 14485, "world_size": 1, "timestamp": "2026-05-05T03:43:08.946945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59230, "epoch": 0, "train_loss": 3.6917659789323807, "train_ppl": 40.11562779370142, "lr": 0.00056, "grad_norm": 0.9546, "tokens_per_sec": 151852, "dt_s": 4.316, "eta_s": 14459, "world_size": 1, "timestamp": "2026-05-05T03:43:13.262734"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59240, "epoch": 0, "train_loss": 3.6915539354085922, "train_ppl": 40.10712243640989, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 144631, "dt_s": 4.531, "eta_s": 14576, "world_size": 1, "timestamp": "2026-05-05T03:43:17.794017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59250, "epoch": 0, "train_loss": 3.7019225507974625, "train_ppl": 40.525141155941995, "lr": 0.00056, "grad_norm": 0.6812, "tokens_per_sec": 149861, "dt_s": 4.373, "eta_s": 14629, "world_size": 1, "timestamp": "2026-05-05T03:43:22.167112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59260, "epoch": 0, "train_loss": 3.699979677796364, "train_ppl": 40.4464823900638, "lr": 0.00056, "grad_norm": 0.6484, "tokens_per_sec": 151547, "dt_s": 4.324, "eta_s": 14631, "world_size": 1, "timestamp": "2026-05-05T03:43:26.491571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59270, "epoch": 0, "train_loss": 3.760921850800514, "train_ppl": 42.988036274218565, "lr": 0.00056, "grad_norm": 0.6796, "tokens_per_sec": 145508, "dt_s": 4.504, "eta_s": 14733, "world_size": 1, "timestamp": "2026-05-05T03:43:30.995525"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59280, "epoch": 0, "train_loss": 3.735836312174797, "train_ppl": 41.92307168156447, "lr": 0.00056, "grad_norm": 0.6968, "tokens_per_sec": 148755, "dt_s": 4.406, "eta_s": 14788, "world_size": 1, "timestamp": "2026-05-05T03:43:35.401147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59290, "epoch": 0, "train_loss": 3.7573645412921906, "train_ppl": 42.83538619679726, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 149526, "dt_s": 4.383, "eta_s": 14685, "world_size": 1, "timestamp": "2026-05-05T03:43:39.784053"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59300, "epoch": 0, "train_loss": 3.75200916826725, "train_ppl": 42.60659988985376, "lr": 0.00056, "grad_norm": 0.6652, "tokens_per_sec": 148541, "dt_s": 4.412, "eta_s": 14706, "world_size": 1, "timestamp": "2026-05-05T03:43:44.196041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59310, "epoch": 0, "train_loss": 3.6577597558498383, "train_ppl": 38.774381419109254, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 132906, "dt_s": 4.931, "eta_s": 15107, "world_size": 1, "timestamp": "2026-05-05T03:43:49.127038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59320, "epoch": 0, "train_loss": 3.9326026141643524, "train_ppl": 51.03964145178321, "lr": 0.00056, "grad_norm": 0.6778, "tokens_per_sec": 146934, "dt_s": 4.46, "eta_s": 15073, "world_size": 1, "timestamp": "2026-05-05T03:43:53.587306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59330, "epoch": 0, "train_loss": 3.6783919036388397, "train_ppl": 39.58269008533447, "lr": 0.00056, "grad_norm": 0.753, "tokens_per_sec": 149264, "dt_s": 4.391, "eta_s": 15059, "world_size": 1, "timestamp": "2026-05-05T03:43:57.977877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59340, "epoch": 0, "train_loss": 3.781302109360695, "train_ppl": 43.873132174597636, "lr": 0.00056, "grad_norm": 0.7331, "tokens_per_sec": 150540, "dt_s": 4.353, "eta_s": 15034, "world_size": 1, "timestamp": "2026-05-05T03:44:02.331267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59350, "epoch": 0, "train_loss": 3.7765422463417053, "train_ppl": 43.66479828895084, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 151462, "dt_s": 4.327, "eta_s": 14973, "world_size": 1, "timestamp": "2026-05-05T03:44:06.658146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59360, "epoch": 0, "train_loss": 3.691291704773903, "train_ppl": 40.09660649909855, "lr": 0.00056, "grad_norm": 0.711, "tokens_per_sec": 152029, "dt_s": 4.311, "eta_s": 14555, "world_size": 1, "timestamp": "2026-05-05T03:44:10.968899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59370, "epoch": 0, "train_loss": 3.7432641237974167, "train_ppl": 42.235627727334226, "lr": 0.00056, "grad_norm": 0.6832, "tokens_per_sec": 153377, "dt_s": 4.273, "eta_s": 14426, "world_size": 1, "timestamp": "2026-05-05T03:44:15.241777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59380, "epoch": 0, "train_loss": 3.646835282444954, "train_ppl": 38.35309706517476, "lr": 0.00056, "grad_norm": 0.7083, "tokens_per_sec": 148998, "dt_s": 4.398, "eta_s": 14427, "world_size": 1, "timestamp": "2026-05-05T03:44:19.640246"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59390, "epoch": 0, "train_loss": 3.755363017320633, "train_ppl": 42.74973588865952, "lr": 0.00056, "grad_norm": 0.6643, "tokens_per_sec": 151432, "dt_s": 4.328, "eta_s": 14406, "world_size": 1, "timestamp": "2026-05-05T03:44:23.967980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59400, "epoch": 0, "train_loss": 3.739791691303253, "train_ppl": 42.089221700835346, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 151440, "dt_s": 4.328, "eta_s": 14402, "world_size": 1, "timestamp": "2026-05-05T03:44:28.295480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59410, "epoch": 0, "train_loss": 3.758548468351364, "train_ppl": 42.88613020227913, "lr": 0.00056, "grad_norm": 0.6712, "tokens_per_sec": 150782, "dt_s": 4.346, "eta_s": 14421, "world_size": 1, "timestamp": "2026-05-05T03:44:32.641896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59420, "epoch": 0, "train_loss": 3.727933257818222, "train_ppl": 41.59305714635184, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 151176, "dt_s": 4.335, "eta_s": 14458, "world_size": 1, "timestamp": "2026-05-05T03:44:36.976966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59430, "epoch": 0, "train_loss": 3.7605627924203873, "train_ppl": 42.97260383028906, "lr": 0.00056, "grad_norm": 0.7879, "tokens_per_sec": 150539, "dt_s": 4.353, "eta_s": 14424, "world_size": 1, "timestamp": "2026-05-05T03:44:41.330375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59440, "epoch": 0, "train_loss": 3.7012012153863907, "train_ppl": 40.49591947717043, "lr": 0.00056, "grad_norm": 0.6599, "tokens_per_sec": 151360, "dt_s": 4.33, "eta_s": 14421, "world_size": 1, "timestamp": "2026-05-05T03:44:45.660191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59450, "epoch": 0, "train_loss": 3.774271160364151, "train_ppl": 43.5657443004815, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 150820, "dt_s": 4.345, "eta_s": 14428, "world_size": 1, "timestamp": "2026-05-05T03:44:50.005482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59460, "epoch": 0, "train_loss": 3.634686455130577, "train_ppl": 37.88997082850279, "lr": 0.00056, "grad_norm": 0.6558, "tokens_per_sec": 149877, "dt_s": 4.373, "eta_s": 14442, "world_size": 1, "timestamp": "2026-05-05T03:44:54.378126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59470, "epoch": 0, "train_loss": 3.7010811120271683, "train_ppl": 40.49105607326784, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 150328, "dt_s": 4.36, "eta_s": 14453, "world_size": 1, "timestamp": "2026-05-05T03:44:58.737684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59480, "epoch": 0, "train_loss": 3.6607119143009186, "train_ppl": 38.88901866730404, "lr": 0.00056, "grad_norm": 0.6447, "tokens_per_sec": 151354, "dt_s": 4.33, "eta_s": 14434, "world_size": 1, "timestamp": "2026-05-05T03:45:03.067657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59490, "epoch": 0, "train_loss": 3.606517046689987, "train_ppl": 36.83752573565068, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 150390, "dt_s": 4.358, "eta_s": 14448, "world_size": 1, "timestamp": "2026-05-05T03:45:07.425397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59500, "epoch": 0, "train_loss": 3.645734563469887, "train_ppl": 38.31090430892623, "lr": 0.00056, "grad_norm": 0.6338, "tokens_per_sec": 150916, "dt_s": 4.343, "eta_s": 14441, "world_size": 1, "timestamp": "2026-05-05T03:45:11.767948"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59510, "epoch": 0, "train_loss": 3.723035007715225, "train_ppl": 41.389822104282544, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 128086, "dt_s": 5.117, "eta_s": 14423, "world_size": 1, "timestamp": "2026-05-05T03:45:16.884512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59520, "epoch": 0, "train_loss": 3.8147958368062973, "train_ppl": 45.3674930650907, "lr": 0.00056, "grad_norm": 0.6977, "tokens_per_sec": 148901, "dt_s": 4.401, "eta_s": 14447, "world_size": 1, "timestamp": "2026-05-05T03:45:21.285835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59530, "epoch": 0, "train_loss": 3.6868497133255005, "train_ppl": 39.91889270969606, "lr": 0.00056, "grad_norm": 0.6281, "tokens_per_sec": 147936, "dt_s": 4.43, "eta_s": 14509, "world_size": 1, "timestamp": "2026-05-05T03:45:25.715861"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59540, "epoch": 0, "train_loss": 3.7055565118789673, "train_ppl": 40.6726758470301, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 147077, "dt_s": 4.456, "eta_s": 14569, "world_size": 1, "timestamp": "2026-05-05T03:45:30.171757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59550, "epoch": 0, "train_loss": 3.843492016196251, "train_ppl": 46.68822615049085, "lr": 0.00056, "grad_norm": 0.6569, "tokens_per_sec": 151744, "dt_s": 4.319, "eta_s": 14549, "world_size": 1, "timestamp": "2026-05-05T03:45:34.490628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59560, "epoch": 0, "train_loss": 3.7550322711467743, "train_ppl": 42.735598915084715, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 151046, "dt_s": 4.339, "eta_s": 14536, "world_size": 1, "timestamp": "2026-05-05T03:45:38.829416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59570, "epoch": 0, "train_loss": 3.7163959443569183, "train_ppl": 41.11594261058393, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 148734, "dt_s": 4.406, "eta_s": 14535, "world_size": 1, "timestamp": "2026-05-05T03:45:43.235726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59580, "epoch": 0, "train_loss": 3.6665519177913666, "train_ppl": 39.11679513236521, "lr": 0.00056, "grad_norm": 0.6367, "tokens_per_sec": 150270, "dt_s": 4.361, "eta_s": 14485, "world_size": 1, "timestamp": "2026-05-05T03:45:47.596924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59590, "epoch": 0, "train_loss": 3.7024715691804886, "train_ppl": 40.54739631209725, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 150553, "dt_s": 4.353, "eta_s": 14413, "world_size": 1, "timestamp": "2026-05-05T03:45:51.949933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59600, "epoch": 0, "train_loss": 3.8335444033145905, "train_ppl": 46.226092126665066, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 134167, "dt_s": 4.885, "eta_s": 14783, "world_size": 1, "timestamp": "2026-05-05T03:45:56.834592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59610, "epoch": 0, "train_loss": 3.6386419534683228, "train_ppl": 38.04014134895732, "lr": 0.00056, "grad_norm": 0.6643, "tokens_per_sec": 148877, "dt_s": 4.402, "eta_s": 14820, "world_size": 1, "timestamp": "2026-05-05T03:46:01.236641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59620, "epoch": 0, "train_loss": 3.7532259076833725, "train_ppl": 42.658472570694805, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 149252, "dt_s": 4.391, "eta_s": 14805, "world_size": 1, "timestamp": "2026-05-05T03:46:05.627591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59630, "epoch": 0, "train_loss": 3.744809329509735, "train_ppl": 42.30094090871686, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 151267, "dt_s": 4.332, "eta_s": 14782, "world_size": 1, "timestamp": "2026-05-05T03:46:09.960069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59640, "epoch": 0, "train_loss": 3.71727155148983, "train_ppl": 41.15195978935831, "lr": 0.00056, "grad_norm": 0.6653, "tokens_per_sec": 151581, "dt_s": 4.323, "eta_s": 14758, "world_size": 1, "timestamp": "2026-05-05T03:46:14.283553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59650, "epoch": 0, "train_loss": 3.7967700958251953, "train_ppl": 44.55703686771172, "lr": 0.00056, "grad_norm": 0.6759, "tokens_per_sec": 148665, "dt_s": 4.408, "eta_s": 14439, "world_size": 1, "timestamp": "2026-05-05T03:46:18.691865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59660, "epoch": 0, "train_loss": 3.712477758526802, "train_ppl": 40.95515790473294, "lr": 0.00056, "grad_norm": 0.6784, "tokens_per_sec": 151566, "dt_s": 4.324, "eta_s": 14383, "world_size": 1, "timestamp": "2026-05-05T03:46:23.015742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59670, "epoch": 0, "train_loss": 3.715448573231697, "train_ppl": 41.07700899897008, "lr": 0.00056, "grad_norm": 0.694, "tokens_per_sec": 151130, "dt_s": 4.336, "eta_s": 14342, "world_size": 1, "timestamp": "2026-05-05T03:46:27.352145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59680, "epoch": 0, "train_loss": 3.7868243902921677, "train_ppl": 44.11608213737462, "lr": 0.00056, "grad_norm": 0.7553, "tokens_per_sec": 147552, "dt_s": 4.442, "eta_s": 14410, "world_size": 1, "timestamp": "2026-05-05T03:46:31.793680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59690, "epoch": 0, "train_loss": 3.6807721853256226, "train_ppl": 39.67702025931349, "lr": 0.00056, "grad_norm": 0.6685, "tokens_per_sec": 151673, "dt_s": 4.321, "eta_s": 14404, "world_size": 1, "timestamp": "2026-05-05T03:46:36.114544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59700, "epoch": 0, "train_loss": 3.5828781872987747, "train_ppl": 35.97694034687459, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 150805, "dt_s": 4.346, "eta_s": 14358, "world_size": 1, "timestamp": "2026-05-05T03:46:40.460306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59710, "epoch": 0, "train_loss": 3.6826749444007874, "train_ppl": 39.752587940433564, "lr": 0.00056, "grad_norm": 0.7215, "tokens_per_sec": 149729, "dt_s": 4.377, "eta_s": 14389, "world_size": 1, "timestamp": "2026-05-05T03:46:44.837314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59720, "epoch": 0, "train_loss": 3.7198026925325394, "train_ppl": 41.25625313867642, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 150923, "dt_s": 4.342, "eta_s": 14389, "world_size": 1, "timestamp": "2026-05-05T03:46:49.179607"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59730, "epoch": 0, "train_loss": 3.6609829366207123, "train_ppl": 38.89955988774624, "lr": 0.00056, "grad_norm": 0.6216, "tokens_per_sec": 150294, "dt_s": 4.361, "eta_s": 14331, "world_size": 1, "timestamp": "2026-05-05T03:46:53.540145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59740, "epoch": 0, "train_loss": 3.668437883257866, "train_ppl": 39.190637667493256, "lr": 0.00056, "grad_norm": 0.6308, "tokens_per_sec": 151594, "dt_s": 4.323, "eta_s": 14328, "world_size": 1, "timestamp": "2026-05-05T03:46:57.863284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59750, "epoch": 0, "train_loss": 3.767265185713768, "train_ppl": 43.26159049130706, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 151408, "dt_s": 4.328, "eta_s": 14312, "world_size": 1, "timestamp": "2026-05-05T03:47:02.191727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59760, "epoch": 0, "train_loss": 3.641664043068886, "train_ppl": 38.155275950445606, "lr": 0.00056, "grad_norm": 0.6634, "tokens_per_sec": 147505, "dt_s": 4.443, "eta_s": 14351, "world_size": 1, "timestamp": "2026-05-05T03:47:06.634695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59770, "epoch": 0, "train_loss": 3.661226809024811, "train_ppl": 38.90904757377982, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 150539, "dt_s": 4.353, "eta_s": 14354, "world_size": 1, "timestamp": "2026-05-05T03:47:10.988106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59780, "epoch": 0, "train_loss": 3.7628265023231506, "train_ppl": 43.069991526285484, "lr": 0.00056, "grad_norm": 0.7113, "tokens_per_sec": 150452, "dt_s": 4.356, "eta_s": 14347, "world_size": 1, "timestamp": "2026-05-05T03:47:15.344057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59790, "epoch": 0, "train_loss": 3.6983327865600586, "train_ppl": 40.37992625307867, "lr": 0.00056, "grad_norm": 0.6661, "tokens_per_sec": 148397, "dt_s": 4.416, "eta_s": 14404, "world_size": 1, "timestamp": "2026-05-05T03:47:19.760304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59800, "epoch": 0, "train_loss": 3.746795356273651, "train_ppl": 42.385035188604576, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 152024, "dt_s": 4.311, "eta_s": 14388, "world_size": 1, "timestamp": "2026-05-05T03:47:24.071209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59810, "epoch": 0, "train_loss": 3.617619276046753, "train_ppl": 37.248783107866416, "lr": 0.00056, "grad_norm": 0.6907, "tokens_per_sec": 150955, "dt_s": 4.341, "eta_s": 14317, "world_size": 1, "timestamp": "2026-05-05T03:47:28.412642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59820, "epoch": 0, "train_loss": 3.6288722306489944, "train_ppl": 37.670309232229066, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 150744, "dt_s": 4.348, "eta_s": 14309, "world_size": 1, "timestamp": "2026-05-05T03:47:32.760156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59830, "epoch": 0, "train_loss": 3.762550875544548, "train_ppl": 43.05812191913259, "lr": 0.00056, "grad_norm": 0.6716, "tokens_per_sec": 151058, "dt_s": 4.338, "eta_s": 14293, "world_size": 1, "timestamp": "2026-05-05T03:47:37.098620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59840, "epoch": 0, "train_loss": 3.8355766981840134, "train_ppl": 46.32013270325084, "lr": 0.00056, "grad_norm": 1.4548, "tokens_per_sec": 149298, "dt_s": 4.39, "eta_s": 14271, "world_size": 1, "timestamp": "2026-05-05T03:47:41.488237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59850, "epoch": 0, "train_loss": 3.6268658190965652, "train_ppl": 37.594802862358755, "lr": 0.00056, "grad_norm": 0.7111, "tokens_per_sec": 149836, "dt_s": 4.374, "eta_s": 14308, "world_size": 1, "timestamp": "2026-05-05T03:47:45.862050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59860, "epoch": 0, "train_loss": 3.6342596113681793, "train_ppl": 37.87380118199909, "lr": 0.00056, "grad_norm": 0.6435, "tokens_per_sec": 151091, "dt_s": 4.338, "eta_s": 14301, "world_size": 1, "timestamp": "2026-05-05T03:47:50.199590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59870, "epoch": 0, "train_loss": 3.7512888312339783, "train_ppl": 42.575919829409784, "lr": 0.00056, "grad_norm": 0.7156, "tokens_per_sec": 150181, "dt_s": 4.364, "eta_s": 14307, "world_size": 1, "timestamp": "2026-05-05T03:47:54.563383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59880, "epoch": 0, "train_loss": 3.7422763258218765, "train_ppl": 42.19392805858437, "lr": 0.00056, "grad_norm": 0.8256, "tokens_per_sec": 152366, "dt_s": 4.301, "eta_s": 14278, "world_size": 1, "timestamp": "2026-05-05T03:47:58.864584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59890, "epoch": 0, "train_loss": 3.6842986047267914, "train_ppl": 39.81718506803376, "lr": 0.00056, "grad_norm": 0.6442, "tokens_per_sec": 151889, "dt_s": 4.315, "eta_s": 14225, "world_size": 1, "timestamp": "2026-05-05T03:48:03.179320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59900, "epoch": 0, "train_loss": 3.5959270149469376, "train_ppl": 36.449473537945245, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 133342, "dt_s": 4.915, "eta_s": 14575, "world_size": 1, "timestamp": "2026-05-05T03:48:08.094196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59910, "epoch": 0, "train_loss": 3.7625345587730408, "train_ppl": 43.057419355327504, "lr": 0.00056, "grad_norm": 0.7007, "tokens_per_sec": 151853, "dt_s": 4.316, "eta_s": 14557, "world_size": 1, "timestamp": "2026-05-05T03:48:12.409909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59920, "epoch": 0, "train_loss": 3.7825033962726593, "train_ppl": 43.92586806319036, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 150111, "dt_s": 4.366, "eta_s": 14554, "world_size": 1, "timestamp": "2026-05-05T03:48:16.775744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59930, "epoch": 0, "train_loss": 3.74902480840683, "train_ppl": 42.47963601053869, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 150697, "dt_s": 4.349, "eta_s": 14580, "world_size": 1, "timestamp": "2026-05-05T03:48:21.124623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59940, "epoch": 0, "train_loss": 3.6987053602933884, "train_ppl": 40.39497355589519, "lr": 0.00056, "grad_norm": 0.694, "tokens_per_sec": 153402, "dt_s": 4.272, "eta_s": 14548, "world_size": 1, "timestamp": "2026-05-05T03:48:25.396791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59950, "epoch": 0, "train_loss": 3.748341739177704, "train_ppl": 42.45062938620951, "lr": 0.00056, "grad_norm": 0.6929, "tokens_per_sec": 150128, "dt_s": 4.365, "eta_s": 14184, "world_size": 1, "timestamp": "2026-05-05T03:48:29.762140"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59960, "epoch": 0, "train_loss": 3.7737616300582886, "train_ppl": 43.5435518877965, "lr": 0.00056, "grad_norm": 0.6334, "tokens_per_sec": 150774, "dt_s": 4.347, "eta_s": 14200, "world_size": 1, "timestamp": "2026-05-05T03:48:34.108784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59970, "epoch": 0, "train_loss": 3.7233335822820663, "train_ppl": 41.4021818975571, "lr": 0.00056, "grad_norm": 0.6409, "tokens_per_sec": 152071, "dt_s": 4.31, "eta_s": 14159, "world_size": 1, "timestamp": "2026-05-05T03:48:38.418337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59980, "epoch": 0, "train_loss": 3.7462407797574997, "train_ppl": 42.361535960115134, "lr": 0.00056, "grad_norm": 0.7088, "tokens_per_sec": 149288, "dt_s": 4.39, "eta_s": 14181, "world_size": 1, "timestamp": "2026-05-05T03:48:42.808269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 59990, "epoch": 0, "train_loss": 3.6734201908111572, "train_ppl": 39.38638470861783, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 150919, "dt_s": 4.342, "eta_s": 14223, "world_size": 1, "timestamp": "2026-05-05T03:48:47.150710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60000, "epoch": 0, "train_loss": 3.6674351543188095, "train_ppl": 39.15135977679387, "lr": 0.00056, "grad_norm": 0.6969, "tokens_per_sec": 152161, "dt_s": 4.307, "eta_s": 14180, "world_size": 1, "timestamp": "2026-05-05T03:48:51.457724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60010, "epoch": 0, "train_loss": 3.7861393690109253, "train_ppl": 44.08587203072981, "lr": 0.00056, "grad_norm": 0.7312, "tokens_per_sec": 126022, "dt_s": 5.2, "eta_s": 14231, "world_size": 1, "timestamp": "2026-05-05T03:48:56.658102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60020, "epoch": 0, "train_loss": 3.799178585410118, "train_ppl": 44.66448136447948, "lr": 0.00056, "grad_norm": 0.6376, "tokens_per_sec": 149395, "dt_s": 4.387, "eta_s": 14277, "world_size": 1, "timestamp": "2026-05-05T03:49:01.044857"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60030, "epoch": 0, "train_loss": 3.6682815849781036, "train_ppl": 39.18451271691512, "lr": 0.00056, "grad_norm": 0.6972, "tokens_per_sec": 151201, "dt_s": 4.334, "eta_s": 14237, "world_size": 1, "timestamp": "2026-05-05T03:49:05.379264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60040, "epoch": 0, "train_loss": 3.721419036388397, "train_ppl": 41.32299135137301, "lr": 0.00056, "grad_norm": 0.6338, "tokens_per_sec": 150272, "dt_s": 4.361, "eta_s": 14244, "world_size": 1, "timestamp": "2026-05-05T03:49:09.740419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60050, "epoch": 0, "train_loss": 3.6057793647050858, "train_ppl": 36.81036137710795, "lr": 0.00056, "grad_norm": 0.6077, "tokens_per_sec": 151070, "dt_s": 4.338, "eta_s": 14260, "world_size": 1, "timestamp": "2026-05-05T03:49:14.078533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60060, "epoch": 0, "train_loss": 3.780519664287567, "train_ppl": 43.83881728499378, "lr": 0.00056, "grad_norm": 0.7466, "tokens_per_sec": 148097, "dt_s": 4.425, "eta_s": 14252, "world_size": 1, "timestamp": "2026-05-05T03:49:18.503782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60070, "epoch": 0, "train_loss": 3.7898402959108353, "train_ppl": 44.24933291224605, "lr": 0.00056, "grad_norm": 0.6967, "tokens_per_sec": 149232, "dt_s": 4.392, "eta_s": 14251, "world_size": 1, "timestamp": "2026-05-05T03:49:22.895336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60080, "epoch": 0, "train_loss": 3.7129895091056824, "train_ppl": 40.97612209425937, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 150952, "dt_s": 4.342, "eta_s": 14251, "world_size": 1, "timestamp": "2026-05-05T03:49:27.236836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60090, "epoch": 0, "train_loss": 3.7746020555496216, "train_ppl": 43.5801623808271, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 149652, "dt_s": 4.379, "eta_s": 14258, "world_size": 1, "timestamp": "2026-05-05T03:49:31.616082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60100, "epoch": 0, "train_loss": 3.739607557654381, "train_ppl": 42.081472372343335, "lr": 0.00056, "grad_norm": 0.6478, "tokens_per_sec": 151957, "dt_s": 4.313, "eta_s": 14238, "world_size": 1, "timestamp": "2026-05-05T03:49:35.928872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60110, "epoch": 0, "train_loss": 3.6966229379177094, "train_ppl": 40.31094168438908, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 152321, "dt_s": 4.302, "eta_s": 14153, "world_size": 1, "timestamp": "2026-05-05T03:49:40.231376"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60120, "epoch": 0, "train_loss": 3.787945434451103, "train_ppl": 44.16556594515432, "lr": 0.00056, "grad_norm": 0.7083, "tokens_per_sec": 148362, "dt_s": 4.417, "eta_s": 14166, "world_size": 1, "timestamp": "2026-05-05T03:49:44.648648"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60130, "epoch": 0, "train_loss": 3.842426508665085, "train_ppl": 46.63850598721889, "lr": 0.00056, "grad_norm": 0.73, "tokens_per_sec": 151235, "dt_s": 4.333, "eta_s": 14156, "world_size": 1, "timestamp": "2026-05-05T03:49:48.982032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60140, "epoch": 0, "train_loss": 3.6957550942897797, "train_ppl": 40.27597326626657, "lr": 0.00056, "grad_norm": 0.6998, "tokens_per_sec": 151232, "dt_s": 4.333, "eta_s": 14122, "world_size": 1, "timestamp": "2026-05-05T03:49:53.315505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60150, "epoch": 0, "train_loss": 3.7950026839971542, "train_ppl": 44.478355785148885, "lr": 0.00056, "grad_norm": 0.6759, "tokens_per_sec": 147582, "dt_s": 4.441, "eta_s": 14201, "world_size": 1, "timestamp": "2026-05-05T03:49:57.756155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60160, "epoch": 0, "train_loss": 3.5915861427783966, "train_ppl": 36.29159394817761, "lr": 0.00056, "grad_norm": 0.674, "tokens_per_sec": 150471, "dt_s": 4.355, "eta_s": 14231, "world_size": 1, "timestamp": "2026-05-05T03:50:02.111646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60170, "epoch": 0, "train_loss": 3.7088114470243454, "train_ppl": 40.80527845847871, "lr": 0.00056, "grad_norm": 0.5963, "tokens_per_sec": 149164, "dt_s": 4.394, "eta_s": 14211, "world_size": 1, "timestamp": "2026-05-05T03:50:06.505092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60180, "epoch": 0, "train_loss": 3.7867017537355423, "train_ppl": 44.1106722247028, "lr": 0.00056, "grad_norm": 0.6967, "tokens_per_sec": 150386, "dt_s": 4.358, "eta_s": 14222, "world_size": 1, "timestamp": "2026-05-05T03:50:10.862952"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60190, "epoch": 0, "train_loss": 3.76184418797493, "train_ppl": 43.02770402883962, "lr": 0.00056, "grad_norm": 0.8566, "tokens_per_sec": 136292, "dt_s": 4.808, "eta_s": 14527, "world_size": 1, "timestamp": "2026-05-05T03:50:15.671446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60200, "epoch": 0, "train_loss": 3.673251524567604, "train_ppl": 39.37974211526827, "lr": 0.00056, "grad_norm": 0.6823, "tokens_per_sec": 149226, "dt_s": 4.392, "eta_s": 14491, "world_size": 1, "timestamp": "2026-05-05T03:50:20.063201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60210, "epoch": 0, "train_loss": 3.7798383831977844, "train_ppl": 43.8089608992294, "lr": 0.00056, "grad_norm": 0.6853, "tokens_per_sec": 150729, "dt_s": 4.348, "eta_s": 14481, "world_size": 1, "timestamp": "2026-05-05T03:50:24.411084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60220, "epoch": 0, "train_loss": 3.8115231096744537, "train_ppl": 45.219260334579204, "lr": 0.00056, "grad_norm": 0.8009, "tokens_per_sec": 150841, "dt_s": 4.345, "eta_s": 14445, "world_size": 1, "timestamp": "2026-05-05T03:50:28.755813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60230, "epoch": 0, "train_loss": 3.7495121359825134, "train_ppl": 42.500342553596546, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 148165, "dt_s": 4.423, "eta_s": 14483, "world_size": 1, "timestamp": "2026-05-05T03:50:33.178996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60240, "epoch": 0, "train_loss": 3.804558649659157, "train_ppl": 44.905426713417455, "lr": 0.00056, "grad_norm": 0.7485, "tokens_per_sec": 151051, "dt_s": 4.339, "eta_s": 14174, "world_size": 1, "timestamp": "2026-05-05T03:50:37.517670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60250, "epoch": 0, "train_loss": 3.7238825410604477, "train_ppl": 41.42491622828819, "lr": 0.00056, "grad_norm": 0.7028, "tokens_per_sec": 151635, "dt_s": 4.322, "eta_s": 14124, "world_size": 1, "timestamp": "2026-05-05T03:50:41.839642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60260, "epoch": 0, "train_loss": 3.7410835325717926, "train_ppl": 42.14362942989751, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 149521, "dt_s": 4.383, "eta_s": 14143, "world_size": 1, "timestamp": "2026-05-05T03:50:46.222711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60270, "epoch": 0, "train_loss": 3.7015319168567657, "train_ppl": 40.50931375191683, "lr": 0.00056, "grad_norm": 0.6991, "tokens_per_sec": 150273, "dt_s": 4.361, "eta_s": 14149, "world_size": 1, "timestamp": "2026-05-05T03:50:50.583842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60280, "epoch": 0, "train_loss": 3.898744523525238, "train_ppl": 49.34046441118713, "lr": 0.00056, "grad_norm": 0.7891, "tokens_per_sec": 151372, "dt_s": 4.329, "eta_s": 14084, "world_size": 1, "timestamp": "2026-05-05T03:50:54.913325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60290, "epoch": 0, "train_loss": 3.750442624092102, "train_ppl": 42.53990702127488, "lr": 0.00056, "grad_norm": 0.6836, "tokens_per_sec": 149979, "dt_s": 4.37, "eta_s": 14100, "world_size": 1, "timestamp": "2026-05-05T03:50:59.282977"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60300, "epoch": 0, "train_loss": 3.787719652056694, "train_ppl": 44.155595263569325, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 153178, "dt_s": 4.278, "eta_s": 14067, "world_size": 1, "timestamp": "2026-05-05T03:51:03.561397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60310, "epoch": 0, "train_loss": 3.6536757051944733, "train_ppl": 38.616347809556174, "lr": 0.00056, "grad_norm": 0.6571, "tokens_per_sec": 151159, "dt_s": 4.336, "eta_s": 14032, "world_size": 1, "timestamp": "2026-05-05T03:51:07.896982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60320, "epoch": 0, "train_loss": 3.6593420803546906, "train_ppl": 38.83578363929671, "lr": 0.00056, "grad_norm": 0.6185, "tokens_per_sec": 152208, "dt_s": 4.306, "eta_s": 13992, "world_size": 1, "timestamp": "2026-05-05T03:51:12.202680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60330, "epoch": 0, "train_loss": 3.6491461992263794, "train_ppl": 38.441830368953894, "lr": 0.00056, "grad_norm": 0.6255, "tokens_per_sec": 153137, "dt_s": 4.28, "eta_s": 13955, "world_size": 1, "timestamp": "2026-05-05T03:51:16.482251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60340, "epoch": 0, "train_loss": 3.6745710968971252, "train_ppl": 39.43174083379701, "lr": 0.00056, "grad_norm": 0.6172, "tokens_per_sec": 150102, "dt_s": 4.366, "eta_s": 13949, "world_size": 1, "timestamp": "2026-05-05T03:51:20.848364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60350, "epoch": 0, "train_loss": 3.499237447977066, "train_ppl": 33.09020932944303, "lr": 0.00056, "grad_norm": 1.0226, "tokens_per_sec": 152224, "dt_s": 4.305, "eta_s": 13962, "world_size": 1, "timestamp": "2026-05-05T03:51:25.153604"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60360, "epoch": 0, "train_loss": 3.8119968324899673, "train_ppl": 45.2406868046035, "lr": 0.00056, "grad_norm": 0.6967, "tokens_per_sec": 151605, "dt_s": 4.323, "eta_s": 13949, "world_size": 1, "timestamp": "2026-05-05T03:51:29.476414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60370, "epoch": 0, "train_loss": 3.6756310909986496, "train_ppl": 39.47356040682838, "lr": 0.00056, "grad_norm": 0.744, "tokens_per_sec": 149859, "dt_s": 4.373, "eta_s": 13988, "world_size": 1, "timestamp": "2026-05-05T03:51:33.849612"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60380, "epoch": 0, "train_loss": 3.7616731971502304, "train_ppl": 43.020347315226, "lr": 0.00056, "grad_norm": 0.9204, "tokens_per_sec": 152636, "dt_s": 4.294, "eta_s": 13993, "world_size": 1, "timestamp": "2026-05-05T03:51:38.143248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60390, "epoch": 0, "train_loss": 3.7072193026542664, "train_ppl": 40.74036225580797, "lr": 0.00056, "grad_norm": 0.7152, "tokens_per_sec": 152240, "dt_s": 4.305, "eta_s": 13949, "world_size": 1, "timestamp": "2026-05-05T03:51:42.447985"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60400, "epoch": 0, "train_loss": 3.849774330854416, "train_ppl": 46.982459543659026, "lr": 0.00056, "grad_norm": 0.6934, "tokens_per_sec": 150905, "dt_s": 4.343, "eta_s": 13969, "world_size": 1, "timestamp": "2026-05-05T03:51:46.790847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60410, "epoch": 0, "train_loss": 3.751796767115593, "train_ppl": 42.597551159983304, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 152148, "dt_s": 4.307, "eta_s": 13955, "world_size": 1, "timestamp": "2026-05-05T03:51:51.098258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60420, "epoch": 0, "train_loss": 3.7240759879350662, "train_ppl": 41.43293052400912, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 151661, "dt_s": 4.321, "eta_s": 13917, "world_size": 1, "timestamp": "2026-05-05T03:51:55.419443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60430, "epoch": 0, "train_loss": 3.735830143094063, "train_ppl": 41.92281305554839, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 148638, "dt_s": 4.409, "eta_s": 13987, "world_size": 1, "timestamp": "2026-05-05T03:51:59.828565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60440, "epoch": 0, "train_loss": 3.756223127245903, "train_ppl": 42.78652117822858, "lr": 0.00056, "grad_norm": 0.6494, "tokens_per_sec": 150796, "dt_s": 4.346, "eta_s": 14009, "world_size": 1, "timestamp": "2026-05-05T03:52:04.174548"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60450, "epoch": 0, "train_loss": 3.6169587671756744, "train_ppl": 37.224188079698116, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 148639, "dt_s": 4.409, "eta_s": 14048, "world_size": 1, "timestamp": "2026-05-05T03:52:08.583610"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60460, "epoch": 0, "train_loss": 3.650619223713875, "train_ppl": 38.49849785248052, "lr": 0.00056, "grad_norm": 0.6384, "tokens_per_sec": 151390, "dt_s": 4.329, "eta_s": 14057, "world_size": 1, "timestamp": "2026-05-05T03:52:12.912600"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60470, "epoch": 0, "train_loss": 3.7364667654037476, "train_ppl": 41.94951055083317, "lr": 0.00056, "grad_norm": 0.6592, "tokens_per_sec": 153399, "dt_s": 4.272, "eta_s": 14021, "world_size": 1, "timestamp": "2026-05-05T03:52:17.184843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60480, "epoch": 0, "train_loss": 3.727239966392517, "train_ppl": 41.56423103006766, "lr": 0.00056, "grad_norm": 0.7208, "tokens_per_sec": 149283, "dt_s": 4.39, "eta_s": 14005, "world_size": 1, "timestamp": "2026-05-05T03:52:21.574907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60490, "epoch": 0, "train_loss": 3.845510557293892, "train_ppl": 46.78256343356153, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 134561, "dt_s": 4.87, "eta_s": 14338, "world_size": 1, "timestamp": "2026-05-05T03:52:26.445269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60500, "epoch": 0, "train_loss": 3.654005154967308, "train_ppl": 38.62907205245412, "lr": 0.00056, "grad_norm": 0.6693, "tokens_per_sec": 149957, "dt_s": 4.37, "eta_s": 14309, "world_size": 1, "timestamp": "2026-05-05T03:52:30.815582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60510, "epoch": 0, "train_loss": 3.758203312754631, "train_ppl": 42.871330368687225, "lr": 0.00056, "grad_norm": 0.6859, "tokens_per_sec": 126391, "dt_s": 5.185, "eta_s": 14359, "world_size": 1, "timestamp": "2026-05-05T03:52:36.000783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60520, "epoch": 0, "train_loss": 3.7147507667541504, "train_ppl": 41.04835519458022, "lr": 0.00056, "grad_norm": 0.7004, "tokens_per_sec": 151037, "dt_s": 4.339, "eta_s": 14397, "world_size": 1, "timestamp": "2026-05-05T03:52:40.339854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60530, "epoch": 0, "train_loss": 3.8148923069238663, "train_ppl": 45.3718698835932, "lr": 0.00056, "grad_norm": 0.6949, "tokens_per_sec": 146862, "dt_s": 4.462, "eta_s": 14439, "world_size": 1, "timestamp": "2026-05-05T03:52:44.802281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60540, "epoch": 0, "train_loss": 3.7112669497728348, "train_ppl": 40.9055990502239, "lr": 0.00056, "grad_norm": 0.6353, "tokens_per_sec": 148886, "dt_s": 4.402, "eta_s": 14133, "world_size": 1, "timestamp": "2026-05-05T03:52:49.204006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60550, "epoch": 0, "train_loss": 3.800807908177376, "train_ppl": 44.73731353830163, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 150998, "dt_s": 4.34, "eta_s": 14110, "world_size": 1, "timestamp": "2026-05-05T03:52:53.544206"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60560, "epoch": 0, "train_loss": 3.7121186554431915, "train_ppl": 40.94045342161023, "lr": 0.00056, "grad_norm": 0.6771, "tokens_per_sec": 149243, "dt_s": 4.391, "eta_s": 14091, "world_size": 1, "timestamp": "2026-05-05T03:52:57.935430"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60570, "epoch": 0, "train_loss": 3.7752432972192764, "train_ppl": 43.608116758714104, "lr": 0.00056, "grad_norm": 0.7035, "tokens_per_sec": 150194, "dt_s": 4.363, "eta_s": 14102, "world_size": 1, "timestamp": "2026-05-05T03:53:02.298841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60580, "epoch": 0, "train_loss": 3.705718234181404, "train_ppl": 40.6792540577217, "lr": 0.00056, "grad_norm": 0.72, "tokens_per_sec": 150833, "dt_s": 4.345, "eta_s": 14022, "world_size": 1, "timestamp": "2026-05-05T03:53:06.643787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60590, "epoch": 0, "train_loss": 3.7369539588689804, "train_ppl": 41.96995305756675, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 147201, "dt_s": 4.452, "eta_s": 14050, "world_size": 1, "timestamp": "2026-05-05T03:53:11.095932"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60600, "epoch": 0, "train_loss": 3.800321638584137, "train_ppl": 44.715564431438, "lr": 0.00056, "grad_norm": 0.6827, "tokens_per_sec": 150100, "dt_s": 4.366, "eta_s": 14063, "world_size": 1, "timestamp": "2026-05-05T03:53:15.462094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60610, "epoch": 0, "train_loss": 3.5829598009586334, "train_ppl": 35.9798766764675, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 149507, "dt_s": 4.383, "eta_s": 14053, "world_size": 1, "timestamp": "2026-05-05T03:53:19.845573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60620, "epoch": 0, "train_loss": 3.754526287317276, "train_ppl": 42.71398086274343, "lr": 0.00056, "grad_norm": 0.7026, "tokens_per_sec": 150149, "dt_s": 4.365, "eta_s": 14050, "world_size": 1, "timestamp": "2026-05-05T03:53:24.210302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60630, "epoch": 0, "train_loss": 3.709856867790222, "train_ppl": 40.84795944984674, "lr": 0.00056, "grad_norm": 0.6255, "tokens_per_sec": 150162, "dt_s": 4.364, "eta_s": 14058, "world_size": 1, "timestamp": "2026-05-05T03:53:28.574663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60640, "epoch": 0, "train_loss": 3.7791337072849274, "train_ppl": 43.77810065422909, "lr": 0.00056, "grad_norm": 0.6926, "tokens_per_sec": 148426, "dt_s": 4.415, "eta_s": 14030, "world_size": 1, "timestamp": "2026-05-05T03:53:32.990092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60650, "epoch": 0, "train_loss": 3.7015232741832733, "train_ppl": 40.5089636446576, "lr": 0.00056, "grad_norm": 0.7513, "tokens_per_sec": 151346, "dt_s": 4.33, "eta_s": 14002, "world_size": 1, "timestamp": "2026-05-05T03:53:37.320305"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60660, "epoch": 0, "train_loss": 3.7123672366142273, "train_ppl": 40.95063171247783, "lr": 0.00056, "grad_norm": 0.6367, "tokens_per_sec": 149252, "dt_s": 4.391, "eta_s": 14003, "world_size": 1, "timestamp": "2026-05-05T03:53:41.711294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60670, "epoch": 0, "train_loss": 3.617850825190544, "train_ppl": 37.25740903032625, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 148711, "dt_s": 4.407, "eta_s": 14025, "world_size": 1, "timestamp": "2026-05-05T03:53:46.118217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60680, "epoch": 0, "train_loss": 3.6562157422304153, "train_ppl": 38.714559440972465, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 149855, "dt_s": 4.373, "eta_s": 14027, "world_size": 1, "timestamp": "2026-05-05T03:53:50.491511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60690, "epoch": 0, "train_loss": 3.7087571918964386, "train_ppl": 40.80306462293318, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 152549, "dt_s": 4.296, "eta_s": 13946, "world_size": 1, "timestamp": "2026-05-05T03:53:54.787599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60700, "epoch": 0, "train_loss": 3.8002771735191345, "train_ppl": 44.71357619516279, "lr": 0.00056, "grad_norm": 0.7242, "tokens_per_sec": 149371, "dt_s": 4.387, "eta_s": 13978, "world_size": 1, "timestamp": "2026-05-05T03:53:59.175029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60710, "epoch": 0, "train_loss": 3.7376390397548676, "train_ppl": 41.99871572143874, "lr": 0.00056, "grad_norm": 0.7303, "tokens_per_sec": 150223, "dt_s": 4.363, "eta_s": 13956, "world_size": 1, "timestamp": "2026-05-05T03:54:03.537613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60720, "epoch": 0, "train_loss": 3.755121961236, "train_ppl": 42.73943204665891, "lr": 0.00056, "grad_norm": 0.6828, "tokens_per_sec": 150540, "dt_s": 4.353, "eta_s": 13917, "world_size": 1, "timestamp": "2026-05-05T03:54:07.891035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60730, "epoch": 0, "train_loss": 3.7306827306747437, "train_ppl": 41.707573485637255, "lr": 0.00056, "grad_norm": 0.6384, "tokens_per_sec": 148827, "dt_s": 4.404, "eta_s": 13932, "world_size": 1, "timestamp": "2026-05-05T03:54:12.294559"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60740, "epoch": 0, "train_loss": 3.820381000638008, "train_ppl": 45.62158686372257, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 150989, "dt_s": 4.34, "eta_s": 13956, "world_size": 1, "timestamp": "2026-05-05T03:54:16.634992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60750, "epoch": 0, "train_loss": 3.7286132723093033, "train_ppl": 41.62135064684461, "lr": 0.00056, "grad_norm": 0.6742, "tokens_per_sec": 148422, "dt_s": 4.416, "eta_s": 13970, "world_size": 1, "timestamp": "2026-05-05T03:54:21.050497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60760, "epoch": 0, "train_loss": 3.7217664271593094, "train_ppl": 41.337349070920204, "lr": 0.00056, "grad_norm": 0.6279, "tokens_per_sec": 151478, "dt_s": 4.326, "eta_s": 13942, "world_size": 1, "timestamp": "2026-05-05T03:54:25.376938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60770, "epoch": 0, "train_loss": 3.5592396557331085, "train_ppl": 35.13647117150973, "lr": 0.00056, "grad_norm": 0.7276, "tokens_per_sec": 150911, "dt_s": 4.343, "eta_s": 13931, "world_size": 1, "timestamp": "2026-05-05T03:54:29.719631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60780, "epoch": 0, "train_loss": 3.6238166242837906, "train_ppl": 37.48034357749534, "lr": 0.00056, "grad_norm": 0.6485, "tokens_per_sec": 134029, "dt_s": 4.89, "eta_s": 14237, "world_size": 1, "timestamp": "2026-05-05T03:54:34.609327"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60790, "epoch": 0, "train_loss": 3.7214276492595673, "train_ppl": 41.323347262506594, "lr": 0.00056, "grad_norm": 0.632, "tokens_per_sec": 151854, "dt_s": 4.316, "eta_s": 14217, "world_size": 1, "timestamp": "2026-05-05T03:54:38.925051"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60800, "epoch": 0, "train_loss": 3.7374822050333023, "train_ppl": 41.9921293810494, "lr": 0.00056, "grad_norm": 0.7339, "tokens_per_sec": 151066, "dt_s": 4.338, "eta_s": 14163, "world_size": 1, "timestamp": "2026-05-05T03:54:43.263292"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60810, "epoch": 0, "train_loss": 3.8468665182590485, "train_ppl": 46.84604179078856, "lr": 0.00056, "grad_norm": 0.7091, "tokens_per_sec": 147620, "dt_s": 4.44, "eta_s": 14231, "world_size": 1, "timestamp": "2026-05-05T03:54:47.702797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60820, "epoch": 0, "train_loss": 3.6392363160848618, "train_ppl": 38.06275770739619, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 150938, "dt_s": 4.342, "eta_s": 14226, "world_size": 1, "timestamp": "2026-05-05T03:54:52.044698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60830, "epoch": 0, "train_loss": 3.709183841943741, "train_ppl": 40.82047696660913, "lr": 0.00056, "grad_norm": 0.7423, "tokens_per_sec": 150501, "dt_s": 4.355, "eta_s": 13880, "world_size": 1, "timestamp": "2026-05-05T03:54:56.399229"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60840, "epoch": 0, "train_loss": 3.7083926796913147, "train_ppl": 40.788194118276536, "lr": 0.00056, "grad_norm": 0.6268, "tokens_per_sec": 149887, "dt_s": 4.372, "eta_s": 13912, "world_size": 1, "timestamp": "2026-05-05T03:55:00.771582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60850, "epoch": 0, "train_loss": 3.692820906639099, "train_ppl": 40.15796921057358, "lr": 0.00056, "grad_norm": 0.6434, "tokens_per_sec": 150623, "dt_s": 4.351, "eta_s": 13916, "world_size": 1, "timestamp": "2026-05-05T03:55:05.122566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60860, "epoch": 0, "train_loss": 3.5942857414484024, "train_ppl": 36.38969904955197, "lr": 0.00056, "grad_norm": 0.601, "tokens_per_sec": 149596, "dt_s": 4.381, "eta_s": 13874, "world_size": 1, "timestamp": "2026-05-05T03:55:09.503433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60870, "epoch": 0, "train_loss": 3.6352633982896805, "train_ppl": 37.91183749527628, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 150168, "dt_s": 4.364, "eta_s": 13884, "world_size": 1, "timestamp": "2026-05-05T03:55:13.867624"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60880, "epoch": 0, "train_loss": 3.816781982779503, "train_ppl": 45.4576890703224, "lr": 0.00056, "grad_norm": 0.732, "tokens_per_sec": 149391, "dt_s": 4.387, "eta_s": 13900, "world_size": 1, "timestamp": "2026-05-05T03:55:18.254516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60890, "epoch": 0, "train_loss": 3.853712022304535, "train_ppl": 47.1678266927125, "lr": 0.00056, "grad_norm": 0.8756, "tokens_per_sec": 148323, "dt_s": 4.418, "eta_s": 13925, "world_size": 1, "timestamp": "2026-05-05T03:55:22.672972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60900, "epoch": 0, "train_loss": 3.6191179305315018, "train_ppl": 37.304648014362904, "lr": 0.00056, "grad_norm": 0.6553, "tokens_per_sec": 151221, "dt_s": 4.334, "eta_s": 13910, "world_size": 1, "timestamp": "2026-05-05T03:55:27.006751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60910, "epoch": 0, "train_loss": 3.6481916457414627, "train_ppl": 38.40515309380398, "lr": 0.00056, "grad_norm": 0.6479, "tokens_per_sec": 151622, "dt_s": 4.322, "eta_s": 13868, "world_size": 1, "timestamp": "2026-05-05T03:55:31.329085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60920, "epoch": 0, "train_loss": 3.7728371918201447, "train_ppl": 43.50331716353095, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 149299, "dt_s": 4.39, "eta_s": 13880, "world_size": 1, "timestamp": "2026-05-05T03:55:35.718656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60930, "epoch": 0, "train_loss": 3.825603038072586, "train_ppl": 45.86044762526463, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 151309, "dt_s": 4.331, "eta_s": 13840, "world_size": 1, "timestamp": "2026-05-05T03:55:40.049939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60940, "epoch": 0, "train_loss": 3.769802913069725, "train_ppl": 43.37151603449417, "lr": 0.00056, "grad_norm": 0.6417, "tokens_per_sec": 151619, "dt_s": 4.322, "eta_s": 13775, "world_size": 1, "timestamp": "2026-05-05T03:55:44.372346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60950, "epoch": 0, "train_loss": 3.769990786910057, "train_ppl": 43.37966517325362, "lr": 0.00056, "grad_norm": 0.6716, "tokens_per_sec": 150174, "dt_s": 4.364, "eta_s": 13790, "world_size": 1, "timestamp": "2026-05-05T03:55:48.736397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60960, "epoch": 0, "train_loss": 3.7124367356300354, "train_ppl": 40.95347783997895, "lr": 0.00056, "grad_norm": 0.6685, "tokens_per_sec": 150579, "dt_s": 4.352, "eta_s": 13804, "world_size": 1, "timestamp": "2026-05-05T03:55:53.088639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60970, "epoch": 0, "train_loss": 3.7548478692770004, "train_ppl": 42.72771911728592, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 149979, "dt_s": 4.37, "eta_s": 13787, "world_size": 1, "timestamp": "2026-05-05T03:55:57.458310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60980, "epoch": 0, "train_loss": 3.74996979534626, "train_ppl": 42.51979768490057, "lr": 0.00056, "grad_norm": 0.6492, "tokens_per_sec": 151203, "dt_s": 4.334, "eta_s": 13785, "world_size": 1, "timestamp": "2026-05-05T03:56:01.792578"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 60990, "epoch": 0, "train_loss": 3.7452756464481354, "train_ppl": 42.320671153889094, "lr": 0.00056, "grad_norm": 0.6476, "tokens_per_sec": 151320, "dt_s": 4.331, "eta_s": 13786, "world_size": 1, "timestamp": "2026-05-05T03:56:06.123514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61000, "epoch": 0, "train_loss": 3.5585397332906723, "train_ppl": 35.11188697130913, "lr": 0.00056, "grad_norm": 0.6118, "tokens_per_sec": 147758, "dt_s": 4.435, "eta_s": 13827, "world_size": 1, "timestamp": "2026-05-05T03:56:10.558881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61010, "epoch": 0, "train_loss": 3.721181735396385, "train_ppl": 41.31318652792561, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 127612, "dt_s": 5.136, "eta_s": 13832, "world_size": 1, "timestamp": "2026-05-05T03:56:15.694432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61020, "epoch": 0, "train_loss": 3.8107567131519318, "train_ppl": 45.184617727390254, "lr": 0.00056, "grad_norm": 0.7052, "tokens_per_sec": 149231, "dt_s": 4.392, "eta_s": 13842, "world_size": 1, "timestamp": "2026-05-05T03:56:20.086012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61030, "epoch": 0, "train_loss": 3.7606711238622665, "train_ppl": 42.97725936658923, "lr": 0.00056, "grad_norm": 0.6599, "tokens_per_sec": 146213, "dt_s": 4.482, "eta_s": 13931, "world_size": 1, "timestamp": "2026-05-05T03:56:24.568248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61040, "epoch": 0, "train_loss": 3.757897228002548, "train_ppl": 42.85821011621691, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 150049, "dt_s": 4.368, "eta_s": 13950, "world_size": 1, "timestamp": "2026-05-05T03:56:28.935894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61050, "epoch": 0, "train_loss": 3.990913286805153, "train_ppl": 54.10427953079724, "lr": 0.00056, "grad_norm": 0.7428, "tokens_per_sec": 148924, "dt_s": 4.401, "eta_s": 13923, "world_size": 1, "timestamp": "2026-05-05T03:56:33.336504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61060, "epoch": 0, "train_loss": 3.7462091594934464, "train_ppl": 42.36019649833955, "lr": 0.00056, "grad_norm": 0.7169, "tokens_per_sec": 150042, "dt_s": 4.368, "eta_s": 13919, "world_size": 1, "timestamp": "2026-05-05T03:56:37.704357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61070, "epoch": 0, "train_loss": 3.7230153530836105, "train_ppl": 41.38900861057097, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 151153, "dt_s": 4.336, "eta_s": 13879, "world_size": 1, "timestamp": "2026-05-05T03:56:42.040107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61080, "epoch": 0, "train_loss": 3.7535935044288635, "train_ppl": 42.67415656889615, "lr": 0.00056, "grad_norm": 0.6663, "tokens_per_sec": 132753, "dt_s": 4.937, "eta_s": 14162, "world_size": 1, "timestamp": "2026-05-05T03:56:46.976791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61090, "epoch": 0, "train_loss": 3.706502377986908, "train_ppl": 40.71116495253371, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 151286, "dt_s": 4.332, "eta_s": 14135, "world_size": 1, "timestamp": "2026-05-05T03:56:51.308719"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61100, "epoch": 0, "train_loss": 3.6390142142772675, "train_ppl": 38.05430483884079, "lr": 0.00056, "grad_norm": 0.638, "tokens_per_sec": 150458, "dt_s": 4.356, "eta_s": 14102, "world_size": 1, "timestamp": "2026-05-05T03:56:55.664484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61110, "epoch": 0, "train_loss": 3.641015738248825, "train_ppl": 38.13054771771852, "lr": 0.00056, "grad_norm": 0.6447, "tokens_per_sec": 148665, "dt_s": 4.408, "eta_s": 14124, "world_size": 1, "timestamp": "2026-05-05T03:57:00.072781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61120, "epoch": 0, "train_loss": 3.652455061674118, "train_ppl": 38.569239771732356, "lr": 0.00056, "grad_norm": 0.6405, "tokens_per_sec": 153430, "dt_s": 4.271, "eta_s": 14078, "world_size": 1, "timestamp": "2026-05-05T03:57:04.344167"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61130, "epoch": 0, "train_loss": 3.773085594177246, "train_ppl": 43.514124832325734, "lr": 0.00056, "grad_norm": 0.6818, "tokens_per_sec": 150115, "dt_s": 4.366, "eta_s": 13714, "world_size": 1, "timestamp": "2026-05-05T03:57:08.709891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61140, "epoch": 0, "train_loss": 3.7456583976745605, "train_ppl": 42.33687254302937, "lr": 0.00056, "grad_norm": 0.6876, "tokens_per_sec": 150923, "dt_s": 4.342, "eta_s": 13716, "world_size": 1, "timestamp": "2026-05-05T03:57:13.052241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61150, "epoch": 0, "train_loss": 3.8727010637521744, "train_ppl": 48.072056567343445, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 152940, "dt_s": 4.285, "eta_s": 13667, "world_size": 1, "timestamp": "2026-05-05T03:57:17.337310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61160, "epoch": 0, "train_loss": 3.651577278971672, "train_ppl": 38.535399214714886, "lr": 0.00056, "grad_norm": 0.6708, "tokens_per_sec": 148107, "dt_s": 4.425, "eta_s": 13673, "world_size": 1, "timestamp": "2026-05-05T03:57:21.762235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61170, "epoch": 0, "train_loss": 3.7675417065620422, "train_ppl": 43.27355487713223, "lr": 0.00056, "grad_norm": 0.6699, "tokens_per_sec": 150155, "dt_s": 4.365, "eta_s": 13727, "world_size": 1, "timestamp": "2026-05-05T03:57:26.126778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61180, "epoch": 0, "train_loss": 3.857599303126335, "train_ppl": 47.351538118434235, "lr": 0.00056, "grad_norm": 0.7152, "tokens_per_sec": 149953, "dt_s": 4.37, "eta_s": 13726, "world_size": 1, "timestamp": "2026-05-05T03:57:30.497224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61190, "epoch": 0, "train_loss": 3.653947427868843, "train_ppl": 38.626842172571, "lr": 0.00056, "grad_norm": 0.6995, "tokens_per_sec": 149438, "dt_s": 4.385, "eta_s": 13749, "world_size": 1, "timestamp": "2026-05-05T03:57:34.882701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61200, "epoch": 0, "train_loss": 3.7767823189496994, "train_ppl": 43.67528226936151, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 152233, "dt_s": 4.305, "eta_s": 13757, "world_size": 1, "timestamp": "2026-05-05T03:57:39.187669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61210, "epoch": 0, "train_loss": 3.746719017624855, "train_ppl": 42.38179969578725, "lr": 0.00056, "grad_norm": 0.6571, "tokens_per_sec": 150914, "dt_s": 4.343, "eta_s": 13701, "world_size": 1, "timestamp": "2026-05-05T03:57:43.530318"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61220, "epoch": 0, "train_loss": 3.5767945498228073, "train_ppl": 35.75873410115243, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 148989, "dt_s": 4.399, "eta_s": 13718, "world_size": 1, "timestamp": "2026-05-05T03:57:47.929010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61230, "epoch": 0, "train_loss": 3.7578930258750916, "train_ppl": 42.858030020933825, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 150458, "dt_s": 4.356, "eta_s": 13704, "world_size": 1, "timestamp": "2026-05-05T03:57:52.284779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61240, "epoch": 0, "train_loss": 3.8808305114507675, "train_ppl": 48.46444864136407, "lr": 0.00056, "grad_norm": 0.6784, "tokens_per_sec": 149901, "dt_s": 4.372, "eta_s": 13692, "world_size": 1, "timestamp": "2026-05-05T03:57:56.656730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61250, "epoch": 0, "train_loss": 3.6842410415410995, "train_ppl": 39.814893129982224, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 148973, "dt_s": 4.399, "eta_s": 13746, "world_size": 1, "timestamp": "2026-05-05T03:58:01.055923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61260, "epoch": 0, "train_loss": 3.7115963846445084, "train_ppl": 40.91907700092933, "lr": 0.00056, "grad_norm": 0.6487, "tokens_per_sec": 151043, "dt_s": 4.339, "eta_s": 13740, "world_size": 1, "timestamp": "2026-05-05T03:58:05.394820"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61270, "epoch": 0, "train_loss": 3.7677437514066696, "train_ppl": 43.28229895912245, "lr": 0.00056, "grad_norm": 0.7634, "tokens_per_sec": 148944, "dt_s": 4.4, "eta_s": 13736, "world_size": 1, "timestamp": "2026-05-05T03:58:09.794843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61280, "epoch": 0, "train_loss": 3.6373074054718018, "train_ppl": 37.98940881456339, "lr": 0.00056, "grad_norm": 0.7006, "tokens_per_sec": 150460, "dt_s": 4.356, "eta_s": 13732, "world_size": 1, "timestamp": "2026-05-05T03:58:14.150554"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61290, "epoch": 0, "train_loss": 3.7072700411081314, "train_ppl": 40.74242941124043, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 152247, "dt_s": 4.305, "eta_s": 13685, "world_size": 1, "timestamp": "2026-05-05T03:58:18.455150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61300, "epoch": 0, "train_loss": 3.8334863781929016, "train_ppl": 46.22340992986237, "lr": 0.00056, "grad_norm": 0.6277, "tokens_per_sec": 148838, "dt_s": 4.403, "eta_s": 13683, "world_size": 1, "timestamp": "2026-05-05T03:58:22.858345"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61310, "epoch": 0, "train_loss": 3.5260753631591797, "train_ppl": 33.99030589272119, "lr": 0.00056, "grad_norm": 0.7474, "tokens_per_sec": 151406, "dt_s": 4.328, "eta_s": 13672, "world_size": 1, "timestamp": "2026-05-05T03:58:27.186839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61320, "epoch": 0, "train_loss": 3.746906563639641, "train_ppl": 42.38974897882452, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 150341, "dt_s": 4.359, "eta_s": 13642, "world_size": 1, "timestamp": "2026-05-05T03:58:31.545976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61330, "epoch": 0, "train_loss": 3.662669762969017, "train_ppl": 38.965232063508694, "lr": 0.00056, "grad_norm": 0.7409, "tokens_per_sec": 147739, "dt_s": 4.436, "eta_s": 13688, "world_size": 1, "timestamp": "2026-05-05T03:58:35.981930"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61340, "epoch": 0, "train_loss": 3.7494959980249405, "train_ppl": 42.49965669040581, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 150642, "dt_s": 4.35, "eta_s": 13713, "world_size": 1, "timestamp": "2026-05-05T03:58:40.332377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61350, "epoch": 0, "train_loss": 3.694847345352173, "train_ppl": 40.23942938316794, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 150099, "dt_s": 4.366, "eta_s": 13685, "world_size": 1, "timestamp": "2026-05-05T03:58:44.698539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61360, "epoch": 0, "train_loss": 3.787037417292595, "train_ppl": 44.12548105509903, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 149065, "dt_s": 4.396, "eta_s": 13723, "world_size": 1, "timestamp": "2026-05-05T03:58:49.095000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61370, "epoch": 0, "train_loss": 3.8335433900356293, "train_ppl": 46.22604528676219, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 135830, "dt_s": 4.825, "eta_s": 14011, "world_size": 1, "timestamp": "2026-05-05T03:58:53.919848"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61380, "epoch": 0, "train_loss": 3.651797115802765, "train_ppl": 38.54387164600524, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 149642, "dt_s": 4.38, "eta_s": 13971, "world_size": 1, "timestamp": "2026-05-05T03:58:58.299377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61390, "epoch": 0, "train_loss": 3.8375227600336075, "train_ppl": 46.41036231409833, "lr": 0.00056, "grad_norm": 0.805, "tokens_per_sec": 150618, "dt_s": 4.351, "eta_s": 13967, "world_size": 1, "timestamp": "2026-05-05T03:59:02.650494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61400, "epoch": 0, "train_loss": 3.703550308942795, "train_ppl": 40.591160001329044, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 152575, "dt_s": 4.295, "eta_s": 13918, "world_size": 1, "timestamp": "2026-05-05T03:59:06.945855"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61410, "epoch": 0, "train_loss": 3.737816795706749, "train_ppl": 42.00618190668938, "lr": 0.00056, "grad_norm": 0.6071, "tokens_per_sec": 146411, "dt_s": 4.476, "eta_s": 13963, "world_size": 1, "timestamp": "2026-05-05T03:59:11.421997"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61420, "epoch": 0, "train_loss": 3.747835099697113, "train_ppl": 42.429127668655966, "lr": 0.00056, "grad_norm": 0.6388, "tokens_per_sec": 152210, "dt_s": 4.306, "eta_s": 13634, "world_size": 1, "timestamp": "2026-05-05T03:59:15.727642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61430, "epoch": 0, "train_loss": 3.778918296098709, "train_ppl": 43.76867137725911, "lr": 0.00056, "grad_norm": 0.6369, "tokens_per_sec": 152235, "dt_s": 4.305, "eta_s": 13583, "world_size": 1, "timestamp": "2026-05-05T03:59:20.032562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61440, "epoch": 0, "train_loss": 3.7489964067935944, "train_ppl": 42.47842953747931, "lr": 0.00056, "grad_norm": 0.7093, "tokens_per_sec": 147716, "dt_s": 4.437, "eta_s": 13632, "world_size": 1, "timestamp": "2026-05-05T03:59:24.469201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61450, "epoch": 0, "train_loss": 3.6138892769813538, "train_ppl": 37.110103978983005, "lr": 0.00056, "grad_norm": 0.6785, "tokens_per_sec": 152541, "dt_s": 4.296, "eta_s": 13629, "world_size": 1, "timestamp": "2026-05-05T03:59:28.765478"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61460, "epoch": 0, "train_loss": 3.697152078151703, "train_ppl": 40.33227746981788, "lr": 0.00056, "grad_norm": 0.6386, "tokens_per_sec": 152163, "dt_s": 4.307, "eta_s": 13519, "world_size": 1, "timestamp": "2026-05-05T03:59:33.072445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61470, "epoch": 0, "train_loss": 3.646605223417282, "train_ppl": 38.344274603838095, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 150029, "dt_s": 4.368, "eta_s": 13553, "world_size": 1, "timestamp": "2026-05-05T03:59:37.440697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61480, "epoch": 0, "train_loss": 3.706492528319359, "train_ppl": 40.710763963068196, "lr": 0.00056, "grad_norm": 0.6919, "tokens_per_sec": 151657, "dt_s": 4.321, "eta_s": 13559, "world_size": 1, "timestamp": "2026-05-05T03:59:41.762009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61490, "epoch": 0, "train_loss": 3.7283632159233093, "train_ppl": 41.61094426346721, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 151324, "dt_s": 4.331, "eta_s": 13489, "world_size": 1, "timestamp": "2026-05-05T03:59:46.092852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61500, "epoch": 0, "train_loss": 3.6381770223379135, "train_ppl": 38.022459413798586, "lr": 0.00056, "grad_norm": 0.7304, "tokens_per_sec": 150912, "dt_s": 4.343, "eta_s": 13513, "world_size": 1, "timestamp": "2026-05-05T03:59:50.435514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61510, "epoch": 0, "train_loss": 3.7365029603242874, "train_ppl": 41.951028937513016, "lr": 0.00056, "grad_norm": 0.7489, "tokens_per_sec": 129052, "dt_s": 5.078, "eta_s": 13511, "world_size": 1, "timestamp": "2026-05-05T03:59:55.513775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61520, "epoch": 0, "train_loss": 3.6860744804143906, "train_ppl": 39.88795826254246, "lr": 0.00056, "grad_norm": 0.7353, "tokens_per_sec": 148383, "dt_s": 4.417, "eta_s": 13537, "world_size": 1, "timestamp": "2026-05-05T03:59:59.930463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61530, "epoch": 0, "train_loss": 3.728642091155052, "train_ppl": 41.62255014341273, "lr": 0.00056, "grad_norm": 0.7602, "tokens_per_sec": 150565, "dt_s": 4.353, "eta_s": 13552, "world_size": 1, "timestamp": "2026-05-05T04:00:04.283146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61540, "epoch": 0, "train_loss": 3.7236140966415405, "train_ppl": 41.41379743317903, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 152039, "dt_s": 4.31, "eta_s": 13535, "world_size": 1, "timestamp": "2026-05-05T04:00:08.593584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61550, "epoch": 0, "train_loss": 3.729772388935089, "train_ppl": 41.66962261738418, "lr": 0.00056, "grad_norm": 0.6349, "tokens_per_sec": 148124, "dt_s": 4.424, "eta_s": 13581, "world_size": 1, "timestamp": "2026-05-05T04:00:13.017999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61560, "epoch": 0, "train_loss": 3.7597881108522415, "train_ppl": 42.9393266374447, "lr": 0.00056, "grad_norm": 0.6565, "tokens_per_sec": 150836, "dt_s": 4.345, "eta_s": 13599, "world_size": 1, "timestamp": "2026-05-05T04:00:17.362844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61570, "epoch": 0, "train_loss": 3.8100992888212204, "train_ppl": 45.15492202273131, "lr": 0.00056, "grad_norm": 0.688, "tokens_per_sec": 151795, "dt_s": 4.317, "eta_s": 13533, "world_size": 1, "timestamp": "2026-05-05T04:00:21.680259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61580, "epoch": 0, "train_loss": 3.6451636850833893, "train_ppl": 38.289039683304004, "lr": 0.00056, "grad_norm": 0.6163, "tokens_per_sec": 149182, "dt_s": 4.393, "eta_s": 13554, "world_size": 1, "timestamp": "2026-05-05T04:00:26.073297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61590, "epoch": 0, "train_loss": 3.6298415660858154, "train_ppl": 37.70684210132563, "lr": 0.00056, "grad_norm": 0.6742, "tokens_per_sec": 151674, "dt_s": 4.321, "eta_s": 13556, "world_size": 1, "timestamp": "2026-05-05T04:00:30.394125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61600, "epoch": 0, "train_loss": 3.698538064956665, "train_ppl": 40.38821623044251, "lr": 0.00056, "grad_norm": 0.6932, "tokens_per_sec": 149924, "dt_s": 4.371, "eta_s": 13518, "world_size": 1, "timestamp": "2026-05-05T04:00:34.765414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61610, "epoch": 0, "train_loss": 3.7009681314229965, "train_ppl": 40.48648162770581, "lr": 0.00056, "grad_norm": 0.6256, "tokens_per_sec": 151695, "dt_s": 4.32, "eta_s": 13499, "world_size": 1, "timestamp": "2026-05-05T04:00:39.085672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61620, "epoch": 0, "train_loss": 3.7860884368419647, "train_ppl": 44.08362669882707, "lr": 0.00056, "grad_norm": 0.8161, "tokens_per_sec": 151939, "dt_s": 4.313, "eta_s": 13492, "world_size": 1, "timestamp": "2026-05-05T04:00:43.398957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61630, "epoch": 0, "train_loss": 3.7200116366147995, "train_ppl": 41.2648742892638, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 150455, "dt_s": 4.356, "eta_s": 13464, "world_size": 1, "timestamp": "2026-05-05T04:00:47.754828"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61640, "epoch": 0, "train_loss": 3.7945019751787186, "train_ppl": 44.456090654816464, "lr": 0.00056, "grad_norm": 0.7802, "tokens_per_sec": 150219, "dt_s": 4.363, "eta_s": 13486, "world_size": 1, "timestamp": "2026-05-05T04:00:52.117537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61650, "epoch": 0, "train_loss": 3.7268630862236023, "train_ppl": 41.54856924714931, "lr": 0.00056, "grad_norm": 0.704, "tokens_per_sec": 151261, "dt_s": 4.333, "eta_s": 13458, "world_size": 1, "timestamp": "2026-05-05T04:00:56.450195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61660, "epoch": 0, "train_loss": 3.7354879081249237, "train_ppl": 41.90846805773605, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 148484, "dt_s": 4.414, "eta_s": 13511, "world_size": 1, "timestamp": "2026-05-05T04:01:00.863866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61670, "epoch": 0, "train_loss": 3.7066038250923157, "train_ppl": 40.71529519187279, "lr": 0.00056, "grad_norm": 0.7105, "tokens_per_sec": 135793, "dt_s": 4.826, "eta_s": 13825, "world_size": 1, "timestamp": "2026-05-05T04:01:05.690049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61680, "epoch": 0, "train_loss": 3.6877561211586, "train_ppl": 39.955091909877716, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 152171, "dt_s": 4.307, "eta_s": 13790, "world_size": 1, "timestamp": "2026-05-05T04:01:09.996745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61690, "epoch": 0, "train_loss": 3.6760756820440292, "train_ppl": 39.49111390008833, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 148775, "dt_s": 4.405, "eta_s": 13812, "world_size": 1, "timestamp": "2026-05-05T04:01:14.401797"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61700, "epoch": 0, "train_loss": 3.74919493496418, "train_ppl": 42.486863539550534, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 153203, "dt_s": 4.278, "eta_s": 13773, "world_size": 1, "timestamp": "2026-05-05T04:01:18.679536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61710, "epoch": 0, "train_loss": 3.6622686982154846, "train_ppl": 38.94960761573189, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 150357, "dt_s": 4.359, "eta_s": 13735, "world_size": 1, "timestamp": "2026-05-05T04:01:23.038265"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61720, "epoch": 0, "train_loss": 3.681166857481003, "train_ppl": 39.692682764992426, "lr": 0.00056, "grad_norm": 0.6656, "tokens_per_sec": 149282, "dt_s": 4.39, "eta_s": 13460, "world_size": 1, "timestamp": "2026-05-05T04:01:27.428312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61730, "epoch": 0, "train_loss": 3.7327206134796143, "train_ppl": 41.79265529641095, "lr": 0.00056, "grad_norm": 0.6564, "tokens_per_sec": 151459, "dt_s": 4.327, "eta_s": 13469, "world_size": 1, "timestamp": "2026-05-05T04:01:31.755306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61740, "epoch": 0, "train_loss": 3.6869972497224808, "train_ppl": 39.924782633776296, "lr": 0.00056, "grad_norm": 0.6934, "tokens_per_sec": 150425, "dt_s": 4.357, "eta_s": 13434, "world_size": 1, "timestamp": "2026-05-05T04:01:36.112022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61750, "epoch": 0, "train_loss": 3.7282507717609406, "train_ppl": 41.6062656187422, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 153504, "dt_s": 4.269, "eta_s": 13425, "world_size": 1, "timestamp": "2026-05-05T04:01:40.381374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61760, "epoch": 0, "train_loss": 3.7398141622543335, "train_ppl": 42.090167496303614, "lr": 0.00056, "grad_norm": 0.6387, "tokens_per_sec": 153618, "dt_s": 4.266, "eta_s": 13363, "world_size": 1, "timestamp": "2026-05-05T04:01:44.647500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61770, "epoch": 0, "train_loss": 3.756886899471283, "train_ppl": 42.81493111042618, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 150009, "dt_s": 4.369, "eta_s": 13346, "world_size": 1, "timestamp": "2026-05-05T04:01:49.016316"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61780, "epoch": 0, "train_loss": 3.6856884211301804, "train_ppl": 39.8725621180305, "lr": 0.00056, "grad_norm": 0.7243, "tokens_per_sec": 151366, "dt_s": 4.33, "eta_s": 13343, "world_size": 1, "timestamp": "2026-05-05T04:01:53.345944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61790, "epoch": 0, "train_loss": 3.644164353609085, "train_ppl": 38.25079535339166, "lr": 0.00056, "grad_norm": 0.6638, "tokens_per_sec": 150494, "dt_s": 4.355, "eta_s": 13338, "world_size": 1, "timestamp": "2026-05-05T04:01:57.700683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61800, "epoch": 0, "train_loss": 3.688412055373192, "train_ppl": 39.98130841892123, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 147212, "dt_s": 4.452, "eta_s": 13446, "world_size": 1, "timestamp": "2026-05-05T04:02:02.152499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61810, "epoch": 0, "train_loss": 3.6990961879491806, "train_ppl": 40.41076411420828, "lr": 0.00056, "grad_norm": 0.6451, "tokens_per_sec": 150891, "dt_s": 4.343, "eta_s": 13489, "world_size": 1, "timestamp": "2026-05-05T04:02:06.495769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61820, "epoch": 0, "train_loss": 3.709242209792137, "train_ppl": 40.82285963955525, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 150106, "dt_s": 4.366, "eta_s": 13483, "world_size": 1, "timestamp": "2026-05-05T04:02:10.861766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61830, "epoch": 0, "train_loss": 3.811589613556862, "train_ppl": 45.2222676909504, "lr": 0.00056, "grad_norm": 0.7833, "tokens_per_sec": 149921, "dt_s": 4.371, "eta_s": 13504, "world_size": 1, "timestamp": "2026-05-05T04:02:15.233115"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61840, "epoch": 0, "train_loss": 3.8270546942949295, "train_ppl": 45.92706957380297, "lr": 0.00056, "grad_norm": 0.6992, "tokens_per_sec": 151283, "dt_s": 4.332, "eta_s": 13486, "world_size": 1, "timestamp": "2026-05-05T04:02:19.565124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61850, "epoch": 0, "train_loss": 3.690717041492462, "train_ppl": 40.0735710710744, "lr": 0.00056, "grad_norm": 0.6387, "tokens_per_sec": 150035, "dt_s": 4.368, "eta_s": 13430, "world_size": 1, "timestamp": "2026-05-05T04:02:23.933199"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61860, "epoch": 0, "train_loss": 3.6929493695497513, "train_ppl": 40.16312835155629, "lr": 0.00056, "grad_norm": 0.7617, "tokens_per_sec": 148494, "dt_s": 4.413, "eta_s": 13469, "world_size": 1, "timestamp": "2026-05-05T04:02:28.346538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61870, "epoch": 0, "train_loss": 3.7122209072113037, "train_ppl": 40.944639869392105, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 151758, "dt_s": 4.318, "eta_s": 13435, "world_size": 1, "timestamp": "2026-05-05T04:02:32.664978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61880, "epoch": 0, "train_loss": 3.7836253196001053, "train_ppl": 43.975177174597775, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 150165, "dt_s": 4.364, "eta_s": 13427, "world_size": 1, "timestamp": "2026-05-05T04:02:37.029268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61890, "epoch": 0, "train_loss": 3.7454239279031754, "train_ppl": 42.32694698986961, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 150181, "dt_s": 4.364, "eta_s": 13442, "world_size": 1, "timestamp": "2026-05-05T04:02:41.393036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61900, "epoch": 0, "train_loss": 3.725313976407051, "train_ppl": 41.48425577783952, "lr": 0.00056, "grad_norm": 0.6675, "tokens_per_sec": 150472, "dt_s": 4.355, "eta_s": 13430, "world_size": 1, "timestamp": "2026-05-05T04:02:45.748396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61910, "epoch": 0, "train_loss": 3.6528953909873962, "train_ppl": 38.58622667823668, "lr": 0.00056, "grad_norm": 0.6278, "tokens_per_sec": 148732, "dt_s": 4.406, "eta_s": 13421, "world_size": 1, "timestamp": "2026-05-05T04:02:50.154717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61920, "epoch": 0, "train_loss": 3.659731686115265, "train_ppl": 38.85091723219505, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 150815, "dt_s": 4.345, "eta_s": 13433, "world_size": 1, "timestamp": "2026-05-05T04:02:54.500170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61930, "epoch": 0, "train_loss": 3.7290572226047516, "train_ppl": 41.639832559983084, "lr": 0.00056, "grad_norm": 0.7172, "tokens_per_sec": 150045, "dt_s": 4.368, "eta_s": 13431, "world_size": 1, "timestamp": "2026-05-05T04:02:58.867917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61940, "epoch": 0, "train_loss": 3.7381743639707565, "train_ppl": 42.02120466990296, "lr": 0.00056, "grad_norm": 0.6794, "tokens_per_sec": 148590, "dt_s": 4.411, "eta_s": 13455, "world_size": 1, "timestamp": "2026-05-05T04:03:03.278456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61950, "epoch": 0, "train_loss": 3.737449213862419, "train_ppl": 41.99074403438547, "lr": 0.00056, "grad_norm": 0.6829, "tokens_per_sec": 150472, "dt_s": 4.355, "eta_s": 13451, "world_size": 1, "timestamp": "2026-05-05T04:03:07.633836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61960, "epoch": 0, "train_loss": 3.710891142487526, "train_ppl": 40.89022931630078, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 134455, "dt_s": 4.874, "eta_s": 13734, "world_size": 1, "timestamp": "2026-05-05T04:03:12.508042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61970, "epoch": 0, "train_loss": 3.7685202807188034, "train_ppl": 43.315921985904126, "lr": 0.00056, "grad_norm": 0.6602, "tokens_per_sec": 151151, "dt_s": 4.336, "eta_s": 13724, "world_size": 1, "timestamp": "2026-05-05T04:03:16.843865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61980, "epoch": 0, "train_loss": 3.753946900367737, "train_ppl": 42.68924010759612, "lr": 0.00056, "grad_norm": 0.618, "tokens_per_sec": 153548, "dt_s": 4.268, "eta_s": 13658, "world_size": 1, "timestamp": "2026-05-05T04:03:21.111966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 61990, "epoch": 0, "train_loss": 3.6728957146406174, "train_ppl": 39.365732904557404, "lr": 0.00056, "grad_norm": 0.6822, "tokens_per_sec": 151696, "dt_s": 4.32, "eta_s": 13598, "world_size": 1, "timestamp": "2026-05-05T04:03:25.432195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62000, "epoch": 0, "train_loss": 3.6184296160936356, "train_ppl": 37.27897952154563, "lr": 0.00056, "grad_norm": 4.6438, "tokens_per_sec": 150921, "dt_s": 4.342, "eta_s": 13586, "world_size": 1, "timestamp": "2026-05-05T04:03:29.774570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62010, "epoch": 0, "train_loss": 3.673275336623192, "train_ppl": 39.38067983904108, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 128371, "dt_s": 5.105, "eta_s": 13250, "world_size": 1, "timestamp": "2026-05-05T04:03:34.879813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62020, "epoch": 0, "train_loss": 3.815831735730171, "train_ppl": 45.41451355236701, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 149162, "dt_s": 4.394, "eta_s": 13281, "world_size": 1, "timestamp": "2026-05-05T04:03:39.273405"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62030, "epoch": 0, "train_loss": 3.8100603967905045, "train_ppl": 45.15316589026703, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 149608, "dt_s": 4.381, "eta_s": 13345, "world_size": 1, "timestamp": "2026-05-05T04:03:43.653923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62040, "epoch": 0, "train_loss": 3.61716391146183, "train_ppl": 37.2318251925177, "lr": 0.00056, "grad_norm": 0.6622, "tokens_per_sec": 148739, "dt_s": 4.406, "eta_s": 13394, "world_size": 1, "timestamp": "2026-05-05T04:03:48.060027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62050, "epoch": 0, "train_loss": 3.7658687978982925, "train_ppl": 43.20122269170607, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 149780, "dt_s": 4.375, "eta_s": 13410, "world_size": 1, "timestamp": "2026-05-05T04:03:52.435506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62060, "epoch": 0, "train_loss": 3.8129605650901794, "train_ppl": 45.284307745418246, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 153087, "dt_s": 4.281, "eta_s": 13373, "world_size": 1, "timestamp": "2026-05-05T04:03:56.716509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62070, "epoch": 0, "train_loss": 3.66967049241066, "train_ppl": 39.2389741900862, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 148269, "dt_s": 4.42, "eta_s": 13385, "world_size": 1, "timestamp": "2026-05-05T04:04:01.136564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62080, "epoch": 0, "train_loss": 3.710628941655159, "train_ppl": 40.87950926960237, "lr": 0.00056, "grad_norm": 0.6856, "tokens_per_sec": 150892, "dt_s": 4.343, "eta_s": 13358, "world_size": 1, "timestamp": "2026-05-05T04:04:05.479837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62090, "epoch": 0, "train_loss": 3.7518177181482315, "train_ppl": 42.59844363201705, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 150014, "dt_s": 4.369, "eta_s": 13330, "world_size": 1, "timestamp": "2026-05-05T04:04:09.848481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62100, "epoch": 0, "train_loss": 3.8273904770612717, "train_ppl": 45.942493681704335, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 149923, "dt_s": 4.371, "eta_s": 13323, "world_size": 1, "timestamp": "2026-05-05T04:04:14.219813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62110, "epoch": 0, "train_loss": 3.65579591691494, "train_ppl": 38.69830950014861, "lr": 0.00056, "grad_norm": 0.6361, "tokens_per_sec": 151110, "dt_s": 4.337, "eta_s": 13353, "world_size": 1, "timestamp": "2026-05-05T04:04:18.556783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62120, "epoch": 0, "train_loss": 3.722410276532173, "train_ppl": 41.363972667068495, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 150957, "dt_s": 4.341, "eta_s": 13301, "world_size": 1, "timestamp": "2026-05-05T04:04:22.898153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62130, "epoch": 0, "train_loss": 3.7473232746124268, "train_ppl": 42.40741693331722, "lr": 0.00056, "grad_norm": 0.7378, "tokens_per_sec": 150978, "dt_s": 4.341, "eta_s": 13295, "world_size": 1, "timestamp": "2026-05-05T04:04:27.238927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62140, "epoch": 0, "train_loss": 3.64359088242054, "train_ppl": 38.22886591286971, "lr": 0.00056, "grad_norm": 0.6652, "tokens_per_sec": 150935, "dt_s": 4.342, "eta_s": 13274, "world_size": 1, "timestamp": "2026-05-05T04:04:31.580914"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62150, "epoch": 0, "train_loss": 3.7183423936367035, "train_ppl": 41.19605064528734, "lr": 0.00056, "grad_norm": 0.6638, "tokens_per_sec": 149885, "dt_s": 4.372, "eta_s": 13271, "world_size": 1, "timestamp": "2026-05-05T04:04:35.953335"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62160, "epoch": 0, "train_loss": 3.5806143432855606, "train_ppl": 35.89558628705209, "lr": 0.00056, "grad_norm": 0.7218, "tokens_per_sec": 150030, "dt_s": 4.368, "eta_s": 13285, "world_size": 1, "timestamp": "2026-05-05T04:04:40.321514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62170, "epoch": 0, "train_loss": 3.6839393377304077, "train_ppl": 39.80288263689949, "lr": 0.00056, "grad_norm": 0.6906, "tokens_per_sec": 151658, "dt_s": 4.321, "eta_s": 13269, "world_size": 1, "timestamp": "2026-05-05T04:04:44.642816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62180, "epoch": 0, "train_loss": 3.7525734901428223, "train_ppl": 42.63065051172312, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 149705, "dt_s": 4.378, "eta_s": 13287, "world_size": 1, "timestamp": "2026-05-05T04:04:49.020508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62190, "epoch": 0, "train_loss": 3.6753239035606384, "train_ppl": 39.46143648719107, "lr": 0.00056, "grad_norm": 0.7142, "tokens_per_sec": 152145, "dt_s": 4.307, "eta_s": 13261, "world_size": 1, "timestamp": "2026-05-05T04:04:53.327955"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62200, "epoch": 0, "train_loss": 3.6251674592494965, "train_ppl": 37.531007547753646, "lr": 0.00056, "grad_norm": 0.6852, "tokens_per_sec": 151110, "dt_s": 4.337, "eta_s": 13236, "world_size": 1, "timestamp": "2026-05-05T04:04:57.664917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62210, "epoch": 0, "train_loss": 3.755041003227234, "train_ppl": 42.73597208740221, "lr": 0.00056, "grad_norm": 0.6879, "tokens_per_sec": 150024, "dt_s": 4.368, "eta_s": 13231, "world_size": 1, "timestamp": "2026-05-05T04:05:02.033291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62220, "epoch": 0, "train_loss": 3.7177065908908844, "train_ppl": 41.1698664080572, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 152170, "dt_s": 4.307, "eta_s": 13218, "world_size": 1, "timestamp": "2026-05-05T04:05:06.340071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62230, "epoch": 0, "train_loss": 3.710892900824547, "train_ppl": 40.890301215167995, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 150866, "dt_s": 4.344, "eta_s": 13193, "world_size": 1, "timestamp": "2026-05-05T04:05:10.684041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62240, "epoch": 0, "train_loss": 3.7357073575258255, "train_ppl": 41.91766585513268, "lr": 0.00056, "grad_norm": 0.7061, "tokens_per_sec": 148905, "dt_s": 4.401, "eta_s": 13246, "world_size": 1, "timestamp": "2026-05-05T04:05:15.085270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62250, "epoch": 0, "train_loss": 3.725776419043541, "train_ppl": 41.50344430290835, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 151826, "dt_s": 4.317, "eta_s": 13229, "world_size": 1, "timestamp": "2026-05-05T04:05:19.401775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62260, "epoch": 0, "train_loss": 3.7292925268411636, "train_ppl": 41.64963174183722, "lr": 0.00056, "grad_norm": 0.6391, "tokens_per_sec": 133888, "dt_s": 4.895, "eta_s": 13545, "world_size": 1, "timestamp": "2026-05-05T04:05:24.296609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62270, "epoch": 0, "train_loss": 3.6664123833179474, "train_ppl": 39.1113373717363, "lr": 0.00056, "grad_norm": 0.7522, "tokens_per_sec": 149480, "dt_s": 4.384, "eta_s": 13588, "world_size": 1, "timestamp": "2026-05-05T04:05:28.680857"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62280, "epoch": 0, "train_loss": 3.7122227251529694, "train_ppl": 40.94471430442657, "lr": 0.00056, "grad_norm": 0.6932, "tokens_per_sec": 151162, "dt_s": 4.335, "eta_s": 13578, "world_size": 1, "timestamp": "2026-05-05T04:05:33.016348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62290, "epoch": 0, "train_loss": 3.6758263260126114, "train_ppl": 39.48126778029563, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 149828, "dt_s": 4.374, "eta_s": 13557, "world_size": 1, "timestamp": "2026-05-05T04:05:37.390420"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62300, "epoch": 0, "train_loss": 3.708345741033554, "train_ppl": 40.78627962012448, "lr": 0.00056, "grad_norm": 0.6565, "tokens_per_sec": 152093, "dt_s": 4.309, "eta_s": 13548, "world_size": 1, "timestamp": "2026-05-05T04:05:41.699395"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62310, "epoch": 0, "train_loss": 3.8100756108760834, "train_ppl": 45.15385285962284, "lr": 0.00056, "grad_norm": 1.0082, "tokens_per_sec": 149856, "dt_s": 4.373, "eta_s": 13227, "world_size": 1, "timestamp": "2026-05-05T04:05:46.072633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62320, "epoch": 0, "train_loss": 3.6343212574720383, "train_ppl": 37.87613602624658, "lr": 0.00056, "grad_norm": 0.6417, "tokens_per_sec": 149353, "dt_s": 4.388, "eta_s": 13225, "world_size": 1, "timestamp": "2026-05-05T04:05:50.460641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62330, "epoch": 0, "train_loss": 3.708368733525276, "train_ppl": 40.787217409102034, "lr": 0.00056, "grad_norm": 0.6755, "tokens_per_sec": 151816, "dt_s": 4.317, "eta_s": 13209, "world_size": 1, "timestamp": "2026-05-05T04:05:54.777453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62340, "epoch": 0, "train_loss": 3.7284135073423386, "train_ppl": 41.613036989524005, "lr": 0.00056, "grad_norm": 0.6829, "tokens_per_sec": 151665, "dt_s": 4.321, "eta_s": 13173, "world_size": 1, "timestamp": "2026-05-05T04:05:59.098562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62350, "epoch": 0, "train_loss": 3.7072207033634186, "train_ppl": 40.740419321246215, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 148141, "dt_s": 4.424, "eta_s": 13238, "world_size": 1, "timestamp": "2026-05-05T04:06:03.522464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62360, "epoch": 0, "train_loss": 3.600228801369667, "train_ppl": 36.60660912787917, "lr": 0.00056, "grad_norm": 0.6013, "tokens_per_sec": 151324, "dt_s": 4.331, "eta_s": 13208, "world_size": 1, "timestamp": "2026-05-05T04:06:07.853307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62370, "epoch": 0, "train_loss": 3.736012488603592, "train_ppl": 41.93045818926261, "lr": 0.00056, "grad_norm": 0.6562, "tokens_per_sec": 149287, "dt_s": 4.39, "eta_s": 13205, "world_size": 1, "timestamp": "2026-05-05T04:06:12.243247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62380, "epoch": 0, "train_loss": 3.7841087132692337, "train_ppl": 43.99643963549824, "lr": 0.00056, "grad_norm": 0.6947, "tokens_per_sec": 152237, "dt_s": 4.305, "eta_s": 13193, "world_size": 1, "timestamp": "2026-05-05T04:06:16.548095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62390, "epoch": 0, "train_loss": 3.720213606953621, "train_ppl": 41.27320941160084, "lr": 0.00056, "grad_norm": 0.711, "tokens_per_sec": 150202, "dt_s": 4.363, "eta_s": 13214, "world_size": 1, "timestamp": "2026-05-05T04:06:20.911290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62400, "epoch": 0, "train_loss": 3.609179273247719, "train_ppl": 36.93572623297197, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 148404, "dt_s": 4.416, "eta_s": 13205, "world_size": 1, "timestamp": "2026-05-05T04:06:25.327337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62410, "epoch": 0, "train_loss": 3.7671814262866974, "train_ppl": 43.25796707702301, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 150003, "dt_s": 4.369, "eta_s": 13224, "world_size": 1, "timestamp": "2026-05-05T04:06:29.696352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62420, "epoch": 0, "train_loss": 3.671529307961464, "train_ppl": 39.311980036689036, "lr": 0.00056, "grad_norm": 0.6978, "tokens_per_sec": 150453, "dt_s": 4.356, "eta_s": 13199, "world_size": 1, "timestamp": "2026-05-05T04:06:34.052226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62430, "epoch": 0, "train_loss": 3.648677036166191, "train_ppl": 38.423799112309275, "lr": 0.00056, "grad_norm": 0.7173, "tokens_per_sec": 148412, "dt_s": 4.416, "eta_s": 13262, "world_size": 1, "timestamp": "2026-05-05T04:06:38.468039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62440, "epoch": 0, "train_loss": 3.6392863988876343, "train_ppl": 38.06466404472038, "lr": 0.00056, "grad_norm": 0.6789, "tokens_per_sec": 151435, "dt_s": 4.328, "eta_s": 13236, "world_size": 1, "timestamp": "2026-05-05T04:06:42.795682"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62450, "epoch": 0, "train_loss": 3.6868524104356766, "train_ppl": 39.919000375493, "lr": 0.00056, "grad_norm": 0.7334, "tokens_per_sec": 150133, "dt_s": 4.365, "eta_s": 13201, "world_size": 1, "timestamp": "2026-05-05T04:06:47.160877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62460, "epoch": 0, "train_loss": 3.6871079951524734, "train_ppl": 39.929204365835226, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 148700, "dt_s": 4.407, "eta_s": 13219, "world_size": 1, "timestamp": "2026-05-05T04:06:51.568127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62470, "epoch": 0, "train_loss": 3.7060692757368088, "train_ppl": 40.69353667308791, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 152997, "dt_s": 4.283, "eta_s": 13171, "world_size": 1, "timestamp": "2026-05-05T04:06:55.851602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62480, "epoch": 0, "train_loss": 3.6410010904073715, "train_ppl": 38.129989191591626, "lr": 0.00056, "grad_norm": 0.7876, "tokens_per_sec": 150347, "dt_s": 4.359, "eta_s": 13133, "world_size": 1, "timestamp": "2026-05-05T04:07:00.210584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62490, "epoch": 0, "train_loss": 3.673460081219673, "train_ppl": 39.38795587893093, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 150170, "dt_s": 4.364, "eta_s": 13150, "world_size": 1, "timestamp": "2026-05-05T04:07:04.574677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62500, "epoch": 0, "train_loss": 3.6518708616495132, "train_ppl": 38.54671420126877, "lr": 0.00056, "grad_norm": 0.6237, "tokens_per_sec": 150550, "dt_s": 4.353, "eta_s": 13139, "world_size": 1, "timestamp": "2026-05-05T04:07:08.927788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62510, "epoch": 0, "train_loss": 3.7335557639598846, "train_ppl": 41.82757303130553, "lr": 0.00056, "grad_norm": 0.6715, "tokens_per_sec": 126264, "dt_s": 5.19, "eta_s": 13146, "world_size": 1, "timestamp": "2026-05-05T04:07:14.118166"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62520, "epoch": 0, "train_loss": 3.750659555196762, "train_ppl": 42.54913625131448, "lr": 0.00056, "grad_norm": 0.8958, "tokens_per_sec": 151616, "dt_s": 4.322, "eta_s": 13165, "world_size": 1, "timestamp": "2026-05-05T04:07:18.440697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62530, "epoch": 0, "train_loss": 3.6419817954301834, "train_ppl": 38.16740180588239, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 148737, "dt_s": 4.406, "eta_s": 13189, "world_size": 1, "timestamp": "2026-05-05T04:07:22.846853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62540, "epoch": 0, "train_loss": 3.6951325684785843, "train_ppl": 40.250908235960935, "lr": 0.00056, "grad_norm": 0.7395, "tokens_per_sec": 146846, "dt_s": 4.463, "eta_s": 13244, "world_size": 1, "timestamp": "2026-05-05T04:07:27.309759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62550, "epoch": 0, "train_loss": 3.6077274829149246, "train_ppl": 36.88214220848814, "lr": 0.00056, "grad_norm": 0.641, "tokens_per_sec": 135176, "dt_s": 4.848, "eta_s": 13538, "world_size": 1, "timestamp": "2026-05-05T04:07:32.157977"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62560, "epoch": 0, "train_loss": 3.7071049213409424, "train_ppl": 40.73570258616251, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 149178, "dt_s": 4.393, "eta_s": 13514, "world_size": 1, "timestamp": "2026-05-05T04:07:36.551101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62570, "epoch": 0, "train_loss": 3.6912454664707184, "train_ppl": 40.094752542912794, "lr": 0.00056, "grad_norm": 0.6975, "tokens_per_sec": 149917, "dt_s": 4.371, "eta_s": 13539, "world_size": 1, "timestamp": "2026-05-05T04:07:40.922575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62580, "epoch": 0, "train_loss": 3.684369534254074, "train_ppl": 39.820009382310815, "lr": 0.00056, "grad_norm": 0.725, "tokens_per_sec": 153009, "dt_s": 4.283, "eta_s": 13460, "world_size": 1, "timestamp": "2026-05-05T04:07:45.205743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62590, "epoch": 0, "train_loss": 3.7111772894859314, "train_ppl": 40.901931606891566, "lr": 0.00056, "grad_norm": 0.6164, "tokens_per_sec": 149457, "dt_s": 4.385, "eta_s": 13409, "world_size": 1, "timestamp": "2026-05-05T04:07:49.590683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62600, "epoch": 0, "train_loss": 3.7407840192317963, "train_ppl": 42.13100874081411, "lr": 0.00056, "grad_norm": 0.7405, "tokens_per_sec": 150888, "dt_s": 4.343, "eta_s": 13101, "world_size": 1, "timestamp": "2026-05-05T04:07:53.934046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62610, "epoch": 0, "train_loss": 3.7688048183918, "train_ppl": 43.32824875118108, "lr": 0.00056, "grad_norm": 0.6451, "tokens_per_sec": 150632, "dt_s": 4.351, "eta_s": 13071, "world_size": 1, "timestamp": "2026-05-05T04:07:58.284804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62620, "epoch": 0, "train_loss": 3.669185161590576, "train_ppl": 39.219934927107566, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 147989, "dt_s": 4.428, "eta_s": 13101, "world_size": 1, "timestamp": "2026-05-05T04:08:02.713239"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62630, "epoch": 0, "train_loss": 3.726811647415161, "train_ppl": 41.546432093221604, "lr": 0.00056, "grad_norm": 0.6684, "tokens_per_sec": 153490, "dt_s": 4.27, "eta_s": 13088, "world_size": 1, "timestamp": "2026-05-05T04:08:06.982972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62640, "epoch": 0, "train_loss": 3.7207279056310654, "train_ppl": 41.29444162799752, "lr": 0.00056, "grad_norm": 0.7177, "tokens_per_sec": 152172, "dt_s": 4.307, "eta_s": 13037, "world_size": 1, "timestamp": "2026-05-05T04:08:11.289689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62650, "epoch": 0, "train_loss": 3.6899655163288116, "train_ppl": 40.04346608776032, "lr": 0.00056, "grad_norm": 0.6443, "tokens_per_sec": 149430, "dt_s": 4.386, "eta_s": 13058, "world_size": 1, "timestamp": "2026-05-05T04:08:15.675451"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62660, "epoch": 0, "train_loss": 3.7378020733594894, "train_ppl": 42.00556348164464, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 151768, "dt_s": 4.318, "eta_s": 13034, "world_size": 1, "timestamp": "2026-05-05T04:08:19.993579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62670, "epoch": 0, "train_loss": 3.7324943840503693, "train_ppl": 41.783201637244886, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 149034, "dt_s": 4.397, "eta_s": 13011, "world_size": 1, "timestamp": "2026-05-05T04:08:24.390966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62680, "epoch": 0, "train_loss": 3.7982589453458786, "train_ppl": 44.62342499940091, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 148900, "dt_s": 4.401, "eta_s": 13086, "world_size": 1, "timestamp": "2026-05-05T04:08:28.792337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62690, "epoch": 0, "train_loss": 3.6040737330913544, "train_ppl": 36.74762997457003, "lr": 0.00056, "grad_norm": 0.6799, "tokens_per_sec": 151786, "dt_s": 4.318, "eta_s": 13088, "world_size": 1, "timestamp": "2026-05-05T04:08:33.109964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62700, "epoch": 0, "train_loss": 3.679721772670746, "train_ppl": 39.63536489661546, "lr": 0.00056, "grad_norm": 0.6205, "tokens_per_sec": 150991, "dt_s": 4.34, "eta_s": 13056, "world_size": 1, "timestamp": "2026-05-05T04:08:37.450365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62710, "epoch": 0, "train_loss": 3.6329514235258102, "train_ppl": 37.824287529388975, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 150990, "dt_s": 4.34, "eta_s": 13065, "world_size": 1, "timestamp": "2026-05-05T04:08:41.790768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62720, "epoch": 0, "train_loss": 3.764610007405281, "train_ppl": 43.14687561626255, "lr": 0.00056, "grad_norm": 1.0388, "tokens_per_sec": 152081, "dt_s": 4.309, "eta_s": 13008, "world_size": 1, "timestamp": "2026-05-05T04:08:46.100062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62730, "epoch": 0, "train_loss": 3.683449700474739, "train_ppl": 39.783398433162745, "lr": 0.00056, "grad_norm": 0.6464, "tokens_per_sec": 147176, "dt_s": 4.453, "eta_s": 13035, "world_size": 1, "timestamp": "2026-05-05T04:08:50.552960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62740, "epoch": 0, "train_loss": 3.7201797515153885, "train_ppl": 41.27181211266217, "lr": 0.00056, "grad_norm": 0.7493, "tokens_per_sec": 152254, "dt_s": 4.304, "eta_s": 13023, "world_size": 1, "timestamp": "2026-05-05T04:08:54.857362"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62750, "epoch": 0, "train_loss": 3.797928422689438, "train_ppl": 44.60867838361018, "lr": 0.00056, "grad_norm": 0.7225, "tokens_per_sec": 151315, "dt_s": 4.331, "eta_s": 13013, "world_size": 1, "timestamp": "2026-05-05T04:08:59.188458"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62760, "epoch": 0, "train_loss": 3.765618458390236, "train_ppl": 43.19040907246452, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 147543, "dt_s": 4.442, "eta_s": 13069, "world_size": 1, "timestamp": "2026-05-05T04:09:03.630327"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62770, "epoch": 0, "train_loss": 3.7271326184272766, "train_ppl": 41.55976943391572, "lr": 0.00056, "grad_norm": 0.6301, "tokens_per_sec": 152080, "dt_s": 4.309, "eta_s": 13065, "world_size": 1, "timestamp": "2026-05-05T04:09:07.939619"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62780, "epoch": 0, "train_loss": 3.7879503667354584, "train_ppl": 44.16578378282149, "lr": 0.00056, "grad_norm": 0.6601, "tokens_per_sec": 151121, "dt_s": 4.337, "eta_s": 12991, "world_size": 1, "timestamp": "2026-05-05T04:09:12.276319"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62790, "epoch": 0, "train_loss": 3.7123611867427826, "train_ppl": 40.950383967169806, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 150266, "dt_s": 4.361, "eta_s": 13020, "world_size": 1, "timestamp": "2026-05-05T04:09:16.637612"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62800, "epoch": 0, "train_loss": 3.725976660847664, "train_ppl": 41.511755859605664, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 151128, "dt_s": 4.336, "eta_s": 13019, "world_size": 1, "timestamp": "2026-05-05T04:09:20.974075"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62810, "epoch": 0, "train_loss": 3.7131504118442535, "train_ppl": 40.98271579497838, "lr": 0.00056, "grad_norm": 0.6747, "tokens_per_sec": 150798, "dt_s": 4.346, "eta_s": 12958, "world_size": 1, "timestamp": "2026-05-05T04:09:25.320038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62820, "epoch": 0, "train_loss": 3.633265510201454, "train_ppl": 37.836169500004324, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 150203, "dt_s": 4.363, "eta_s": 12985, "world_size": 1, "timestamp": "2026-05-05T04:09:29.683170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62830, "epoch": 0, "train_loss": 3.6619645059108734, "train_ppl": 38.93776124670606, "lr": 0.00056, "grad_norm": 0.7464, "tokens_per_sec": 152595, "dt_s": 4.295, "eta_s": 12956, "world_size": 1, "timestamp": "2026-05-05T04:09:33.977942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62840, "epoch": 0, "train_loss": 3.7176757603883743, "train_ppl": 41.168597139953754, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 147920, "dt_s": 4.431, "eta_s": 12993, "world_size": 1, "timestamp": "2026-05-05T04:09:38.408457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62850, "epoch": 0, "train_loss": 3.7042398154735565, "train_ppl": 40.619157522368745, "lr": 0.00056, "grad_norm": 0.6366, "tokens_per_sec": 136609, "dt_s": 4.797, "eta_s": 13264, "world_size": 1, "timestamp": "2026-05-05T04:09:43.205776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62860, "epoch": 0, "train_loss": 3.723968505859375, "train_ppl": 41.428477465950685, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 153305, "dt_s": 4.275, "eta_s": 13217, "world_size": 1, "timestamp": "2026-05-05T04:09:47.480648"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62870, "epoch": 0, "train_loss": 3.788649767637253, "train_ppl": 44.19668417644963, "lr": 0.00056, "grad_norm": 0.6475, "tokens_per_sec": 149726, "dt_s": 4.377, "eta_s": 13221, "world_size": 1, "timestamp": "2026-05-05T04:09:51.857723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62880, "epoch": 0, "train_loss": 3.65680792927742, "train_ppl": 38.73749249126138, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 152280, "dt_s": 4.304, "eta_s": 13222, "world_size": 1, "timestamp": "2026-05-05T04:09:56.161404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62890, "epoch": 0, "train_loss": 3.6976929903030396, "train_ppl": 40.354099590185356, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 152242, "dt_s": 4.305, "eta_s": 13142, "world_size": 1, "timestamp": "2026-05-05T04:10:00.466110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62900, "epoch": 0, "train_loss": 3.784265100955963, "train_ppl": 44.00332067495806, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 149778, "dt_s": 4.376, "eta_s": 12887, "world_size": 1, "timestamp": "2026-05-05T04:10:04.841680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62910, "epoch": 0, "train_loss": 3.837545156478882, "train_ppl": 46.41140175287788, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 153271, "dt_s": 4.276, "eta_s": 12883, "world_size": 1, "timestamp": "2026-05-05T04:10:09.117497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62920, "epoch": 0, "train_loss": 3.767222136259079, "train_ppl": 43.259728143514245, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 149795, "dt_s": 4.375, "eta_s": 12877, "world_size": 1, "timestamp": "2026-05-05T04:10:13.492558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62930, "epoch": 0, "train_loss": 3.6952272057533264, "train_ppl": 40.25471765247584, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 150069, "dt_s": 4.367, "eta_s": 12911, "world_size": 1, "timestamp": "2026-05-05T04:10:17.859700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62940, "epoch": 0, "train_loss": 3.683609887957573, "train_ppl": 39.78977174606518, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 152999, "dt_s": 4.283, "eta_s": 12894, "world_size": 1, "timestamp": "2026-05-05T04:10:22.143026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62950, "epoch": 0, "train_loss": 3.6991938799619675, "train_ppl": 40.41471211593383, "lr": 0.00056, "grad_norm": 0.638, "tokens_per_sec": 150134, "dt_s": 4.365, "eta_s": 12883, "world_size": 1, "timestamp": "2026-05-05T04:10:26.508201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62960, "epoch": 0, "train_loss": 3.744628205895424, "train_ppl": 42.29327990322616, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 150138, "dt_s": 4.365, "eta_s": 12932, "world_size": 1, "timestamp": "2026-05-05T04:10:30.873251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62970, "epoch": 0, "train_loss": 3.7656124979257584, "train_ppl": 43.190151638332686, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 150923, "dt_s": 4.342, "eta_s": 12908, "world_size": 1, "timestamp": "2026-05-05T04:10:35.215571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62980, "epoch": 0, "train_loss": 3.702709600329399, "train_ppl": 40.557049004201865, "lr": 0.00056, "grad_norm": 0.6899, "tokens_per_sec": 147912, "dt_s": 4.431, "eta_s": 12941, "world_size": 1, "timestamp": "2026-05-05T04:10:39.646353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 62990, "epoch": 0, "train_loss": 3.64647676050663, "train_ppl": 38.33934910309449, "lr": 0.00056, "grad_norm": 0.6358, "tokens_per_sec": 150499, "dt_s": 4.355, "eta_s": 12979, "world_size": 1, "timestamp": "2026-05-05T04:10:44.000898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63000, "epoch": 0, "train_loss": 3.7008010298013687, "train_ppl": 40.47971683619137, "lr": 0.00056, "grad_norm": 0.7444, "tokens_per_sec": 151514, "dt_s": 4.325, "eta_s": 12951, "world_size": 1, "timestamp": "2026-05-05T04:10:48.326330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63010, "epoch": 0, "train_loss": 3.75911183655262, "train_ppl": 42.91029769126822, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 125692, "dt_s": 5.214, "eta_s": 12988, "world_size": 1, "timestamp": "2026-05-05T04:10:53.540340"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63020, "epoch": 0, "train_loss": 3.7197526395320892, "train_ppl": 41.254188191098336, "lr": 0.00056, "grad_norm": 0.6268, "tokens_per_sec": 150615, "dt_s": 4.351, "eta_s": 12989, "world_size": 1, "timestamp": "2026-05-05T04:10:57.891536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63030, "epoch": 0, "train_loss": 3.7447023391723633, "train_ppl": 42.29641535887726, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 148709, "dt_s": 4.407, "eta_s": 12970, "world_size": 1, "timestamp": "2026-05-05T04:11:02.298516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63040, "epoch": 0, "train_loss": 3.7275180518627167, "train_ppl": 41.57579104605889, "lr": 0.00056, "grad_norm": 0.7607, "tokens_per_sec": 150565, "dt_s": 4.353, "eta_s": 12965, "world_size": 1, "timestamp": "2026-05-05T04:11:06.651195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63050, "epoch": 0, "train_loss": 3.6769421249628067, "train_ppl": 39.525345523814146, "lr": 0.00056, "grad_norm": 0.7749, "tokens_per_sec": 151922, "dt_s": 4.314, "eta_s": 12953, "world_size": 1, "timestamp": "2026-05-05T04:11:10.965013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63060, "epoch": 0, "train_loss": 3.6679121553897858, "train_ppl": 39.170039472100726, "lr": 0.00056, "grad_norm": 0.6253, "tokens_per_sec": 149766, "dt_s": 4.376, "eta_s": 12915, "world_size": 1, "timestamp": "2026-05-05T04:11:15.340883"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63070, "epoch": 0, "train_loss": 3.7264636904001236, "train_ppl": 41.531978235531405, "lr": 0.00056, "grad_norm": 0.734, "tokens_per_sec": 149708, "dt_s": 4.378, "eta_s": 12926, "world_size": 1, "timestamp": "2026-05-05T04:11:19.718484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63080, "epoch": 0, "train_loss": 3.7525818794965744, "train_ppl": 42.63100815683115, "lr": 0.00056, "grad_norm": 0.6494, "tokens_per_sec": 149944, "dt_s": 4.371, "eta_s": 12900, "world_size": 1, "timestamp": "2026-05-05T04:11:24.089199"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63090, "epoch": 0, "train_loss": 3.78858844935894, "train_ppl": 44.19397419495535, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 144565, "dt_s": 4.533, "eta_s": 13003, "world_size": 1, "timestamp": "2026-05-05T04:11:28.622509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63100, "epoch": 0, "train_loss": 3.735867217183113, "train_ppl": 41.92436733446441, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 151259, "dt_s": 4.333, "eta_s": 13010, "world_size": 1, "timestamp": "2026-05-05T04:11:32.955227"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63110, "epoch": 0, "train_loss": 3.6880205273628235, "train_ppl": 39.96565768083515, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 151503, "dt_s": 4.326, "eta_s": 12976, "world_size": 1, "timestamp": "2026-05-05T04:11:37.280936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63120, "epoch": 0, "train_loss": 3.727263256907463, "train_ppl": 41.56519909368499, "lr": 0.00056, "grad_norm": 0.759, "tokens_per_sec": 150160, "dt_s": 4.364, "eta_s": 12963, "world_size": 1, "timestamp": "2026-05-05T04:11:41.645359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63130, "epoch": 0, "train_loss": 3.6806994527578354, "train_ppl": 39.674134552691584, "lr": 0.00056, "grad_norm": 0.6646, "tokens_per_sec": 152782, "dt_s": 4.289, "eta_s": 12911, "world_size": 1, "timestamp": "2026-05-05T04:11:45.934842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63140, "epoch": 0, "train_loss": 3.6999965012073517, "train_ppl": 40.44716284358382, "lr": 0.00056, "grad_norm": 0.6325, "tokens_per_sec": 135287, "dt_s": 4.844, "eta_s": 13090, "world_size": 1, "timestamp": "2026-05-05T04:11:50.779064"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63150, "epoch": 0, "train_loss": 3.676965057849884, "train_ppl": 39.52625196449335, "lr": 0.00056, "grad_norm": 0.8039, "tokens_per_sec": 149758, "dt_s": 4.376, "eta_s": 13112, "world_size": 1, "timestamp": "2026-05-05T04:11:55.155207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63160, "epoch": 0, "train_loss": 3.7072087079286575, "train_ppl": 40.73993062513518, "lr": 0.00056, "grad_norm": 0.7291, "tokens_per_sec": 152052, "dt_s": 4.31, "eta_s": 13098, "world_size": 1, "timestamp": "2026-05-05T04:11:59.465313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63170, "epoch": 0, "train_loss": 3.7321653962135315, "train_ppl": 41.76945773303471, "lr": 0.00056, "grad_norm": 0.6293, "tokens_per_sec": 148700, "dt_s": 4.407, "eta_s": 13119, "world_size": 1, "timestamp": "2026-05-05T04:12:03.872538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63180, "epoch": 0, "train_loss": 3.6525739133358, "train_ppl": 38.57382406238892, "lr": 0.00056, "grad_norm": 0.6296, "tokens_per_sec": 151454, "dt_s": 4.327, "eta_s": 13137, "world_size": 1, "timestamp": "2026-05-05T04:12:08.199696"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63190, "epoch": 0, "train_loss": 3.725657567381859, "train_ppl": 41.49851184270877, "lr": 0.00056, "grad_norm": 0.6938, "tokens_per_sec": 152649, "dt_s": 4.293, "eta_s": 12807, "world_size": 1, "timestamp": "2026-05-05T04:12:12.492927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63200, "epoch": 0, "train_loss": 3.6716673374176025, "train_ppl": 39.31740662241896, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 145728, "dt_s": 4.497, "eta_s": 12874, "world_size": 1, "timestamp": "2026-05-05T04:12:16.990058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63210, "epoch": 0, "train_loss": 3.712476760149002, "train_ppl": 40.95511701603291, "lr": 0.00056, "grad_norm": 0.7171, "tokens_per_sec": 150453, "dt_s": 4.356, "eta_s": 12897, "world_size": 1, "timestamp": "2026-05-05T04:12:21.345975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63220, "epoch": 0, "train_loss": 3.710553005337715, "train_ppl": 40.8764051480688, "lr": 0.00056, "grad_norm": 0.6922, "tokens_per_sec": 149821, "dt_s": 4.374, "eta_s": 12873, "world_size": 1, "timestamp": "2026-05-05T04:12:25.720273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63230, "epoch": 0, "train_loss": 3.7660765796899796, "train_ppl": 43.210200051793706, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 147122, "dt_s": 4.455, "eta_s": 12944, "world_size": 1, "timestamp": "2026-05-05T04:12:30.174822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63240, "epoch": 0, "train_loss": 3.766595095396042, "train_ppl": 43.23261102890235, "lr": 0.00056, "grad_norm": 0.6331, "tokens_per_sec": 151952, "dt_s": 4.313, "eta_s": 12951, "world_size": 1, "timestamp": "2026-05-05T04:12:34.487722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63250, "epoch": 0, "train_loss": 3.6744775027036667, "train_ppl": 39.42805042452014, "lr": 0.00056, "grad_norm": 0.6675, "tokens_per_sec": 151326, "dt_s": 4.331, "eta_s": 12848, "world_size": 1, "timestamp": "2026-05-05T04:12:38.818491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63260, "epoch": 0, "train_loss": 3.822798475623131, "train_ppl": 45.73200932674086, "lr": 0.00056, "grad_norm": 0.6951, "tokens_per_sec": 149755, "dt_s": 4.376, "eta_s": 12856, "world_size": 1, "timestamp": "2026-05-05T04:12:43.194707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63270, "epoch": 0, "train_loss": 3.6972554326057434, "train_ppl": 40.33644620576099, "lr": 0.00056, "grad_norm": 0.6618, "tokens_per_sec": 151358, "dt_s": 4.33, "eta_s": 12826, "world_size": 1, "timestamp": "2026-05-05T04:12:47.524583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63280, "epoch": 0, "train_loss": 3.7608794420957565, "train_ppl": 42.986213245936504, "lr": 0.00056, "grad_norm": 0.6768, "tokens_per_sec": 149507, "dt_s": 4.383, "eta_s": 12779, "world_size": 1, "timestamp": "2026-05-05T04:12:51.908046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63290, "epoch": 0, "train_loss": 3.8254723846912384, "train_ppl": 45.85445619412116, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 150303, "dt_s": 4.36, "eta_s": 12803, "world_size": 1, "timestamp": "2026-05-05T04:12:56.268329"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63300, "epoch": 0, "train_loss": 3.604015037417412, "train_ppl": 36.74547311096278, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 151216, "dt_s": 4.334, "eta_s": 12800, "world_size": 1, "timestamp": "2026-05-05T04:13:00.602267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63310, "epoch": 0, "train_loss": 3.705089747905731, "train_ppl": 40.65369573720993, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 148611, "dt_s": 4.41, "eta_s": 12816, "world_size": 1, "timestamp": "2026-05-05T04:13:05.012189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63320, "epoch": 0, "train_loss": 3.8824256509542465, "train_ppl": 48.54181788887246, "lr": 0.00056, "grad_norm": 0.6932, "tokens_per_sec": 150753, "dt_s": 4.347, "eta_s": 12822, "world_size": 1, "timestamp": "2026-05-05T04:13:09.359407"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63330, "epoch": 0, "train_loss": 3.7609267234802246, "train_ppl": 42.98824574166104, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 152066, "dt_s": 4.31, "eta_s": 12774, "world_size": 1, "timestamp": "2026-05-05T04:13:13.669103"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63340, "epoch": 0, "train_loss": 3.7484819442033768, "train_ppl": 42.45658159504748, "lr": 0.00056, "grad_norm": 0.6872, "tokens_per_sec": 149316, "dt_s": 4.389, "eta_s": 12787, "world_size": 1, "timestamp": "2026-05-05T04:13:18.058198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63350, "epoch": 0, "train_loss": 3.716360956430435, "train_ppl": 41.11450407417244, "lr": 0.00056, "grad_norm": 0.6972, "tokens_per_sec": 151177, "dt_s": 4.335, "eta_s": 12783, "world_size": 1, "timestamp": "2026-05-05T04:13:22.393256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63360, "epoch": 0, "train_loss": 3.7961948066949844, "train_ppl": 44.531411060558234, "lr": 0.00056, "grad_norm": 0.6915, "tokens_per_sec": 150073, "dt_s": 4.367, "eta_s": 12753, "world_size": 1, "timestamp": "2026-05-05T04:13:26.760211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63370, "epoch": 0, "train_loss": 3.6749551743268967, "train_ppl": 39.446888584236184, "lr": 0.00056, "grad_norm": 0.6962, "tokens_per_sec": 149963, "dt_s": 4.37, "eta_s": 12762, "world_size": 1, "timestamp": "2026-05-05T04:13:31.130377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63380, "epoch": 0, "train_loss": 3.7143484950065613, "train_ppl": 41.03184592183007, "lr": 0.00056, "grad_norm": 0.7295, "tokens_per_sec": 151894, "dt_s": 4.315, "eta_s": 12761, "world_size": 1, "timestamp": "2026-05-05T04:13:35.444965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63390, "epoch": 0, "train_loss": 3.6148997098207474, "train_ppl": 37.14762019733427, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 149784, "dt_s": 4.375, "eta_s": 12748, "world_size": 1, "timestamp": "2026-05-05T04:13:39.820317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63400, "epoch": 0, "train_loss": 3.634650334715843, "train_ppl": 37.88860225175914, "lr": 0.00056, "grad_norm": 0.6199, "tokens_per_sec": 150688, "dt_s": 4.349, "eta_s": 12752, "world_size": 1, "timestamp": "2026-05-05T04:13:44.169442"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63410, "epoch": 0, "train_loss": 3.72964146733284, "train_ppl": 41.66416752072878, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 152124, "dt_s": 4.308, "eta_s": 12713, "world_size": 1, "timestamp": "2026-05-05T04:13:48.477483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63420, "epoch": 0, "train_loss": 3.593375563621521, "train_ppl": 36.35659302082231, "lr": 0.00056, "grad_norm": 0.6886, "tokens_per_sec": 148047, "dt_s": 4.427, "eta_s": 12742, "world_size": 1, "timestamp": "2026-05-05T04:13:52.904200"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63430, "epoch": 0, "train_loss": 3.7558848559856415, "train_ppl": 42.77205017548758, "lr": 0.00056, "grad_norm": 0.6317, "tokens_per_sec": 150957, "dt_s": 4.341, "eta_s": 12754, "world_size": 1, "timestamp": "2026-05-05T04:13:57.245566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63440, "epoch": 0, "train_loss": 3.653357893228531, "train_ppl": 38.60407702214612, "lr": 0.00056, "grad_norm": 0.6861, "tokens_per_sec": 135419, "dt_s": 4.84, "eta_s": 13021, "world_size": 1, "timestamp": "2026-05-05T04:14:02.085066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63450, "epoch": 0, "train_loss": 3.8912622034549713, "train_ppl": 48.972660991459776, "lr": 0.00056, "grad_norm": 0.689, "tokens_per_sec": 150290, "dt_s": 4.361, "eta_s": 13023, "world_size": 1, "timestamp": "2026-05-05T04:14:06.445732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63460, "epoch": 0, "train_loss": 3.7484287321567535, "train_ppl": 42.45432245355549, "lr": 0.00056, "grad_norm": 0.6872, "tokens_per_sec": 152030, "dt_s": 4.311, "eta_s": 13020, "world_size": 1, "timestamp": "2026-05-05T04:14:10.756427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63470, "epoch": 0, "train_loss": 3.6967084407806396, "train_ppl": 40.31438853266611, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 152338, "dt_s": 4.302, "eta_s": 12943, "world_size": 1, "timestamp": "2026-05-05T04:14:15.058431"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63480, "epoch": 0, "train_loss": 3.6684187203645706, "train_ppl": 39.18988666868113, "lr": 0.00056, "grad_norm": 0.6403, "tokens_per_sec": 149381, "dt_s": 4.387, "eta_s": 12965, "world_size": 1, "timestamp": "2026-05-05T04:14:19.445621"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63490, "epoch": 0, "train_loss": 3.689936190843582, "train_ppl": 40.042291810905226, "lr": 0.00056, "grad_norm": 0.6985, "tokens_per_sec": 150985, "dt_s": 4.341, "eta_s": 12669, "world_size": 1, "timestamp": "2026-05-05T04:14:23.786244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63500, "epoch": 0, "train_loss": 3.7433953434228897, "train_ppl": 42.24117023422111, "lr": 0.00056, "grad_norm": 0.7321, "tokens_per_sec": 147662, "dt_s": 4.438, "eta_s": 12710, "world_size": 1, "timestamp": "2026-05-05T04:14:28.224443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63510, "epoch": 0, "train_loss": 3.678344637155533, "train_ppl": 39.580819194989864, "lr": 0.00056, "grad_norm": 0.6208, "tokens_per_sec": 127935, "dt_s": 5.123, "eta_s": 12717, "world_size": 1, "timestamp": "2026-05-05T04:14:33.347023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63520, "epoch": 0, "train_loss": 3.659019351005554, "train_ppl": 38.82325221435767, "lr": 0.00056, "grad_norm": 0.6808, "tokens_per_sec": 149792, "dt_s": 4.375, "eta_s": 12755, "world_size": 1, "timestamp": "2026-05-05T04:14:37.722143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63530, "epoch": 0, "train_loss": 3.7193198055028915, "train_ppl": 41.23633583843389, "lr": 0.00056, "grad_norm": 0.7105, "tokens_per_sec": 151019, "dt_s": 4.34, "eta_s": 12723, "world_size": 1, "timestamp": "2026-05-05T04:14:42.061730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63540, "epoch": 0, "train_loss": 3.6901918798685074, "train_ppl": 40.0525314944857, "lr": 0.00056, "grad_norm": 0.6666, "tokens_per_sec": 152915, "dt_s": 4.286, "eta_s": 12687, "world_size": 1, "timestamp": "2026-05-05T04:14:46.347493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63550, "epoch": 0, "train_loss": 3.6901631355285645, "train_ppl": 40.05138022745113, "lr": 0.00056, "grad_norm": 0.609, "tokens_per_sec": 153426, "dt_s": 4.271, "eta_s": 12586, "world_size": 1, "timestamp": "2026-05-05T04:14:50.618995"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63560, "epoch": 0, "train_loss": 3.7001936584711075, "train_ppl": 40.455138081698976, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 149708, "dt_s": 4.378, "eta_s": 12609, "world_size": 1, "timestamp": "2026-05-05T04:14:54.996601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63570, "epoch": 0, "train_loss": 3.819800302386284, "train_ppl": 45.595102178539356, "lr": 0.00056, "grad_norm": 0.6331, "tokens_per_sec": 152005, "dt_s": 4.311, "eta_s": 12568, "world_size": 1, "timestamp": "2026-05-05T04:14:59.308049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63580, "epoch": 0, "train_loss": 3.737158924341202, "train_ppl": 41.97855633047113, "lr": 0.00056, "grad_norm": 0.6764, "tokens_per_sec": 151274, "dt_s": 4.332, "eta_s": 12559, "world_size": 1, "timestamp": "2026-05-05T04:15:03.640339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63590, "epoch": 0, "train_loss": 3.6758013516664505, "train_ppl": 39.4802817737597, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 149397, "dt_s": 4.387, "eta_s": 12613, "world_size": 1, "timestamp": "2026-05-05T04:15:08.027045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63600, "epoch": 0, "train_loss": 3.73513026535511, "train_ppl": 41.893482477043264, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 150398, "dt_s": 4.357, "eta_s": 12659, "world_size": 1, "timestamp": "2026-05-05T04:15:12.384536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63610, "epoch": 0, "train_loss": 3.7235589176416397, "train_ppl": 41.41151232430016, "lr": 0.00056, "grad_norm": 0.7551, "tokens_per_sec": 150705, "dt_s": 4.349, "eta_s": 12638, "world_size": 1, "timestamp": "2026-05-05T04:15:16.733161"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63620, "epoch": 0, "train_loss": 3.627818614244461, "train_ppl": 37.63064007816364, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 152476, "dt_s": 4.298, "eta_s": 12626, "world_size": 1, "timestamp": "2026-05-05T04:15:21.031293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63630, "epoch": 0, "train_loss": 3.687225863337517, "train_ppl": 39.93391102606135, "lr": 0.00056, "grad_norm": 0.6106, "tokens_per_sec": 151058, "dt_s": 4.338, "eta_s": 12625, "world_size": 1, "timestamp": "2026-05-05T04:15:25.369759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63640, "epoch": 0, "train_loss": 3.7064600735902786, "train_ppl": 40.709442727693414, "lr": 0.00056, "grad_norm": 0.7129, "tokens_per_sec": 149656, "dt_s": 4.379, "eta_s": 12616, "world_size": 1, "timestamp": "2026-05-05T04:15:29.748883"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63650, "epoch": 0, "train_loss": 3.6578709334135056, "train_ppl": 38.77869250001143, "lr": 0.00056, "grad_norm": 0.6106, "tokens_per_sec": 151946, "dt_s": 4.313, "eta_s": 12586, "world_size": 1, "timestamp": "2026-05-05T04:15:34.062011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63660, "epoch": 0, "train_loss": 3.675331473350525, "train_ppl": 39.4617352031045, "lr": 0.00056, "grad_norm": 0.6282, "tokens_per_sec": 152201, "dt_s": 4.306, "eta_s": 12557, "world_size": 1, "timestamp": "2026-05-05T04:15:38.367877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63670, "epoch": 0, "train_loss": 3.679817348718643, "train_ppl": 39.6391532691852, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 149183, "dt_s": 4.393, "eta_s": 12608, "world_size": 1, "timestamp": "2026-05-05T04:15:42.760887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63680, "epoch": 0, "train_loss": 3.7301904559135437, "train_ppl": 41.68704695262098, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 152551, "dt_s": 4.296, "eta_s": 12579, "world_size": 1, "timestamp": "2026-05-05T04:15:47.056881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63690, "epoch": 0, "train_loss": 3.6362839341163635, "train_ppl": 37.95054763287703, "lr": 0.00056, "grad_norm": 0.7417, "tokens_per_sec": 150929, "dt_s": 4.342, "eta_s": 12553, "world_size": 1, "timestamp": "2026-05-05T04:15:51.399051"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63700, "epoch": 0, "train_loss": 3.8557016998529434, "train_ppl": 47.26176888483381, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 149621, "dt_s": 4.38, "eta_s": 12588, "world_size": 1, "timestamp": "2026-05-05T04:15:55.779208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63710, "epoch": 0, "train_loss": 3.680620089173317, "train_ppl": 39.670985996102836, "lr": 0.00056, "grad_norm": 0.7584, "tokens_per_sec": 151931, "dt_s": 4.314, "eta_s": 12588, "world_size": 1, "timestamp": "2026-05-05T04:16:00.092738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63720, "epoch": 0, "train_loss": 3.769633114337921, "train_ppl": 43.36415223127502, "lr": 0.00056, "grad_norm": 0.6499, "tokens_per_sec": 149626, "dt_s": 4.38, "eta_s": 12576, "world_size": 1, "timestamp": "2026-05-05T04:16:04.472704"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63730, "epoch": 0, "train_loss": 3.8490386605262756, "train_ppl": 46.94790865282059, "lr": 0.00056, "grad_norm": 0.6804, "tokens_per_sec": 134341, "dt_s": 4.878, "eta_s": 12909, "world_size": 1, "timestamp": "2026-05-05T04:16:09.351041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63740, "epoch": 0, "train_loss": 3.6988799571990967, "train_ppl": 40.40202700902185, "lr": 0.00056, "grad_norm": 0.9685, "tokens_per_sec": 150037, "dt_s": 4.368, "eta_s": 12919, "world_size": 1, "timestamp": "2026-05-05T04:16:13.719029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63750, "epoch": 0, "train_loss": 3.666855290532112, "train_ppl": 39.128663901953246, "lr": 0.00056, "grad_norm": 0.645, "tokens_per_sec": 149421, "dt_s": 4.386, "eta_s": 12918, "world_size": 1, "timestamp": "2026-05-05T04:16:18.105028"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63760, "epoch": 0, "train_loss": 3.7345193326473236, "train_ppl": 41.86789619489909, "lr": 0.00056, "grad_norm": 0.7301, "tokens_per_sec": 152869, "dt_s": 4.287, "eta_s": 12898, "world_size": 1, "timestamp": "2026-05-05T04:16:22.392076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63770, "epoch": 0, "train_loss": 3.790723368525505, "train_ppl": 44.288425544634386, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 150622, "dt_s": 4.351, "eta_s": 12877, "world_size": 1, "timestamp": "2026-05-05T04:16:26.743112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63780, "epoch": 0, "train_loss": 3.6549090296030045, "train_ppl": 38.66400367540858, "lr": 0.00056, "grad_norm": 0.6135, "tokens_per_sec": 147657, "dt_s": 4.438, "eta_s": 12618, "world_size": 1, "timestamp": "2026-05-05T04:16:31.181498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63790, "epoch": 0, "train_loss": 3.697717472910881, "train_ppl": 40.355087575874606, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 151519, "dt_s": 4.325, "eta_s": 12589, "world_size": 1, "timestamp": "2026-05-05T04:16:35.506779"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63800, "epoch": 0, "train_loss": 3.9254670441150665, "train_ppl": 50.67674080619934, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 149375, "dt_s": 4.387, "eta_s": 12586, "world_size": 1, "timestamp": "2026-05-05T04:16:39.894103"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63810, "epoch": 0, "train_loss": 3.707674592733383, "train_ppl": 40.75891516171919, "lr": 0.00056, "grad_norm": 0.6525, "tokens_per_sec": 148440, "dt_s": 4.415, "eta_s": 12655, "world_size": 1, "timestamp": "2026-05-05T04:16:44.309088"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63820, "epoch": 0, "train_loss": 3.707522928714752, "train_ppl": 40.752733969594786, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 150464, "dt_s": 4.356, "eta_s": 12653, "world_size": 1, "timestamp": "2026-05-05T04:16:48.664689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63830, "epoch": 0, "train_loss": 3.7564280331134796, "train_ppl": 42.795289285758855, "lr": 0.00056, "grad_norm": 0.6931, "tokens_per_sec": 151580, "dt_s": 4.324, "eta_s": 12583, "world_size": 1, "timestamp": "2026-05-05T04:16:52.988237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63840, "epoch": 0, "train_loss": 3.678540289402008, "train_ppl": 39.58856402880505, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 151285, "dt_s": 4.332, "eta_s": 12582, "world_size": 1, "timestamp": "2026-05-05T04:16:57.320198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63850, "epoch": 0, "train_loss": 3.701189950108528, "train_ppl": 40.49546328195481, "lr": 0.00056, "grad_norm": 0.6318, "tokens_per_sec": 150794, "dt_s": 4.346, "eta_s": 12554, "world_size": 1, "timestamp": "2026-05-05T04:17:01.666264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63860, "epoch": 0, "train_loss": 3.654558464884758, "train_ppl": 38.65045181539461, "lr": 0.00056, "grad_norm": 0.653, "tokens_per_sec": 149410, "dt_s": 4.386, "eta_s": 12533, "world_size": 1, "timestamp": "2026-05-05T04:17:06.052581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63870, "epoch": 0, "train_loss": 3.778200700879097, "train_ppl": 43.7372744543982, "lr": 0.00056, "grad_norm": 0.7133, "tokens_per_sec": 153896, "dt_s": 4.258, "eta_s": 12473, "world_size": 1, "timestamp": "2026-05-05T04:17:10.311039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63880, "epoch": 0, "train_loss": 3.7136965841054916, "train_ppl": 41.00510553130569, "lr": 0.00056, "grad_norm": 0.6366, "tokens_per_sec": 152061, "dt_s": 4.31, "eta_s": 12461, "world_size": 1, "timestamp": "2026-05-05T04:17:14.620893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63890, "epoch": 0, "train_loss": 3.713103950023651, "train_ppl": 40.980811707623346, "lr": 0.00056, "grad_norm": 0.7116, "tokens_per_sec": 150345, "dt_s": 4.359, "eta_s": 12472, "world_size": 1, "timestamp": "2026-05-05T04:17:18.979933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63900, "epoch": 0, "train_loss": 3.5759672671556473, "train_ppl": 35.729163753435515, "lr": 0.00056, "grad_norm": 0.7155, "tokens_per_sec": 152586, "dt_s": 4.295, "eta_s": 12438, "world_size": 1, "timestamp": "2026-05-05T04:17:23.274961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63910, "epoch": 0, "train_loss": 3.639025554060936, "train_ppl": 38.05473636887204, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 147532, "dt_s": 4.442, "eta_s": 12466, "world_size": 1, "timestamp": "2026-05-05T04:17:27.717101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63920, "epoch": 0, "train_loss": 3.7849950790405273, "train_ppl": 44.03545386153293, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 148400, "dt_s": 4.416, "eta_s": 12552, "world_size": 1, "timestamp": "2026-05-05T04:17:32.133288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63930, "epoch": 0, "train_loss": 3.6607149094343185, "train_ppl": 38.88913514527718, "lr": 0.00056, "grad_norm": 0.5917, "tokens_per_sec": 149905, "dt_s": 4.372, "eta_s": 12584, "world_size": 1, "timestamp": "2026-05-05T04:17:36.505098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63940, "epoch": 0, "train_loss": 3.700931116938591, "train_ppl": 40.48498306919732, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 150520, "dt_s": 4.354, "eta_s": 12576, "world_size": 1, "timestamp": "2026-05-05T04:17:40.859069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63950, "epoch": 0, "train_loss": 3.6628592163324356, "train_ppl": 38.972614857104965, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 150190, "dt_s": 4.364, "eta_s": 12611, "world_size": 1, "timestamp": "2026-05-05T04:17:45.222604"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63960, "epoch": 0, "train_loss": 3.6772664189338684, "train_ppl": 39.538165433669036, "lr": 0.00056, "grad_norm": 0.7215, "tokens_per_sec": 151278, "dt_s": 4.332, "eta_s": 12544, "world_size": 1, "timestamp": "2026-05-05T04:17:49.554773"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63970, "epoch": 0, "train_loss": 3.6399743407964706, "train_ppl": 38.090859331747296, "lr": 0.00056, "grad_norm": 0.7086, "tokens_per_sec": 148198, "dt_s": 4.422, "eta_s": 12543, "world_size": 1, "timestamp": "2026-05-05T04:17:53.976957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63980, "epoch": 0, "train_loss": 3.6915545761585236, "train_ppl": 40.10714813505407, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 151344, "dt_s": 4.33, "eta_s": 12515, "world_size": 1, "timestamp": "2026-05-05T04:17:58.307260"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 63990, "epoch": 0, "train_loss": 3.760951906442642, "train_ppl": 42.98932832666924, "lr": 0.00056, "grad_norm": 0.7249, "tokens_per_sec": 150443, "dt_s": 4.356, "eta_s": 12512, "world_size": 1, "timestamp": "2026-05-05T04:18:02.663460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64000, "epoch": 0, "train_loss": 3.6743584126234055, "train_ppl": 39.42335521441256, "lr": 0.00056, "grad_norm": 0.6126, "tokens_per_sec": 146415, "dt_s": 4.476, "eta_s": 12572, "world_size": 1, "timestamp": "2026-05-05T04:18:07.139488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64010, "epoch": 0, "train_loss": 3.7292392253875732, "train_ppl": 41.64741181508706, "lr": 0.00056, "grad_norm": 0.7971, "tokens_per_sec": 127263, "dt_s": 5.15, "eta_s": 12593, "world_size": 1, "timestamp": "2026-05-05T04:18:12.289281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64020, "epoch": 0, "train_loss": 3.7060364186763763, "train_ppl": 40.692199625060084, "lr": 0.00056, "grad_norm": 0.6665, "tokens_per_sec": 146863, "dt_s": 4.462, "eta_s": 12612, "world_size": 1, "timestamp": "2026-05-05T04:18:16.751540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64030, "epoch": 0, "train_loss": 3.6288776248693466, "train_ppl": 37.670512434725865, "lr": 0.00056, "grad_norm": 0.6499, "tokens_per_sec": 132570, "dt_s": 4.943, "eta_s": 12959, "world_size": 1, "timestamp": "2026-05-05T04:18:21.695034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64040, "epoch": 0, "train_loss": 3.634259358048439, "train_ppl": 37.87379158781883, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 148532, "dt_s": 4.412, "eta_s": 12987, "world_size": 1, "timestamp": "2026-05-05T04:18:26.107289"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64050, "epoch": 0, "train_loss": 3.603803411126137, "train_ppl": 36.73769762554475, "lr": 0.00056, "grad_norm": 0.9322, "tokens_per_sec": 147250, "dt_s": 4.451, "eta_s": 12967, "world_size": 1, "timestamp": "2026-05-05T04:18:30.557938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64060, "epoch": 0, "train_loss": 3.6757523864507675, "train_ppl": 39.47834866057546, "lr": 0.00056, "grad_norm": 0.7, "tokens_per_sec": 151701, "dt_s": 4.32, "eta_s": 12930, "world_size": 1, "timestamp": "2026-05-05T04:18:34.877980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64070, "epoch": 0, "train_loss": 3.714185982942581, "train_ppl": 41.025178293660076, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 150771, "dt_s": 4.347, "eta_s": 12860, "world_size": 1, "timestamp": "2026-05-05T04:18:39.224723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64080, "epoch": 0, "train_loss": 3.720254600048065, "train_ppl": 41.27490136285119, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 147235, "dt_s": 4.451, "eta_s": 12573, "world_size": 1, "timestamp": "2026-05-05T04:18:43.675844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64090, "epoch": 0, "train_loss": 3.629913344979286, "train_ppl": 37.70954875386704, "lr": 0.00056, "grad_norm": 0.6879, "tokens_per_sec": 150177, "dt_s": 4.364, "eta_s": 12541, "world_size": 1, "timestamp": "2026-05-05T04:18:48.039756"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64100, "epoch": 0, "train_loss": 3.7596687227487564, "train_ppl": 42.93420049867861, "lr": 0.00056, "grad_norm": 0.6327, "tokens_per_sec": 149751, "dt_s": 4.376, "eta_s": 12494, "world_size": 1, "timestamp": "2026-05-05T04:18:52.416068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64110, "epoch": 0, "train_loss": 3.700386792421341, "train_ppl": 40.462952096875455, "lr": 0.00056, "grad_norm": 0.6772, "tokens_per_sec": 147999, "dt_s": 4.428, "eta_s": 12552, "world_size": 1, "timestamp": "2026-05-05T04:18:56.844227"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64120, "epoch": 0, "train_loss": 3.754586696624756, "train_ppl": 42.71656126268636, "lr": 0.00056, "grad_norm": 0.6334, "tokens_per_sec": 151030, "dt_s": 4.339, "eta_s": 12543, "world_size": 1, "timestamp": "2026-05-05T04:19:01.183480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64130, "epoch": 0, "train_loss": 3.7041234523057938, "train_ppl": 40.61443122351647, "lr": 0.00056, "grad_norm": 0.7133, "tokens_per_sec": 150569, "dt_s": 4.353, "eta_s": 12482, "world_size": 1, "timestamp": "2026-05-05T04:19:05.536055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64140, "epoch": 0, "train_loss": 3.614535331726074, "train_ppl": 37.134086884036314, "lr": 0.00056, "grad_norm": 0.7355, "tokens_per_sec": 150774, "dt_s": 4.347, "eta_s": 12468, "world_size": 1, "timestamp": "2026-05-05T04:19:09.882710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64150, "epoch": 0, "train_loss": 3.866979956626892, "train_ppl": 47.79781640866597, "lr": 0.00056, "grad_norm": 0.7011, "tokens_per_sec": 152844, "dt_s": 4.288, "eta_s": 12413, "world_size": 1, "timestamp": "2026-05-05T04:19:14.170460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64160, "epoch": 0, "train_loss": 3.7047316431999207, "train_ppl": 40.63914006384096, "lr": 0.00056, "grad_norm": 0.6409, "tokens_per_sec": 148883, "dt_s": 4.402, "eta_s": 12394, "world_size": 1, "timestamp": "2026-05-05T04:19:18.572325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64170, "epoch": 0, "train_loss": 3.6741974651813507, "train_ppl": 39.41701063681905, "lr": 0.00056, "grad_norm": 0.658, "tokens_per_sec": 152987, "dt_s": 4.284, "eta_s": 12358, "world_size": 1, "timestamp": "2026-05-05T04:19:22.856080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64180, "epoch": 0, "train_loss": 3.702851116657257, "train_ppl": 40.56278889498025, "lr": 0.00056, "grad_norm": 0.6043, "tokens_per_sec": 151480, "dt_s": 4.326, "eta_s": 12339, "world_size": 1, "timestamp": "2026-05-05T04:19:27.182460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64190, "epoch": 0, "train_loss": 3.8009396344423294, "train_ppl": 44.743207005671785, "lr": 0.00056, "grad_norm": 0.6553, "tokens_per_sec": 148889, "dt_s": 4.402, "eta_s": 12366, "world_size": 1, "timestamp": "2026-05-05T04:19:31.584135"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64200, "epoch": 0, "train_loss": 3.6518546640872955, "train_ppl": 38.54608984352376, "lr": 0.00056, "grad_norm": 0.6793, "tokens_per_sec": 151012, "dt_s": 4.34, "eta_s": 12391, "world_size": 1, "timestamp": "2026-05-05T04:19:35.923909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64210, "epoch": 0, "train_loss": 3.742908462882042, "train_ppl": 42.220608836289884, "lr": 0.00056, "grad_norm": 0.6889, "tokens_per_sec": 150976, "dt_s": 4.341, "eta_s": 12352, "world_size": 1, "timestamp": "2026-05-05T04:19:40.264727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64220, "epoch": 0, "train_loss": 3.657852217555046, "train_ppl": 38.77796673028308, "lr": 0.00056, "grad_norm": 0.6781, "tokens_per_sec": 147679, "dt_s": 4.438, "eta_s": 12435, "world_size": 1, "timestamp": "2026-05-05T04:19:44.702471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64230, "epoch": 0, "train_loss": 3.7148542404174805, "train_ppl": 41.0526028380217, "lr": 0.00056, "grad_norm": 0.6713, "tokens_per_sec": 151981, "dt_s": 4.312, "eta_s": 12423, "world_size": 1, "timestamp": "2026-05-05T04:19:49.014590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64240, "epoch": 0, "train_loss": 3.7978803664445877, "train_ppl": 44.606534709548214, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 149967, "dt_s": 4.37, "eta_s": 12400, "world_size": 1, "timestamp": "2026-05-05T04:19:53.384625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64250, "epoch": 0, "train_loss": 3.654780238866806, "train_ppl": 38.659024430558034, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 147667, "dt_s": 4.438, "eta_s": 12452, "world_size": 1, "timestamp": "2026-05-05T04:19:57.822696"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64260, "epoch": 0, "train_loss": 3.678333505988121, "train_ppl": 39.580378616717184, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 150010, "dt_s": 4.369, "eta_s": 12463, "world_size": 1, "timestamp": "2026-05-05T04:20:02.191469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64270, "epoch": 0, "train_loss": 3.679165616631508, "train_ppl": 39.61332757772283, "lr": 0.00056, "grad_norm": 0.6622, "tokens_per_sec": 146922, "dt_s": 4.461, "eta_s": 12472, "world_size": 1, "timestamp": "2026-05-05T04:20:06.652047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64280, "epoch": 0, "train_loss": 3.7497852593660355, "train_ppl": 42.511951976285864, "lr": 0.00056, "grad_norm": 0.7145, "tokens_per_sec": 149585, "dt_s": 4.381, "eta_s": 12507, "world_size": 1, "timestamp": "2026-05-05T04:20:11.033239"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64290, "epoch": 0, "train_loss": 3.6730027943849564, "train_ppl": 39.36994840286575, "lr": 0.00056, "grad_norm": 0.7366, "tokens_per_sec": 150335, "dt_s": 4.359, "eta_s": 12496, "world_size": 1, "timestamp": "2026-05-05T04:20:15.392551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64300, "epoch": 0, "train_loss": 3.7585837692022324, "train_ppl": 42.887644145887315, "lr": 0.00056, "grad_norm": 0.6795, "tokens_per_sec": 146903, "dt_s": 4.461, "eta_s": 12505, "world_size": 1, "timestamp": "2026-05-05T04:20:19.853730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64310, "epoch": 0, "train_loss": 3.7318001687526703, "train_ppl": 41.75420516554313, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 150676, "dt_s": 4.349, "eta_s": 12490, "world_size": 1, "timestamp": "2026-05-05T04:20:24.203208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64320, "epoch": 0, "train_loss": 3.837471529841423, "train_ppl": 46.40798476321934, "lr": 0.00056, "grad_norm": 0.6441, "tokens_per_sec": 134494, "dt_s": 4.873, "eta_s": 12719, "world_size": 1, "timestamp": "2026-05-05T04:20:29.075965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64330, "epoch": 0, "train_loss": 3.657251864671707, "train_ppl": 38.75469325299516, "lr": 0.00056, "grad_norm": 0.6175, "tokens_per_sec": 146636, "dt_s": 4.469, "eta_s": 12765, "world_size": 1, "timestamp": "2026-05-05T04:20:33.545334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64340, "epoch": 0, "train_loss": 3.639119014143944, "train_ppl": 38.05829313389711, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 149442, "dt_s": 4.385, "eta_s": 12775, "world_size": 1, "timestamp": "2026-05-05T04:20:37.930692"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64350, "epoch": 0, "train_loss": 3.728035181760788, "train_ppl": 41.59729669077144, "lr": 0.00056, "grad_norm": 0.7176, "tokens_per_sec": 147015, "dt_s": 4.458, "eta_s": 12769, "world_size": 1, "timestamp": "2026-05-05T04:20:42.388445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64360, "epoch": 0, "train_loss": 3.6881618052721024, "train_ppl": 39.97130434426027, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 150800, "dt_s": 4.346, "eta_s": 12762, "world_size": 1, "timestamp": "2026-05-05T04:20:46.734348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64370, "epoch": 0, "train_loss": 3.745007261633873, "train_ppl": 42.30931445247327, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 151430, "dt_s": 4.328, "eta_s": 12449, "world_size": 1, "timestamp": "2026-05-05T04:20:51.062155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64380, "epoch": 0, "train_loss": 3.72588050365448, "train_ppl": 41.507764397585035, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 150214, "dt_s": 4.363, "eta_s": 12384, "world_size": 1, "timestamp": "2026-05-05T04:20:55.424997"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64390, "epoch": 0, "train_loss": 3.70830000936985, "train_ppl": 40.78441443835042, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 151713, "dt_s": 4.32, "eta_s": 12343, "world_size": 1, "timestamp": "2026-05-05T04:20:59.744725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64400, "epoch": 0, "train_loss": 3.8715242594480515, "train_ppl": 48.015518437948785, "lr": 0.00056, "grad_norm": 0.7006, "tokens_per_sec": 150172, "dt_s": 4.364, "eta_s": 12285, "world_size": 1, "timestamp": "2026-05-05T04:21:04.108793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64410, "epoch": 0, "train_loss": 3.74302077293396, "train_ppl": 42.2253509013461, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 147372, "dt_s": 4.447, "eta_s": 12338, "world_size": 1, "timestamp": "2026-05-05T04:21:08.555775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64420, "epoch": 0, "train_loss": 3.6871660202741623, "train_ppl": 39.931521329997906, "lr": 0.00056, "grad_norm": 0.6924, "tokens_per_sec": 150717, "dt_s": 4.348, "eta_s": 12345, "world_size": 1, "timestamp": "2026-05-05T04:21:12.904060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64430, "epoch": 0, "train_loss": 3.73246631026268, "train_ppl": 41.78202864097845, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 151947, "dt_s": 4.313, "eta_s": 12313, "world_size": 1, "timestamp": "2026-05-05T04:21:17.217209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64440, "epoch": 0, "train_loss": 3.6708116084337234, "train_ppl": 39.283775969415686, "lr": 0.00056, "grad_norm": 0.6891, "tokens_per_sec": 148594, "dt_s": 4.41, "eta_s": 12360, "world_size": 1, "timestamp": "2026-05-05T04:21:21.627543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64450, "epoch": 0, "train_loss": 3.71317982673645, "train_ppl": 40.98392131487545, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 150997, "dt_s": 4.34, "eta_s": 12342, "world_size": 1, "timestamp": "2026-05-05T04:21:25.967760"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64460, "epoch": 0, "train_loss": 3.8642458021640778, "train_ppl": 47.667308291596996, "lr": 0.00056, "grad_norm": 0.7865, "tokens_per_sec": 149167, "dt_s": 4.393, "eta_s": 12307, "world_size": 1, "timestamp": "2026-05-05T04:21:30.361235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64470, "epoch": 0, "train_loss": 3.635901987552643, "train_ppl": 37.93605531943818, "lr": 0.00056, "grad_norm": 0.7242, "tokens_per_sec": 151251, "dt_s": 4.333, "eta_s": 12294, "world_size": 1, "timestamp": "2026-05-05T04:21:34.694136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64480, "epoch": 0, "train_loss": 3.8564728051424026, "train_ppl": 47.29822673942587, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 151801, "dt_s": 4.317, "eta_s": 12292, "world_size": 1, "timestamp": "2026-05-05T04:21:39.011390"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64490, "epoch": 0, "train_loss": 3.817925900220871, "train_ppl": 45.50971866680208, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 149630, "dt_s": 4.38, "eta_s": 12271, "world_size": 1, "timestamp": "2026-05-05T04:21:43.391249"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64500, "epoch": 0, "train_loss": 3.765955001115799, "train_ppl": 43.20494693662095, "lr": 0.00056, "grad_norm": 0.6327, "tokens_per_sec": 151721, "dt_s": 4.32, "eta_s": 12255, "world_size": 1, "timestamp": "2026-05-05T04:21:47.710758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64510, "epoch": 0, "train_loss": 3.712558403611183, "train_ppl": 40.95846087008018, "lr": 0.00056, "grad_norm": 0.6885, "tokens_per_sec": 128788, "dt_s": 5.089, "eta_s": 12211, "world_size": 1, "timestamp": "2026-05-05T04:21:52.799428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64520, "epoch": 0, "train_loss": 3.7090568095445633, "train_ppl": 40.81529177283526, "lr": 0.00056, "grad_norm": 0.6365, "tokens_per_sec": 148217, "dt_s": 4.422, "eta_s": 12257, "world_size": 1, "timestamp": "2026-05-05T04:21:57.221066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64530, "epoch": 0, "train_loss": 3.679525062441826, "train_ppl": 39.62756898170656, "lr": 0.00056, "grad_norm": 0.6788, "tokens_per_sec": 148534, "dt_s": 4.412, "eta_s": 12306, "world_size": 1, "timestamp": "2026-05-05T04:22:01.633268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64540, "epoch": 0, "train_loss": 3.784180611371994, "train_ppl": 43.99960300975518, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 150353, "dt_s": 4.359, "eta_s": 12290, "world_size": 1, "timestamp": "2026-05-05T04:22:05.992087"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64550, "epoch": 0, "train_loss": 3.6577636152505875, "train_ppl": 38.77453106527473, "lr": 0.00056, "grad_norm": 0.6295, "tokens_per_sec": 148492, "dt_s": 4.413, "eta_s": 12338, "world_size": 1, "timestamp": "2026-05-05T04:22:10.405499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64560, "epoch": 0, "train_loss": 3.653269961476326, "train_ppl": 38.60068264725017, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 151719, "dt_s": 4.32, "eta_s": 12331, "world_size": 1, "timestamp": "2026-05-05T04:22:14.725093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64570, "epoch": 0, "train_loss": 3.693749949336052, "train_ppl": 40.195295014541195, "lr": 0.00056, "grad_norm": 0.7351, "tokens_per_sec": 151124, "dt_s": 4.337, "eta_s": 12279, "world_size": 1, "timestamp": "2026-05-05T04:22:19.061674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64580, "epoch": 0, "train_loss": 3.6900241374969482, "train_ppl": 40.04581355132348, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 150824, "dt_s": 4.345, "eta_s": 12237, "world_size": 1, "timestamp": "2026-05-05T04:22:23.406897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64590, "epoch": 0, "train_loss": 3.6865442395210266, "train_ppl": 39.90670039598068, "lr": 0.00056, "grad_norm": 0.8547, "tokens_per_sec": 150389, "dt_s": 4.358, "eta_s": 12232, "world_size": 1, "timestamp": "2026-05-05T04:22:27.764632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64600, "epoch": 0, "train_loss": 3.708345979452133, "train_ppl": 40.786289344332474, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 147570, "dt_s": 4.441, "eta_s": 12243, "world_size": 1, "timestamp": "2026-05-05T04:22:32.205673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64610, "epoch": 0, "train_loss": 3.7820124328136444, "train_ppl": 43.90430736025756, "lr": 0.00056, "grad_norm": 0.7517, "tokens_per_sec": 149204, "dt_s": 4.392, "eta_s": 12280, "world_size": 1, "timestamp": "2026-05-05T04:22:36.598012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64620, "epoch": 0, "train_loss": 3.7386909872293472, "train_ppl": 42.042919410276426, "lr": 0.00056, "grad_norm": 0.7164, "tokens_per_sec": 134785, "dt_s": 4.862, "eta_s": 12571, "world_size": 1, "timestamp": "2026-05-05T04:22:41.460275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64630, "epoch": 0, "train_loss": 3.7128919512033463, "train_ppl": 40.972124744731666, "lr": 0.00056, "grad_norm": 0.7, "tokens_per_sec": 147415, "dt_s": 4.446, "eta_s": 12622, "world_size": 1, "timestamp": "2026-05-05T04:22:45.905957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64640, "epoch": 0, "train_loss": 3.793393075466156, "train_ppl": 44.40682063147382, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 148749, "dt_s": 4.406, "eta_s": 12645, "world_size": 1, "timestamp": "2026-05-05T04:22:50.311769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64650, "epoch": 0, "train_loss": 3.7149427980184555, "train_ppl": 41.056238519024035, "lr": 0.00056, "grad_norm": 0.6931, "tokens_per_sec": 148938, "dt_s": 4.4, "eta_s": 12617, "world_size": 1, "timestamp": "2026-05-05T04:22:54.711979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64660, "epoch": 0, "train_loss": 3.568721130490303, "train_ppl": 35.47120109500373, "lr": 0.00056, "grad_norm": 0.7373, "tokens_per_sec": 148910, "dt_s": 4.401, "eta_s": 12618, "world_size": 1, "timestamp": "2026-05-05T04:22:59.113018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64670, "epoch": 0, "train_loss": 3.7195087522268295, "train_ppl": 41.244128045130545, "lr": 0.00056, "grad_norm": 0.6885, "tokens_per_sec": 151401, "dt_s": 4.329, "eta_s": 12314, "world_size": 1, "timestamp": "2026-05-05T04:23:03.441665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64680, "epoch": 0, "train_loss": 3.746889665722847, "train_ppl": 42.3890326864253, "lr": 0.00056, "grad_norm": 0.729, "tokens_per_sec": 148856, "dt_s": 4.403, "eta_s": 12286, "world_size": 1, "timestamp": "2026-05-05T04:23:07.844348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64690, "epoch": 0, "train_loss": 3.6828580051660538, "train_ppl": 39.75986574572324, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 149114, "dt_s": 4.395, "eta_s": 12275, "world_size": 1, "timestamp": "2026-05-05T04:23:12.239359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64700, "epoch": 0, "train_loss": 3.693620353937149, "train_ppl": 40.19008622677452, "lr": 0.00056, "grad_norm": 0.688, "tokens_per_sec": 150840, "dt_s": 4.345, "eta_s": 12240, "world_size": 1, "timestamp": "2026-05-05T04:23:16.584067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64710, "epoch": 0, "train_loss": 3.7886032164096832, "train_ppl": 44.19462681443345, "lr": 0.00056, "grad_norm": 0.6961, "tokens_per_sec": 147873, "dt_s": 4.432, "eta_s": 12253, "world_size": 1, "timestamp": "2026-05-05T04:23:21.015988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64720, "epoch": 0, "train_loss": 3.7340435087680817, "train_ppl": 41.84797918898522, "lr": 0.00056, "grad_norm": 0.7549, "tokens_per_sec": 149873, "dt_s": 4.373, "eta_s": 12273, "world_size": 1, "timestamp": "2026-05-05T04:23:25.388746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64730, "epoch": 0, "train_loss": 3.7738939821720123, "train_ppl": 43.54931535032269, "lr": 0.00056, "grad_norm": 0.7675, "tokens_per_sec": 150962, "dt_s": 4.341, "eta_s": 12234, "world_size": 1, "timestamp": "2026-05-05T04:23:29.729975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64740, "epoch": 0, "train_loss": 3.6480392515659332, "train_ppl": 38.39930081809986, "lr": 0.00056, "grad_norm": 0.6409, "tokens_per_sec": 149583, "dt_s": 4.381, "eta_s": 12222, "world_size": 1, "timestamp": "2026-05-05T04:23:34.111270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64750, "epoch": 0, "train_loss": 3.7520163357257843, "train_ppl": 42.606905271986165, "lr": 0.00056, "grad_norm": 0.6675, "tokens_per_sec": 151369, "dt_s": 4.33, "eta_s": 12210, "world_size": 1, "timestamp": "2026-05-05T04:23:38.440783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64760, "epoch": 0, "train_loss": 3.7082925736904144, "train_ppl": 40.78411117964615, "lr": 0.00056, "grad_norm": 0.7224, "tokens_per_sec": 150681, "dt_s": 4.349, "eta_s": 12159, "world_size": 1, "timestamp": "2026-05-05T04:23:42.790115"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64770, "epoch": 0, "train_loss": 3.803639680147171, "train_ppl": 44.864178950966, "lr": 0.00056, "grad_norm": 0.6794, "tokens_per_sec": 149981, "dt_s": 4.37, "eta_s": 12153, "world_size": 1, "timestamp": "2026-05-05T04:23:47.159765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64780, "epoch": 0, "train_loss": 3.7465084195137024, "train_ppl": 42.37287510860765, "lr": 0.00056, "grad_norm": 0.6444, "tokens_per_sec": 150026, "dt_s": 4.368, "eta_s": 12164, "world_size": 1, "timestamp": "2026-05-05T04:23:51.528049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64790, "epoch": 0, "train_loss": 3.782513201236725, "train_ppl": 43.92629875685973, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 149967, "dt_s": 4.37, "eta_s": 12153, "world_size": 1, "timestamp": "2026-05-05T04:23:55.898077"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64800, "epoch": 0, "train_loss": 3.566786050796509, "train_ppl": 35.40262786278279, "lr": 0.00056, "grad_norm": 0.6349, "tokens_per_sec": 150837, "dt_s": 4.345, "eta_s": 12157, "world_size": 1, "timestamp": "2026-05-05T04:24:00.242902"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64810, "epoch": 0, "train_loss": 3.6266869604587555, "train_ppl": 37.58807930843084, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 152514, "dt_s": 4.297, "eta_s": 12124, "world_size": 1, "timestamp": "2026-05-05T04:24:04.539954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64820, "epoch": 0, "train_loss": 3.7044695764780045, "train_ppl": 40.62849129302811, "lr": 0.00056, "grad_norm": 0.715, "tokens_per_sec": 148692, "dt_s": 4.407, "eta_s": 12140, "world_size": 1, "timestamp": "2026-05-05T04:24:08.947447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64830, "epoch": 0, "train_loss": 3.7542345076799393, "train_ppl": 42.70151961095627, "lr": 0.00056, "grad_norm": 0.6529, "tokens_per_sec": 150427, "dt_s": 4.357, "eta_s": 12130, "world_size": 1, "timestamp": "2026-05-05T04:24:13.304123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64840, "epoch": 0, "train_loss": 3.7953084260225296, "train_ppl": 44.49195676682189, "lr": 0.00056, "grad_norm": 0.654, "tokens_per_sec": 152430, "dt_s": 4.299, "eta_s": 12086, "world_size": 1, "timestamp": "2026-05-05T04:24:17.603547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64850, "epoch": 0, "train_loss": 3.6372789442539215, "train_ppl": 37.98832760510832, "lr": 0.00056, "grad_norm": 0.6503, "tokens_per_sec": 150342, "dt_s": 4.359, "eta_s": 12090, "world_size": 1, "timestamp": "2026-05-05T04:24:21.962684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64860, "epoch": 0, "train_loss": 3.5566301941871643, "train_ppl": 35.044903424344305, "lr": 0.00056, "grad_norm": 0.6356, "tokens_per_sec": 149989, "dt_s": 4.369, "eta_s": 12125, "world_size": 1, "timestamp": "2026-05-05T04:24:26.332071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64870, "epoch": 0, "train_loss": 3.713292747735977, "train_ppl": 40.988549521540804, "lr": 0.00056, "grad_norm": 0.6456, "tokens_per_sec": 149189, "dt_s": 4.393, "eta_s": 12113, "world_size": 1, "timestamp": "2026-05-05T04:24:30.724880"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64880, "epoch": 0, "train_loss": 3.642008736729622, "train_ppl": 38.16843009913495, "lr": 0.00056, "grad_norm": 0.649, "tokens_per_sec": 150740, "dt_s": 4.348, "eta_s": 12104, "world_size": 1, "timestamp": "2026-05-05T04:24:35.072507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64890, "epoch": 0, "train_loss": 3.6376660466194153, "train_ppl": 38.00303582319475, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 152023, "dt_s": 4.311, "eta_s": 12106, "world_size": 1, "timestamp": "2026-05-05T04:24:39.383448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64900, "epoch": 0, "train_loss": 3.652585729956627, "train_ppl": 38.574279877334796, "lr": 0.00056, "grad_norm": 0.7295, "tokens_per_sec": 150788, "dt_s": 4.346, "eta_s": 12094, "world_size": 1, "timestamp": "2026-05-05T04:24:43.729707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64910, "epoch": 0, "train_loss": 3.6363139748573303, "train_ppl": 37.95168771257236, "lr": 0.00056, "grad_norm": 0.6824, "tokens_per_sec": 134952, "dt_s": 4.856, "eta_s": 12360, "world_size": 1, "timestamp": "2026-05-05T04:24:48.585960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64920, "epoch": 0, "train_loss": 3.7548172622919083, "train_ppl": 42.72641137063707, "lr": 0.00056, "grad_norm": 0.635, "tokens_per_sec": 151044, "dt_s": 4.339, "eta_s": 12326, "world_size": 1, "timestamp": "2026-05-05T04:24:52.924816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64930, "epoch": 0, "train_loss": 3.7917799055576324, "train_ppl": 44.33524263395616, "lr": 0.00056, "grad_norm": 0.8441, "tokens_per_sec": 149553, "dt_s": 4.382, "eta_s": 12340, "world_size": 1, "timestamp": "2026-05-05T04:24:57.306945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64940, "epoch": 0, "train_loss": 3.7611399739980698, "train_ppl": 42.99741398485826, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 150191, "dt_s": 4.364, "eta_s": 12365, "world_size": 1, "timestamp": "2026-05-05T04:25:01.670455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64950, "epoch": 0, "train_loss": 3.7861037850379944, "train_ppl": 44.08430330816369, "lr": 0.00056, "grad_norm": 0.92, "tokens_per_sec": 151724, "dt_s": 4.319, "eta_s": 12346, "world_size": 1, "timestamp": "2026-05-05T04:25:05.989887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64960, "epoch": 0, "train_loss": 3.7220340222120285, "train_ppl": 41.34841222118023, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 148767, "dt_s": 4.405, "eta_s": 12091, "world_size": 1, "timestamp": "2026-05-05T04:25:10.395159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64970, "epoch": 0, "train_loss": 3.7513996809720993, "train_ppl": 42.58063962056211, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 151012, "dt_s": 4.34, "eta_s": 12088, "world_size": 1, "timestamp": "2026-05-05T04:25:14.734941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64980, "epoch": 0, "train_loss": 3.6956861913204193, "train_ppl": 40.273198227719945, "lr": 0.00056, "grad_norm": 0.6552, "tokens_per_sec": 151456, "dt_s": 4.327, "eta_s": 12053, "world_size": 1, "timestamp": "2026-05-05T04:25:19.062001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 64990, "epoch": 0, "train_loss": 3.639954537153244, "train_ppl": 38.090105001428164, "lr": 0.00056, "grad_norm": 0.6052, "tokens_per_sec": 150015, "dt_s": 4.369, "eta_s": 12051, "world_size": 1, "timestamp": "2026-05-05T04:25:23.430633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65000, "epoch": 0, "train_loss": 3.7292414158582687, "train_ppl": 41.647503042622105, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 151740, "dt_s": 4.319, "eta_s": 12047, "world_size": 1, "timestamp": "2026-05-05T04:25:27.749602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65010, "epoch": 0, "train_loss": 3.757501944899559, "train_ppl": 42.84127233776296, "lr": 0.00056, "grad_norm": 0.7121, "tokens_per_sec": 127416, "dt_s": 5.143, "eta_s": 12028, "world_size": 1, "timestamp": "2026-05-05T04:25:32.893056"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65020, "epoch": 0, "train_loss": 3.686601296067238, "train_ppl": 39.90897739943432, "lr": 0.00056, "grad_norm": 0.7089, "tokens_per_sec": 151625, "dt_s": 4.322, "eta_s": 12014, "world_size": 1, "timestamp": "2026-05-05T04:25:37.215319"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65030, "epoch": 0, "train_loss": 3.770167574286461, "train_ppl": 43.387334828378, "lr": 0.00056, "grad_norm": 0.698, "tokens_per_sec": 147641, "dt_s": 4.439, "eta_s": 12071, "world_size": 1, "timestamp": "2026-05-05T04:25:41.654212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65040, "epoch": 0, "train_loss": 3.7845454961061478, "train_ppl": 44.01566072263125, "lr": 0.00056, "grad_norm": 0.7537, "tokens_per_sec": 147782, "dt_s": 4.435, "eta_s": 12103, "world_size": 1, "timestamp": "2026-05-05T04:25:46.088832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65050, "epoch": 0, "train_loss": 3.6859252005815506, "train_ppl": 39.88200423921956, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 151015, "dt_s": 4.34, "eta_s": 12110, "world_size": 1, "timestamp": "2026-05-05T04:25:50.428528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65060, "epoch": 0, "train_loss": 3.7239671796560287, "train_ppl": 41.42842252340167, "lr": 0.00056, "grad_norm": 0.6736, "tokens_per_sec": 151509, "dt_s": 4.326, "eta_s": 12076, "world_size": 1, "timestamp": "2026-05-05T04:25:54.754068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65070, "epoch": 0, "train_loss": 3.7955834716558456, "train_ppl": 44.504195768313096, "lr": 0.00056, "grad_norm": 0.6273, "tokens_per_sec": 148566, "dt_s": 4.411, "eta_s": 12121, "world_size": 1, "timestamp": "2026-05-05T04:25:59.165325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65080, "epoch": 0, "train_loss": 3.7471481412649155, "train_ppl": 42.399990630745975, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 151915, "dt_s": 4.314, "eta_s": 12048, "world_size": 1, "timestamp": "2026-05-05T04:26:03.479334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65090, "epoch": 0, "train_loss": 3.7069326788187027, "train_ppl": 40.72868677023212, "lr": 0.00056, "grad_norm": 0.6131, "tokens_per_sec": 148176, "dt_s": 4.423, "eta_s": 12037, "world_size": 1, "timestamp": "2026-05-05T04:26:07.902169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65100, "epoch": 0, "train_loss": 3.678173154592514, "train_ppl": 39.574032356596774, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 151013, "dt_s": 4.34, "eta_s": 12033, "world_size": 1, "timestamp": "2026-05-05T04:26:12.241923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65110, "epoch": 0, "train_loss": 3.6802920550107956, "train_ppl": 39.65797469162806, "lr": 0.00056, "grad_norm": 0.7376, "tokens_per_sec": 151614, "dt_s": 4.323, "eta_s": 12027, "world_size": 1, "timestamp": "2026-05-05T04:26:16.564493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65120, "epoch": 0, "train_loss": 3.673302173614502, "train_ppl": 39.38173671218529, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 149278, "dt_s": 4.39, "eta_s": 12011, "world_size": 1, "timestamp": "2026-05-05T04:26:20.954665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65130, "epoch": 0, "train_loss": 3.7292447984218597, "train_ppl": 41.64764391818781, "lr": 0.00056, "grad_norm": 0.6554, "tokens_per_sec": 151801, "dt_s": 4.317, "eta_s": 12008, "world_size": 1, "timestamp": "2026-05-05T04:26:25.271904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65140, "epoch": 0, "train_loss": 3.739466056227684, "train_ppl": 42.075518205236655, "lr": 0.00056, "grad_norm": 0.7102, "tokens_per_sec": 150014, "dt_s": 4.369, "eta_s": 11974, "world_size": 1, "timestamp": "2026-05-05T04:26:29.640559"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65150, "epoch": 0, "train_loss": 3.6744061559438705, "train_ppl": 39.425237461226374, "lr": 0.00056, "grad_norm": 0.6522, "tokens_per_sec": 150477, "dt_s": 4.355, "eta_s": 11978, "world_size": 1, "timestamp": "2026-05-05T04:26:33.995762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65160, "epoch": 0, "train_loss": 3.7694711089134216, "train_ppl": 43.357127572416516, "lr": 0.00056, "grad_norm": 0.7101, "tokens_per_sec": 150922, "dt_s": 4.342, "eta_s": 11985, "world_size": 1, "timestamp": "2026-05-05T04:26:38.338135"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65170, "epoch": 0, "train_loss": 3.668886825442314, "train_ppl": 39.20823594798728, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 150267, "dt_s": 4.361, "eta_s": 11964, "world_size": 1, "timestamp": "2026-05-05T04:26:42.699493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65180, "epoch": 0, "train_loss": 3.6630125492811203, "train_ppl": 38.978591101224815, "lr": 0.00056, "grad_norm": 0.6497, "tokens_per_sec": 150387, "dt_s": 4.358, "eta_s": 11982, "world_size": 1, "timestamp": "2026-05-05T04:26:47.057308"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65190, "epoch": 0, "train_loss": 3.7533855885267258, "train_ppl": 42.66528485545227, "lr": 0.00056, "grad_norm": 0.675, "tokens_per_sec": 151947, "dt_s": 4.313, "eta_s": 11947, "world_size": 1, "timestamp": "2026-05-05T04:26:51.370386"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65200, "epoch": 0, "train_loss": 3.633302092552185, "train_ppl": 37.83755366134507, "lr": 0.00056, "grad_norm": 0.7395, "tokens_per_sec": 151184, "dt_s": 4.335, "eta_s": 11932, "world_size": 1, "timestamp": "2026-05-05T04:26:55.705234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65210, "epoch": 0, "train_loss": 3.64633372426033, "train_ppl": 38.33386557869393, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 134743, "dt_s": 4.864, "eta_s": 12214, "world_size": 1, "timestamp": "2026-05-05T04:27:00.568982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65220, "epoch": 0, "train_loss": 3.868505284190178, "train_ppl": 47.87077936755868, "lr": 0.00056, "grad_norm": 0.7302, "tokens_per_sec": 151348, "dt_s": 4.33, "eta_s": 12192, "world_size": 1, "timestamp": "2026-05-05T04:27:04.899154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65230, "epoch": 0, "train_loss": 3.6587690114974976, "train_ppl": 38.81353443691973, "lr": 0.00056, "grad_norm": 0.6979, "tokens_per_sec": 148542, "dt_s": 4.412, "eta_s": 12218, "world_size": 1, "timestamp": "2026-05-05T04:27:09.311089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65240, "epoch": 0, "train_loss": 3.83159601688385, "train_ppl": 46.13611352103126, "lr": 0.00056, "grad_norm": 0.7736, "tokens_per_sec": 150683, "dt_s": 4.349, "eta_s": 12233, "world_size": 1, "timestamp": "2026-05-05T04:27:13.660415"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65250, "epoch": 0, "train_loss": 3.779458701610565, "train_ppl": 43.7923306007296, "lr": 0.00056, "grad_norm": 0.654, "tokens_per_sec": 151483, "dt_s": 4.326, "eta_s": 12224, "world_size": 1, "timestamp": "2026-05-05T04:27:17.986717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65260, "epoch": 0, "train_loss": 3.7984949350357056, "train_ppl": 44.63395691028785, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 150501, "dt_s": 4.355, "eta_s": 11940, "world_size": 1, "timestamp": "2026-05-05T04:27:22.341210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65270, "epoch": 0, "train_loss": 3.674904778599739, "train_ppl": 39.44490067969325, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 152081, "dt_s": 4.309, "eta_s": 11924, "world_size": 1, "timestamp": "2026-05-05T04:27:26.650484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65280, "epoch": 0, "train_loss": 3.5820581912994385, "train_ppl": 35.94745149174576, "lr": 0.00056, "grad_norm": 0.6436, "tokens_per_sec": 151069, "dt_s": 4.338, "eta_s": 11880, "world_size": 1, "timestamp": "2026-05-05T04:27:30.988642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65290, "epoch": 0, "train_loss": 3.6454616338014603, "train_ppl": 38.30044955328747, "lr": 0.00056, "grad_norm": 0.7188, "tokens_per_sec": 148636, "dt_s": 4.409, "eta_s": 11908, "world_size": 1, "timestamp": "2026-05-05T04:27:35.397782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65300, "epoch": 0, "train_loss": 3.7430407851934433, "train_ppl": 42.22619593448059, "lr": 0.00056, "grad_norm": 0.7754, "tokens_per_sec": 151238, "dt_s": 4.333, "eta_s": 11908, "world_size": 1, "timestamp": "2026-05-05T04:27:39.731085"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65310, "epoch": 0, "train_loss": 3.708971530199051, "train_ppl": 40.811811219877725, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 151644, "dt_s": 4.322, "eta_s": 11885, "world_size": 1, "timestamp": "2026-05-05T04:27:44.052820"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65320, "epoch": 0, "train_loss": 3.7132257521152496, "train_ppl": 40.98580356020761, "lr": 0.00056, "grad_norm": 0.6892, "tokens_per_sec": 147707, "dt_s": 4.437, "eta_s": 11951, "world_size": 1, "timestamp": "2026-05-05T04:27:48.489709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65330, "epoch": 0, "train_loss": 3.712296813726425, "train_ppl": 40.94774795227787, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 153230, "dt_s": 4.277, "eta_s": 11913, "world_size": 1, "timestamp": "2026-05-05T04:27:52.766671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65340, "epoch": 0, "train_loss": 3.7880276888608932, "train_ppl": 44.16919890712571, "lr": 0.00056, "grad_norm": 0.7018, "tokens_per_sec": 152008, "dt_s": 4.311, "eta_s": 11855, "world_size": 1, "timestamp": "2026-05-05T04:27:57.078002"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65350, "epoch": 0, "train_loss": 3.744278073310852, "train_ppl": 42.27847423994745, "lr": 0.00056, "grad_norm": 0.656, "tokens_per_sec": 150470, "dt_s": 4.355, "eta_s": 11863, "world_size": 1, "timestamp": "2026-05-05T04:28:01.433443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65360, "epoch": 0, "train_loss": 3.8085808157920837, "train_ppl": 45.086407523381546, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 154602, "dt_s": 4.239, "eta_s": 11813, "world_size": 1, "timestamp": "2026-05-05T04:28:05.672441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65370, "epoch": 0, "train_loss": 3.66580069065094, "train_ppl": 39.087420569081296, "lr": 0.00056, "grad_norm": 0.7263, "tokens_per_sec": 150742, "dt_s": 4.348, "eta_s": 11760, "world_size": 1, "timestamp": "2026-05-05T04:28:10.020003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65380, "epoch": 0, "train_loss": 3.6348276883363724, "train_ppl": 37.89532252846037, "lr": 0.00056, "grad_norm": 0.6909, "tokens_per_sec": 152800, "dt_s": 4.289, "eta_s": 11763, "world_size": 1, "timestamp": "2026-05-05T04:28:14.309003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65390, "epoch": 0, "train_loss": 3.721183940768242, "train_ppl": 41.313277638964955, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 151741, "dt_s": 4.319, "eta_s": 11762, "world_size": 1, "timestamp": "2026-05-05T04:28:18.627940"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65400, "epoch": 0, "train_loss": 3.744463875889778, "train_ppl": 42.28633041932088, "lr": 0.00056, "grad_norm": 0.7315, "tokens_per_sec": 149660, "dt_s": 4.379, "eta_s": 11771, "world_size": 1, "timestamp": "2026-05-05T04:28:23.006931"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65410, "epoch": 0, "train_loss": 3.7274206280708313, "train_ppl": 41.571740772144274, "lr": 0.00056, "grad_norm": 0.6752, "tokens_per_sec": 153127, "dt_s": 4.28, "eta_s": 11789, "world_size": 1, "timestamp": "2026-05-05T04:28:27.286774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65420, "epoch": 0, "train_loss": 3.8033931255340576, "train_ppl": 44.8531188441983, "lr": 0.00056, "grad_norm": 0.7191, "tokens_per_sec": 152018, "dt_s": 4.311, "eta_s": 11765, "world_size": 1, "timestamp": "2026-05-05T04:28:31.597829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65430, "epoch": 0, "train_loss": 3.708547458052635, "train_ppl": 40.79450773671654, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 148841, "dt_s": 4.403, "eta_s": 11822, "world_size": 1, "timestamp": "2026-05-05T04:28:36.000935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65440, "epoch": 0, "train_loss": 3.7109930515289307, "train_ppl": 40.894396612712214, "lr": 0.00056, "grad_norm": 0.6908, "tokens_per_sec": 150560, "dt_s": 4.353, "eta_s": 11837, "world_size": 1, "timestamp": "2026-05-05T04:28:40.353741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65450, "epoch": 0, "train_loss": 3.6161555647850037, "train_ppl": 37.19430152692378, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 150884, "dt_s": 4.343, "eta_s": 11813, "world_size": 1, "timestamp": "2026-05-05T04:28:44.697223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65460, "epoch": 0, "train_loss": 3.6888202726840973, "train_ppl": 39.997632812853894, "lr": 0.00056, "grad_norm": 0.6904, "tokens_per_sec": 149168, "dt_s": 4.393, "eta_s": 11870, "world_size": 1, "timestamp": "2026-05-05T04:28:49.090668"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65470, "epoch": 0, "train_loss": 3.798012539744377, "train_ppl": 44.61243089208332, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 150034, "dt_s": 4.368, "eta_s": 11897, "world_size": 1, "timestamp": "2026-05-05T04:28:53.458731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65480, "epoch": 0, "train_loss": 3.8509788662195206, "train_ppl": 47.03908567496124, "lr": 0.00056, "grad_norm": 0.7276, "tokens_per_sec": 151209, "dt_s": 4.334, "eta_s": 11855, "world_size": 1, "timestamp": "2026-05-05T04:28:57.792874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65490, "epoch": 0, "train_loss": 3.7106943279504776, "train_ppl": 40.88218231665733, "lr": 0.00056, "grad_norm": 0.6562, "tokens_per_sec": 151406, "dt_s": 4.328, "eta_s": 11838, "world_size": 1, "timestamp": "2026-05-05T04:29:02.121371"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65500, "epoch": 0, "train_loss": 3.7568662762641907, "train_ppl": 42.814048138340134, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 149801, "dt_s": 4.375, "eta_s": 11850, "world_size": 1, "timestamp": "2026-05-05T04:29:06.496252"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65510, "epoch": 0, "train_loss": 3.763581544160843, "train_ppl": 43.10252345177378, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 115171, "dt_s": 5.69, "eta_s": 12133, "world_size": 1, "timestamp": "2026-05-05T04:29:12.186585"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65520, "epoch": 0, "train_loss": 3.6134150326251984, "train_ppl": 37.092508894129246, "lr": 0.00056, "grad_norm": 0.8584, "tokens_per_sec": 150975, "dt_s": 4.341, "eta_s": 12113, "world_size": 1, "timestamp": "2026-05-05T04:29:16.527421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65530, "epoch": 0, "train_loss": 3.7423404306173325, "train_ppl": 42.196632978410285, "lr": 0.00056, "grad_norm": 0.718, "tokens_per_sec": 149056, "dt_s": 4.397, "eta_s": 12143, "world_size": 1, "timestamp": "2026-05-05T04:29:20.924149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65540, "epoch": 0, "train_loss": 3.7635058015584946, "train_ppl": 43.09925887811497, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 149341, "dt_s": 4.388, "eta_s": 12171, "world_size": 1, "timestamp": "2026-05-05T04:29:25.312488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65550, "epoch": 0, "train_loss": 3.6703672856092453, "train_ppl": 39.26632516830251, "lr": 0.00056, "grad_norm": 0.6873, "tokens_per_sec": 152568, "dt_s": 4.296, "eta_s": 12123, "world_size": 1, "timestamp": "2026-05-05T04:29:29.608002"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65560, "epoch": 0, "train_loss": 3.716965824365616, "train_ppl": 41.13938044205795, "lr": 0.00056, "grad_norm": 0.6931, "tokens_per_sec": 149840, "dt_s": 4.374, "eta_s": 11822, "world_size": 1, "timestamp": "2026-05-05T04:29:33.981727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65570, "epoch": 0, "train_loss": 3.6794827729463577, "train_ppl": 39.6258931872422, "lr": 0.00056, "grad_norm": 0.6605, "tokens_per_sec": 150729, "dt_s": 4.348, "eta_s": 11822, "world_size": 1, "timestamp": "2026-05-05T04:29:38.329664"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65580, "epoch": 0, "train_loss": 3.6715540885925293, "train_ppl": 39.31295422443322, "lr": 0.00056, "grad_norm": 0.6255, "tokens_per_sec": 150899, "dt_s": 4.343, "eta_s": 11788, "world_size": 1, "timestamp": "2026-05-05T04:29:42.672699"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65590, "epoch": 0, "train_loss": 3.8153882026672363, "train_ppl": 45.39437518041647, "lr": 0.00056, "grad_norm": 0.6839, "tokens_per_sec": 147778, "dt_s": 4.435, "eta_s": 11809, "world_size": 1, "timestamp": "2026-05-05T04:29:47.107464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65600, "epoch": 0, "train_loss": 3.675366371870041, "train_ppl": 39.46311238327126, "lr": 0.00056, "grad_norm": 0.6356, "tokens_per_sec": 149270, "dt_s": 4.39, "eta_s": 11856, "world_size": 1, "timestamp": "2026-05-05T04:29:51.497911"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65610, "epoch": 0, "train_loss": 3.6911792755126953, "train_ppl": 40.09209872066077, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 149561, "dt_s": 4.382, "eta_s": 11856, "world_size": 1, "timestamp": "2026-05-05T04:29:55.879811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65620, "epoch": 0, "train_loss": 3.6180287152528763, "train_ppl": 37.264037342678954, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 146521, "dt_s": 4.473, "eta_s": 11919, "world_size": 1, "timestamp": "2026-05-05T04:30:00.352635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65630, "epoch": 0, "train_loss": 3.608576849102974, "train_ppl": 36.91348196060165, "lr": 0.00056, "grad_norm": 0.6994, "tokens_per_sec": 150623, "dt_s": 4.351, "eta_s": 11919, "world_size": 1, "timestamp": "2026-05-05T04:30:04.703605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65640, "epoch": 0, "train_loss": 3.645551010966301, "train_ppl": 38.30387289186254, "lr": 0.00056, "grad_norm": 0.6276, "tokens_per_sec": 150444, "dt_s": 4.356, "eta_s": 11872, "world_size": 1, "timestamp": "2026-05-05T04:30:09.059790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65650, "epoch": 0, "train_loss": 3.657556965947151, "train_ppl": 38.76651916329451, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 151597, "dt_s": 4.323, "eta_s": 11831, "world_size": 1, "timestamp": "2026-05-05T04:30:13.382832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65660, "epoch": 0, "train_loss": 3.7665088176727295, "train_ppl": 43.22888117855374, "lr": 0.00056, "grad_norm": 0.7325, "tokens_per_sec": 152400, "dt_s": 4.3, "eta_s": 11783, "world_size": 1, "timestamp": "2026-05-05T04:30:17.683097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65670, "epoch": 0, "train_loss": 3.7037722021341324, "train_ppl": 40.60016790272087, "lr": 0.00056, "grad_norm": 0.7686, "tokens_per_sec": 151469, "dt_s": 4.327, "eta_s": 11700, "world_size": 1, "timestamp": "2026-05-05T04:30:22.009798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65680, "epoch": 0, "train_loss": 3.726530835032463, "train_ppl": 41.53476697856388, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 150998, "dt_s": 4.34, "eta_s": 11689, "world_size": 1, "timestamp": "2026-05-05T04:30:26.349986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65690, "epoch": 0, "train_loss": 3.716224789619446, "train_ppl": 41.10890602441019, "lr": 0.00056, "grad_norm": 0.7185, "tokens_per_sec": 151808, "dt_s": 4.317, "eta_s": 11664, "world_size": 1, "timestamp": "2026-05-05T04:30:30.667010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65700, "epoch": 0, "train_loss": 3.641024887561798, "train_ppl": 38.13089658762938, "lr": 0.00056, "grad_norm": 0.7157, "tokens_per_sec": 151429, "dt_s": 4.328, "eta_s": 11662, "world_size": 1, "timestamp": "2026-05-05T04:30:34.994874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65710, "epoch": 0, "train_loss": 3.7728925198316574, "train_ppl": 43.505724182150985, "lr": 0.00056, "grad_norm": 0.6744, "tokens_per_sec": 150512, "dt_s": 4.354, "eta_s": 11687, "world_size": 1, "timestamp": "2026-05-05T04:30:39.349055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65720, "epoch": 0, "train_loss": 3.8339030146598816, "train_ppl": 46.2426723004917, "lr": 0.00056, "grad_norm": 0.6958, "tokens_per_sec": 153071, "dt_s": 4.281, "eta_s": 11658, "world_size": 1, "timestamp": "2026-05-05T04:30:43.630456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65730, "epoch": 0, "train_loss": 3.6907281577587128, "train_ppl": 40.07401654203602, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 150060, "dt_s": 4.367, "eta_s": 11669, "world_size": 1, "timestamp": "2026-05-05T04:30:47.997812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65740, "epoch": 0, "train_loss": 3.788716733455658, "train_ppl": 44.19964394267678, "lr": 0.00056, "grad_norm": 0.6752, "tokens_per_sec": 150970, "dt_s": 4.341, "eta_s": 11677, "world_size": 1, "timestamp": "2026-05-05T04:30:52.338787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65750, "epoch": 0, "train_loss": 3.6282825767993927, "train_ppl": 37.64810333691383, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 150509, "dt_s": 4.354, "eta_s": 11687, "world_size": 1, "timestamp": "2026-05-05T04:30:56.693109"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65760, "epoch": 0, "train_loss": 3.697641372680664, "train_ppl": 40.35201666126979, "lr": 0.00056, "grad_norm": 0.6948, "tokens_per_sec": 146513, "dt_s": 4.473, "eta_s": 11747, "world_size": 1, "timestamp": "2026-05-05T04:31:01.166144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65770, "epoch": 0, "train_loss": 3.698491796851158, "train_ppl": 40.38634758742234, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 148787, "dt_s": 4.405, "eta_s": 11809, "world_size": 1, "timestamp": "2026-05-05T04:31:05.570835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65780, "epoch": 0, "train_loss": 3.7145781964063644, "train_ppl": 41.04127207683387, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 151796, "dt_s": 4.317, "eta_s": 11777, "world_size": 1, "timestamp": "2026-05-05T04:31:09.888204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65790, "epoch": 0, "train_loss": 3.739228904247284, "train_ppl": 42.065541095860304, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 147626, "dt_s": 4.439, "eta_s": 11826, "world_size": 1, "timestamp": "2026-05-05T04:31:14.327559"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65800, "epoch": 0, "train_loss": 3.7943329364061356, "train_ppl": 44.448576486929085, "lr": 0.00056, "grad_norm": 0.6812, "tokens_per_sec": 135001, "dt_s": 4.854, "eta_s": 12091, "world_size": 1, "timestamp": "2026-05-05T04:31:19.181999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65810, "epoch": 0, "train_loss": 3.7807489186525345, "train_ppl": 43.84886867733014, "lr": 0.00056, "grad_norm": 0.7005, "tokens_per_sec": 149714, "dt_s": 4.377, "eta_s": 12035, "world_size": 1, "timestamp": "2026-05-05T04:31:23.559427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65820, "epoch": 0, "train_loss": 3.81530824303627, "train_ppl": 45.39074560804067, "lr": 0.00056, "grad_norm": 0.6858, "tokens_per_sec": 152584, "dt_s": 4.295, "eta_s": 11971, "world_size": 1, "timestamp": "2026-05-05T04:31:27.854487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65830, "epoch": 0, "train_loss": 3.6132114976644516, "train_ppl": 37.0849600400414, "lr": 0.00056, "grad_norm": 0.6817, "tokens_per_sec": 149095, "dt_s": 4.396, "eta_s": 12009, "world_size": 1, "timestamp": "2026-05-05T04:31:32.250080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65840, "epoch": 0, "train_loss": 3.781447157263756, "train_ppl": 43.87949634196369, "lr": 0.00056, "grad_norm": 0.7227, "tokens_per_sec": 148380, "dt_s": 4.417, "eta_s": 11992, "world_size": 1, "timestamp": "2026-05-05T04:31:36.666873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65850, "epoch": 0, "train_loss": 3.7009845972061157, "train_ppl": 40.48714827481997, "lr": 0.00056, "grad_norm": 0.637, "tokens_per_sec": 151338, "dt_s": 4.33, "eta_s": 11707, "world_size": 1, "timestamp": "2026-05-05T04:31:40.997303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65860, "epoch": 0, "train_loss": 3.6932216584682465, "train_ppl": 40.17406581534604, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 152007, "dt_s": 4.311, "eta_s": 11667, "world_size": 1, "timestamp": "2026-05-05T04:31:45.308657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65870, "epoch": 0, "train_loss": 3.6596427261829376, "train_ppl": 38.84746121095321, "lr": 0.00056, "grad_norm": 0.7515, "tokens_per_sec": 150005, "dt_s": 4.369, "eta_s": 11702, "world_size": 1, "timestamp": "2026-05-05T04:31:49.677576"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65880, "epoch": 0, "train_loss": 3.7436840683221817, "train_ppl": 42.25336807266807, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 152314, "dt_s": 4.303, "eta_s": 11648, "world_size": 1, "timestamp": "2026-05-05T04:31:53.980264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65890, "epoch": 0, "train_loss": 3.721322014927864, "train_ppl": 41.318982328882264, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 150636, "dt_s": 4.351, "eta_s": 11608, "world_size": 1, "timestamp": "2026-05-05T04:31:58.330886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65900, "epoch": 0, "train_loss": 3.701427638530731, "train_ppl": 40.50508972873083, "lr": 0.00056, "grad_norm": 0.6592, "tokens_per_sec": 150865, "dt_s": 4.344, "eta_s": 11611, "world_size": 1, "timestamp": "2026-05-05T04:32:02.674888"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65910, "epoch": 0, "train_loss": 3.678079664707184, "train_ppl": 39.57033275778993, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 152337, "dt_s": 4.302, "eta_s": 11602, "world_size": 1, "timestamp": "2026-05-05T04:32:06.976916"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65920, "epoch": 0, "train_loss": 3.7397918552160263, "train_ppl": 42.08922859979696, "lr": 0.00056, "grad_norm": 0.6774, "tokens_per_sec": 148059, "dt_s": 4.426, "eta_s": 11628, "world_size": 1, "timestamp": "2026-05-05T04:32:11.403266"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65930, "epoch": 0, "train_loss": 3.768304705619812, "train_ppl": 43.306585158164054, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 151742, "dt_s": 4.319, "eta_s": 11632, "world_size": 1, "timestamp": "2026-05-05T04:32:15.722169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65940, "epoch": 0, "train_loss": 3.7565504014492035, "train_ppl": 42.80052639450718, "lr": 0.00056, "grad_norm": 0.6502, "tokens_per_sec": 152217, "dt_s": 4.305, "eta_s": 11604, "world_size": 1, "timestamp": "2026-05-05T04:32:20.027620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65950, "epoch": 0, "train_loss": 3.696807324886322, "train_ppl": 40.31837518202717, "lr": 0.00056, "grad_norm": 0.685, "tokens_per_sec": 149188, "dt_s": 4.393, "eta_s": 11626, "world_size": 1, "timestamp": "2026-05-05T04:32:24.420468"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65960, "epoch": 0, "train_loss": 3.7499055713415146, "train_ppl": 42.51706698090159, "lr": 0.00056, "grad_norm": 0.7702, "tokens_per_sec": 150876, "dt_s": 4.344, "eta_s": 11644, "world_size": 1, "timestamp": "2026-05-05T04:32:28.764154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65970, "epoch": 0, "train_loss": 3.7805924117565155, "train_ppl": 43.842006563997536, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 152535, "dt_s": 4.296, "eta_s": 11570, "world_size": 1, "timestamp": "2026-05-05T04:32:33.060635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65980, "epoch": 0, "train_loss": 3.660671591758728, "train_ppl": 38.88745059482263, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 149882, "dt_s": 4.372, "eta_s": 11594, "world_size": 1, "timestamp": "2026-05-05T04:32:37.433129"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 65990, "epoch": 0, "train_loss": 3.7865359634160995, "train_ppl": 44.103359708452786, "lr": 0.00056, "grad_norm": 0.6984, "tokens_per_sec": 151848, "dt_s": 4.316, "eta_s": 11595, "world_size": 1, "timestamp": "2026-05-05T04:32:41.749011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66000, "epoch": 0, "train_loss": 3.625954493880272, "train_ppl": 37.56055737726588, "lr": 0.00056, "grad_norm": 0.622, "tokens_per_sec": 151309, "dt_s": 4.331, "eta_s": 11558, "world_size": 1, "timestamp": "2026-05-05T04:32:46.080297"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66010, "epoch": 0, "train_loss": 3.711889773607254, "train_ppl": 40.931083967756784, "lr": 0.00056, "grad_norm": 0.6348, "tokens_per_sec": 127227, "dt_s": 5.151, "eta_s": 11575, "world_size": 1, "timestamp": "2026-05-05T04:32:51.231412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66020, "epoch": 0, "train_loss": 3.7464506775140762, "train_ppl": 42.37042848470613, "lr": 0.00056, "grad_norm": 0.7188, "tokens_per_sec": 152046, "dt_s": 4.31, "eta_s": 11578, "world_size": 1, "timestamp": "2026-05-05T04:32:55.541711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66030, "epoch": 0, "train_loss": 3.6731574535369873, "train_ppl": 39.37603779657934, "lr": 0.00056, "grad_norm": 0.6416, "tokens_per_sec": 146943, "dt_s": 4.46, "eta_s": 11621, "world_size": 1, "timestamp": "2026-05-05T04:33:00.001656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66040, "epoch": 0, "train_loss": 3.716624528169632, "train_ppl": 41.125342123756525, "lr": 0.00056, "grad_norm": 0.6243, "tokens_per_sec": 151544, "dt_s": 4.325, "eta_s": 11621, "world_size": 1, "timestamp": "2026-05-05T04:33:04.326203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66050, "epoch": 0, "train_loss": 3.802226573228836, "train_ppl": 44.800825842205455, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 152393, "dt_s": 4.3, "eta_s": 11600, "world_size": 1, "timestamp": "2026-05-05T04:33:08.626739"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66060, "epoch": 0, "train_loss": 3.642508491873741, "train_ppl": 38.18750973559608, "lr": 0.00056, "grad_norm": 0.7015, "tokens_per_sec": 149103, "dt_s": 4.395, "eta_s": 11602, "world_size": 1, "timestamp": "2026-05-05T04:33:13.022004"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66070, "epoch": 0, "train_loss": 3.7251989543437958, "train_ppl": 41.479484447556736, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 152165, "dt_s": 4.307, "eta_s": 11596, "world_size": 1, "timestamp": "2026-05-05T04:33:17.328902"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66080, "epoch": 0, "train_loss": 3.7559599727392197, "train_ppl": 42.77526319371488, "lr": 0.00056, "grad_norm": 0.7076, "tokens_per_sec": 152096, "dt_s": 4.309, "eta_s": 11511, "world_size": 1, "timestamp": "2026-05-05T04:33:21.637778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66090, "epoch": 0, "train_loss": 3.8202054053545, "train_ppl": 45.61357663154322, "lr": 0.00056, "grad_norm": 0.72, "tokens_per_sec": 151370, "dt_s": 4.33, "eta_s": 11509, "world_size": 1, "timestamp": "2026-05-05T04:33:25.967312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66100, "epoch": 0, "train_loss": 3.6065485030412674, "train_ppl": 36.83868452802611, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 135033, "dt_s": 4.853, "eta_s": 11799, "world_size": 1, "timestamp": "2026-05-05T04:33:30.820645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66110, "epoch": 0, "train_loss": 3.6642015129327774, "train_ppl": 39.02496279090101, "lr": 0.00056, "grad_norm": 0.6619, "tokens_per_sec": 152440, "dt_s": 4.299, "eta_s": 11743, "world_size": 1, "timestamp": "2026-05-05T04:33:35.119776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66120, "epoch": 0, "train_loss": 3.802457094192505, "train_ppl": 44.81115456219926, "lr": 0.00056, "grad_norm": 0.7354, "tokens_per_sec": 152564, "dt_s": 4.296, "eta_s": 11733, "world_size": 1, "timestamp": "2026-05-05T04:33:39.415411"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66130, "epoch": 0, "train_loss": 3.7956923991441727, "train_ppl": 44.50904376261328, "lr": 0.00056, "grad_norm": 1.7286, "tokens_per_sec": 153101, "dt_s": 4.281, "eta_s": 11713, "world_size": 1, "timestamp": "2026-05-05T04:33:43.695982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66140, "epoch": 0, "train_loss": 3.642325982451439, "train_ppl": 38.18054079122343, "lr": 0.00056, "grad_norm": 0.786, "tokens_per_sec": 150265, "dt_s": 4.361, "eta_s": 11726, "world_size": 1, "timestamp": "2026-05-05T04:33:48.057363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66150, "epoch": 0, "train_loss": 3.6736123859882355, "train_ppl": 39.39395530929455, "lr": 0.00056, "grad_norm": 0.7334, "tokens_per_sec": 152783, "dt_s": 4.289, "eta_s": 11422, "world_size": 1, "timestamp": "2026-05-05T04:33:52.346834"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66160, "epoch": 0, "train_loss": 3.7173221856355667, "train_ppl": 41.15404353644156, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 152712, "dt_s": 4.291, "eta_s": 11414, "world_size": 1, "timestamp": "2026-05-05T04:33:56.638311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66170, "epoch": 0, "train_loss": 3.6568614691495895, "train_ppl": 38.73956654717938, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 151713, "dt_s": 4.32, "eta_s": 11422, "world_size": 1, "timestamp": "2026-05-05T04:34:00.958035"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66180, "epoch": 0, "train_loss": 3.806247800588608, "train_ppl": 44.98134285554831, "lr": 0.00056, "grad_norm": 0.7405, "tokens_per_sec": 153540, "dt_s": 4.268, "eta_s": 11412, "world_size": 1, "timestamp": "2026-05-05T04:34:05.226381"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66190, "epoch": 0, "train_loss": 3.6753054559230804, "train_ppl": 39.460708523627865, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 150212, "dt_s": 4.363, "eta_s": 11408, "world_size": 1, "timestamp": "2026-05-05T04:34:09.589298"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66200, "epoch": 0, "train_loss": 3.678682044148445, "train_ppl": 39.59417629343394, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 148416, "dt_s": 4.416, "eta_s": 11471, "world_size": 1, "timestamp": "2026-05-05T04:34:14.004973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66210, "epoch": 0, "train_loss": 3.6793118864297867, "train_ppl": 39.61912223493812, "lr": 0.00056, "grad_norm": 0.6854, "tokens_per_sec": 151572, "dt_s": 4.324, "eta_s": 11483, "world_size": 1, "timestamp": "2026-05-05T04:34:18.328748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66220, "epoch": 0, "train_loss": 3.7069424241781235, "train_ppl": 40.72908368785748, "lr": 0.00056, "grad_norm": 0.7028, "tokens_per_sec": 149817, "dt_s": 4.374, "eta_s": 11508, "world_size": 1, "timestamp": "2026-05-05T04:34:22.703124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66230, "epoch": 0, "train_loss": 3.63191457092762, "train_ppl": 37.78508964331612, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 150800, "dt_s": 4.346, "eta_s": 11545, "world_size": 1, "timestamp": "2026-05-05T04:34:27.049020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66240, "epoch": 0, "train_loss": 3.6926631182432175, "train_ppl": 40.15163324891374, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 152484, "dt_s": 4.298, "eta_s": 11506, "world_size": 1, "timestamp": "2026-05-05T04:34:31.346912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66250, "epoch": 0, "train_loss": 3.7791095227003098, "train_ppl": 43.777041911852095, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 151439, "dt_s": 4.328, "eta_s": 11455, "world_size": 1, "timestamp": "2026-05-05T04:34:35.674474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66260, "epoch": 0, "train_loss": 3.6236987113952637, "train_ppl": 37.475924422463926, "lr": 0.00056, "grad_norm": 0.7954, "tokens_per_sec": 153075, "dt_s": 4.281, "eta_s": 11428, "world_size": 1, "timestamp": "2026-05-05T04:34:39.955787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66270, "epoch": 0, "train_loss": 3.642899125814438, "train_ppl": 38.20242998699806, "lr": 0.00056, "grad_norm": 0.7107, "tokens_per_sec": 152779, "dt_s": 4.29, "eta_s": 11379, "world_size": 1, "timestamp": "2026-05-05T04:34:44.245383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66280, "epoch": 0, "train_loss": 3.664242208003998, "train_ppl": 39.02655094685599, "lr": 0.00056, "grad_norm": 0.7144, "tokens_per_sec": 150902, "dt_s": 4.343, "eta_s": 11373, "world_size": 1, "timestamp": "2026-05-05T04:34:48.588337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66290, "epoch": 0, "train_loss": 3.759850263595581, "train_ppl": 42.94199551733061, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 151598, "dt_s": 4.323, "eta_s": 11382, "world_size": 1, "timestamp": "2026-05-05T04:34:52.911375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66300, "epoch": 0, "train_loss": 3.7212896794080734, "train_ppl": 41.31764627971248, "lr": 0.00056, "grad_norm": 0.652, "tokens_per_sec": 153407, "dt_s": 4.272, "eta_s": 11349, "world_size": 1, "timestamp": "2026-05-05T04:34:57.183398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66310, "epoch": 0, "train_loss": 3.7878655195236206, "train_ppl": 44.16203659818022, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 150462, "dt_s": 4.356, "eta_s": 11383, "world_size": 1, "timestamp": "2026-05-05T04:35:01.539032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66320, "epoch": 0, "train_loss": 3.75335855782032, "train_ppl": 42.66413159825038, "lr": 0.00056, "grad_norm": 0.6498, "tokens_per_sec": 152633, "dt_s": 4.294, "eta_s": 11381, "world_size": 1, "timestamp": "2026-05-05T04:35:05.832747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66330, "epoch": 0, "train_loss": 3.835128530859947, "train_ppl": 46.29937818442281, "lr": 0.00056, "grad_norm": 0.6165, "tokens_per_sec": 151469, "dt_s": 4.327, "eta_s": 11368, "world_size": 1, "timestamp": "2026-05-05T04:35:10.159449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66340, "epoch": 0, "train_loss": 3.6787746846675873, "train_ppl": 39.59784448838989, "lr": 0.00056, "grad_norm": 0.6612, "tokens_per_sec": 149537, "dt_s": 4.383, "eta_s": 11396, "world_size": 1, "timestamp": "2026-05-05T04:35:14.542045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66350, "epoch": 0, "train_loss": 3.7709593027830124, "train_ppl": 43.421699419669075, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 152081, "dt_s": 4.309, "eta_s": 11411, "world_size": 1, "timestamp": "2026-05-05T04:35:18.851349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66360, "epoch": 0, "train_loss": 3.689437672495842, "train_ppl": 40.02233496859126, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 151179, "dt_s": 4.335, "eta_s": 11396, "world_size": 1, "timestamp": "2026-05-05T04:35:23.186337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66370, "epoch": 0, "train_loss": 3.6336596608161926, "train_ppl": 37.85108558887179, "lr": 0.00056, "grad_norm": 0.6479, "tokens_per_sec": 149036, "dt_s": 4.397, "eta_s": 11446, "world_size": 1, "timestamp": "2026-05-05T04:35:27.583666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66380, "epoch": 0, "train_loss": 3.7416077852249146, "train_ppl": 42.16572913184586, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 152522, "dt_s": 4.297, "eta_s": 11426, "world_size": 1, "timestamp": "2026-05-05T04:35:31.880498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66390, "epoch": 0, "train_loss": 3.6225667595863342, "train_ppl": 37.43352748219621, "lr": 0.00056, "grad_norm": 0.6471, "tokens_per_sec": 134291, "dt_s": 4.88, "eta_s": 11683, "world_size": 1, "timestamp": "2026-05-05T04:35:36.760603"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66400, "epoch": 0, "train_loss": 3.658440440893173, "train_ppl": 38.800783545352616, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 154200, "dt_s": 4.25, "eta_s": 11647, "world_size": 1, "timestamp": "2026-05-05T04:35:41.010689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66410, "epoch": 0, "train_loss": 3.626433029770851, "train_ppl": 37.57853575334708, "lr": 0.00056, "grad_norm": 0.6738, "tokens_per_sec": 153838, "dt_s": 4.26, "eta_s": 11604, "world_size": 1, "timestamp": "2026-05-05T04:35:45.270728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66420, "epoch": 0, "train_loss": 3.642933636903763, "train_ppl": 38.203748417221874, "lr": 0.00056, "grad_norm": 0.6695, "tokens_per_sec": 149826, "dt_s": 4.374, "eta_s": 11587, "world_size": 1, "timestamp": "2026-05-05T04:35:49.644873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66430, "epoch": 0, "train_loss": 3.661343678832054, "train_ppl": 38.913595132400765, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 151467, "dt_s": 4.327, "eta_s": 11598, "world_size": 1, "timestamp": "2026-05-05T04:35:53.971659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66440, "epoch": 0, "train_loss": 3.673958867788315, "train_ppl": 39.40760696273051, "lr": 0.00056, "grad_norm": 0.638, "tokens_per_sec": 152744, "dt_s": 4.291, "eta_s": 11284, "world_size": 1, "timestamp": "2026-05-05T04:35:58.262219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66450, "epoch": 0, "train_loss": 3.675745978951454, "train_ppl": 39.478095703893935, "lr": 0.00056, "grad_norm": 0.6383, "tokens_per_sec": 150895, "dt_s": 4.343, "eta_s": 11329, "world_size": 1, "timestamp": "2026-05-05T04:36:02.605383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66460, "epoch": 0, "train_loss": 3.7514330595731735, "train_ppl": 42.582060926465964, "lr": 0.00056, "grad_norm": 0.7074, "tokens_per_sec": 152951, "dt_s": 4.285, "eta_s": 11338, "world_size": 1, "timestamp": "2026-05-05T04:36:06.890140"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66470, "epoch": 0, "train_loss": 3.7523653507232666, "train_ppl": 42.62177831622917, "lr": 0.00056, "grad_norm": 0.7098, "tokens_per_sec": 152203, "dt_s": 4.306, "eta_s": 11298, "world_size": 1, "timestamp": "2026-05-05T04:36:11.195970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66480, "epoch": 0, "train_loss": 3.7051019072532654, "train_ppl": 40.65419006263028, "lr": 0.00056, "grad_norm": 0.7482, "tokens_per_sec": 151817, "dt_s": 4.317, "eta_s": 11288, "world_size": 1, "timestamp": "2026-05-05T04:36:15.512778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66490, "epoch": 0, "train_loss": 3.7633440643548965, "train_ppl": 43.09228868819159, "lr": 0.00056, "grad_norm": 0.7076, "tokens_per_sec": 152683, "dt_s": 4.292, "eta_s": 11285, "world_size": 1, "timestamp": "2026-05-05T04:36:19.805039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66500, "epoch": 0, "train_loss": 3.772155836224556, "train_ppl": 43.47368603077031, "lr": 0.00056, "grad_norm": 0.7446, "tokens_per_sec": 150768, "dt_s": 4.347, "eta_s": 11282, "world_size": 1, "timestamp": "2026-05-05T04:36:24.151860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66510, "epoch": 0, "train_loss": 3.699457973241806, "train_ppl": 40.42538677930165, "lr": 0.00056, "grad_norm": 0.6533, "tokens_per_sec": 128204, "dt_s": 5.112, "eta_s": 11307, "world_size": 1, "timestamp": "2026-05-05T04:36:29.263740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66520, "epoch": 0, "train_loss": 3.7208173871040344, "train_ppl": 41.29813688078569, "lr": 0.00056, "grad_norm": 0.6779, "tokens_per_sec": 149687, "dt_s": 4.378, "eta_s": 11340, "world_size": 1, "timestamp": "2026-05-05T04:36:33.641944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66530, "epoch": 0, "train_loss": 3.7100861221551895, "train_ppl": 40.85732509636728, "lr": 0.00056, "grad_norm": 0.6511, "tokens_per_sec": 148628, "dt_s": 4.409, "eta_s": 11384, "world_size": 1, "timestamp": "2026-05-05T04:36:38.051367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66540, "epoch": 0, "train_loss": 3.6056482940912247, "train_ppl": 36.8055369366239, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 150049, "dt_s": 4.368, "eta_s": 11419, "world_size": 1, "timestamp": "2026-05-05T04:36:42.418992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66550, "epoch": 0, "train_loss": 3.747700408101082, "train_ppl": 42.423413206585714, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 152685, "dt_s": 4.292, "eta_s": 11387, "world_size": 1, "timestamp": "2026-05-05T04:36:46.711235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66560, "epoch": 0, "train_loss": 3.575830712914467, "train_ppl": 35.72428511769793, "lr": 0.00056, "grad_norm": 0.6272, "tokens_per_sec": 150409, "dt_s": 4.357, "eta_s": 11391, "world_size": 1, "timestamp": "2026-05-05T04:36:51.068419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66570, "epoch": 0, "train_loss": 3.692166730761528, "train_ppl": 40.1317074266731, "lr": 0.00056, "grad_norm": 0.6171, "tokens_per_sec": 152468, "dt_s": 4.298, "eta_s": 11345, "world_size": 1, "timestamp": "2026-05-05T04:36:55.366785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66580, "epoch": 0, "train_loss": 3.6913313418626785, "train_ppl": 40.098195843348236, "lr": 0.00056, "grad_norm": 0.6752, "tokens_per_sec": 151578, "dt_s": 4.324, "eta_s": 11296, "world_size": 1, "timestamp": "2026-05-05T04:36:59.690374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66590, "epoch": 0, "train_loss": 3.733856812119484, "train_ppl": 41.84016704079369, "lr": 0.00056, "grad_norm": 0.6823, "tokens_per_sec": 150657, "dt_s": 4.35, "eta_s": 11283, "world_size": 1, "timestamp": "2026-05-05T04:37:04.040388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66600, "epoch": 0, "train_loss": 3.7393427342176437, "train_ppl": 42.07032968769388, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 152649, "dt_s": 4.293, "eta_s": 11279, "world_size": 1, "timestamp": "2026-05-05T04:37:08.333616"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66610, "epoch": 0, "train_loss": 3.6928548216819763, "train_ppl": 40.15933119291694, "lr": 0.00056, "grad_norm": 0.8644, "tokens_per_sec": 150877, "dt_s": 4.344, "eta_s": 11267, "world_size": 1, "timestamp": "2026-05-05T04:37:12.677382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66620, "epoch": 0, "train_loss": 3.7028573900461197, "train_ppl": 40.563043361926525, "lr": 0.00056, "grad_norm": 0.7193, "tokens_per_sec": 152652, "dt_s": 4.293, "eta_s": 11260, "world_size": 1, "timestamp": "2026-05-05T04:37:16.970456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66630, "epoch": 0, "train_loss": 3.736829951405525, "train_ppl": 41.96474879283725, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 152464, "dt_s": 4.298, "eta_s": 11243, "world_size": 1, "timestamp": "2026-05-05T04:37:21.268916"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66640, "epoch": 0, "train_loss": 3.8481744676828384, "train_ppl": 46.90735413213569, "lr": 0.00056, "grad_norm": 0.7069, "tokens_per_sec": 150727, "dt_s": 4.348, "eta_s": 11237, "world_size": 1, "timestamp": "2026-05-05T04:37:25.616912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66650, "epoch": 0, "train_loss": 3.703079715371132, "train_ppl": 40.57206255628596, "lr": 0.00056, "grad_norm": 0.6849, "tokens_per_sec": 153256, "dt_s": 4.276, "eta_s": 11224, "world_size": 1, "timestamp": "2026-05-05T04:37:29.893151"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66660, "epoch": 0, "train_loss": 3.633528858423233, "train_ppl": 37.84613490008868, "lr": 0.00056, "grad_norm": 0.6027, "tokens_per_sec": 152570, "dt_s": 4.295, "eta_s": 11195, "world_size": 1, "timestamp": "2026-05-05T04:37:34.188628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66670, "epoch": 0, "train_loss": 3.8098460286855698, "train_ppl": 45.14348752906637, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 149686, "dt_s": 4.378, "eta_s": 11235, "world_size": 1, "timestamp": "2026-05-05T04:37:38.566860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66680, "epoch": 0, "train_loss": 3.757702514529228, "train_ppl": 42.849865857661044, "lr": 0.00056, "grad_norm": 0.6393, "tokens_per_sec": 152756, "dt_s": 4.29, "eta_s": 11226, "world_size": 1, "timestamp": "2026-05-05T04:37:42.857113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66690, "epoch": 0, "train_loss": 3.634052276611328, "train_ppl": 37.865949440637564, "lr": 0.00056, "grad_norm": 0.6289, "tokens_per_sec": 134630, "dt_s": 4.868, "eta_s": 11492, "world_size": 1, "timestamp": "2026-05-05T04:37:47.724982"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66700, "epoch": 0, "train_loss": 3.808289036154747, "train_ppl": 45.0732541467826, "lr": 0.00056, "grad_norm": 0.6769, "tokens_per_sec": 151822, "dt_s": 4.317, "eta_s": 11509, "world_size": 1, "timestamp": "2026-05-05T04:37:52.041614"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66710, "epoch": 0, "train_loss": 3.707397386431694, "train_ppl": 40.747618099467104, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 154012, "dt_s": 4.255, "eta_s": 11484, "world_size": 1, "timestamp": "2026-05-05T04:37:56.296848"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66720, "epoch": 0, "train_loss": 3.821344792842865, "train_ppl": 45.6655777891653, "lr": 0.00056, "grad_norm": 0.6183, "tokens_per_sec": 149288, "dt_s": 4.39, "eta_s": 11485, "world_size": 1, "timestamp": "2026-05-05T04:38:00.686765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66730, "epoch": 0, "train_loss": 3.666913628578186, "train_ppl": 39.130946658335894, "lr": 0.00056, "grad_norm": 0.6649, "tokens_per_sec": 151383, "dt_s": 4.329, "eta_s": 11501, "world_size": 1, "timestamp": "2026-05-05T04:38:05.015906"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66740, "epoch": 0, "train_loss": 3.8131044656038284, "train_ppl": 45.29082464944472, "lr": 0.00056, "grad_norm": 0.8756, "tokens_per_sec": 152136, "dt_s": 4.308, "eta_s": 11206, "world_size": 1, "timestamp": "2026-05-05T04:38:09.323646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66750, "epoch": 0, "train_loss": 3.7258072048425674, "train_ppl": 41.504722039271535, "lr": 0.00056, "grad_norm": 0.7782, "tokens_per_sec": 149099, "dt_s": 4.395, "eta_s": 11242, "world_size": 1, "timestamp": "2026-05-05T04:38:13.719102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66760, "epoch": 0, "train_loss": 3.8527698069810867, "train_ppl": 47.12340537413873, "lr": 0.00056, "grad_norm": 0.7291, "tokens_per_sec": 152640, "dt_s": 4.294, "eta_s": 11258, "world_size": 1, "timestamp": "2026-05-05T04:38:18.012625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66770, "epoch": 0, "train_loss": 3.7654701620340347, "train_ppl": 43.184004567070374, "lr": 0.00056, "grad_norm": 0.7105, "tokens_per_sec": 152220, "dt_s": 4.305, "eta_s": 11210, "world_size": 1, "timestamp": "2026-05-05T04:38:22.317942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66780, "epoch": 0, "train_loss": 3.6631481051445007, "train_ppl": 38.983875235934526, "lr": 0.00056, "grad_norm": 0.7762, "tokens_per_sec": 148206, "dt_s": 4.422, "eta_s": 11254, "world_size": 1, "timestamp": "2026-05-05T04:38:26.739889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66790, "epoch": 0, "train_loss": 3.55931493639946, "train_ppl": 35.139116368037584, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 149570, "dt_s": 4.382, "eta_s": 11287, "world_size": 1, "timestamp": "2026-05-05T04:38:31.121532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66800, "epoch": 0, "train_loss": 3.7269173562526703, "train_ppl": 41.550824150396366, "lr": 0.00056, "grad_norm": 0.69, "tokens_per_sec": 150058, "dt_s": 4.367, "eta_s": 11269, "world_size": 1, "timestamp": "2026-05-05T04:38:35.488907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66810, "epoch": 0, "train_loss": 3.8210274279117584, "train_ppl": 45.651087435702934, "lr": 0.00056, "grad_norm": 0.7408, "tokens_per_sec": 148229, "dt_s": 4.421, "eta_s": 11330, "world_size": 1, "timestamp": "2026-05-05T04:38:39.910168"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66820, "epoch": 0, "train_loss": 3.6713644713163376, "train_ppl": 39.30550051583243, "lr": 0.00056, "grad_norm": 0.7366, "tokens_per_sec": 150715, "dt_s": 4.348, "eta_s": 11348, "world_size": 1, "timestamp": "2026-05-05T04:38:44.258514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66830, "epoch": 0, "train_loss": 3.6433218717575073, "train_ppl": 38.21858332342861, "lr": 0.00056, "grad_norm": 0.6979, "tokens_per_sec": 150275, "dt_s": 4.361, "eta_s": 11312, "world_size": 1, "timestamp": "2026-05-05T04:38:48.619596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66840, "epoch": 0, "train_loss": 3.668356716632843, "train_ppl": 39.187456824792164, "lr": 0.00056, "grad_norm": 0.6787, "tokens_per_sec": 150261, "dt_s": 4.361, "eta_s": 11298, "world_size": 1, "timestamp": "2026-05-05T04:38:52.981055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66850, "epoch": 0, "train_loss": 3.715353846549988, "train_ppl": 41.073118094502206, "lr": 0.00056, "grad_norm": 0.7208, "tokens_per_sec": 151481, "dt_s": 4.326, "eta_s": 11272, "world_size": 1, "timestamp": "2026-05-05T04:38:57.307418"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66860, "epoch": 0, "train_loss": 3.6591500341892242, "train_ppl": 38.8283260920855, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 148061, "dt_s": 4.426, "eta_s": 11270, "world_size": 1, "timestamp": "2026-05-05T04:39:01.733717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66870, "epoch": 0, "train_loss": 3.7860438227653503, "train_ppl": 44.08165999239981, "lr": 0.00056, "grad_norm": 0.8072, "tokens_per_sec": 152471, "dt_s": 4.298, "eta_s": 11240, "world_size": 1, "timestamp": "2026-05-05T04:39:06.031967"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66880, "epoch": 0, "train_loss": 3.691095933318138, "train_ppl": 40.0887574964034, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 151981, "dt_s": 4.312, "eta_s": 11210, "world_size": 1, "timestamp": "2026-05-05T04:39:10.344096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66890, "epoch": 0, "train_loss": 3.805179253220558, "train_ppl": 44.93330383058641, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 149560, "dt_s": 4.382, "eta_s": 11217, "world_size": 1, "timestamp": "2026-05-05T04:39:14.726003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66900, "epoch": 0, "train_loss": 3.756105974316597, "train_ppl": 42.78150890554469, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 150655, "dt_s": 4.35, "eta_s": 11224, "world_size": 1, "timestamp": "2026-05-05T04:39:19.076079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66910, "epoch": 0, "train_loss": 3.6119361519813538, "train_ppl": 37.03769404299566, "lr": 0.00056, "grad_norm": 0.6757, "tokens_per_sec": 151709, "dt_s": 4.32, "eta_s": 11165, "world_size": 1, "timestamp": "2026-05-05T04:39:23.395944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66920, "epoch": 0, "train_loss": 3.7578846216201782, "train_ppl": 42.85766983263801, "lr": 0.00056, "grad_norm": 0.6657, "tokens_per_sec": 148996, "dt_s": 4.399, "eta_s": 11213, "world_size": 1, "timestamp": "2026-05-05T04:39:27.794455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66930, "epoch": 0, "train_loss": 3.6160862743854523, "train_ppl": 37.19172440819577, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 152008, "dt_s": 4.311, "eta_s": 11208, "world_size": 1, "timestamp": "2026-05-05T04:39:32.105790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66940, "epoch": 0, "train_loss": 3.7388260811567307, "train_ppl": 42.0485995370449, "lr": 0.00056, "grad_norm": 0.6563, "tokens_per_sec": 150782, "dt_s": 4.346, "eta_s": 11185, "world_size": 1, "timestamp": "2026-05-05T04:39:36.452210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66950, "epoch": 0, "train_loss": 3.6551346331834793, "train_ppl": 38.67272739708767, "lr": 0.00056, "grad_norm": 0.6684, "tokens_per_sec": 150424, "dt_s": 4.357, "eta_s": 11184, "world_size": 1, "timestamp": "2026-05-05T04:39:40.808964"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66960, "epoch": 0, "train_loss": 3.75833398103714, "train_ppl": 42.87693265780815, "lr": 0.00056, "grad_norm": 0.7088, "tokens_per_sec": 152440, "dt_s": 4.299, "eta_s": 11169, "world_size": 1, "timestamp": "2026-05-05T04:39:45.108108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66970, "epoch": 0, "train_loss": 3.708977058529854, "train_ppl": 40.81203684169447, "lr": 0.00056, "grad_norm": 0.6824, "tokens_per_sec": 150223, "dt_s": 4.363, "eta_s": 11146, "world_size": 1, "timestamp": "2026-05-05T04:39:49.470677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66980, "epoch": 0, "train_loss": 3.675279840826988, "train_ppl": 39.45969774673279, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 136135, "dt_s": 4.814, "eta_s": 11401, "world_size": 1, "timestamp": "2026-05-05T04:39:54.284720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 66990, "epoch": 0, "train_loss": 3.643366366624832, "train_ppl": 38.22028389205594, "lr": 0.00056, "grad_norm": 0.6529, "tokens_per_sec": 153385, "dt_s": 4.273, "eta_s": 11358, "world_size": 1, "timestamp": "2026-05-05T04:39:58.557376"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67000, "epoch": 0, "train_loss": 3.715238109230995, "train_ppl": 41.068364677010486, "lr": 0.00056, "grad_norm": 1.4376, "tokens_per_sec": 148388, "dt_s": 4.417, "eta_s": 11385, "world_size": 1, "timestamp": "2026-05-05T04:40:02.973896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67010, "epoch": 0, "train_loss": 3.7097092866897583, "train_ppl": 40.84193150785553, "lr": 0.00056, "grad_norm": 0.6368, "tokens_per_sec": 127411, "dt_s": 5.144, "eta_s": 11407, "world_size": 1, "timestamp": "2026-05-05T04:40:08.117552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67020, "epoch": 0, "train_loss": 3.6464150696992874, "train_ppl": 38.33698399064893, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 150659, "dt_s": 4.35, "eta_s": 11396, "world_size": 1, "timestamp": "2026-05-05T04:40:12.467484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67030, "epoch": 0, "train_loss": 3.71951462328434, "train_ppl": 41.24437019248909, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 150665, "dt_s": 4.35, "eta_s": 11153, "world_size": 1, "timestamp": "2026-05-05T04:40:16.817286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67040, "epoch": 0, "train_loss": 3.8486383110284805, "train_ppl": 46.92911684306569, "lr": 0.00056, "grad_norm": 0.7607, "tokens_per_sec": 150124, "dt_s": 4.365, "eta_s": 11196, "world_size": 1, "timestamp": "2026-05-05T04:40:21.182711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67050, "epoch": 0, "train_loss": 3.712661921977997, "train_ppl": 40.96270104252073, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 150070, "dt_s": 4.367, "eta_s": 11167, "world_size": 1, "timestamp": "2026-05-05T04:40:25.549743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67060, "epoch": 0, "train_loss": 3.7567976266145706, "train_ppl": 42.81110906982079, "lr": 0.00056, "grad_norm": 0.6507, "tokens_per_sec": 151066, "dt_s": 4.338, "eta_s": 11156, "world_size": 1, "timestamp": "2026-05-05T04:40:29.887979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67070, "epoch": 0, "train_loss": 3.7570880204439163, "train_ppl": 42.82354295699679, "lr": 0.00056, "grad_norm": 0.6435, "tokens_per_sec": 151557, "dt_s": 4.324, "eta_s": 11138, "world_size": 1, "timestamp": "2026-05-05T04:40:34.212151"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67080, "epoch": 0, "train_loss": 3.6636220514774323, "train_ppl": 39.002355879716795, "lr": 0.00056, "grad_norm": 0.7039, "tokens_per_sec": 150102, "dt_s": 4.366, "eta_s": 11142, "world_size": 1, "timestamp": "2026-05-05T04:40:38.578272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67090, "epoch": 0, "train_loss": 3.7233934104442596, "train_ppl": 41.40465898810997, "lr": 0.00056, "grad_norm": 0.6725, "tokens_per_sec": 151784, "dt_s": 4.318, "eta_s": 11113, "world_size": 1, "timestamp": "2026-05-05T04:40:42.895968"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67100, "epoch": 0, "train_loss": 3.8068353086709976, "train_ppl": 45.0077775225626, "lr": 0.00056, "grad_norm": 0.7131, "tokens_per_sec": 152580, "dt_s": 4.295, "eta_s": 11072, "world_size": 1, "timestamp": "2026-05-05T04:40:47.191160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67110, "epoch": 0, "train_loss": 3.763435497879982, "train_ppl": 43.09622894818354, "lr": 0.00056, "grad_norm": 0.6552, "tokens_per_sec": 149624, "dt_s": 4.38, "eta_s": 11089, "world_size": 1, "timestamp": "2026-05-05T04:40:51.571212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67120, "epoch": 0, "train_loss": 3.7545111030340195, "train_ppl": 42.713332286483094, "lr": 0.00056, "grad_norm": 0.6759, "tokens_per_sec": 151202, "dt_s": 4.334, "eta_s": 11090, "world_size": 1, "timestamp": "2026-05-05T04:40:55.905546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67130, "epoch": 0, "train_loss": 3.6797005981206894, "train_ppl": 39.63452564448287, "lr": 0.00056, "grad_norm": 0.6233, "tokens_per_sec": 150780, "dt_s": 4.346, "eta_s": 11076, "world_size": 1, "timestamp": "2026-05-05T04:41:00.252021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67140, "epoch": 0, "train_loss": 3.913482666015625, "train_ppl": 50.07303632052556, "lr": 0.00056, "grad_norm": 1.0733, "tokens_per_sec": 149481, "dt_s": 4.384, "eta_s": 11105, "world_size": 1, "timestamp": "2026-05-05T04:41:04.636276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67150, "epoch": 0, "train_loss": 3.698225185275078, "train_ppl": 40.37558155487815, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 152989, "dt_s": 4.284, "eta_s": 11095, "world_size": 1, "timestamp": "2026-05-05T04:41:08.919962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67160, "epoch": 0, "train_loss": 3.738352581858635, "train_ppl": 42.028694267615656, "lr": 0.00056, "grad_norm": 0.7247, "tokens_per_sec": 151345, "dt_s": 4.33, "eta_s": 11065, "world_size": 1, "timestamp": "2026-05-05T04:41:13.250199"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67170, "epoch": 0, "train_loss": 3.6961817890405655, "train_ppl": 40.29316247965484, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 151624, "dt_s": 4.322, "eta_s": 11055, "world_size": 1, "timestamp": "2026-05-05T04:41:17.572459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67180, "epoch": 0, "train_loss": 3.6954742670059204, "train_ppl": 40.2646642621022, "lr": 0.00056, "grad_norm": 0.6372, "tokens_per_sec": 152785, "dt_s": 4.289, "eta_s": 11022, "world_size": 1, "timestamp": "2026-05-05T04:41:21.861908"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67190, "epoch": 0, "train_loss": 3.9531741589307785, "train_ppl": 52.10047985214268, "lr": 0.00056, "grad_norm": 0.7957, "tokens_per_sec": 151201, "dt_s": 4.334, "eta_s": 10992, "world_size": 1, "timestamp": "2026-05-05T04:41:26.196295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67200, "epoch": 0, "train_loss": 3.6810625195503235, "train_ppl": 39.68854152865739, "lr": 0.00056, "grad_norm": 0.6646, "tokens_per_sec": 151416, "dt_s": 4.328, "eta_s": 11010, "world_size": 1, "timestamp": "2026-05-05T04:41:30.524494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67210, "epoch": 0, "train_loss": 3.7375741600990295, "train_ppl": 41.99599094760925, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 152088, "dt_s": 4.309, "eta_s": 10995, "world_size": 1, "timestamp": "2026-05-05T04:41:34.833563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67220, "epoch": 0, "train_loss": 3.7296861708164215, "train_ppl": 41.666030095788976, "lr": 0.00056, "grad_norm": 0.6369, "tokens_per_sec": 151470, "dt_s": 4.327, "eta_s": 10993, "world_size": 1, "timestamp": "2026-05-05T04:41:39.160237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67230, "epoch": 0, "train_loss": 3.7240567058324814, "train_ppl": 41.432131617694694, "lr": 0.00056, "grad_norm": 0.6436, "tokens_per_sec": 152654, "dt_s": 4.293, "eta_s": 10991, "world_size": 1, "timestamp": "2026-05-05T04:41:43.453345"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67240, "epoch": 0, "train_loss": 3.8018382638692856, "train_ppl": 44.7834326394056, "lr": 0.00056, "grad_norm": 0.6684, "tokens_per_sec": 152792, "dt_s": 4.289, "eta_s": 10963, "world_size": 1, "timestamp": "2026-05-05T04:41:47.742547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67250, "epoch": 0, "train_loss": 3.7922570258378983, "train_ppl": 44.3564009244707, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 151261, "dt_s": 4.333, "eta_s": 10961, "world_size": 1, "timestamp": "2026-05-05T04:41:52.075209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67260, "epoch": 0, "train_loss": 3.6958498805761337, "train_ppl": 40.27979105713602, "lr": 0.00056, "grad_norm": 0.7433, "tokens_per_sec": 151129, "dt_s": 4.336, "eta_s": 10971, "world_size": 1, "timestamp": "2026-05-05T04:41:56.411649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67270, "epoch": 0, "train_loss": 3.6151570230722427, "train_ppl": 37.157180002152046, "lr": 0.00056, "grad_norm": 0.6511, "tokens_per_sec": 148934, "dt_s": 4.4, "eta_s": 11004, "world_size": 1, "timestamp": "2026-05-05T04:42:00.811972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67280, "epoch": 0, "train_loss": 3.6831716299057007, "train_ppl": 39.77233737887026, "lr": 0.00056, "grad_norm": 0.7396, "tokens_per_sec": 132447, "dt_s": 4.948, "eta_s": 11332, "world_size": 1, "timestamp": "2026-05-05T04:42:05.760064"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67290, "epoch": 0, "train_loss": 3.7276180386543274, "train_ppl": 41.57994828384523, "lr": 0.00056, "grad_norm": 0.6854, "tokens_per_sec": 152576, "dt_s": 4.295, "eta_s": 11331, "world_size": 1, "timestamp": "2026-05-05T04:42:10.055379"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67300, "epoch": 0, "train_loss": 3.76755154132843, "train_ppl": 43.273980464527995, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 150202, "dt_s": 4.363, "eta_s": 11342, "world_size": 1, "timestamp": "2026-05-05T04:42:14.418561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67310, "epoch": 0, "train_loss": 3.707954853773117, "train_ppl": 40.770339898540364, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 153162, "dt_s": 4.279, "eta_s": 11308, "world_size": 1, "timestamp": "2026-05-05T04:42:18.697417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67320, "epoch": 0, "train_loss": 3.7610469460487366, "train_ppl": 42.99341420965695, "lr": 0.00056, "grad_norm": 0.7248, "tokens_per_sec": 153392, "dt_s": 4.272, "eta_s": 11239, "world_size": 1, "timestamp": "2026-05-05T04:42:22.969885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67330, "epoch": 0, "train_loss": 3.5726303458213806, "train_ppl": 35.61013704647735, "lr": 0.00056, "grad_norm": 0.633, "tokens_per_sec": 149654, "dt_s": 4.379, "eta_s": 10946, "world_size": 1, "timestamp": "2026-05-05T04:42:27.349067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67340, "epoch": 0, "train_loss": 3.68747316300869, "train_ppl": 39.94378789034906, "lr": 0.00056, "grad_norm": 0.7848, "tokens_per_sec": 151825, "dt_s": 4.317, "eta_s": 10953, "world_size": 1, "timestamp": "2026-05-05T04:42:31.665601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67350, "epoch": 0, "train_loss": 3.617502883076668, "train_ppl": 37.2444478636693, "lr": 0.00056, "grad_norm": 0.6282, "tokens_per_sec": 152463, "dt_s": 4.298, "eta_s": 10916, "world_size": 1, "timestamp": "2026-05-05T04:42:35.964078"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67360, "epoch": 0, "train_loss": 3.710402563214302, "train_ppl": 40.870256077432046, "lr": 0.00056, "grad_norm": 0.6669, "tokens_per_sec": 150773, "dt_s": 4.347, "eta_s": 10946, "world_size": 1, "timestamp": "2026-05-05T04:42:40.310734"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67370, "epoch": 0, "train_loss": 3.655359596014023, "train_ppl": 38.681428301961155, "lr": 0.00056, "grad_norm": 0.6337, "tokens_per_sec": 153096, "dt_s": 4.281, "eta_s": 10945, "world_size": 1, "timestamp": "2026-05-05T04:42:44.591441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67380, "epoch": 0, "train_loss": 3.725519746541977, "train_ppl": 41.492792877058136, "lr": 0.00056, "grad_norm": 0.7281, "tokens_per_sec": 151525, "dt_s": 4.325, "eta_s": 10914, "world_size": 1, "timestamp": "2026-05-05T04:42:48.916544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67390, "epoch": 0, "train_loss": 3.7335463911294937, "train_ppl": 41.827180990395114, "lr": 0.00056, "grad_norm": 0.6981, "tokens_per_sec": 150116, "dt_s": 4.366, "eta_s": 10934, "world_size": 1, "timestamp": "2026-05-05T04:42:53.282255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67400, "epoch": 0, "train_loss": 3.684497654438019, "train_ppl": 39.82511145606989, "lr": 0.00056, "grad_norm": 0.6446, "tokens_per_sec": 152695, "dt_s": 4.292, "eta_s": 10927, "world_size": 1, "timestamp": "2026-05-05T04:42:57.574221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67410, "epoch": 0, "train_loss": 3.766572579741478, "train_ppl": 43.231637629324936, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 148078, "dt_s": 4.426, "eta_s": 10962, "world_size": 1, "timestamp": "2026-05-05T04:43:01.999972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67420, "epoch": 0, "train_loss": 3.6326597332954407, "train_ppl": 37.81325616319505, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 149534, "dt_s": 4.383, "eta_s": 11010, "world_size": 1, "timestamp": "2026-05-05T04:43:06.382661"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67430, "epoch": 0, "train_loss": 3.7349960058927536, "train_ppl": 41.887858258170404, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 149748, "dt_s": 4.376, "eta_s": 11031, "world_size": 1, "timestamp": "2026-05-05T04:43:10.759089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67440, "epoch": 0, "train_loss": 3.5101358890533447, "train_ppl": 33.45281334622918, "lr": 0.00056, "grad_norm": 0.6351, "tokens_per_sec": 148608, "dt_s": 4.41, "eta_s": 11049, "world_size": 1, "timestamp": "2026-05-05T04:43:15.169089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67450, "epoch": 0, "train_loss": 3.789632946252823, "train_ppl": 44.24015877935894, "lr": 0.00056, "grad_norm": 0.6726, "tokens_per_sec": 153521, "dt_s": 4.269, "eta_s": 11033, "world_size": 1, "timestamp": "2026-05-05T04:43:19.437927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67460, "epoch": 0, "train_loss": 3.683987721800804, "train_ppl": 39.80480850896536, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 152008, "dt_s": 4.311, "eta_s": 10971, "world_size": 1, "timestamp": "2026-05-05T04:43:23.749290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67470, "epoch": 0, "train_loss": 3.7743055671453476, "train_ppl": 43.56724328330075, "lr": 0.00056, "grad_norm": 0.6538, "tokens_per_sec": 148943, "dt_s": 4.4, "eta_s": 10975, "world_size": 1, "timestamp": "2026-05-05T04:43:28.149395"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67480, "epoch": 0, "train_loss": 3.7136696875095367, "train_ppl": 41.004002648382084, "lr": 0.00056, "grad_norm": 0.6217, "tokens_per_sec": 152457, "dt_s": 4.299, "eta_s": 10932, "world_size": 1, "timestamp": "2026-05-05T04:43:32.448013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67490, "epoch": 0, "train_loss": 3.6976892948150635, "train_ppl": 40.35395046237109, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 150064, "dt_s": 4.367, "eta_s": 10906, "world_size": 1, "timestamp": "2026-05-05T04:43:36.815223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67500, "epoch": 0, "train_loss": 3.6699071526527405, "train_ppl": 39.248261594153334, "lr": 0.00056, "grad_norm": 0.6432, "tokens_per_sec": 146966, "dt_s": 4.459, "eta_s": 10997, "world_size": 1, "timestamp": "2026-05-05T04:43:41.274483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67510, "epoch": 0, "train_loss": 3.6875163316726685, "train_ppl": 39.94551224752536, "lr": 0.00056, "grad_norm": 0.6914, "tokens_per_sec": 128526, "dt_s": 5.099, "eta_s": 11004, "world_size": 1, "timestamp": "2026-05-05T04:43:46.373547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67520, "epoch": 0, "train_loss": 3.7387708872556686, "train_ppl": 42.04627877484881, "lr": 0.00056, "grad_norm": 0.6726, "tokens_per_sec": 148257, "dt_s": 4.42, "eta_s": 11010, "world_size": 1, "timestamp": "2026-05-05T04:43:50.793979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67530, "epoch": 0, "train_loss": 3.666289433836937, "train_ppl": 39.106528948707435, "lr": 0.00056, "grad_norm": 0.6842, "tokens_per_sec": 147392, "dt_s": 4.446, "eta_s": 11080, "world_size": 1, "timestamp": "2026-05-05T04:43:55.240349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67540, "epoch": 0, "train_loss": 3.6637888848781586, "train_ppl": 39.0088633181985, "lr": 0.00056, "grad_norm": 0.6437, "tokens_per_sec": 143177, "dt_s": 4.577, "eta_s": 11181, "world_size": 1, "timestamp": "2026-05-05T04:43:59.817649"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67550, "epoch": 0, "train_loss": 3.7079039067029953, "train_ppl": 40.76826282208559, "lr": 0.00056, "grad_norm": 0.7363, "tokens_per_sec": 137542, "dt_s": 4.765, "eta_s": 11330, "world_size": 1, "timestamp": "2026-05-05T04:44:04.582428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67560, "epoch": 0, "train_loss": 3.7252887040376663, "train_ppl": 41.48320738565159, "lr": 0.00056, "grad_norm": 0.7168, "tokens_per_sec": 141102, "dt_s": 4.645, "eta_s": 11482, "world_size": 1, "timestamp": "2026-05-05T04:44:09.227014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67570, "epoch": 0, "train_loss": 4.015237718820572, "train_ppl": 55.43647212902505, "lr": 0.00056, "grad_norm": 1.1636, "tokens_per_sec": 126697, "dt_s": 5.173, "eta_s": 11855, "world_size": 1, "timestamp": "2026-05-05T04:44:14.399689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67580, "epoch": 0, "train_loss": 3.6914676874876022, "train_ppl": 40.10366342965107, "lr": 0.00056, "grad_norm": 0.7557, "tokens_per_sec": 142160, "dt_s": 4.61, "eta_s": 11933, "world_size": 1, "timestamp": "2026-05-05T04:44:19.009683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67590, "epoch": 0, "train_loss": 3.709890589118004, "train_ppl": 40.8493369205015, "lr": 0.00056, "grad_norm": 0.7062, "tokens_per_sec": 147326, "dt_s": 4.448, "eta_s": 11863, "world_size": 1, "timestamp": "2026-05-05T04:44:23.458040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67600, "epoch": 0, "train_loss": 3.7431854903697968, "train_ppl": 42.23230672573093, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 146558, "dt_s": 4.472, "eta_s": 11712, "world_size": 1, "timestamp": "2026-05-05T04:44:27.929706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67610, "epoch": 0, "train_loss": 3.702949121594429, "train_ppl": 40.56676444336592, "lr": 0.00056, "grad_norm": 0.6798, "tokens_per_sec": 147646, "dt_s": 4.439, "eta_s": 11604, "world_size": 1, "timestamp": "2026-05-05T04:44:32.368439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67620, "epoch": 0, "train_loss": 3.7556083649396896, "train_ppl": 42.760225721350224, "lr": 0.00056, "grad_norm": 0.7046, "tokens_per_sec": 147667, "dt_s": 4.438, "eta_s": 11231, "world_size": 1, "timestamp": "2026-05-05T04:44:36.806552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67630, "epoch": 0, "train_loss": 3.944814011454582, "train_ppl": 51.66672779888084, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 145456, "dt_s": 4.506, "eta_s": 11174, "world_size": 1, "timestamp": "2026-05-05T04:44:41.312106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67640, "epoch": 0, "train_loss": 3.727942183613777, "train_ppl": 41.5934283991333, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 148519, "dt_s": 4.413, "eta_s": 11152, "world_size": 1, "timestamp": "2026-05-05T04:44:45.724737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67650, "epoch": 0, "train_loss": 3.735357865691185, "train_ppl": 41.903018532897974, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 147861, "dt_s": 4.432, "eta_s": 11128, "world_size": 1, "timestamp": "2026-05-05T04:44:50.156980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67660, "epoch": 0, "train_loss": 3.7986875772476196, "train_ppl": 44.642556122732586, "lr": 0.00056, "grad_norm": 0.7172, "tokens_per_sec": 149173, "dt_s": 4.393, "eta_s": 11100, "world_size": 1, "timestamp": "2026-05-05T04:44:54.550293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67670, "epoch": 0, "train_loss": 3.6858903765678406, "train_ppl": 39.88061541193956, "lr": 0.00056, "grad_norm": 0.791, "tokens_per_sec": 149377, "dt_s": 4.387, "eta_s": 11071, "world_size": 1, "timestamp": "2026-05-05T04:44:58.937570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67680, "epoch": 0, "train_loss": 3.6141896843910217, "train_ppl": 37.12125380385299, "lr": 0.00056, "grad_norm": 0.6688, "tokens_per_sec": 146736, "dt_s": 4.466, "eta_s": 11046, "world_size": 1, "timestamp": "2026-05-05T04:45:03.403816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67690, "epoch": 0, "train_loss": 3.7712672501802444, "train_ppl": 43.435073078074716, "lr": 0.00056, "grad_norm": 0.7202, "tokens_per_sec": 148904, "dt_s": 4.401, "eta_s": 11036, "world_size": 1, "timestamp": "2026-05-05T04:45:07.805036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67700, "epoch": 0, "train_loss": 3.6992499232292175, "train_ppl": 40.41697715191519, "lr": 0.00056, "grad_norm": 0.7096, "tokens_per_sec": 149221, "dt_s": 4.392, "eta_s": 11012, "world_size": 1, "timestamp": "2026-05-05T04:45:12.196907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67710, "epoch": 0, "train_loss": 3.807724416255951, "train_ppl": 45.04781207381407, "lr": 0.00056, "grad_norm": 0.832, "tokens_per_sec": 145614, "dt_s": 4.501, "eta_s": 11061, "world_size": 1, "timestamp": "2026-05-05T04:45:16.697574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67720, "epoch": 0, "train_loss": 3.6413795202970505, "train_ppl": 38.14442144982157, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 147588, "dt_s": 4.44, "eta_s": 11083, "world_size": 1, "timestamp": "2026-05-05T04:45:21.138024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67730, "epoch": 0, "train_loss": 3.621107041835785, "train_ppl": 37.37892495949606, "lr": 0.00056, "grad_norm": 0.732, "tokens_per_sec": 149371, "dt_s": 4.387, "eta_s": 11039, "world_size": 1, "timestamp": "2026-05-05T04:45:25.525490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67740, "epoch": 0, "train_loss": 3.6553623527288437, "train_ppl": 38.68153493577483, "lr": 0.00056, "grad_norm": 0.719, "tokens_per_sec": 146341, "dt_s": 4.478, "eta_s": 11073, "world_size": 1, "timestamp": "2026-05-05T04:45:30.003792"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67750, "epoch": 0, "train_loss": 3.695604130625725, "train_ppl": 40.2698935166911, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 150089, "dt_s": 4.366, "eta_s": 11056, "world_size": 1, "timestamp": "2026-05-05T04:45:34.370283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67760, "epoch": 0, "train_loss": 3.7056580632925034, "train_ppl": 40.6768064244841, "lr": 0.00056, "grad_norm": 0.6616, "tokens_per_sec": 148938, "dt_s": 4.4, "eta_s": 11002, "world_size": 1, "timestamp": "2026-05-05T04:45:38.770483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67770, "epoch": 0, "train_loss": 3.752403885126114, "train_ppl": 42.62342075264982, "lr": 0.00056, "grad_norm": 0.6663, "tokens_per_sec": 146784, "dt_s": 4.465, "eta_s": 11010, "world_size": 1, "timestamp": "2026-05-05T04:45:43.235332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67780, "epoch": 0, "train_loss": 3.721474528312683, "train_ppl": 41.32528450730558, "lr": 0.00056, "grad_norm": 0.6958, "tokens_per_sec": 150526, "dt_s": 4.354, "eta_s": 10988, "world_size": 1, "timestamp": "2026-05-05T04:45:47.589095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67790, "epoch": 0, "train_loss": 3.688292533159256, "train_ppl": 39.97653004998926, "lr": 0.00056, "grad_norm": 0.6487, "tokens_per_sec": 146428, "dt_s": 4.476, "eta_s": 10983, "world_size": 1, "timestamp": "2026-05-05T04:45:52.064757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67800, "epoch": 0, "train_loss": 3.6288789957761765, "train_ppl": 37.67056407752404, "lr": 0.00056, "grad_norm": 0.6903, "tokens_per_sec": 150416, "dt_s": 4.357, "eta_s": 10973, "world_size": 1, "timestamp": "2026-05-05T04:45:56.421732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67810, "epoch": 0, "train_loss": 3.7563627511262894, "train_ppl": 42.79249561542107, "lr": 0.00056, "grad_norm": 0.6838, "tokens_per_sec": 146928, "dt_s": 4.46, "eta_s": 10999, "world_size": 1, "timestamp": "2026-05-05T04:46:00.882142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67820, "epoch": 0, "train_loss": 3.7637630701065063, "train_ppl": 43.110348388296856, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 147298, "dt_s": 4.449, "eta_s": 10987, "world_size": 1, "timestamp": "2026-05-05T04:46:05.331394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67830, "epoch": 0, "train_loss": 3.837882176041603, "train_ppl": 46.42704593925227, "lr": 0.00056, "grad_norm": 0.721, "tokens_per_sec": 146838, "dt_s": 4.463, "eta_s": 11037, "world_size": 1, "timestamp": "2026-05-05T04:46:09.794508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67840, "epoch": 0, "train_loss": 3.7030743807554245, "train_ppl": 40.571846120501064, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 148875, "dt_s": 4.402, "eta_s": 10996, "world_size": 1, "timestamp": "2026-05-05T04:46:14.196577"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67850, "epoch": 0, "train_loss": 3.6049817502498627, "train_ppl": 36.78101260682924, "lr": 0.00056, "grad_norm": 0.7867, "tokens_per_sec": 145312, "dt_s": 4.51, "eta_s": 11067, "world_size": 1, "timestamp": "2026-05-05T04:46:18.706605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67860, "epoch": 0, "train_loss": 3.7606662660837173, "train_ppl": 42.97705059308766, "lr": 0.00056, "grad_norm": 0.7449, "tokens_per_sec": 149564, "dt_s": 4.382, "eta_s": 11024, "world_size": 1, "timestamp": "2026-05-05T04:46:23.088412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67870, "epoch": 0, "train_loss": 3.7718153595924377, "train_ppl": 43.458886776107946, "lr": 0.00056, "grad_norm": 0.6564, "tokens_per_sec": 129533, "dt_s": 5.059, "eta_s": 11322, "world_size": 1, "timestamp": "2026-05-05T04:46:28.147799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67880, "epoch": 0, "train_loss": 3.7334363758563995, "train_ppl": 41.822579614771136, "lr": 0.00056, "grad_norm": 0.6706, "tokens_per_sec": 149837, "dt_s": 4.374, "eta_s": 11273, "world_size": 1, "timestamp": "2026-05-05T04:46:32.521621"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67890, "epoch": 0, "train_loss": 3.7352966368198395, "train_ppl": 41.90045293691231, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 148156, "dt_s": 4.423, "eta_s": 11279, "world_size": 1, "timestamp": "2026-05-05T04:46:36.945057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67900, "epoch": 0, "train_loss": 3.660103663802147, "train_ppl": 38.865371594703824, "lr": 0.00056, "grad_norm": 0.6304, "tokens_per_sec": 147085, "dt_s": 4.456, "eta_s": 11248, "world_size": 1, "timestamp": "2026-05-05T04:46:41.400710"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67910, "epoch": 0, "train_loss": 3.7460222393274307, "train_ppl": 42.35227926334611, "lr": 0.00056, "grad_norm": 0.7326, "tokens_per_sec": 146123, "dt_s": 4.485, "eta_s": 11294, "world_size": 1, "timestamp": "2026-05-05T04:46:45.885685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67920, "epoch": 0, "train_loss": 3.6851606369018555, "train_ppl": 39.85152356099936, "lr": 0.00056, "grad_norm": 0.6941, "tokens_per_sec": 147375, "dt_s": 4.447, "eta_s": 10987, "world_size": 1, "timestamp": "2026-05-05T04:46:50.332591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67930, "epoch": 0, "train_loss": 3.713871493935585, "train_ppl": 41.0122783546275, "lr": 0.00056, "grad_norm": 0.715, "tokens_per_sec": 146827, "dt_s": 4.463, "eta_s": 11026, "world_size": 1, "timestamp": "2026-05-05T04:46:54.796068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67940, "epoch": 0, "train_loss": 3.773792788386345, "train_ppl": 43.54490865320811, "lr": 0.00056, "grad_norm": 0.7468, "tokens_per_sec": 148947, "dt_s": 4.4, "eta_s": 11010, "world_size": 1, "timestamp": "2026-05-05T04:46:59.195999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67950, "epoch": 0, "train_loss": 3.7069236636161804, "train_ppl": 40.728319594527505, "lr": 0.00056, "grad_norm": 0.7791, "tokens_per_sec": 145961, "dt_s": 4.49, "eta_s": 11023, "world_size": 1, "timestamp": "2026-05-05T04:47:03.685979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67960, "epoch": 0, "train_loss": 3.612816944718361, "train_ppl": 37.07033094596781, "lr": 0.00056, "grad_norm": 0.6319, "tokens_per_sec": 147383, "dt_s": 4.447, "eta_s": 11000, "world_size": 1, "timestamp": "2026-05-05T04:47:08.132618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67970, "epoch": 0, "train_loss": 3.6397265046834946, "train_ppl": 38.081420210956416, "lr": 0.00056, "grad_norm": 0.6301, "tokens_per_sec": 149474, "dt_s": 4.384, "eta_s": 10964, "world_size": 1, "timestamp": "2026-05-05T04:47:12.517062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67980, "epoch": 0, "train_loss": 3.7091856002807617, "train_ppl": 40.82054874282809, "lr": 0.00056, "grad_norm": 0.6949, "tokens_per_sec": 140554, "dt_s": 4.663, "eta_s": 11058, "world_size": 1, "timestamp": "2026-05-05T04:47:17.179768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 67990, "epoch": 0, "train_loss": 3.7885172367095947, "train_ppl": 44.19082713702432, "lr": 0.00056, "grad_norm": 0.7156, "tokens_per_sec": 141450, "dt_s": 4.633, "eta_s": 11169, "world_size": 1, "timestamp": "2026-05-05T04:47:21.812920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68000, "epoch": 0, "train_loss": 3.7501856088638306, "train_ppl": 42.528975022266444, "lr": 0.00056, "grad_norm": 0.6432, "tokens_per_sec": 144019, "dt_s": 4.551, "eta_s": 11194, "world_size": 1, "timestamp": "2026-05-05T04:47:26.363444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68010, "epoch": 0, "train_loss": 3.7004831433296204, "train_ppl": 40.46685092688654, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 120558, "dt_s": 5.436, "eta_s": 11285, "world_size": 1, "timestamp": "2026-05-05T04:47:31.799536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68020, "epoch": 0, "train_loss": 3.7285371720790863, "train_ppl": 41.618183372995105, "lr": 0.00056, "grad_norm": 0.7074, "tokens_per_sec": 144700, "dt_s": 4.529, "eta_s": 11352, "world_size": 1, "timestamp": "2026-05-05T04:47:36.328599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68030, "epoch": 0, "train_loss": 3.7189644426107407, "train_ppl": 41.22168457827991, "lr": 0.00056, "grad_norm": 0.6749, "tokens_per_sec": 141367, "dt_s": 4.636, "eta_s": 11334, "world_size": 1, "timestamp": "2026-05-05T04:47:40.964519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68040, "epoch": 0, "train_loss": 3.66622731089592, "train_ppl": 39.104099611575734, "lr": 0.00056, "grad_norm": 0.6857, "tokens_per_sec": 145213, "dt_s": 4.513, "eta_s": 11271, "world_size": 1, "timestamp": "2026-05-05T04:47:45.477588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68050, "epoch": 0, "train_loss": 3.731426551938057, "train_ppl": 41.738608006274376, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 144323, "dt_s": 4.541, "eta_s": 11261, "world_size": 1, "timestamp": "2026-05-05T04:47:50.018509"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68060, "epoch": 0, "train_loss": 3.7132939249277115, "train_ppl": 40.988597772950904, "lr": 0.00056, "grad_norm": 0.6229, "tokens_per_sec": 144055, "dt_s": 4.549, "eta_s": 11212, "world_size": 1, "timestamp": "2026-05-05T04:47:54.567877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68070, "epoch": 0, "train_loss": 3.6783254742622375, "train_ppl": 39.580060719242404, "lr": 0.00056, "grad_norm": 0.6951, "tokens_per_sec": 144192, "dt_s": 4.545, "eta_s": 11215, "world_size": 1, "timestamp": "2026-05-05T04:47:59.112923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68080, "epoch": 0, "train_loss": 3.6970740407705307, "train_ppl": 40.32913016731265, "lr": 0.00056, "grad_norm": 0.695, "tokens_per_sec": 142549, "dt_s": 4.597, "eta_s": 11192, "world_size": 1, "timestamp": "2026-05-05T04:48:03.710380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68090, "epoch": 0, "train_loss": 3.684336766600609, "train_ppl": 39.81870459541991, "lr": 0.00056, "grad_norm": 0.7622, "tokens_per_sec": 143264, "dt_s": 4.574, "eta_s": 11217, "world_size": 1, "timestamp": "2026-05-05T04:48:08.284877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68100, "epoch": 0, "train_loss": 3.7515937834978104, "train_ppl": 42.588905432440384, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 145522, "dt_s": 4.504, "eta_s": 11194, "world_size": 1, "timestamp": "2026-05-05T04:48:12.788389"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68110, "epoch": 0, "train_loss": 3.6371482759714127, "train_ppl": 37.983364059880806, "lr": 0.00056, "grad_norm": 0.69, "tokens_per_sec": 143485, "dt_s": 4.567, "eta_s": 11199, "world_size": 1, "timestamp": "2026-05-05T04:48:17.355815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68120, "epoch": 0, "train_loss": 3.7382857501506805, "train_ppl": 42.02588551205267, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 146157, "dt_s": 4.484, "eta_s": 11164, "world_size": 1, "timestamp": "2026-05-05T04:48:21.839752"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68130, "epoch": 0, "train_loss": 3.666015699505806, "train_ppl": 39.0958256141647, "lr": 0.00056, "grad_norm": 0.6951, "tokens_per_sec": 145268, "dt_s": 4.511, "eta_s": 11117, "world_size": 1, "timestamp": "2026-05-05T04:48:26.351118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68140, "epoch": 0, "train_loss": 3.7326095551252365, "train_ppl": 41.78801413061354, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 143766, "dt_s": 4.559, "eta_s": 11105, "world_size": 1, "timestamp": "2026-05-05T04:48:30.909648"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68150, "epoch": 0, "train_loss": 3.7085749804973602, "train_ppl": 40.795630516751565, "lr": 0.00056, "grad_norm": 0.6354, "tokens_per_sec": 143321, "dt_s": 4.573, "eta_s": 11134, "world_size": 1, "timestamp": "2026-05-05T04:48:35.482348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68160, "epoch": 0, "train_loss": 3.7263296097517014, "train_ppl": 41.52640997426571, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 145009, "dt_s": 4.519, "eta_s": 11106, "world_size": 1, "timestamp": "2026-05-05T04:48:40.001774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68170, "epoch": 0, "train_loss": 3.783325657248497, "train_ppl": 43.96200144382707, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 129369, "dt_s": 5.066, "eta_s": 11387, "world_size": 1, "timestamp": "2026-05-05T04:48:45.067628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68180, "epoch": 0, "train_loss": 3.7266235649585724, "train_ppl": 41.53861867301768, "lr": 0.00056, "grad_norm": 0.7223, "tokens_per_sec": 144861, "dt_s": 4.524, "eta_s": 11389, "world_size": 1, "timestamp": "2026-05-05T04:48:49.591638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68190, "epoch": 0, "train_loss": 3.733276978135109, "train_ppl": 41.81591372216021, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 143243, "dt_s": 4.575, "eta_s": 11392, "world_size": 1, "timestamp": "2026-05-05T04:48:54.166803"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68200, "epoch": 0, "train_loss": 3.663027212023735, "train_ppl": 38.97916263846376, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 144810, "dt_s": 4.526, "eta_s": 11364, "world_size": 1, "timestamp": "2026-05-05T04:48:58.692446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68210, "epoch": 0, "train_loss": 3.681662529706955, "train_ppl": 39.712362202284886, "lr": 0.00056, "grad_norm": 0.8742, "tokens_per_sec": 145300, "dt_s": 4.51, "eta_s": 11355, "world_size": 1, "timestamp": "2026-05-05T04:49:03.202841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68220, "epoch": 0, "train_loss": 3.8203766494989395, "train_ppl": 45.62138835828546, "lr": 0.00056, "grad_norm": 0.6857, "tokens_per_sec": 142044, "dt_s": 4.614, "eta_s": 11129, "world_size": 1, "timestamp": "2026-05-05T04:49:07.816623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68230, "epoch": 0, "train_loss": 3.610100418329239, "train_ppl": 36.969765070470864, "lr": 0.00056, "grad_norm": 0.6807, "tokens_per_sec": 143980, "dt_s": 4.552, "eta_s": 11139, "world_size": 1, "timestamp": "2026-05-05T04:49:12.368370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68240, "epoch": 0, "train_loss": 3.851976066827774, "train_ppl": 47.08601647563045, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 145750, "dt_s": 4.496, "eta_s": 11095, "world_size": 1, "timestamp": "2026-05-05T04:49:16.864826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68250, "epoch": 0, "train_loss": 3.7121605426073074, "train_ppl": 40.942168337017904, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 144427, "dt_s": 4.538, "eta_s": 11097, "world_size": 1, "timestamp": "2026-05-05T04:49:21.402476"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68260, "epoch": 0, "train_loss": 3.72521910071373, "train_ppl": 41.480320117012916, "lr": 0.00056, "grad_norm": 0.6725, "tokens_per_sec": 145319, "dt_s": 4.51, "eta_s": 11092, "world_size": 1, "timestamp": "2026-05-05T04:49:25.912307"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68270, "epoch": 0, "train_loss": 3.550591692328453, "train_ppl": 34.8339223557575, "lr": 0.00056, "grad_norm": 0.765, "tokens_per_sec": 142417, "dt_s": 4.602, "eta_s": 11082, "world_size": 1, "timestamp": "2026-05-05T04:49:30.513960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68280, "epoch": 0, "train_loss": 3.6569083780050278, "train_ppl": 38.741383818529, "lr": 0.00056, "grad_norm": 0.646, "tokens_per_sec": 145198, "dt_s": 4.514, "eta_s": 11058, "world_size": 1, "timestamp": "2026-05-05T04:49:35.027530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68290, "epoch": 0, "train_loss": 3.695344939827919, "train_ppl": 40.2594572834092, "lr": 0.00056, "grad_norm": 0.6537, "tokens_per_sec": 142989, "dt_s": 4.583, "eta_s": 11096, "world_size": 1, "timestamp": "2026-05-05T04:49:39.610832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68300, "epoch": 0, "train_loss": 3.6044664829969406, "train_ppl": 36.76206543735082, "lr": 0.00056, "grad_norm": 0.6615, "tokens_per_sec": 141884, "dt_s": 4.619, "eta_s": 11131, "world_size": 1, "timestamp": "2026-05-05T04:49:44.229822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68310, "epoch": 0, "train_loss": 3.627128317952156, "train_ppl": 37.60467275044678, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 143416, "dt_s": 4.57, "eta_s": 11156, "world_size": 1, "timestamp": "2026-05-05T04:49:48.799475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68320, "epoch": 0, "train_loss": 3.8292104601860046, "train_ppl": 46.026184379641165, "lr": 0.00056, "grad_norm": 0.7257, "tokens_per_sec": 145480, "dt_s": 4.505, "eta_s": 11104, "world_size": 1, "timestamp": "2026-05-05T04:49:53.304294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68330, "epoch": 0, "train_loss": 3.8186143934726715, "train_ppl": 45.5410625897971, "lr": 0.00056, "grad_norm": 0.6664, "tokens_per_sec": 147205, "dt_s": 4.452, "eta_s": 11070, "world_size": 1, "timestamp": "2026-05-05T04:49:57.756313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68340, "epoch": 0, "train_loss": 3.6438151448965073, "train_ppl": 38.237440174399374, "lr": 0.00056, "grad_norm": 0.7004, "tokens_per_sec": 145493, "dt_s": 4.504, "eta_s": 11027, "world_size": 1, "timestamp": "2026-05-05T04:50:02.260728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68350, "epoch": 0, "train_loss": 3.6734911799430847, "train_ppl": 39.3891808131234, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 146675, "dt_s": 4.468, "eta_s": 10949, "world_size": 1, "timestamp": "2026-05-05T04:50:06.728817"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68360, "epoch": 0, "train_loss": 3.6690106242895126, "train_ppl": 39.21309018286647, "lr": 0.00056, "grad_norm": 0.6831, "tokens_per_sec": 151053, "dt_s": 4.339, "eta_s": 10832, "world_size": 1, "timestamp": "2026-05-05T04:50:11.067432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68370, "epoch": 0, "train_loss": 3.6820837408304214, "train_ppl": 39.729093014338325, "lr": 0.00056, "grad_norm": 0.7221, "tokens_per_sec": 153250, "dt_s": 4.276, "eta_s": 10716, "world_size": 1, "timestamp": "2026-05-05T04:50:15.343830"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68380, "epoch": 0, "train_loss": 3.8581595569849014, "train_ppl": 47.3780744332164, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 149008, "dt_s": 4.398, "eta_s": 10686, "world_size": 1, "timestamp": "2026-05-05T04:50:19.741992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68390, "epoch": 0, "train_loss": 3.768391251564026, "train_ppl": 43.31033332965942, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 151409, "dt_s": 4.328, "eta_s": 10596, "world_size": 1, "timestamp": "2026-05-05T04:50:24.070400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68400, "epoch": 0, "train_loss": 3.75173382461071, "train_ppl": 42.594870047790266, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 151321, "dt_s": 4.331, "eta_s": 10525, "world_size": 1, "timestamp": "2026-05-05T04:50:28.401322"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68410, "epoch": 0, "train_loss": 3.723053902387619, "train_ppl": 41.390604158799945, "lr": 0.00056, "grad_norm": 0.699, "tokens_per_sec": 148478, "dt_s": 4.414, "eta_s": 10557, "world_size": 1, "timestamp": "2026-05-05T04:50:32.815163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68420, "epoch": 0, "train_loss": 3.7640472799539566, "train_ppl": 43.122602515125095, "lr": 0.00056, "grad_norm": 0.7211, "tokens_per_sec": 151828, "dt_s": 4.316, "eta_s": 10572, "world_size": 1, "timestamp": "2026-05-05T04:50:37.131631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68430, "epoch": 0, "train_loss": 3.7075599133968353, "train_ppl": 40.75424122437717, "lr": 0.00056, "grad_norm": 0.6181, "tokens_per_sec": 150256, "dt_s": 4.362, "eta_s": 10550, "world_size": 1, "timestamp": "2026-05-05T04:50:41.493277"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68440, "epoch": 0, "train_loss": 3.7749801725149155, "train_ppl": 43.59664389534731, "lr": 0.00056, "grad_norm": 0.7723, "tokens_per_sec": 150177, "dt_s": 4.364, "eta_s": 10563, "world_size": 1, "timestamp": "2026-05-05T04:50:45.857193"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68450, "epoch": 0, "train_loss": 3.7681992948055267, "train_ppl": 43.302020416349414, "lr": 0.00056, "grad_norm": 0.8585, "tokens_per_sec": 150863, "dt_s": 4.344, "eta_s": 10565, "world_size": 1, "timestamp": "2026-05-05T04:50:50.201258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68460, "epoch": 0, "train_loss": 3.705862835049629, "train_ppl": 40.685136738487316, "lr": 0.00056, "grad_norm": 0.7172, "tokens_per_sec": 132894, "dt_s": 4.931, "eta_s": 10811, "world_size": 1, "timestamp": "2026-05-05T04:50:55.132694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68470, "epoch": 0, "train_loss": 3.716699033975601, "train_ppl": 41.1284063146658, "lr": 0.00056, "grad_norm": 0.6316, "tokens_per_sec": 153244, "dt_s": 4.277, "eta_s": 10788, "world_size": 1, "timestamp": "2026-05-05T04:50:59.409293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68480, "epoch": 0, "train_loss": 3.748570814728737, "train_ppl": 42.46035490142421, "lr": 0.00056, "grad_norm": 0.7818, "tokens_per_sec": 152298, "dt_s": 4.303, "eta_s": 10755, "world_size": 1, "timestamp": "2026-05-05T04:51:03.712415"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68490, "epoch": 0, "train_loss": 3.6859262734651566, "train_ppl": 39.882047027991035, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 148074, "dt_s": 4.426, "eta_s": 10780, "world_size": 1, "timestamp": "2026-05-05T04:51:08.138320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68500, "epoch": 0, "train_loss": 3.7673865854740143, "train_ppl": 43.26684275682601, "lr": 0.00056, "grad_norm": 0.727, "tokens_per_sec": 152885, "dt_s": 4.287, "eta_s": 10748, "world_size": 1, "timestamp": "2026-05-05T04:51:12.424936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68510, "epoch": 0, "train_loss": 3.738856017589569, "train_ppl": 42.04985834096285, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 128083, "dt_s": 5.117, "eta_s": 10465, "world_size": 1, "timestamp": "2026-05-05T04:51:17.541641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68520, "epoch": 0, "train_loss": 3.7731450498104095, "train_ppl": 43.51671206908133, "lr": 0.00056, "grad_norm": 0.686, "tokens_per_sec": 149765, "dt_s": 4.376, "eta_s": 10509, "world_size": 1, "timestamp": "2026-05-05T04:51:21.917556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68530, "epoch": 0, "train_loss": 3.7041912227869034, "train_ppl": 40.61718377633034, "lr": 0.00056, "grad_norm": 0.6551, "tokens_per_sec": 151013, "dt_s": 4.34, "eta_s": 10522, "world_size": 1, "timestamp": "2026-05-05T04:51:26.257314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68540, "epoch": 0, "train_loss": 3.680544897913933, "train_ppl": 39.6680031968464, "lr": 0.00056, "grad_norm": 0.6962, "tokens_per_sec": 145516, "dt_s": 4.504, "eta_s": 10555, "world_size": 1, "timestamp": "2026-05-05T04:51:30.761016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68550, "epoch": 0, "train_loss": 3.713587075471878, "train_ppl": 41.0006153640883, "lr": 0.00056, "grad_norm": 0.7369, "tokens_per_sec": 153370, "dt_s": 4.273, "eta_s": 10544, "world_size": 1, "timestamp": "2026-05-05T04:51:35.034089"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68560, "epoch": 0, "train_loss": 3.6926295161247253, "train_ppl": 40.15028409164306, "lr": 0.00056, "grad_norm": 0.6453, "tokens_per_sec": 152581, "dt_s": 4.295, "eta_s": 10511, "world_size": 1, "timestamp": "2026-05-05T04:51:39.329270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68570, "epoch": 0, "train_loss": 3.788094937801361, "train_ppl": 44.1721693388316, "lr": 0.00056, "grad_norm": 0.6963, "tokens_per_sec": 149956, "dt_s": 4.37, "eta_s": 10504, "world_size": 1, "timestamp": "2026-05-05T04:51:43.699638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68580, "epoch": 0, "train_loss": 3.747686520218849, "train_ppl": 42.42282403931034, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 152002, "dt_s": 4.312, "eta_s": 10486, "world_size": 1, "timestamp": "2026-05-05T04:51:48.011142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68590, "epoch": 0, "train_loss": 3.7123918384313583, "train_ppl": 40.9516391848234, "lr": 0.00056, "grad_norm": 0.7538, "tokens_per_sec": 151909, "dt_s": 4.314, "eta_s": 10390, "world_size": 1, "timestamp": "2026-05-05T04:51:52.325301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68600, "epoch": 0, "train_loss": 3.735764279961586, "train_ppl": 41.92005197868589, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 148559, "dt_s": 4.411, "eta_s": 10453, "world_size": 1, "timestamp": "2026-05-05T04:51:56.736736"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68610, "epoch": 0, "train_loss": 3.7445446252822876, "train_ppl": 42.289745152680666, "lr": 0.00056, "grad_norm": 0.677, "tokens_per_sec": 150512, "dt_s": 4.354, "eta_s": 10477, "world_size": 1, "timestamp": "2026-05-05T04:52:01.090943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68620, "epoch": 0, "train_loss": 3.7248368859291077, "train_ppl": 41.46446875489878, "lr": 0.00056, "grad_norm": 0.6658, "tokens_per_sec": 149718, "dt_s": 4.377, "eta_s": 10476, "world_size": 1, "timestamp": "2026-05-05T04:52:05.468242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68630, "epoch": 0, "train_loss": 3.8271253407001495, "train_ppl": 45.93031427078239, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 149636, "dt_s": 4.38, "eta_s": 10504, "world_size": 1, "timestamp": "2026-05-05T04:52:09.847937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68640, "epoch": 0, "train_loss": 3.6552553921937943, "train_ppl": 38.677397759362805, "lr": 0.00056, "grad_norm": 0.6463, "tokens_per_sec": 152125, "dt_s": 4.308, "eta_s": 10497, "world_size": 1, "timestamp": "2026-05-05T04:52:14.155978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68650, "epoch": 0, "train_loss": 3.764418825507164, "train_ppl": 43.13862750315449, "lr": 0.00056, "grad_norm": 0.6474, "tokens_per_sec": 150933, "dt_s": 4.342, "eta_s": 10459, "world_size": 1, "timestamp": "2026-05-05T04:52:18.498040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68660, "epoch": 0, "train_loss": 3.7592287957668304, "train_ppl": 42.91531673947397, "lr": 0.00056, "grad_norm": 0.7058, "tokens_per_sec": 152263, "dt_s": 4.304, "eta_s": 10431, "world_size": 1, "timestamp": "2026-05-05T04:52:22.802160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68670, "epoch": 0, "train_loss": 3.5675108432769775, "train_ppl": 35.428296722421095, "lr": 0.00056, "grad_norm": 0.857, "tokens_per_sec": 152058, "dt_s": 4.31, "eta_s": 10394, "world_size": 1, "timestamp": "2026-05-05T04:52:27.112104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68680, "epoch": 0, "train_loss": 3.7197727113962173, "train_ppl": 41.25501624786872, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 148745, "dt_s": 4.406, "eta_s": 10402, "world_size": 1, "timestamp": "2026-05-05T04:52:31.518016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68690, "epoch": 0, "train_loss": 3.713698923587799, "train_ppl": 41.0052014621368, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 149863, "dt_s": 4.373, "eta_s": 10429, "world_size": 1, "timestamp": "2026-05-05T04:52:35.891086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68700, "epoch": 0, "train_loss": 3.770650178194046, "train_ppl": 43.40827877911562, "lr": 0.00056, "grad_norm": 0.6283, "tokens_per_sec": 152972, "dt_s": 4.284, "eta_s": 10397, "world_size": 1, "timestamp": "2026-05-05T04:52:40.175290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68710, "epoch": 0, "train_loss": 3.69986329972744, "train_ppl": 40.441775580438495, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 149647, "dt_s": 4.379, "eta_s": 10429, "world_size": 1, "timestamp": "2026-05-05T04:52:44.554660"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68720, "epoch": 0, "train_loss": 3.64650522172451, "train_ppl": 38.34044030319107, "lr": 0.00056, "grad_norm": 0.746, "tokens_per_sec": 150586, "dt_s": 4.352, "eta_s": 10445, "world_size": 1, "timestamp": "2026-05-05T04:52:48.906723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68730, "epoch": 0, "train_loss": 3.6621864438056946, "train_ppl": 38.94640397050469, "lr": 0.00056, "grad_norm": 0.7367, "tokens_per_sec": 151730, "dt_s": 4.319, "eta_s": 10399, "world_size": 1, "timestamp": "2026-05-05T04:52:53.225959"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68740, "epoch": 0, "train_loss": 3.729568064212799, "train_ppl": 41.66110935307975, "lr": 0.00056, "grad_norm": 0.7289, "tokens_per_sec": 148344, "dt_s": 4.418, "eta_s": 10416, "world_size": 1, "timestamp": "2026-05-05T04:52:57.643837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68750, "epoch": 0, "train_loss": 3.586419001221657, "train_ppl": 36.10455379256214, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 150387, "dt_s": 4.358, "eta_s": 10447, "world_size": 1, "timestamp": "2026-05-05T04:53:02.001654"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68760, "epoch": 0, "train_loss": 3.7402231693267822, "train_ppl": 42.107386193534595, "lr": 0.00056, "grad_norm": 0.6993, "tokens_per_sec": 133335, "dt_s": 4.915, "eta_s": 10699, "world_size": 1, "timestamp": "2026-05-05T04:53:06.916798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68770, "epoch": 0, "train_loss": 3.60676845908165, "train_ppl": 36.84678831041306, "lr": 0.00056, "grad_norm": 0.6546, "tokens_per_sec": 150477, "dt_s": 4.355, "eta_s": 10696, "world_size": 1, "timestamp": "2026-05-05T04:53:11.272039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68780, "epoch": 0, "train_loss": 3.691684529185295, "train_ppl": 40.1123605190246, "lr": 0.00056, "grad_norm": 0.7296, "tokens_per_sec": 153451, "dt_s": 4.271, "eta_s": 10668, "world_size": 1, "timestamp": "2026-05-05T04:53:15.542805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68790, "epoch": 0, "train_loss": 3.67203626036644, "train_ppl": 39.33191439197075, "lr": 0.00056, "grad_norm": 0.9001, "tokens_per_sec": 149868, "dt_s": 4.373, "eta_s": 10642, "world_size": 1, "timestamp": "2026-05-05T04:53:19.915725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68800, "epoch": 0, "train_loss": 3.7879109531641006, "train_ppl": 44.164043085854566, "lr": 0.00056, "grad_norm": 0.7157, "tokens_per_sec": 153202, "dt_s": 4.278, "eta_s": 10600, "world_size": 1, "timestamp": "2026-05-05T04:53:24.193458"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68810, "epoch": 0, "train_loss": 3.7143751084804535, "train_ppl": 41.03293793632134, "lr": 0.00056, "grad_norm": 0.6333, "tokens_per_sec": 152624, "dt_s": 4.294, "eta_s": 10298, "world_size": 1, "timestamp": "2026-05-05T04:53:28.487405"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68820, "epoch": 0, "train_loss": 3.743441089987755, "train_ppl": 42.24310266685595, "lr": 0.00056, "grad_norm": 0.6997, "tokens_per_sec": 148162, "dt_s": 4.423, "eta_s": 10327, "world_size": 1, "timestamp": "2026-05-05T04:53:32.910688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68830, "epoch": 0, "train_loss": 3.8503438383340836, "train_ppl": 47.00922402634547, "lr": 0.00056, "grad_norm": 0.7481, "tokens_per_sec": 151502, "dt_s": 4.326, "eta_s": 10349, "world_size": 1, "timestamp": "2026-05-05T04:53:37.236440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68840, "epoch": 0, "train_loss": 3.7721939086914062, "train_ppl": 43.47534121274881, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 150461, "dt_s": 4.356, "eta_s": 10336, "world_size": 1, "timestamp": "2026-05-05T04:53:41.592115"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68850, "epoch": 0, "train_loss": 3.875961810350418, "train_ppl": 48.229063202577876, "lr": 0.00056, "grad_norm": 0.7711, "tokens_per_sec": 148915, "dt_s": 4.401, "eta_s": 10390, "world_size": 1, "timestamp": "2026-05-05T04:53:45.993026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68860, "epoch": 0, "train_loss": 3.6343139708042145, "train_ppl": 37.87586003643043, "lr": 0.00056, "grad_norm": 0.7715, "tokens_per_sec": 152026, "dt_s": 4.311, "eta_s": 10394, "world_size": 1, "timestamp": "2026-05-05T04:53:50.303835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68870, "epoch": 0, "train_loss": 3.752286121249199, "train_ppl": 42.61840154892086, "lr": 0.00056, "grad_norm": 0.6955, "tokens_per_sec": 153263, "dt_s": 4.276, "eta_s": 10320, "world_size": 1, "timestamp": "2026-05-05T04:53:54.579881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68880, "epoch": 0, "train_loss": 3.6233988851308823, "train_ppl": 37.46468984033566, "lr": 0.00056, "grad_norm": 0.7238, "tokens_per_sec": 153997, "dt_s": 4.256, "eta_s": 10282, "world_size": 1, "timestamp": "2026-05-05T04:53:58.835537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68890, "epoch": 0, "train_loss": 3.664013370871544, "train_ppl": 39.017621244610446, "lr": 0.00056, "grad_norm": 0.6547, "tokens_per_sec": 152617, "dt_s": 4.294, "eta_s": 10248, "world_size": 1, "timestamp": "2026-05-05T04:54:03.129694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68900, "epoch": 0, "train_loss": 3.650939956307411, "train_ppl": 38.51084755591434, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 149858, "dt_s": 4.373, "eta_s": 10231, "world_size": 1, "timestamp": "2026-05-05T04:54:07.502905"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68910, "epoch": 0, "train_loss": 3.7963273972272873, "train_ppl": 44.537315895508904, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 153250, "dt_s": 4.276, "eta_s": 10210, "world_size": 1, "timestamp": "2026-05-05T04:54:11.779296"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68920, "epoch": 0, "train_loss": 3.660510316491127, "train_ppl": 38.88117951652014, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 150405, "dt_s": 4.357, "eta_s": 10244, "world_size": 1, "timestamp": "2026-05-05T04:54:16.136574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68930, "epoch": 0, "train_loss": 3.6386542469263077, "train_ppl": 38.04060899671123, "lr": 0.00056, "grad_norm": 0.6555, "tokens_per_sec": 150416, "dt_s": 4.357, "eta_s": 10288, "world_size": 1, "timestamp": "2026-05-05T04:54:20.493583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68940, "epoch": 0, "train_loss": 3.664270967245102, "train_ppl": 39.027673336983604, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 152460, "dt_s": 4.299, "eta_s": 10286, "world_size": 1, "timestamp": "2026-05-05T04:54:24.792163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68950, "epoch": 0, "train_loss": 3.743298754096031, "train_ppl": 42.23709038506058, "lr": 0.00056, "grad_norm": 0.6791, "tokens_per_sec": 152600, "dt_s": 4.295, "eta_s": 10244, "world_size": 1, "timestamp": "2026-05-05T04:54:29.086778"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68960, "epoch": 0, "train_loss": 3.7747399657964706, "train_ppl": 43.586172946228515, "lr": 0.00056, "grad_norm": 0.6946, "tokens_per_sec": 149978, "dt_s": 4.37, "eta_s": 10284, "world_size": 1, "timestamp": "2026-05-05T04:54:33.456488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68970, "epoch": 0, "train_loss": 3.6614965200424194, "train_ppl": 38.91954318792289, "lr": 0.00056, "grad_norm": 0.6918, "tokens_per_sec": 151938, "dt_s": 4.313, "eta_s": 10259, "world_size": 1, "timestamp": "2026-05-05T04:54:37.769860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68980, "epoch": 0, "train_loss": 3.7666545063257217, "train_ppl": 43.235179594815754, "lr": 0.00056, "grad_norm": 0.6335, "tokens_per_sec": 151810, "dt_s": 4.317, "eta_s": 10236, "world_size": 1, "timestamp": "2026-05-05T04:54:42.086835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 68990, "epoch": 0, "train_loss": 3.7496453672647476, "train_ppl": 42.506005305949856, "lr": 0.00056, "grad_norm": 0.6967, "tokens_per_sec": 150260, "dt_s": 4.361, "eta_s": 10261, "world_size": 1, "timestamp": "2026-05-05T04:54:46.448334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69000, "epoch": 0, "train_loss": 3.715586856007576, "train_ppl": 41.082689634557234, "lr": 0.00056, "grad_norm": 0.7601, "tokens_per_sec": 150244, "dt_s": 4.362, "eta_s": 10289, "world_size": 1, "timestamp": "2026-05-05T04:54:50.810309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69010, "epoch": 0, "train_loss": 3.7506152540445328, "train_ppl": 42.54725131730486, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 127062, "dt_s": 5.158, "eta_s": 10296, "world_size": 1, "timestamp": "2026-05-05T04:54:55.968097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69020, "epoch": 0, "train_loss": 3.6496847718954086, "train_ppl": 38.462539664367746, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 151987, "dt_s": 4.312, "eta_s": 10290, "world_size": 1, "timestamp": "2026-05-05T04:55:00.280023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69030, "epoch": 0, "train_loss": 3.702707290649414, "train_ppl": 40.55695533050571, "lr": 0.00056, "grad_norm": 0.7114, "tokens_per_sec": 148818, "dt_s": 4.404, "eta_s": 10326, "world_size": 1, "timestamp": "2026-05-05T04:55:04.683822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69040, "epoch": 0, "train_loss": 3.704215720295906, "train_ppl": 40.61817880834342, "lr": 0.00056, "grad_norm": 0.6752, "tokens_per_sec": 146753, "dt_s": 4.466, "eta_s": 10371, "world_size": 1, "timestamp": "2026-05-05T04:55:09.149539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69050, "epoch": 0, "train_loss": 3.8406335711479187, "train_ppl": 46.55496097796105, "lr": 0.00056, "grad_norm": 0.7479, "tokens_per_sec": 134427, "dt_s": 4.875, "eta_s": 10610, "world_size": 1, "timestamp": "2026-05-05T04:55:14.024759"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69060, "epoch": 0, "train_loss": 3.6343846917152405, "train_ppl": 37.87853874647739, "lr": 0.00056, "grad_norm": 0.6905, "tokens_per_sec": 151100, "dt_s": 4.337, "eta_s": 10578, "world_size": 1, "timestamp": "2026-05-05T04:55:18.362001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69070, "epoch": 0, "train_loss": 3.6464725732803345, "train_ppl": 38.339188567899875, "lr": 0.00056, "grad_norm": 0.6495, "tokens_per_sec": 146279, "dt_s": 4.48, "eta_s": 10655, "world_size": 1, "timestamp": "2026-05-05T04:55:22.842254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69080, "epoch": 0, "train_loss": 3.67388878762722, "train_ppl": 39.404845368053834, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 150683, "dt_s": 4.349, "eta_s": 10624, "world_size": 1, "timestamp": "2026-05-05T04:55:27.191480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69090, "epoch": 0, "train_loss": 3.703765094280243, "train_ppl": 40.599879323685116, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 150285, "dt_s": 4.361, "eta_s": 10570, "world_size": 1, "timestamp": "2026-05-05T04:55:31.552280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69100, "epoch": 0, "train_loss": 3.686676263809204, "train_ppl": 39.91196939750441, "lr": 0.00056, "grad_norm": 0.6864, "tokens_per_sec": 151584, "dt_s": 4.323, "eta_s": 10306, "world_size": 1, "timestamp": "2026-05-05T04:55:35.875672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69110, "epoch": 0, "train_loss": 3.7350166738033295, "train_ppl": 41.88872400162562, "lr": 0.00056, "grad_norm": 0.8824, "tokens_per_sec": 153094, "dt_s": 4.281, "eta_s": 10275, "world_size": 1, "timestamp": "2026-05-05T04:55:40.156455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69120, "epoch": 0, "train_loss": 3.696968287229538, "train_ppl": 40.32486544450104, "lr": 0.00056, "grad_norm": 0.7322, "tokens_per_sec": 150221, "dt_s": 4.363, "eta_s": 10215, "world_size": 1, "timestamp": "2026-05-05T04:55:44.519074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69130, "epoch": 0, "train_loss": 3.6498013138771057, "train_ppl": 38.46702242617123, "lr": 0.00056, "grad_norm": 0.6895, "tokens_per_sec": 150567, "dt_s": 4.353, "eta_s": 10212, "world_size": 1, "timestamp": "2026-05-05T04:55:48.871703"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69140, "epoch": 0, "train_loss": 3.7667135447263718, "train_ppl": 43.23773220602115, "lr": 0.00056, "grad_norm": 0.6933, "tokens_per_sec": 151546, "dt_s": 4.325, "eta_s": 10191, "world_size": 1, "timestamp": "2026-05-05T04:55:53.196222"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69150, "epoch": 0, "train_loss": 3.695572942495346, "train_ppl": 40.26863759358682, "lr": 0.00056, "grad_norm": 0.6552, "tokens_per_sec": 146597, "dt_s": 4.47, "eta_s": 10256, "world_size": 1, "timestamp": "2026-05-05T04:55:57.666694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69160, "epoch": 0, "train_loss": 3.749410182237625, "train_ppl": 42.49600970539297, "lr": 0.00056, "grad_norm": 0.6392, "tokens_per_sec": 152267, "dt_s": 4.304, "eta_s": 10262, "world_size": 1, "timestamp": "2026-05-05T04:56:01.970746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69170, "epoch": 0, "train_loss": 3.686579614877701, "train_ppl": 39.90811213471111, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 153464, "dt_s": 4.27, "eta_s": 10214, "world_size": 1, "timestamp": "2026-05-05T04:56:06.241161"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69180, "epoch": 0, "train_loss": 3.7216622680425644, "train_ppl": 41.33304363338158, "lr": 0.00056, "grad_norm": 0.6356, "tokens_per_sec": 150551, "dt_s": 4.353, "eta_s": 10210, "world_size": 1, "timestamp": "2026-05-05T04:56:10.594262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69190, "epoch": 0, "train_loss": 3.7386437952518463, "train_ppl": 42.04093536858534, "lr": 0.00056, "grad_norm": 0.6968, "tokens_per_sec": 154487, "dt_s": 4.242, "eta_s": 10167, "world_size": 1, "timestamp": "2026-05-05T04:56:14.836412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69200, "epoch": 0, "train_loss": 3.7388098537921906, "train_ppl": 42.04791720462806, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 150881, "dt_s": 4.344, "eta_s": 10103, "world_size": 1, "timestamp": "2026-05-05T04:56:19.179992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69210, "epoch": 0, "train_loss": 3.750505179166794, "train_ppl": 42.542568191569956, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 150692, "dt_s": 4.349, "eta_s": 10120, "world_size": 1, "timestamp": "2026-05-05T04:56:23.528984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69220, "epoch": 0, "train_loss": 3.839688926935196, "train_ppl": 46.5110038686829, "lr": 0.00056, "grad_norm": 0.7686, "tokens_per_sec": 148621, "dt_s": 4.41, "eta_s": 10181, "world_size": 1, "timestamp": "2026-05-05T04:56:27.938593"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69230, "epoch": 0, "train_loss": 3.7908138930797577, "train_ppl": 44.2924349160859, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 147912, "dt_s": 4.431, "eta_s": 10213, "world_size": 1, "timestamp": "2026-05-05T04:56:32.369344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69240, "epoch": 0, "train_loss": 3.6710167080163956, "train_ppl": 39.291833881781784, "lr": 0.00056, "grad_norm": 0.8066, "tokens_per_sec": 149771, "dt_s": 4.376, "eta_s": 10272, "world_size": 1, "timestamp": "2026-05-05T04:56:36.745077"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69250, "epoch": 0, "train_loss": 3.709044486284256, "train_ppl": 40.81478879846937, "lr": 0.00056, "grad_norm": 0.6838, "tokens_per_sec": 149095, "dt_s": 4.396, "eta_s": 10291, "world_size": 1, "timestamp": "2026-05-05T04:56:41.140664"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69260, "epoch": 0, "train_loss": 3.7653105407953262, "train_ppl": 43.17711203288105, "lr": 0.00056, "grad_norm": 0.7233, "tokens_per_sec": 148104, "dt_s": 4.425, "eta_s": 10323, "world_size": 1, "timestamp": "2026-05-05T04:56:45.565668"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69270, "epoch": 0, "train_loss": 3.7119438499212265, "train_ppl": 40.93329742975205, "lr": 0.00056, "grad_norm": 0.6752, "tokens_per_sec": 150461, "dt_s": 4.356, "eta_s": 10293, "world_size": 1, "timestamp": "2026-05-05T04:56:49.921386"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69280, "epoch": 0, "train_loss": 3.724763438105583, "train_ppl": 41.46142339175415, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 150014, "dt_s": 4.369, "eta_s": 10260, "world_size": 1, "timestamp": "2026-05-05T04:56:54.289993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69290, "epoch": 0, "train_loss": 3.748397186398506, "train_ppl": 42.45298322088648, "lr": 0.00056, "grad_norm": 0.6531, "tokens_per_sec": 148398, "dt_s": 4.416, "eta_s": 10274, "world_size": 1, "timestamp": "2026-05-05T04:56:58.706257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69300, "epoch": 0, "train_loss": 3.5604020804166794, "train_ppl": 35.177338420841, "lr": 0.00056, "grad_norm": 0.6764, "tokens_per_sec": 148450, "dt_s": 4.415, "eta_s": 10279, "world_size": 1, "timestamp": "2026-05-05T04:57:03.120938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69310, "epoch": 0, "train_loss": 3.727343872189522, "train_ppl": 41.56855001899984, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 149346, "dt_s": 4.388, "eta_s": 10257, "world_size": 1, "timestamp": "2026-05-05T04:57:07.509142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69320, "epoch": 0, "train_loss": 3.6039158701896667, "train_ppl": 36.74182934493626, "lr": 0.00056, "grad_norm": 0.6447, "tokens_per_sec": 148454, "dt_s": 4.415, "eta_s": 10280, "world_size": 1, "timestamp": "2026-05-05T04:57:11.923720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69330, "epoch": 0, "train_loss": 3.7486881017684937, "train_ppl": 42.46533524281666, "lr": 0.00056, "grad_norm": 0.754, "tokens_per_sec": 151916, "dt_s": 4.314, "eta_s": 10250, "world_size": 1, "timestamp": "2026-05-05T04:57:16.237687"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69340, "epoch": 0, "train_loss": 3.6535986065864563, "train_ppl": 38.61337065766197, "lr": 0.00056, "grad_norm": 0.6707, "tokens_per_sec": 149919, "dt_s": 4.371, "eta_s": 10225, "world_size": 1, "timestamp": "2026-05-05T04:57:20.609083"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69350, "epoch": 0, "train_loss": 3.7830854803323746, "train_ppl": 43.951444053765165, "lr": 0.00056, "grad_norm": 0.6863, "tokens_per_sec": 133507, "dt_s": 4.909, "eta_s": 10451, "world_size": 1, "timestamp": "2026-05-05T04:57:25.517903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69360, "epoch": 0, "train_loss": 3.55170214176178, "train_ppl": 34.87262514986886, "lr": 0.00056, "grad_norm": 0.6847, "tokens_per_sec": 150746, "dt_s": 4.347, "eta_s": 10428, "world_size": 1, "timestamp": "2026-05-05T04:57:29.865352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69370, "epoch": 0, "train_loss": 3.6894984394311905, "train_ppl": 40.02476707712794, "lr": 0.00056, "grad_norm": 0.6407, "tokens_per_sec": 148790, "dt_s": 4.405, "eta_s": 10419, "world_size": 1, "timestamp": "2026-05-05T04:57:34.269938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69380, "epoch": 0, "train_loss": 3.729724705219269, "train_ppl": 41.66763570231308, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 148557, "dt_s": 4.412, "eta_s": 10460, "world_size": 1, "timestamp": "2026-05-05T04:57:38.681429"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69390, "epoch": 0, "train_loss": 3.649693548679352, "train_ppl": 38.46287724324971, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 151751, "dt_s": 4.319, "eta_s": 10431, "world_size": 1, "timestamp": "2026-05-05T04:57:43.000096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69400, "epoch": 0, "train_loss": 3.6422044038772583, "train_ppl": 38.175899137681114, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 146785, "dt_s": 4.465, "eta_s": 10219, "world_size": 1, "timestamp": "2026-05-05T04:57:47.464859"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69410, "epoch": 0, "train_loss": 3.610783413052559, "train_ppl": 36.99502384976096, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 148583, "dt_s": 4.411, "eta_s": 10244, "world_size": 1, "timestamp": "2026-05-05T04:57:51.875567"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69420, "epoch": 0, "train_loss": 3.76413457095623, "train_ppl": 43.126366894615124, "lr": 0.00056, "grad_norm": 0.7814, "tokens_per_sec": 148116, "dt_s": 4.425, "eta_s": 10249, "world_size": 1, "timestamp": "2026-05-05T04:57:56.300253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69430, "epoch": 0, "train_loss": 3.6528308540582657, "train_ppl": 38.583736522014505, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 147503, "dt_s": 4.443, "eta_s": 10260, "world_size": 1, "timestamp": "2026-05-05T04:58:00.743255"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69440, "epoch": 0, "train_loss": 3.7004763036966324, "train_ppl": 40.46657414942455, "lr": 0.00056, "grad_norm": 0.7106, "tokens_per_sec": 150786, "dt_s": 4.346, "eta_s": 10268, "world_size": 1, "timestamp": "2026-05-05T04:58:05.089551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69450, "epoch": 0, "train_loss": 3.747038811445236, "train_ppl": 42.39535530081043, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 146631, "dt_s": 4.469, "eta_s": 10266, "world_size": 1, "timestamp": "2026-05-05T04:58:09.558997"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69460, "epoch": 0, "train_loss": 3.6542993932962418, "train_ppl": 38.640439878406134, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 148621, "dt_s": 4.41, "eta_s": 10261, "world_size": 1, "timestamp": "2026-05-05T04:58:13.968608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69470, "epoch": 0, "train_loss": 3.6705725342035294, "train_ppl": 39.27438535348855, "lr": 0.00056, "grad_norm": 0.6828, "tokens_per_sec": 151053, "dt_s": 4.339, "eta_s": 10216, "world_size": 1, "timestamp": "2026-05-05T04:58:18.307228"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69480, "epoch": 0, "train_loss": 3.847299724817276, "train_ppl": 46.86634019970382, "lr": 0.00056, "grad_norm": 0.7212, "tokens_per_sec": 146709, "dt_s": 4.467, "eta_s": 10223, "world_size": 1, "timestamp": "2026-05-05T04:58:22.774313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69490, "epoch": 0, "train_loss": 3.7979429811239243, "train_ppl": 44.60932782085934, "lr": 0.00056, "grad_norm": 0.739, "tokens_per_sec": 148944, "dt_s": 4.4, "eta_s": 10244, "world_size": 1, "timestamp": "2026-05-05T04:58:27.174349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69500, "epoch": 0, "train_loss": 3.8285928070545197, "train_ppl": 45.99776494031012, "lr": 0.00056, "grad_norm": 0.7103, "tokens_per_sec": 148959, "dt_s": 4.4, "eta_s": 10207, "world_size": 1, "timestamp": "2026-05-05T04:58:31.573925"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69510, "epoch": 0, "train_loss": 3.7228028625249863, "train_ppl": 41.3802147713474, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 124700, "dt_s": 5.255, "eta_s": 10230, "world_size": 1, "timestamp": "2026-05-05T04:58:36.829433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69520, "epoch": 0, "train_loss": 3.877321168780327, "train_ppl": 48.29466836657286, "lr": 0.00056, "grad_norm": 0.7412, "tokens_per_sec": 148225, "dt_s": 4.421, "eta_s": 10264, "world_size": 1, "timestamp": "2026-05-05T04:58:41.250843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69530, "epoch": 0, "train_loss": 3.638284832239151, "train_ppl": 38.026558832367776, "lr": 0.00056, "grad_norm": 0.7231, "tokens_per_sec": 141707, "dt_s": 4.625, "eta_s": 10332, "world_size": 1, "timestamp": "2026-05-05T04:58:45.875583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69540, "epoch": 0, "train_loss": 3.653176784515381, "train_ppl": 38.597086120510006, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 147924, "dt_s": 4.43, "eta_s": 10342, "world_size": 1, "timestamp": "2026-05-05T04:58:50.305983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69550, "epoch": 0, "train_loss": 3.8157902508974075, "train_ppl": 45.4126295779457, "lr": 0.00056, "grad_norm": 0.6467, "tokens_per_sec": 151880, "dt_s": 4.315, "eta_s": 10298, "world_size": 1, "timestamp": "2026-05-05T04:58:54.620965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69560, "epoch": 0, "train_loss": 3.6794832348823547, "train_ppl": 39.6259114918729, "lr": 0.00056, "grad_norm": 0.6169, "tokens_per_sec": 149307, "dt_s": 4.389, "eta_s": 10257, "world_size": 1, "timestamp": "2026-05-05T04:58:59.010305"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69570, "epoch": 0, "train_loss": 3.665028616786003, "train_ppl": 39.05725384018213, "lr": 0.00056, "grad_norm": 0.6827, "tokens_per_sec": 150174, "dt_s": 4.364, "eta_s": 10226, "world_size": 1, "timestamp": "2026-05-05T04:59:03.374321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69580, "epoch": 0, "train_loss": 3.7257190495729446, "train_ppl": 41.501063340578696, "lr": 0.00056, "grad_norm": 0.772, "tokens_per_sec": 148951, "dt_s": 4.4, "eta_s": 10118, "world_size": 1, "timestamp": "2026-05-05T04:59:07.774147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69590, "epoch": 0, "train_loss": 3.8236421197652817, "train_ppl": 45.77060714764161, "lr": 0.00056, "grad_norm": 0.8026, "tokens_per_sec": 147442, "dt_s": 4.445, "eta_s": 10120, "world_size": 1, "timestamp": "2026-05-05T04:59:12.219031"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69600, "epoch": 0, "train_loss": 3.613900288939476, "train_ppl": 37.11051263614399, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 149524, "dt_s": 4.383, "eta_s": 10147, "world_size": 1, "timestamp": "2026-05-05T04:59:16.601998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69610, "epoch": 0, "train_loss": 3.7661770433187485, "train_ppl": 43.21454132335704, "lr": 0.00056, "grad_norm": 0.6773, "tokens_per_sec": 148502, "dt_s": 4.413, "eta_s": 10154, "world_size": 1, "timestamp": "2026-05-05T04:59:21.015148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69620, "epoch": 0, "train_loss": 3.6151939779520035, "train_ppl": 37.15855316664368, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 148780, "dt_s": 4.405, "eta_s": 10168, "world_size": 1, "timestamp": "2026-05-05T04:59:25.420042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69630, "epoch": 0, "train_loss": 3.7324976325035095, "train_ppl": 41.783337368237916, "lr": 0.00056, "grad_norm": 0.719, "tokens_per_sec": 149442, "dt_s": 4.385, "eta_s": 10157, "world_size": 1, "timestamp": "2026-05-05T04:59:29.805419"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69640, "epoch": 0, "train_loss": 3.7447282671928406, "train_ppl": 42.297512035418066, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 132784, "dt_s": 4.936, "eta_s": 10379, "world_size": 1, "timestamp": "2026-05-05T04:59:34.740970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69650, "epoch": 0, "train_loss": 3.695404827594757, "train_ppl": 40.26186840459762, "lr": 0.00056, "grad_norm": 0.7123, "tokens_per_sec": 147249, "dt_s": 4.451, "eta_s": 10406, "world_size": 1, "timestamp": "2026-05-05T04:59:39.191669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69660, "epoch": 0, "train_loss": 3.704061895608902, "train_ppl": 40.61193121023128, "lr": 0.00056, "grad_norm": 0.7497, "tokens_per_sec": 147842, "dt_s": 4.433, "eta_s": 10410, "world_size": 1, "timestamp": "2026-05-05T04:59:43.624514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69670, "epoch": 0, "train_loss": 3.64042492210865, "train_ppl": 38.10802622837801, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 146308, "dt_s": 4.479, "eta_s": 10440, "world_size": 1, "timestamp": "2026-05-05T04:59:48.103840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69680, "epoch": 0, "train_loss": 3.7362287491559982, "train_ppl": 41.93952707389893, "lr": 0.00056, "grad_norm": 0.74, "tokens_per_sec": 147790, "dt_s": 4.434, "eta_s": 10458, "world_size": 1, "timestamp": "2026-05-05T04:59:52.538264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69690, "epoch": 0, "train_loss": 3.700918808579445, "train_ppl": 40.48448476855232, "lr": 0.00056, "grad_norm": 0.6913, "tokens_per_sec": 149599, "dt_s": 4.381, "eta_s": 10198, "world_size": 1, "timestamp": "2026-05-05T04:59:56.919025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69700, "epoch": 0, "train_loss": 3.8388755917549133, "train_ppl": 46.47319021263914, "lr": 0.00056, "grad_norm": 0.7384, "tokens_per_sec": 147914, "dt_s": 4.431, "eta_s": 10185, "world_size": 1, "timestamp": "2026-05-05T05:00:01.349723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69710, "epoch": 0, "train_loss": 3.6790275424718857, "train_ppl": 39.607858378393935, "lr": 0.00056, "grad_norm": 0.888, "tokens_per_sec": 147540, "dt_s": 4.442, "eta_s": 10184, "world_size": 1, "timestamp": "2026-05-05T05:00:05.791639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69720, "epoch": 0, "train_loss": 3.6730154305696487, "train_ppl": 39.37044589194828, "lr": 0.00056, "grad_norm": 0.6419, "tokens_per_sec": 144320, "dt_s": 4.541, "eta_s": 10208, "world_size": 1, "timestamp": "2026-05-05T05:00:10.332665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69730, "epoch": 0, "train_loss": 3.7035608291625977, "train_ppl": 40.59158703150053, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 148556, "dt_s": 4.412, "eta_s": 10193, "world_size": 1, "timestamp": "2026-05-05T05:00:14.744219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69740, "epoch": 0, "train_loss": 3.766037479043007, "train_ppl": 43.20851053804662, "lr": 0.00056, "grad_norm": 0.7668, "tokens_per_sec": 150275, "dt_s": 4.361, "eta_s": 10180, "world_size": 1, "timestamp": "2026-05-05T05:00:19.105287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69750, "epoch": 0, "train_loss": 3.799019381403923, "train_ppl": 44.657371166112675, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 145778, "dt_s": 4.496, "eta_s": 10205, "world_size": 1, "timestamp": "2026-05-05T05:00:23.600890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69760, "epoch": 0, "train_loss": 3.7337291687726974, "train_ppl": 41.83482676267523, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 146248, "dt_s": 4.481, "eta_s": 10219, "world_size": 1, "timestamp": "2026-05-05T05:00:28.082032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69770, "epoch": 0, "train_loss": 3.7237289994955063, "train_ppl": 41.4185562700944, "lr": 0.00056, "grad_norm": 0.6461, "tokens_per_sec": 149747, "dt_s": 4.376, "eta_s": 10139, "world_size": 1, "timestamp": "2026-05-05T05:00:32.458495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69780, "epoch": 0, "train_loss": 3.679612308740616, "train_ppl": 39.63102649125552, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 145823, "dt_s": 4.494, "eta_s": 10172, "world_size": 1, "timestamp": "2026-05-05T05:00:36.952742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69790, "epoch": 0, "train_loss": 3.7821979373693466, "train_ppl": 43.91245256475075, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 148793, "dt_s": 4.405, "eta_s": 10188, "world_size": 1, "timestamp": "2026-05-05T05:00:41.357271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69800, "epoch": 0, "train_loss": 3.707561954855919, "train_ppl": 40.75432442257803, "lr": 0.00056, "grad_norm": 0.7232, "tokens_per_sec": 147982, "dt_s": 4.429, "eta_s": 10153, "world_size": 1, "timestamp": "2026-05-05T05:00:45.785897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69810, "epoch": 0, "train_loss": 3.7327636927366257, "train_ppl": 41.79445573173008, "lr": 0.00056, "grad_norm": 0.6499, "tokens_per_sec": 146674, "dt_s": 4.468, "eta_s": 10142, "world_size": 1, "timestamp": "2026-05-05T05:00:50.254025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69820, "epoch": 0, "train_loss": 3.6965061873197556, "train_ppl": 40.30623563256587, "lr": 0.00056, "grad_norm": 0.6745, "tokens_per_sec": 150795, "dt_s": 4.346, "eta_s": 10124, "world_size": 1, "timestamp": "2026-05-05T05:00:54.600061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69830, "epoch": 0, "train_loss": 3.6484930217266083, "train_ppl": 38.416729228949244, "lr": 0.00056, "grad_norm": 0.6404, "tokens_per_sec": 144858, "dt_s": 4.524, "eta_s": 10133, "world_size": 1, "timestamp": "2026-05-05T05:00:59.124225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69840, "epoch": 0, "train_loss": 3.618500307202339, "train_ppl": 37.28161490708741, "lr": 0.00056, "grad_norm": 1.332, "tokens_per_sec": 148160, "dt_s": 4.423, "eta_s": 10137, "world_size": 1, "timestamp": "2026-05-05T05:01:03.547534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69850, "epoch": 0, "train_loss": 3.512928456068039, "train_ppl": 33.54636313055443, "lr": 0.00056, "grad_norm": 0.7153, "tokens_per_sec": 149056, "dt_s": 4.397, "eta_s": 10118, "world_size": 1, "timestamp": "2026-05-05T05:01:07.944303"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69860, "epoch": 0, "train_loss": 3.7051221132278442, "train_ppl": 40.65501152846044, "lr": 0.00056, "grad_norm": 0.7389, "tokens_per_sec": 146252, "dt_s": 4.481, "eta_s": 10120, "world_size": 1, "timestamp": "2026-05-05T05:01:12.425312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69870, "epoch": 0, "train_loss": 3.8349898010492325, "train_ppl": 46.292955525968466, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 149017, "dt_s": 4.398, "eta_s": 10139, "world_size": 1, "timestamp": "2026-05-05T05:01:16.823171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69880, "epoch": 0, "train_loss": 3.7785517126321793, "train_ppl": 43.75262944651317, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 147737, "dt_s": 4.436, "eta_s": 10094, "world_size": 1, "timestamp": "2026-05-05T05:01:21.259156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69890, "epoch": 0, "train_loss": 3.7575293481349945, "train_ppl": 42.84244634332089, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 147864, "dt_s": 4.432, "eta_s": 10094, "world_size": 1, "timestamp": "2026-05-05T05:01:25.691346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69900, "epoch": 0, "train_loss": 3.59945610165596, "train_ppl": 36.578334136931815, "lr": 0.00056, "grad_norm": 0.8148, "tokens_per_sec": 151293, "dt_s": 4.332, "eta_s": 10060, "world_size": 1, "timestamp": "2026-05-05T05:01:30.023079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69910, "epoch": 0, "train_loss": 3.6667670905590057, "train_ppl": 39.12521290704047, "lr": 0.00056, "grad_norm": 0.7503, "tokens_per_sec": 147073, "dt_s": 4.456, "eta_s": 10044, "world_size": 1, "timestamp": "2026-05-05T05:01:34.479086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69920, "epoch": 0, "train_loss": 3.7001552879810333, "train_ppl": 40.45358582800532, "lr": 0.00056, "grad_norm": 0.7137, "tokens_per_sec": 147554, "dt_s": 4.441, "eta_s": 10059, "world_size": 1, "timestamp": "2026-05-05T05:01:38.920590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69930, "epoch": 0, "train_loss": 3.746126115322113, "train_ppl": 42.35667887698482, "lr": 0.00056, "grad_norm": 0.718, "tokens_per_sec": 148113, "dt_s": 4.425, "eta_s": 10050, "world_size": 1, "timestamp": "2026-05-05T05:01:43.345336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69940, "epoch": 0, "train_loss": 3.708416312932968, "train_ppl": 40.78915808691555, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 129195, "dt_s": 5.073, "eta_s": 10337, "world_size": 1, "timestamp": "2026-05-05T05:01:48.417958"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69950, "epoch": 0, "train_loss": 3.776260733604431, "train_ppl": 43.652507822104646, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 150419, "dt_s": 4.357, "eta_s": 10344, "world_size": 1, "timestamp": "2026-05-05T05:01:52.774852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69960, "epoch": 0, "train_loss": 3.788551911711693, "train_ppl": 44.19235948061487, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 151049, "dt_s": 4.339, "eta_s": 10286, "world_size": 1, "timestamp": "2026-05-05T05:01:57.113569"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69970, "epoch": 0, "train_loss": 3.721020668745041, "train_ppl": 41.306532887169396, "lr": 0.00056, "grad_norm": 0.7154, "tokens_per_sec": 146643, "dt_s": 4.469, "eta_s": 10294, "world_size": 1, "timestamp": "2026-05-05T05:02:01.582657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69980, "epoch": 0, "train_loss": 3.6073914021253586, "train_ppl": 36.86974891170496, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 147491, "dt_s": 4.443, "eta_s": 10298, "world_size": 1, "timestamp": "2026-05-05T05:02:06.026040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 69990, "epoch": 0, "train_loss": 3.7719678431749344, "train_ppl": 43.46551404811713, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 147006, "dt_s": 4.458, "eta_s": 10014, "world_size": 1, "timestamp": "2026-05-05T05:02:10.484093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70000, "epoch": 0, "train_loss": 3.7598626017570496, "train_ppl": 42.942525345873634, "lr": 0.00056, "grad_norm": 0.6907, "tokens_per_sec": 146101, "dt_s": 4.486, "eta_s": 10068, "world_size": 1, "timestamp": "2026-05-05T05:02:14.969750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70010, "epoch": 0, "train_loss": 3.690022587776184, "train_ppl": 40.04575149154279, "lr": 0.00056, "grad_norm": 0.6974, "tokens_per_sec": 125733, "dt_s": 5.212, "eta_s": 10112, "world_size": 1, "timestamp": "2026-05-05T05:02:20.182096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70020, "epoch": 0, "train_loss": 3.650931254029274, "train_ppl": 38.51051242526581, "lr": 0.00056, "grad_norm": 0.6647, "tokens_per_sec": 144196, "dt_s": 4.545, "eta_s": 10142, "world_size": 1, "timestamp": "2026-05-05T05:02:24.726973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70030, "epoch": 0, "train_loss": 3.5922350585460663, "train_ppl": 36.31515177842153, "lr": 0.00056, "grad_norm": 0.7066, "tokens_per_sec": 148203, "dt_s": 4.422, "eta_s": 10128, "world_size": 1, "timestamp": "2026-05-05T05:02:29.148999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70040, "epoch": 0, "train_loss": 3.6985332369804382, "train_ppl": 40.38802123756541, "lr": 0.00056, "grad_norm": 0.713, "tokens_per_sec": 147653, "dt_s": 4.439, "eta_s": 10114, "world_size": 1, "timestamp": "2026-05-05T05:02:33.587526"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70050, "epoch": 0, "train_loss": 3.721670374274254, "train_ppl": 41.333378689967724, "lr": 0.00056, "grad_norm": 0.7504, "tokens_per_sec": 144844, "dt_s": 4.525, "eta_s": 10128, "world_size": 1, "timestamp": "2026-05-05T05:02:38.112124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70060, "epoch": 0, "train_loss": 3.7038441449403763, "train_ppl": 40.603088897804795, "lr": 0.00056, "grad_norm": 0.6721, "tokens_per_sec": 149371, "dt_s": 4.387, "eta_s": 10097, "world_size": 1, "timestamp": "2026-05-05T05:02:42.499582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70070, "epoch": 0, "train_loss": 3.6668549329042435, "train_ppl": 39.128649908455074, "lr": 0.00056, "grad_norm": 0.7045, "tokens_per_sec": 149871, "dt_s": 4.373, "eta_s": 10015, "world_size": 1, "timestamp": "2026-05-05T05:02:46.872402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70080, "epoch": 0, "train_loss": 3.685469701886177, "train_ppl": 39.863842175032005, "lr": 0.00056, "grad_norm": 0.6838, "tokens_per_sec": 151276, "dt_s": 4.332, "eta_s": 9970, "world_size": 1, "timestamp": "2026-05-05T05:02:51.204645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70090, "epoch": 0, "train_loss": 3.7548016905784607, "train_ppl": 42.72574605238265, "lr": 0.00056, "grad_norm": 0.7395, "tokens_per_sec": 149948, "dt_s": 4.371, "eta_s": 9935, "world_size": 1, "timestamp": "2026-05-05T05:02:55.575203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70100, "epoch": 0, "train_loss": 3.5968176126480103, "train_ppl": 36.48194981478561, "lr": 0.00056, "grad_norm": 0.8215, "tokens_per_sec": 148340, "dt_s": 4.418, "eta_s": 9882, "world_size": 1, "timestamp": "2026-05-05T05:02:59.993157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70110, "epoch": 0, "train_loss": 3.730491563677788, "train_ppl": 41.69960113611303, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 149485, "dt_s": 4.384, "eta_s": 9876, "world_size": 1, "timestamp": "2026-05-05T05:03:04.377330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70120, "epoch": 0, "train_loss": 3.808683857321739, "train_ppl": 45.09105353514139, "lr": 0.00056, "grad_norm": 0.6597, "tokens_per_sec": 150041, "dt_s": 4.368, "eta_s": 9870, "world_size": 1, "timestamp": "2026-05-05T05:03:08.745142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70130, "epoch": 0, "train_loss": 3.784257709980011, "train_ppl": 44.00299544867501, "lr": 0.00056, "grad_norm": 0.6868, "tokens_per_sec": 144095, "dt_s": 4.548, "eta_s": 9963, "world_size": 1, "timestamp": "2026-05-05T05:03:13.293276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70140, "epoch": 0, "train_loss": 3.703139007091522, "train_ppl": 40.57446821499182, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 149022, "dt_s": 4.398, "eta_s": 9971, "world_size": 1, "timestamp": "2026-05-05T05:03:17.690996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70150, "epoch": 0, "train_loss": 3.7304254472255707, "train_ppl": 41.69684419756754, "lr": 0.00056, "grad_norm": 0.6319, "tokens_per_sec": 150291, "dt_s": 4.361, "eta_s": 9940, "world_size": 1, "timestamp": "2026-05-05T05:03:22.051601"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70160, "epoch": 0, "train_loss": 3.764937609434128, "train_ppl": 43.16101293583368, "lr": 0.00056, "grad_norm": 0.6418, "tokens_per_sec": 148332, "dt_s": 4.418, "eta_s": 9951, "world_size": 1, "timestamp": "2026-05-05T05:03:26.469833"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70170, "epoch": 0, "train_loss": 3.76124107837677, "train_ppl": 43.00176143145416, "lr": 0.00056, "grad_norm": 0.6488, "tokens_per_sec": 150391, "dt_s": 4.358, "eta_s": 9942, "world_size": 1, "timestamp": "2026-05-05T05:03:30.827535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70180, "epoch": 0, "train_loss": 3.7178719341754913, "train_ppl": 41.17667413178613, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 151178, "dt_s": 4.335, "eta_s": 9842, "world_size": 1, "timestamp": "2026-05-05T05:03:35.162551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70190, "epoch": 0, "train_loss": 3.767373725771904, "train_ppl": 43.266286361694455, "lr": 0.00056, "grad_norm": 0.6818, "tokens_per_sec": 149972, "dt_s": 4.37, "eta_s": 9825, "world_size": 1, "timestamp": "2026-05-05T05:03:39.532439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70200, "epoch": 0, "train_loss": 3.7439459711313248, "train_ppl": 42.26443579773293, "lr": 0.00056, "grad_norm": 0.6957, "tokens_per_sec": 146627, "dt_s": 4.47, "eta_s": 9870, "world_size": 1, "timestamp": "2026-05-05T05:03:44.002031"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70210, "epoch": 0, "train_loss": 3.7462540566921234, "train_ppl": 42.362098395192334, "lr": 0.00056, "grad_norm": 0.6391, "tokens_per_sec": 145437, "dt_s": 4.506, "eta_s": 9905, "world_size": 1, "timestamp": "2026-05-05T05:03:48.508156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70220, "epoch": 0, "train_loss": 3.7005331218242645, "train_ppl": 40.46887344971976, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 149163, "dt_s": 4.394, "eta_s": 9917, "world_size": 1, "timestamp": "2026-05-05T05:03:52.901756"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70230, "epoch": 0, "train_loss": 3.705075189471245, "train_ppl": 40.653103887352124, "lr": 0.00056, "grad_norm": 0.749, "tokens_per_sec": 133645, "dt_s": 4.904, "eta_s": 10168, "world_size": 1, "timestamp": "2026-05-05T05:03:57.805475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70240, "epoch": 0, "train_loss": 3.666062593460083, "train_ppl": 39.09765901501085, "lr": 0.00056, "grad_norm": 0.7381, "tokens_per_sec": 144117, "dt_s": 4.547, "eta_s": 10243, "world_size": 1, "timestamp": "2026-05-05T05:04:02.352907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70250, "epoch": 0, "train_loss": 3.687876284122467, "train_ppl": 39.95989332061328, "lr": 0.00056, "grad_norm": 0.6229, "tokens_per_sec": 148153, "dt_s": 4.424, "eta_s": 10218, "world_size": 1, "timestamp": "2026-05-05T05:04:06.776434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70260, "epoch": 0, "train_loss": 3.7529095262289047, "train_ppl": 42.64497835586973, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 148017, "dt_s": 4.428, "eta_s": 10178, "world_size": 1, "timestamp": "2026-05-05T05:04:11.204036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70270, "epoch": 0, "train_loss": 3.8243191987276077, "train_ppl": 45.80160795664764, "lr": 0.00056, "grad_norm": 0.6929, "tokens_per_sec": 148960, "dt_s": 4.4, "eta_s": 10176, "world_size": 1, "timestamp": "2026-05-05T05:04:15.603586"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70280, "epoch": 0, "train_loss": 3.761678695678711, "train_ppl": 43.02058386448129, "lr": 0.00056, "grad_norm": 0.6591, "tokens_per_sec": 148214, "dt_s": 4.422, "eta_s": 9955, "world_size": 1, "timestamp": "2026-05-05T05:04:20.025319"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70290, "epoch": 0, "train_loss": 3.69267301261425, "train_ppl": 40.15203052603608, "lr": 0.00056, "grad_norm": 0.6622, "tokens_per_sec": 147321, "dt_s": 4.449, "eta_s": 9907, "world_size": 1, "timestamp": "2026-05-05T05:04:24.473804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70300, "epoch": 0, "train_loss": 3.6895647048950195, "train_ppl": 40.027419424761504, "lr": 0.00056, "grad_norm": 0.6973, "tokens_per_sec": 148915, "dt_s": 4.401, "eta_s": 9892, "world_size": 1, "timestamp": "2026-05-05T05:04:28.874707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70310, "epoch": 0, "train_loss": 3.7559332698583603, "train_ppl": 42.774120986208274, "lr": 0.00056, "grad_norm": 0.6815, "tokens_per_sec": 148954, "dt_s": 4.4, "eta_s": 9875, "world_size": 1, "timestamp": "2026-05-05T05:04:33.274459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70320, "epoch": 0, "train_loss": 3.7470176219940186, "train_ppl": 42.394456976014965, "lr": 0.00056, "grad_norm": 0.6317, "tokens_per_sec": 145529, "dt_s": 4.503, "eta_s": 9917, "world_size": 1, "timestamp": "2026-05-05T05:04:37.777765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70330, "epoch": 0, "train_loss": 3.66155768930912, "train_ppl": 38.921923940653734, "lr": 0.00056, "grad_norm": 0.7367, "tokens_per_sec": 146150, "dt_s": 4.484, "eta_s": 9941, "world_size": 1, "timestamp": "2026-05-05T05:04:42.261913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70340, "epoch": 0, "train_loss": 3.6133963465690613, "train_ppl": 37.09181578790152, "lr": 0.00056, "grad_norm": 0.6846, "tokens_per_sec": 147765, "dt_s": 4.435, "eta_s": 9930, "world_size": 1, "timestamp": "2026-05-05T05:04:46.697079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70350, "epoch": 0, "train_loss": 3.674401134252548, "train_ppl": 39.425039480350634, "lr": 0.00056, "grad_norm": 0.6502, "tokens_per_sec": 144141, "dt_s": 4.547, "eta_s": 9991, "world_size": 1, "timestamp": "2026-05-05T05:04:51.243724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70360, "epoch": 0, "train_loss": 3.6931390315294266, "train_ppl": 40.1707464924024, "lr": 0.00056, "grad_norm": 0.7195, "tokens_per_sec": 149391, "dt_s": 4.387, "eta_s": 9981, "world_size": 1, "timestamp": "2026-05-05T05:04:55.630606"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70370, "epoch": 0, "train_loss": 3.6758051067590714, "train_ppl": 39.48043002615281, "lr": 0.00056, "grad_norm": 0.6155, "tokens_per_sec": 148669, "dt_s": 4.408, "eta_s": 9934, "world_size": 1, "timestamp": "2026-05-05T05:05:00.038815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70380, "epoch": 0, "train_loss": 3.7545690834522247, "train_ppl": 42.71580889514871, "lr": 0.00056, "grad_norm": 0.7002, "tokens_per_sec": 148451, "dt_s": 4.415, "eta_s": 9898, "world_size": 1, "timestamp": "2026-05-05T05:05:04.453457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70390, "epoch": 0, "train_loss": 3.6856826692819595, "train_ppl": 39.87233277776459, "lr": 0.00056, "grad_norm": 0.6784, "tokens_per_sec": 149290, "dt_s": 4.39, "eta_s": 9874, "world_size": 1, "timestamp": "2026-05-05T05:05:08.843318"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70400, "epoch": 0, "train_loss": 3.8014388233423233, "train_ppl": 44.76554789365687, "lr": 0.00056, "grad_norm": 0.6718, "tokens_per_sec": 144232, "dt_s": 4.544, "eta_s": 9868, "world_size": 1, "timestamp": "2026-05-05T05:05:13.387124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70410, "epoch": 0, "train_loss": 3.673174425959587, "train_ppl": 39.37670610900455, "lr": 0.00056, "grad_norm": 0.6855, "tokens_per_sec": 148003, "dt_s": 4.428, "eta_s": 9882, "world_size": 1, "timestamp": "2026-05-05T05:05:17.815143"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70420, "epoch": 0, "train_loss": 3.683650940656662, "train_ppl": 39.79140525712128, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 150795, "dt_s": 4.346, "eta_s": 9850, "world_size": 1, "timestamp": "2026-05-05T05:05:22.161163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70430, "epoch": 0, "train_loss": 3.6856218725442886, "train_ppl": 39.8699087436958, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 145421, "dt_s": 4.507, "eta_s": 9886, "world_size": 1, "timestamp": "2026-05-05T05:05:26.667813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70440, "epoch": 0, "train_loss": 3.703207805752754, "train_ppl": 40.57725978011208, "lr": 0.00056, "grad_norm": 0.7589, "tokens_per_sec": 149837, "dt_s": 4.374, "eta_s": 9875, "world_size": 1, "timestamp": "2026-05-05T05:05:31.041650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70450, "epoch": 0, "train_loss": 3.7018967419862747, "train_ppl": 40.524095263722224, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 149610, "dt_s": 4.38, "eta_s": 9798, "world_size": 1, "timestamp": "2026-05-05T05:05:35.422073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70460, "epoch": 0, "train_loss": 3.6990924030542374, "train_ppl": 40.41061116400098, "lr": 0.00056, "grad_norm": 0.6346, "tokens_per_sec": 145959, "dt_s": 4.49, "eta_s": 9821, "world_size": 1, "timestamp": "2026-05-05T05:05:39.912126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70470, "epoch": 0, "train_loss": 3.788926288485527, "train_ppl": 44.208907170927525, "lr": 0.00056, "grad_norm": 0.6511, "tokens_per_sec": 147735, "dt_s": 4.436, "eta_s": 9856, "world_size": 1, "timestamp": "2026-05-05T05:05:44.348148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70480, "epoch": 0, "train_loss": 3.670403301715851, "train_ppl": 39.26773941392344, "lr": 0.00056, "grad_norm": 0.6286, "tokens_per_sec": 147384, "dt_s": 4.447, "eta_s": 9825, "world_size": 1, "timestamp": "2026-05-05T05:05:48.794780"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70490, "epoch": 0, "train_loss": 3.7397057712078094, "train_ppl": 42.08560554624206, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 149395, "dt_s": 4.387, "eta_s": 9827, "world_size": 1, "timestamp": "2026-05-05T05:05:53.181536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70500, "epoch": 0, "train_loss": 3.6753692030906677, "train_ppl": 39.46322411220721, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 147437, "dt_s": 4.445, "eta_s": 9851, "world_size": 1, "timestamp": "2026-05-05T05:05:57.626547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70510, "epoch": 0, "train_loss": 3.683539479970932, "train_ppl": 39.786970326969936, "lr": 0.00056, "grad_norm": 0.6413, "tokens_per_sec": 124333, "dt_s": 5.271, "eta_s": 9846, "world_size": 1, "timestamp": "2026-05-05T05:06:02.897571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70520, "epoch": 0, "train_loss": 3.6317847222089767, "train_ppl": 37.78018361636967, "lr": 0.00056, "grad_norm": 0.6319, "tokens_per_sec": 151165, "dt_s": 4.335, "eta_s": 9797, "world_size": 1, "timestamp": "2026-05-05T05:06:07.232961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70530, "epoch": 0, "train_loss": 3.748466908931732, "train_ppl": 42.455943253608936, "lr": 0.00056, "grad_norm": 0.6848, "tokens_per_sec": 133605, "dt_s": 4.905, "eta_s": 9996, "world_size": 1, "timestamp": "2026-05-05T05:06:12.138160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70540, "epoch": 0, "train_loss": 3.692491590976715, "train_ppl": 40.14474673964589, "lr": 0.00056, "grad_norm": 0.636, "tokens_per_sec": 144365, "dt_s": 4.54, "eta_s": 10059, "world_size": 1, "timestamp": "2026-05-05T05:06:16.677771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70550, "epoch": 0, "train_loss": 3.752799078822136, "train_ppl": 42.64026858869419, "lr": 0.00056, "grad_norm": 0.6361, "tokens_per_sec": 148882, "dt_s": 4.402, "eta_s": 10036, "world_size": 1, "timestamp": "2026-05-05T05:06:21.079671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70560, "epoch": 0, "train_loss": 3.7572934180498123, "train_ppl": 42.83233971358158, "lr": 0.00056, "grad_norm": 0.6893, "tokens_per_sec": 146830, "dt_s": 4.463, "eta_s": 10019, "world_size": 1, "timestamp": "2026-05-05T05:06:25.543032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70570, "epoch": 0, "train_loss": 3.6741739362478256, "train_ppl": 39.41608320750677, "lr": 0.00056, "grad_norm": 0.7445, "tokens_per_sec": 149973, "dt_s": 4.37, "eta_s": 10030, "world_size": 1, "timestamp": "2026-05-05T05:06:29.912877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70580, "epoch": 0, "train_loss": 3.7700951993465424, "train_ppl": 43.38419478625813, "lr": 0.00056, "grad_norm": 0.6509, "tokens_per_sec": 145710, "dt_s": 4.498, "eta_s": 9845, "world_size": 1, "timestamp": "2026-05-05T05:06:34.410584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70590, "epoch": 0, "train_loss": 3.6909832805395126, "train_ppl": 40.0842416408464, "lr": 0.00056, "grad_norm": 0.7101, "tokens_per_sec": 146958, "dt_s": 4.46, "eta_s": 9805, "world_size": 1, "timestamp": "2026-05-05T05:06:38.870082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70600, "epoch": 0, "train_loss": 3.8150417804718018, "train_ppl": 45.37865228484333, "lr": 0.00056, "grad_norm": 0.7202, "tokens_per_sec": 140909, "dt_s": 4.651, "eta_s": 9911, "world_size": 1, "timestamp": "2026-05-05T05:06:43.521049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70610, "epoch": 0, "train_loss": 3.534490615129471, "train_ppl": 34.277549801157726, "lr": 0.00056, "grad_norm": 0.7003, "tokens_per_sec": 139100, "dt_s": 4.711, "eta_s": 10016, "world_size": 1, "timestamp": "2026-05-05T05:06:48.232480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70620, "epoch": 0, "train_loss": 3.646291345357895, "train_ppl": 38.33224106596738, "lr": 0.00056, "grad_norm": 0.6753, "tokens_per_sec": 143061, "dt_s": 4.581, "eta_s": 10105, "world_size": 1, "timestamp": "2026-05-05T05:06:52.813465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70630, "epoch": 0, "train_loss": 3.540806770324707, "train_ppl": 34.4947372983476, "lr": 0.00056, "grad_norm": 0.722, "tokens_per_sec": 144854, "dt_s": 4.524, "eta_s": 10112, "world_size": 1, "timestamp": "2026-05-05T05:06:57.337761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70640, "epoch": 0, "train_loss": 3.683399870991707, "train_ppl": 39.781416096375374, "lr": 0.00056, "grad_norm": 0.6542, "tokens_per_sec": 140321, "dt_s": 4.67, "eta_s": 10200, "world_size": 1, "timestamp": "2026-05-05T05:07:02.008201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70650, "epoch": 0, "train_loss": 3.7319881469011307, "train_ppl": 41.7620547814756, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 136042, "dt_s": 4.817, "eta_s": 10269, "world_size": 1, "timestamp": "2026-05-05T05:07:06.825532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70660, "epoch": 0, "train_loss": 3.7266706824302673, "train_ppl": 41.54057591381701, "lr": 0.00056, "grad_norm": 0.6338, "tokens_per_sec": 141303, "dt_s": 4.638, "eta_s": 10232, "world_size": 1, "timestamp": "2026-05-05T05:07:11.463493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70670, "epoch": 0, "train_loss": 3.7244144529104233, "train_ppl": 41.44695649333369, "lr": 0.00056, "grad_norm": 0.6202, "tokens_per_sec": 139814, "dt_s": 4.687, "eta_s": 10274, "world_size": 1, "timestamp": "2026-05-05T05:07:16.150887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70680, "epoch": 0, "train_loss": 3.734243705868721, "train_ppl": 41.856357871752486, "lr": 0.00056, "grad_norm": 0.6486, "tokens_per_sec": 140949, "dt_s": 4.65, "eta_s": 10325, "world_size": 1, "timestamp": "2026-05-05T05:07:20.800518"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70690, "epoch": 0, "train_loss": 3.7227573543787003, "train_ppl": 41.378331677328624, "lr": 0.00056, "grad_norm": 0.6417, "tokens_per_sec": 140163, "dt_s": 4.676, "eta_s": 10322, "world_size": 1, "timestamp": "2026-05-05T05:07:25.476214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70700, "epoch": 0, "train_loss": 3.6796159744262695, "train_ppl": 39.631171766407036, "lr": 0.00056, "grad_norm": 0.704, "tokens_per_sec": 140294, "dt_s": 4.671, "eta_s": 10253, "world_size": 1, "timestamp": "2026-05-05T05:07:30.147525"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70710, "epoch": 0, "train_loss": 3.663645416498184, "train_ppl": 39.00326718121755, "lr": 0.00056, "grad_norm": Infinity, "tokens_per_sec": 139684, "dt_s": 4.692, "eta_s": 10272, "world_size": 1, "timestamp": "2026-05-05T05:07:34.839203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70720, "epoch": 0, "train_loss": 3.696037545800209, "train_ppl": 40.28735088248644, "lr": 0.00056, "grad_norm": 0.7989, "tokens_per_sec": 139088, "dt_s": 4.712, "eta_s": 10278, "world_size": 1, "timestamp": "2026-05-05T05:07:39.551117"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70730, "epoch": 0, "train_loss": 3.7941757142543793, "train_ppl": 44.44158873542037, "lr": 0.00056, "grad_norm": 0.6561, "tokens_per_sec": 140313, "dt_s": 4.671, "eta_s": 10283, "world_size": 1, "timestamp": "2026-05-05T05:07:44.221782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70740, "epoch": 0, "train_loss": 3.6786724776029587, "train_ppl": 39.59379751575723, "lr": 0.00056, "grad_norm": 0.672, "tokens_per_sec": 140879, "dt_s": 4.652, "eta_s": 10268, "world_size": 1, "timestamp": "2026-05-05T05:07:48.873706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70750, "epoch": 0, "train_loss": 3.7316467463970184, "train_ppl": 41.74779962841712, "lr": 0.00056, "grad_norm": 0.7733, "tokens_per_sec": 139554, "dt_s": 4.696, "eta_s": 10274, "world_size": 1, "timestamp": "2026-05-05T05:07:53.569823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70760, "epoch": 0, "train_loss": 3.727612778544426, "train_ppl": 41.579729569322794, "lr": 0.00056, "grad_norm": 0.6914, "tokens_per_sec": 142255, "dt_s": 4.607, "eta_s": 10232, "world_size": 1, "timestamp": "2026-05-05T05:07:58.176776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70770, "epoch": 0, "train_loss": 3.714006468653679, "train_ppl": 41.01781434893803, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 139290, "dt_s": 4.705, "eta_s": 10224, "world_size": 1, "timestamp": "2026-05-05T05:08:02.881804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70780, "epoch": 0, "train_loss": 3.6663858592510223, "train_ppl": 39.11029999376413, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 143454, "dt_s": 4.568, "eta_s": 10175, "world_size": 1, "timestamp": "2026-05-05T05:08:07.450254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70790, "epoch": 0, "train_loss": 3.646473988890648, "train_ppl": 38.339242841289035, "lr": 0.00056, "grad_norm": 0.6339, "tokens_per_sec": 140377, "dt_s": 4.669, "eta_s": 10178, "world_size": 1, "timestamp": "2026-05-05T05:08:12.118788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70800, "epoch": 0, "train_loss": 3.7339948266744614, "train_ppl": 41.845941991332445, "lr": 0.00056, "grad_norm": 0.6999, "tokens_per_sec": 138530, "dt_s": 4.731, "eta_s": 10188, "world_size": 1, "timestamp": "2026-05-05T05:08:16.849630"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70810, "epoch": 0, "train_loss": 3.7256049066781998, "train_ppl": 41.496326559414115, "lr": 0.00056, "grad_norm": 0.6364, "tokens_per_sec": 142231, "dt_s": 4.608, "eta_s": 10184, "world_size": 1, "timestamp": "2026-05-05T05:08:21.457348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70820, "epoch": 0, "train_loss": 3.7477849572896957, "train_ppl": 42.427000223388106, "lr": 0.00056, "grad_norm": 0.7781, "tokens_per_sec": 126015, "dt_s": 5.201, "eta_s": 10396, "world_size": 1, "timestamp": "2026-05-05T05:08:26.658096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70830, "epoch": 0, "train_loss": 3.6761996895074844, "train_ppl": 39.496011396608885, "lr": 0.00056, "grad_norm": 0.6557, "tokens_per_sec": 140687, "dt_s": 4.658, "eta_s": 10430, "world_size": 1, "timestamp": "2026-05-05T05:08:31.316274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70840, "epoch": 0, "train_loss": 3.720022067427635, "train_ppl": 41.26530471768906, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 143430, "dt_s": 4.569, "eta_s": 10382, "world_size": 1, "timestamp": "2026-05-05T05:08:35.885465"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70850, "epoch": 0, "train_loss": 3.6442739069461823, "train_ppl": 38.254986085219386, "lr": 0.00056, "grad_norm": 0.6342, "tokens_per_sec": 139502, "dt_s": 4.698, "eta_s": 10363, "world_size": 1, "timestamp": "2026-05-05T05:08:40.583331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70860, "epoch": 0, "train_loss": 3.7512183785438538, "train_ppl": 42.57292034698532, "lr": 0.00056, "grad_norm": 0.6449, "tokens_per_sec": 143309, "dt_s": 4.573, "eta_s": 10343, "world_size": 1, "timestamp": "2026-05-05T05:08:45.156384"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70870, "epoch": 0, "train_loss": 3.7200231552124023, "train_ppl": 41.26534960548336, "lr": 0.00056, "grad_norm": 0.6251, "tokens_per_sec": 140774, "dt_s": 4.655, "eta_s": 10101, "world_size": 1, "timestamp": "2026-05-05T05:08:49.811798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70880, "epoch": 0, "train_loss": 3.7177844792604446, "train_ppl": 41.173073186710496, "lr": 0.00056, "grad_norm": 0.6671, "tokens_per_sec": 142492, "dt_s": 4.599, "eta_s": 10070, "world_size": 1, "timestamp": "2026-05-05T05:08:54.411058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70890, "epoch": 0, "train_loss": 3.785909429192543, "train_ppl": 44.07573609869369, "lr": 0.00056, "grad_norm": 4.0779, "tokens_per_sec": 142526, "dt_s": 4.598, "eta_s": 10078, "world_size": 1, "timestamp": "2026-05-05T05:08:59.009280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70900, "epoch": 0, "train_loss": 3.5710769444704056, "train_ppl": 35.55486315386558, "lr": 0.00056, "grad_norm": 0.6554, "tokens_per_sec": 138525, "dt_s": 4.731, "eta_s": 10088, "world_size": 1, "timestamp": "2026-05-05T05:09:03.740256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70910, "epoch": 0, "train_loss": 3.6797184199094772, "train_ppl": 39.63523200892194, "lr": 0.00056, "grad_norm": 0.6686, "tokens_per_sec": 142968, "dt_s": 4.584, "eta_s": 10088, "world_size": 1, "timestamp": "2026-05-05T05:09:08.324219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70920, "epoch": 0, "train_loss": 3.6613594442605972, "train_ppl": 38.91420862674017, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 140345, "dt_s": 4.67, "eta_s": 10090, "world_size": 1, "timestamp": "2026-05-05T05:09:12.993824"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70930, "epoch": 0, "train_loss": 3.7068233639001846, "train_ppl": 40.72423476049646, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 142764, "dt_s": 4.591, "eta_s": 10081, "world_size": 1, "timestamp": "2026-05-05T05:09:17.584378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70940, "epoch": 0, "train_loss": 3.7286774069070816, "train_ppl": 41.624020101028606, "lr": 0.00056, "grad_norm": 0.705, "tokens_per_sec": 139615, "dt_s": 4.694, "eta_s": 10118, "world_size": 1, "timestamp": "2026-05-05T05:09:22.278415"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70950, "epoch": 0, "train_loss": 3.6670270413160324, "train_ppl": 39.13538485780036, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 142806, "dt_s": 4.589, "eta_s": 10052, "world_size": 1, "timestamp": "2026-05-05T05:09:26.867565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70960, "epoch": 0, "train_loss": 3.6564434468746185, "train_ppl": 38.723375929695074, "lr": 0.00056, "grad_norm": 0.6324, "tokens_per_sec": 143519, "dt_s": 4.566, "eta_s": 10040, "world_size": 1, "timestamp": "2026-05-05T05:09:31.433938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70970, "epoch": 0, "train_loss": 3.7317796647548676, "train_ppl": 41.75334904618912, "lr": 0.00056, "grad_norm": 0.7381, "tokens_per_sec": 144281, "dt_s": 4.542, "eta_s": 9980, "world_size": 1, "timestamp": "2026-05-05T05:09:35.976170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70980, "epoch": 0, "train_loss": 3.590743914246559, "train_ppl": 36.261041000376835, "lr": 0.00056, "grad_norm": 0.7129, "tokens_per_sec": 139483, "dt_s": 4.699, "eta_s": 10022, "world_size": 1, "timestamp": "2026-05-05T05:09:40.674694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 70990, "epoch": 0, "train_loss": 3.6741746813058853, "train_ppl": 39.41611257478819, "lr": 0.00056, "grad_norm": 0.6505, "tokens_per_sec": 142157, "dt_s": 4.61, "eta_s": 9981, "world_size": 1, "timestamp": "2026-05-05T05:09:45.284784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71000, "epoch": 0, "train_loss": 3.721176326274872, "train_ppl": 41.31296306048395, "lr": 0.00056, "grad_norm": 0.723, "tokens_per_sec": 141640, "dt_s": 4.627, "eta_s": 9993, "world_size": 1, "timestamp": "2026-05-05T05:09:49.911770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71010, "epoch": 0, "train_loss": 3.7872314751148224, "train_ppl": 44.13404478075942, "lr": 0.00056, "grad_norm": 0.6698, "tokens_per_sec": 122002, "dt_s": 5.372, "eta_s": 9989, "world_size": 1, "timestamp": "2026-05-05T05:09:55.283440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71020, "epoch": 0, "train_loss": 3.7716455459594727, "train_ppl": 43.4515074912292, "lr": 0.00056, "grad_norm": 0.6112, "tokens_per_sec": 143789, "dt_s": 4.558, "eta_s": 9991, "world_size": 1, "timestamp": "2026-05-05T05:09:59.841236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71030, "epoch": 0, "train_loss": 3.705175891518593, "train_ppl": 40.65719794428113, "lr": 0.00056, "grad_norm": 0.6488, "tokens_per_sec": 143895, "dt_s": 4.554, "eta_s": 9924, "world_size": 1, "timestamp": "2026-05-05T05:10:04.395661"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71040, "epoch": 0, "train_loss": 3.6489825397729874, "train_ppl": 38.43553951480124, "lr": 0.00056, "grad_norm": 0.6177, "tokens_per_sec": 146734, "dt_s": 4.466, "eta_s": 9857, "world_size": 1, "timestamp": "2026-05-05T05:10:08.861956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71050, "epoch": 0, "train_loss": 3.7642174065113068, "train_ppl": 43.129939439120086, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 145149, "dt_s": 4.515, "eta_s": 9804, "world_size": 1, "timestamp": "2026-05-05T05:10:13.377050"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71060, "epoch": 0, "train_loss": 3.6812146455049515, "train_ppl": 39.6945796451907, "lr": 0.00056, "grad_norm": 0.6529, "tokens_per_sec": 145051, "dt_s": 4.518, "eta_s": 9778, "world_size": 1, "timestamp": "2026-05-05T05:10:17.895221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71070, "epoch": 0, "train_loss": 3.646504908800125, "train_ppl": 38.34042830553424, "lr": 0.00056, "grad_norm": 0.6676, "tokens_per_sec": 142965, "dt_s": 4.584, "eta_s": 9785, "world_size": 1, "timestamp": "2026-05-05T05:10:22.479288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71080, "epoch": 0, "train_loss": 3.6266602277755737, "train_ppl": 37.587074491646064, "lr": 0.00056, "grad_norm": 0.6058, "tokens_per_sec": 141657, "dt_s": 4.626, "eta_s": 9812, "world_size": 1, "timestamp": "2026-05-05T05:10:27.105662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71090, "epoch": 0, "train_loss": 3.774802476167679, "train_ppl": 43.588897619238196, "lr": 0.00056, "grad_norm": 0.6502, "tokens_per_sec": 138545, "dt_s": 4.73, "eta_s": 9921, "world_size": 1, "timestamp": "2026-05-05T05:10:31.835969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71100, "epoch": 0, "train_loss": 3.7453405559062958, "train_ppl": 42.323418254878135, "lr": 0.00056, "grad_norm": 0.6685, "tokens_per_sec": 141726, "dt_s": 4.624, "eta_s": 9964, "world_size": 1, "timestamp": "2026-05-05T05:10:36.460101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71110, "epoch": 0, "train_loss": 3.736751988530159, "train_ppl": 41.9614772278893, "lr": 0.00056, "grad_norm": 0.7099, "tokens_per_sec": 140272, "dt_s": 4.672, "eta_s": 10025, "world_size": 1, "timestamp": "2026-05-05T05:10:41.132157"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71120, "epoch": 0, "train_loss": 3.7848724871873856, "train_ppl": 44.03005580452577, "lr": 0.00056, "grad_norm": 0.7435, "tokens_per_sec": 127140, "dt_s": 5.155, "eta_s": 10267, "world_size": 1, "timestamp": "2026-05-05T05:10:46.286788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71130, "epoch": 0, "train_loss": 3.7456433326005936, "train_ppl": 42.33623473971727, "lr": 0.00056, "grad_norm": 0.6999, "tokens_per_sec": 140806, "dt_s": 4.654, "eta_s": 10274, "world_size": 1, "timestamp": "2026-05-05T05:10:50.941092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71140, "epoch": 0, "train_loss": 3.7307641357183456, "train_ppl": 41.71096883067262, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 139964, "dt_s": 4.682, "eta_s": 10249, "world_size": 1, "timestamp": "2026-05-05T05:10:55.623446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71150, "epoch": 0, "train_loss": 3.6284831017255783, "train_ppl": 37.65565347702683, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 144237, "dt_s": 4.544, "eta_s": 10209, "world_size": 1, "timestamp": "2026-05-05T05:11:00.167075"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71160, "epoch": 0, "train_loss": 3.616035535931587, "train_ppl": 37.189837405474925, "lr": 0.00056, "grad_norm": 0.7025, "tokens_per_sec": 146116, "dt_s": 4.485, "eta_s": 10124, "world_size": 1, "timestamp": "2026-05-05T05:11:04.652279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71170, "epoch": 0, "train_loss": 3.6934187412261963, "train_ppl": 40.18198421129901, "lr": 0.00056, "grad_norm": 0.6409, "tokens_per_sec": 147431, "dt_s": 4.445, "eta_s": 9814, "world_size": 1, "timestamp": "2026-05-05T05:11:09.097473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71180, "epoch": 0, "train_loss": 3.7053800225257874, "train_ppl": 40.66549818618675, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 148926, "dt_s": 4.401, "eta_s": 9700, "world_size": 1, "timestamp": "2026-05-05T05:11:13.498025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71190, "epoch": 0, "train_loss": 3.7154353708028793, "train_ppl": 41.076466686262656, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 148013, "dt_s": 4.428, "eta_s": 9586, "world_size": 1, "timestamp": "2026-05-05T05:11:17.925781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71200, "epoch": 0, "train_loss": 3.698074549436569, "train_ppl": 40.36950000335667, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 148622, "dt_s": 4.41, "eta_s": 9524, "world_size": 1, "timestamp": "2026-05-05T05:11:22.335330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71210, "epoch": 0, "train_loss": 3.696021094918251, "train_ppl": 40.286688125484154, "lr": 0.00056, "grad_norm": 0.7474, "tokens_per_sec": 149768, "dt_s": 4.376, "eta_s": 9473, "world_size": 1, "timestamp": "2026-05-05T05:11:26.711160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71220, "epoch": 0, "train_loss": 3.6322893649339676, "train_ppl": 37.79925392262143, "lr": 0.00056, "grad_norm": 0.635, "tokens_per_sec": 148328, "dt_s": 4.418, "eta_s": 9457, "world_size": 1, "timestamp": "2026-05-05T05:11:31.129481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71230, "epoch": 0, "train_loss": 3.750579059123993, "train_ppl": 42.545711350793894, "lr": 0.00056, "grad_norm": 0.6921, "tokens_per_sec": 149607, "dt_s": 4.381, "eta_s": 9444, "world_size": 1, "timestamp": "2026-05-05T05:11:35.510045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71240, "epoch": 0, "train_loss": 3.643330231308937, "train_ppl": 38.21890281497688, "lr": 0.00056, "grad_norm": 0.6207, "tokens_per_sec": 149369, "dt_s": 4.388, "eta_s": 9422, "world_size": 1, "timestamp": "2026-05-05T05:11:39.897545"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71250, "epoch": 0, "train_loss": 3.767793670296669, "train_ppl": 43.284459617371496, "lr": 0.00056, "grad_norm": 0.631, "tokens_per_sec": 146074, "dt_s": 4.486, "eta_s": 9451, "world_size": 1, "timestamp": "2026-05-05T05:11:44.384093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71260, "epoch": 0, "train_loss": 3.7731604278087616, "train_ppl": 43.51738127415332, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 148179, "dt_s": 4.423, "eta_s": 9467, "world_size": 1, "timestamp": "2026-05-05T05:11:48.806793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71270, "epoch": 0, "train_loss": 3.7105407118797302, "train_ppl": 40.87590263878833, "lr": 0.00056, "grad_norm": 0.6363, "tokens_per_sec": 146036, "dt_s": 4.488, "eta_s": 9492, "world_size": 1, "timestamp": "2026-05-05T05:11:53.294459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71280, "epoch": 0, "train_loss": 3.8099596351385117, "train_ppl": 45.14861641188955, "lr": 0.00056, "grad_norm": 0.6772, "tokens_per_sec": 149433, "dt_s": 4.386, "eta_s": 9490, "world_size": 1, "timestamp": "2026-05-05T05:11:57.680126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71290, "epoch": 0, "train_loss": 3.6261232793331146, "train_ppl": 37.566897588004444, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 145693, "dt_s": 4.498, "eta_s": 9533, "world_size": 1, "timestamp": "2026-05-05T05:12:02.178374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71300, "epoch": 0, "train_loss": 3.7121952027082443, "train_ppl": 40.9435874212977, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 146770, "dt_s": 4.465, "eta_s": 9519, "world_size": 1, "timestamp": "2026-05-05T05:12:06.643549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71310, "epoch": 0, "train_loss": 3.753700390458107, "train_ppl": 42.67871808381988, "lr": 0.00056, "grad_norm": 0.7072, "tokens_per_sec": 149794, "dt_s": 4.375, "eta_s": 9494, "world_size": 1, "timestamp": "2026-05-05T05:12:11.018632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71320, "epoch": 0, "train_loss": 3.7197336554527283, "train_ppl": 41.25340502574961, "lr": 0.00056, "grad_norm": 0.6667, "tokens_per_sec": 143621, "dt_s": 4.563, "eta_s": 9522, "world_size": 1, "timestamp": "2026-05-05T05:12:15.581772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71330, "epoch": 0, "train_loss": 3.791123166680336, "train_ppl": 44.30613551541842, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 145343, "dt_s": 4.509, "eta_s": 9570, "world_size": 1, "timestamp": "2026-05-05T05:12:20.090814"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71340, "epoch": 0, "train_loss": 3.7799991965293884, "train_ppl": 43.81600653068626, "lr": 0.00056, "grad_norm": 0.6489, "tokens_per_sec": 146492, "dt_s": 4.474, "eta_s": 9555, "world_size": 1, "timestamp": "2026-05-05T05:12:24.564491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71350, "epoch": 0, "train_loss": 3.7883077561855316, "train_ppl": 44.181570988922104, "lr": 0.00056, "grad_norm": 0.6896, "tokens_per_sec": 144681, "dt_s": 4.53, "eta_s": 9578, "world_size": 1, "timestamp": "2026-05-05T05:12:29.094216"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71360, "epoch": 0, "train_loss": 3.752100870013237, "train_ppl": 42.61050716860363, "lr": 0.00056, "grad_norm": 0.6577, "tokens_per_sec": 145043, "dt_s": 4.518, "eta_s": 9635, "world_size": 1, "timestamp": "2026-05-05T05:12:33.612591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71370, "epoch": 0, "train_loss": 3.7162203937768936, "train_ppl": 41.108725316529, "lr": 0.00056, "grad_norm": 0.6294, "tokens_per_sec": 145755, "dt_s": 4.496, "eta_s": 9602, "world_size": 1, "timestamp": "2026-05-05T05:12:38.108897"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71380, "epoch": 0, "train_loss": 3.6740054339170456, "train_ppl": 39.40944206515579, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 144871, "dt_s": 4.524, "eta_s": 9604, "world_size": 1, "timestamp": "2026-05-05T05:12:42.632637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71390, "epoch": 0, "train_loss": 3.7304667085409164, "train_ppl": 41.69856469969975, "lr": 0.00056, "grad_norm": 0.684, "tokens_per_sec": 145273, "dt_s": 4.511, "eta_s": 9615, "world_size": 1, "timestamp": "2026-05-05T05:12:47.143873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71400, "epoch": 0, "train_loss": 3.8707790970802307, "train_ppl": 47.9797524079422, "lr": 0.00056, "grad_norm": 0.727, "tokens_per_sec": 145513, "dt_s": 4.504, "eta_s": 9600, "world_size": 1, "timestamp": "2026-05-05T05:12:51.647650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71410, "epoch": 0, "train_loss": 3.748596578836441, "train_ppl": 42.46144886867353, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 129469, "dt_s": 5.062, "eta_s": 9826, "world_size": 1, "timestamp": "2026-05-05T05:12:56.709584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71420, "epoch": 0, "train_loss": 3.6593465507030487, "train_ppl": 38.835957249166384, "lr": 0.00056, "grad_norm": 0.6897, "tokens_per_sec": 149608, "dt_s": 4.381, "eta_s": 9773, "world_size": 1, "timestamp": "2026-05-05T05:13:01.090058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71430, "epoch": 0, "train_loss": 3.8018986135721207, "train_ppl": 44.78613538781154, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 148754, "dt_s": 4.406, "eta_s": 9718, "world_size": 1, "timestamp": "2026-05-05T05:13:05.495761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71440, "epoch": 0, "train_loss": 3.7467190325260162, "train_ppl": 42.38180032732528, "lr": 0.00056, "grad_norm": 0.6936, "tokens_per_sec": 148757, "dt_s": 4.406, "eta_s": 9668, "world_size": 1, "timestamp": "2026-05-05T05:13:09.901326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71450, "epoch": 0, "train_loss": 3.7390195727348328, "train_ppl": 42.05673637410565, "lr": 0.00056, "grad_norm": 0.7037, "tokens_per_sec": 148941, "dt_s": 4.4, "eta_s": 9620, "world_size": 1, "timestamp": "2026-05-05T05:13:14.301448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71460, "epoch": 0, "train_loss": 3.7532997131347656, "train_ppl": 42.66162111470705, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 148879, "dt_s": 4.402, "eta_s": 9335, "world_size": 1, "timestamp": "2026-05-05T05:13:18.703418"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71470, "epoch": 0, "train_loss": 3.6356966495513916, "train_ppl": 37.92826640537092, "lr": 0.00056, "grad_norm": 0.6607, "tokens_per_sec": 149423, "dt_s": 4.386, "eta_s": 9333, "world_size": 1, "timestamp": "2026-05-05T05:13:23.089380"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71480, "epoch": 0, "train_loss": 3.614826574921608, "train_ppl": 37.144903509221436, "lr": 0.00056, "grad_norm": 0.7526, "tokens_per_sec": 151157, "dt_s": 4.336, "eta_s": 9299, "world_size": 1, "timestamp": "2026-05-05T05:13:27.425002"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71490, "epoch": 0, "train_loss": 3.6530566662549973, "train_ppl": 38.59245018410522, "lr": 0.00056, "grad_norm": 0.6504, "tokens_per_sec": 147905, "dt_s": 4.431, "eta_s": 9305, "world_size": 1, "timestamp": "2026-05-05T05:13:31.855973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71500, "epoch": 0, "train_loss": 3.692489519715309, "train_ppl": 40.14466358946743, "lr": 0.00056, "grad_norm": 0.652, "tokens_per_sec": 148071, "dt_s": 4.426, "eta_s": 9312, "world_size": 1, "timestamp": "2026-05-05T05:13:36.281952"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71510, "epoch": 0, "train_loss": 3.7678215205669403, "train_ppl": 43.28566511805707, "lr": 0.00056, "grad_norm": 0.6646, "tokens_per_sec": 125948, "dt_s": 5.203, "eta_s": 9315, "world_size": 1, "timestamp": "2026-05-05T05:13:41.485359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71520, "epoch": 0, "train_loss": 3.722752124071121, "train_ppl": 41.37811525649282, "lr": 0.00056, "grad_norm": 0.6493, "tokens_per_sec": 150111, "dt_s": 4.366, "eta_s": 9302, "world_size": 1, "timestamp": "2026-05-05T05:13:45.851162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71530, "epoch": 0, "train_loss": 3.744908109307289, "train_ppl": 42.30511959347759, "lr": 0.00056, "grad_norm": 0.7208, "tokens_per_sec": 147712, "dt_s": 4.437, "eta_s": 9340, "world_size": 1, "timestamp": "2026-05-05T05:13:50.287942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71540, "epoch": 0, "train_loss": 3.666331857442856, "train_ppl": 39.10818802387214, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 143596, "dt_s": 4.564, "eta_s": 9392, "world_size": 1, "timestamp": "2026-05-05T05:13:54.851832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71550, "epoch": 0, "train_loss": 3.8406182527542114, "train_ppl": 46.554247836201874, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 147343, "dt_s": 4.448, "eta_s": 9397, "world_size": 1, "timestamp": "2026-05-05T05:13:59.299690"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71560, "epoch": 0, "train_loss": 3.751415401697159, "train_ppl": 42.58130902435219, "lr": 0.00056, "grad_norm": 0.6477, "tokens_per_sec": 151080, "dt_s": 4.338, "eta_s": 9358, "world_size": 1, "timestamp": "2026-05-05T05:14:03.637522"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71570, "epoch": 0, "train_loss": 3.635424092411995, "train_ppl": 37.91793019424526, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 146666, "dt_s": 4.468, "eta_s": 9397, "world_size": 1, "timestamp": "2026-05-05T05:14:08.105923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71580, "epoch": 0, "train_loss": 3.7782967388629913, "train_ppl": 43.74147509576517, "lr": 0.00056, "grad_norm": 0.7218, "tokens_per_sec": 149998, "dt_s": 4.369, "eta_s": 9364, "world_size": 1, "timestamp": "2026-05-05T05:14:12.475052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71590, "epoch": 0, "train_loss": 3.7211142778396606, "train_ppl": 41.3103997352981, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 149769, "dt_s": 4.376, "eta_s": 9280, "world_size": 1, "timestamp": "2026-05-05T05:14:16.850854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71600, "epoch": 0, "train_loss": 3.705541178584099, "train_ppl": 40.6720522056795, "lr": 0.00056, "grad_norm": 0.6333, "tokens_per_sec": 145982, "dt_s": 4.489, "eta_s": 9293, "world_size": 1, "timestamp": "2026-05-05T05:14:21.340193"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71610, "epoch": 0, "train_loss": 3.6752118319272995, "train_ppl": 39.45701422735962, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 150488, "dt_s": 4.355, "eta_s": 9296, "world_size": 1, "timestamp": "2026-05-05T05:14:25.695077"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71620, "epoch": 0, "train_loss": 3.7123513519763947, "train_ppl": 40.9499812316904, "lr": 0.00056, "grad_norm": 0.6519, "tokens_per_sec": 147957, "dt_s": 4.429, "eta_s": 9275, "world_size": 1, "timestamp": "2026-05-05T05:14:30.124491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71630, "epoch": 0, "train_loss": 3.604460299015045, "train_ppl": 36.76183810210664, "lr": 0.00056, "grad_norm": 0.6173, "tokens_per_sec": 149278, "dt_s": 4.39, "eta_s": 9280, "world_size": 1, "timestamp": "2026-05-05T05:14:34.514679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71640, "epoch": 0, "train_loss": 3.70352540910244, "train_ppl": 40.590149300508386, "lr": 0.00056, "grad_norm": 0.7177, "tokens_per_sec": 149527, "dt_s": 4.383, "eta_s": 9278, "world_size": 1, "timestamp": "2026-05-05T05:14:38.897552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71650, "epoch": 0, "train_loss": 3.7256885021924973, "train_ppl": 41.49979561117087, "lr": 0.00056, "grad_norm": 0.6435, "tokens_per_sec": 147457, "dt_s": 4.444, "eta_s": 9255, "world_size": 1, "timestamp": "2026-05-05T05:14:43.341946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71660, "epoch": 0, "train_loss": 3.8324384093284607, "train_ppl": 46.17499460875309, "lr": 0.00056, "grad_norm": 0.6734, "tokens_per_sec": 148611, "dt_s": 4.41, "eta_s": 9274, "world_size": 1, "timestamp": "2026-05-05T05:14:47.751870"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71670, "epoch": 0, "train_loss": 3.7171046137809753, "train_ppl": 41.14509054885978, "lr": 0.00056, "grad_norm": 0.7368, "tokens_per_sec": 149939, "dt_s": 4.371, "eta_s": 9245, "world_size": 1, "timestamp": "2026-05-05T05:14:52.122715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71680, "epoch": 0, "train_loss": 3.6606182008981705, "train_ppl": 38.88537441579547, "lr": 0.00056, "grad_norm": 0.6388, "tokens_per_sec": 145750, "dt_s": 4.496, "eta_s": 9285, "world_size": 1, "timestamp": "2026-05-05T05:14:56.619155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71690, "epoch": 0, "train_loss": 3.7960177063941956, "train_ppl": 44.52352523257672, "lr": 0.00056, "grad_norm": 0.7332, "tokens_per_sec": 149487, "dt_s": 4.384, "eta_s": 9281, "world_size": 1, "timestamp": "2026-05-05T05:15:01.003230"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71700, "epoch": 0, "train_loss": 3.6171285808086395, "train_ppl": 37.2305097910513, "lr": 0.00056, "grad_norm": 0.6439, "tokens_per_sec": 149119, "dt_s": 4.395, "eta_s": 9256, "world_size": 1, "timestamp": "2026-05-05T05:15:05.398114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71710, "epoch": 0, "train_loss": 3.721600651741028, "train_ppl": 41.3304969225619, "lr": 0.00056, "grad_norm": 0.6805, "tokens_per_sec": 131464, "dt_s": 4.985, "eta_s": 9493, "world_size": 1, "timestamp": "2026-05-05T05:15:10.383210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71720, "epoch": 0, "train_loss": 3.6868010461330414, "train_ppl": 39.91695001653489, "lr": 0.00056, "grad_norm": 0.6449, "tokens_per_sec": 150478, "dt_s": 4.355, "eta_s": 9482, "world_size": 1, "timestamp": "2026-05-05T05:15:14.738390"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71730, "epoch": 0, "train_loss": 3.660120874643326, "train_ppl": 38.86604050619795, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 148432, "dt_s": 4.415, "eta_s": 9443, "world_size": 1, "timestamp": "2026-05-05T05:15:19.153592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71740, "epoch": 0, "train_loss": 3.8034854233264923, "train_ppl": 44.85725887910653, "lr": 0.00056, "grad_norm": 0.8184, "tokens_per_sec": 150106, "dt_s": 4.366, "eta_s": 9431, "world_size": 1, "timestamp": "2026-05-05T05:15:23.519562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71750, "epoch": 0, "train_loss": 3.737654596567154, "train_ppl": 41.999369092657666, "lr": 0.00056, "grad_norm": 0.6804, "tokens_per_sec": 150258, "dt_s": 4.362, "eta_s": 9412, "world_size": 1, "timestamp": "2026-05-05T05:15:27.881134"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71760, "epoch": 0, "train_loss": 3.8307413458824158, "train_ppl": 46.096699168237365, "lr": 0.00056, "grad_norm": 0.6861, "tokens_per_sec": 146383, "dt_s": 4.477, "eta_s": 9195, "world_size": 1, "timestamp": "2026-05-05T05:15:32.358165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71770, "epoch": 0, "train_loss": 3.6759234964847565, "train_ppl": 39.48510438012582, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 149763, "dt_s": 4.376, "eta_s": 9200, "world_size": 1, "timestamp": "2026-05-05T05:15:36.734165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71780, "epoch": 0, "train_loss": 3.6564219892024994, "train_ppl": 38.7225450251057, "lr": 0.00056, "grad_norm": 0.7112, "tokens_per_sec": 149251, "dt_s": 4.391, "eta_s": 9185, "world_size": 1, "timestamp": "2026-05-05T05:15:41.125149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71790, "epoch": 0, "train_loss": 3.7093297094106674, "train_ppl": 40.82643178047923, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 145387, "dt_s": 4.508, "eta_s": 9240, "world_size": 1, "timestamp": "2026-05-05T05:15:45.632927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71800, "epoch": 0, "train_loss": 3.6881847381591797, "train_ppl": 39.97222101218001, "lr": 0.00056, "grad_norm": 0.6377, "tokens_per_sec": 146370, "dt_s": 4.477, "eta_s": 9284, "world_size": 1, "timestamp": "2026-05-05T05:15:50.110361"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71810, "epoch": 0, "train_loss": 3.754939943552017, "train_ppl": 42.73165342216812, "lr": 0.00056, "grad_norm": 0.6559, "tokens_per_sec": 147178, "dt_s": 4.453, "eta_s": 9269, "world_size": 1, "timestamp": "2026-05-05T05:15:54.563159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71820, "epoch": 0, "train_loss": 3.7179096341133118, "train_ppl": 41.17822651910282, "lr": 0.00056, "grad_norm": 0.7201, "tokens_per_sec": 147360, "dt_s": 4.447, "eta_s": 9295, "world_size": 1, "timestamp": "2026-05-05T05:15:59.010500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71830, "epoch": 0, "train_loss": 3.7529984414577484, "train_ppl": 42.648770312458126, "lr": 0.00056, "grad_norm": 0.6433, "tokens_per_sec": 146654, "dt_s": 4.469, "eta_s": 9323, "world_size": 1, "timestamp": "2026-05-05T05:16:03.479274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71840, "epoch": 0, "train_loss": 3.678880289196968, "train_ppl": 39.6020264209332, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 146845, "dt_s": 4.463, "eta_s": 9299, "world_size": 1, "timestamp": "2026-05-05T05:16:07.942200"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71850, "epoch": 0, "train_loss": 3.6028824001550674, "train_ppl": 36.70387737977919, "lr": 0.00056, "grad_norm": 0.6735, "tokens_per_sec": 148136, "dt_s": 4.424, "eta_s": 9273, "world_size": 1, "timestamp": "2026-05-05T05:16:12.366240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71860, "epoch": 0, "train_loss": 3.6513530015945435, "train_ppl": 38.52675756555183, "lr": 0.00056, "grad_norm": 0.6927, "tokens_per_sec": 143884, "dt_s": 4.555, "eta_s": 9311, "world_size": 1, "timestamp": "2026-05-05T05:16:16.921009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71870, "epoch": 0, "train_loss": 3.60631163418293, "train_ppl": 36.82995962424909, "lr": 0.00056, "grad_norm": 0.6673, "tokens_per_sec": 142513, "dt_s": 4.599, "eta_s": 9369, "world_size": 1, "timestamp": "2026-05-05T05:16:21.519626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71880, "epoch": 0, "train_loss": 3.5928846150636673, "train_ppl": 36.33874818471697, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 144874, "dt_s": 4.524, "eta_s": 9388, "world_size": 1, "timestamp": "2026-05-05T05:16:26.043281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71890, "epoch": 0, "train_loss": 3.6403692960739136, "train_ppl": 38.10590648894418, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 143130, "dt_s": 4.579, "eta_s": 9431, "world_size": 1, "timestamp": "2026-05-05T05:16:30.622032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71900, "epoch": 0, "train_loss": 3.445584148168564, "train_ppl": 31.36159791695288, "lr": 0.00056, "grad_norm": 1.3663, "tokens_per_sec": 145364, "dt_s": 4.508, "eta_s": 9462, "world_size": 1, "timestamp": "2026-05-05T05:16:35.130423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71910, "epoch": 0, "train_loss": 3.696847543120384, "train_ppl": 40.3199967484853, "lr": 0.00056, "grad_norm": 0.6332, "tokens_per_sec": 147423, "dt_s": 4.445, "eta_s": 9412, "world_size": 1, "timestamp": "2026-05-05T05:16:39.575871"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71920, "epoch": 0, "train_loss": 3.6849454045295715, "train_ppl": 39.84294714603838, "lr": 0.00056, "grad_norm": 0.6259, "tokens_per_sec": 146976, "dt_s": 4.459, "eta_s": 9349, "world_size": 1, "timestamp": "2026-05-05T05:16:44.034840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71930, "epoch": 0, "train_loss": 3.777576595544815, "train_ppl": 43.70998630433001, "lr": 0.00056, "grad_norm": 0.6379, "tokens_per_sec": 144132, "dt_s": 4.547, "eta_s": 9354, "world_size": 1, "timestamp": "2026-05-05T05:16:48.581815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71940, "epoch": 0, "train_loss": 3.7358108460903168, "train_ppl": 41.92200407867325, "lr": 0.00056, "grad_norm": 0.7272, "tokens_per_sec": 145266, "dt_s": 4.511, "eta_s": 9322, "world_size": 1, "timestamp": "2026-05-05T05:16:53.093270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71950, "epoch": 0, "train_loss": 3.712876707315445, "train_ppl": 40.97150017501543, "lr": 0.00056, "grad_norm": 0.7511, "tokens_per_sec": 143785, "dt_s": 4.558, "eta_s": 9338, "world_size": 1, "timestamp": "2026-05-05T05:16:57.651194"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71960, "epoch": 0, "train_loss": 3.687485620379448, "train_ppl": 39.94428548802367, "lr": 0.00056, "grad_norm": 0.666, "tokens_per_sec": 143610, "dt_s": 4.563, "eta_s": 9383, "world_size": 1, "timestamp": "2026-05-05T05:17:02.214647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71970, "epoch": 0, "train_loss": 3.6568746268749237, "train_ppl": 38.74007627510899, "lr": 0.00056, "grad_norm": 0.6314, "tokens_per_sec": 146161, "dt_s": 4.484, "eta_s": 9388, "world_size": 1, "timestamp": "2026-05-05T05:17:06.698466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71980, "epoch": 0, "train_loss": 3.652748316526413, "train_ppl": 38.58055204705349, "lr": 0.00056, "grad_norm": 0.6548, "tokens_per_sec": 142394, "dt_s": 4.602, "eta_s": 9407, "world_size": 1, "timestamp": "2026-05-05T05:17:11.300913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 71990, "epoch": 0, "train_loss": 3.77849180996418, "train_ppl": 43.7500086257752, "lr": 0.00056, "grad_norm": 0.6644, "tokens_per_sec": 145100, "dt_s": 4.517, "eta_s": 9404, "world_size": 1, "timestamp": "2026-05-05T05:17:15.817501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72000, "epoch": 0, "train_loss": 3.7075749337673187, "train_ppl": 40.754853372776466, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 127619, "dt_s": 5.135, "eta_s": 9639, "world_size": 1, "timestamp": "2026-05-05T05:17:20.952795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72010, "epoch": 0, "train_loss": 3.7318356931209564, "train_ppl": 41.75568848365173, "lr": 0.00056, "grad_norm": 0.7299, "tokens_per_sec": 125874, "dt_s": 5.206, "eta_s": 9578, "world_size": 1, "timestamp": "2026-05-05T05:17:26.159262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72020, "epoch": 0, "train_loss": 3.7983518540859222, "train_ppl": 44.62757109819546, "lr": 0.00056, "grad_norm": 0.8554, "tokens_per_sec": 148121, "dt_s": 4.424, "eta_s": 9549, "world_size": 1, "timestamp": "2026-05-05T05:17:30.583749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72030, "epoch": 0, "train_loss": 3.719172313809395, "train_ppl": 41.23025426992892, "lr": 0.00056, "grad_norm": 0.6985, "tokens_per_sec": 145337, "dt_s": 4.509, "eta_s": 9506, "world_size": 1, "timestamp": "2026-05-05T05:17:35.092980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72040, "epoch": 0, "train_loss": 3.68079075217247, "train_ppl": 39.67775694331093, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 147803, "dt_s": 4.434, "eta_s": 9467, "world_size": 1, "timestamp": "2026-05-05T05:17:39.527026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72050, "epoch": 0, "train_loss": 3.704269289970398, "train_ppl": 40.62035476924287, "lr": 0.00056, "grad_norm": 0.6365, "tokens_per_sec": 145211, "dt_s": 4.513, "eta_s": 9206, "world_size": 1, "timestamp": "2026-05-05T05:17:44.040188"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72060, "epoch": 0, "train_loss": 3.806534320116043, "train_ppl": 44.99423273515961, "lr": 0.00056, "grad_norm": 0.7175, "tokens_per_sec": 143881, "dt_s": 4.555, "eta_s": 9254, "world_size": 1, "timestamp": "2026-05-05T05:17:48.595054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72070, "epoch": 0, "train_loss": 3.6583074778318405, "train_ppl": 38.79562481735805, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 149320, "dt_s": 4.389, "eta_s": 9234, "world_size": 1, "timestamp": "2026-05-05T05:17:52.984054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72080, "epoch": 0, "train_loss": 3.635910242795944, "train_ppl": 37.936368492097394, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 144050, "dt_s": 4.55, "eta_s": 9247, "world_size": 1, "timestamp": "2026-05-05T05:17:57.533572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72090, "epoch": 0, "train_loss": 3.67422516644001, "train_ppl": 39.41810255274993, "lr": 0.00056, "grad_norm": 0.6547, "tokens_per_sec": 145465, "dt_s": 4.505, "eta_s": 9271, "world_size": 1, "timestamp": "2026-05-05T05:18:02.038832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72100, "epoch": 0, "train_loss": 3.672918424010277, "train_ppl": 39.36662688568868, "lr": 0.00056, "grad_norm": 0.6703, "tokens_per_sec": 150117, "dt_s": 4.366, "eta_s": 9206, "world_size": 1, "timestamp": "2026-05-05T05:18:06.404501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72110, "epoch": 0, "train_loss": 3.6684024780988693, "train_ppl": 39.189250141298395, "lr": 0.00056, "grad_norm": 0.6334, "tokens_per_sec": 148515, "dt_s": 4.413, "eta_s": 9143, "world_size": 1, "timestamp": "2026-05-05T05:18:10.817260"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72120, "epoch": 0, "train_loss": 3.749234288930893, "train_ppl": 42.488535599064875, "lr": 0.00056, "grad_norm": 0.6297, "tokens_per_sec": 152461, "dt_s": 4.299, "eta_s": 9102, "world_size": 1, "timestamp": "2026-05-05T05:18:15.115788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72130, "epoch": 0, "train_loss": 3.6894458532333374, "train_ppl": 40.022662382146834, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 147187, "dt_s": 4.453, "eta_s": 9057, "world_size": 1, "timestamp": "2026-05-05T05:18:19.568373"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72140, "epoch": 0, "train_loss": 3.731767863035202, "train_ppl": 41.75285628777628, "lr": 0.00056, "grad_norm": 0.7349, "tokens_per_sec": 146978, "dt_s": 4.459, "eta_s": 9034, "world_size": 1, "timestamp": "2026-05-05T05:18:24.027250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72150, "epoch": 0, "train_loss": 3.731156751513481, "train_ppl": 41.72734843110047, "lr": 0.00056, "grad_norm": 0.6756, "tokens_per_sec": 149877, "dt_s": 4.373, "eta_s": 9032, "world_size": 1, "timestamp": "2026-05-05T05:18:28.399884"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72160, "epoch": 0, "train_loss": 3.7276257276535034, "train_ppl": 41.580267993262446, "lr": 0.00056, "grad_norm": 0.6874, "tokens_per_sec": 149363, "dt_s": 4.388, "eta_s": 9018, "world_size": 1, "timestamp": "2026-05-05T05:18:32.787600"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72170, "epoch": 0, "train_loss": 3.736199989914894, "train_ppl": 41.938320942271815, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 149106, "dt_s": 4.395, "eta_s": 9053, "world_size": 1, "timestamp": "2026-05-05T05:18:37.182848"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72180, "epoch": 0, "train_loss": 3.773157224059105, "train_ppl": 43.51724185558133, "lr": 0.00056, "grad_norm": 0.6584, "tokens_per_sec": 146393, "dt_s": 4.477, "eta_s": 9058, "world_size": 1, "timestamp": "2026-05-05T05:18:41.659571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72190, "epoch": 0, "train_loss": 3.6963294446468353, "train_ppl": 40.29911243025027, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 146677, "dt_s": 4.468, "eta_s": 9058, "world_size": 1, "timestamp": "2026-05-05T05:18:46.127635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72200, "epoch": 0, "train_loss": 3.610168904066086, "train_ppl": 36.97229705877432, "lr": 0.00056, "grad_norm": 0.6481, "tokens_per_sec": 148580, "dt_s": 4.411, "eta_s": 9069, "world_size": 1, "timestamp": "2026-05-05T05:18:50.538474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72210, "epoch": 0, "train_loss": 3.7255450636148453, "train_ppl": 41.49384336641652, "lr": 0.00056, "grad_norm": 0.6487, "tokens_per_sec": 148923, "dt_s": 4.401, "eta_s": 9070, "world_size": 1, "timestamp": "2026-05-05T05:18:54.939114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72220, "epoch": 0, "train_loss": 3.7182210236787796, "train_ppl": 41.191050985764264, "lr": 0.00056, "grad_norm": 0.6825, "tokens_per_sec": 148717, "dt_s": 4.407, "eta_s": 9070, "world_size": 1, "timestamp": "2026-05-05T05:18:59.345855"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72230, "epoch": 0, "train_loss": 3.815588742494583, "train_ppl": 45.403479473433826, "lr": 0.00056, "grad_norm": 0.7283, "tokens_per_sec": 149954, "dt_s": 4.37, "eta_s": 9022, "world_size": 1, "timestamp": "2026-05-05T05:19:03.716284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72240, "epoch": 0, "train_loss": 3.7119578570127487, "train_ppl": 40.933870790211, "lr": 0.00056, "grad_norm": 0.7071, "tokens_per_sec": 148190, "dt_s": 4.422, "eta_s": 8999, "world_size": 1, "timestamp": "2026-05-05T05:19:08.138691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72250, "epoch": 0, "train_loss": 3.5977081656455994, "train_ppl": 36.5144533954835, "lr": 0.00056, "grad_norm": 0.7441, "tokens_per_sec": 149649, "dt_s": 4.379, "eta_s": 8982, "world_size": 1, "timestamp": "2026-05-05T05:19:12.518009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72260, "epoch": 0, "train_loss": 3.7377225905656815, "train_ppl": 42.002224894785506, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 150652, "dt_s": 4.35, "eta_s": 8957, "world_size": 1, "timestamp": "2026-05-05T05:19:16.868165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72270, "epoch": 0, "train_loss": 3.6814576387405396, "train_ppl": 39.70422633152629, "lr": 0.00056, "grad_norm": 0.6451, "tokens_per_sec": 148995, "dt_s": 4.399, "eta_s": 8949, "world_size": 1, "timestamp": "2026-05-05T05:19:21.266705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72280, "epoch": 0, "train_loss": 3.711875930428505, "train_ppl": 40.930517355366895, "lr": 0.00056, "grad_norm": 0.6371, "tokens_per_sec": 148455, "dt_s": 4.415, "eta_s": 8963, "world_size": 1, "timestamp": "2026-05-05T05:19:25.681278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72290, "epoch": 0, "train_loss": 3.7077371031045914, "train_ppl": 40.76146309627132, "lr": 0.00056, "grad_norm": 0.7357, "tokens_per_sec": 152511, "dt_s": 4.297, "eta_s": 8907, "world_size": 1, "timestamp": "2026-05-05T05:19:29.978388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72300, "epoch": 0, "train_loss": 3.7016258537769318, "train_ppl": 40.513119250824346, "lr": 0.00056, "grad_norm": 0.7708, "tokens_per_sec": 134347, "dt_s": 4.878, "eta_s": 9106, "world_size": 1, "timestamp": "2026-05-05T05:19:34.856485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72310, "epoch": 0, "train_loss": 3.684840649366379, "train_ppl": 39.83877361021155, "lr": 0.00056, "grad_norm": 0.6338, "tokens_per_sec": 150984, "dt_s": 4.341, "eta_s": 9098, "world_size": 1, "timestamp": "2026-05-05T05:19:39.197059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72320, "epoch": 0, "train_loss": 3.8508069217205048, "train_ppl": 47.03099825825403, "lr": 0.00056, "grad_norm": 0.6596, "tokens_per_sec": 151244, "dt_s": 4.333, "eta_s": 9067, "world_size": 1, "timestamp": "2026-05-05T05:19:43.530217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72330, "epoch": 0, "train_loss": 3.7122219055891037, "train_ppl": 40.944680747631985, "lr": 0.00056, "grad_norm": 0.6452, "tokens_per_sec": 147925, "dt_s": 4.43, "eta_s": 9069, "world_size": 1, "timestamp": "2026-05-05T05:19:47.960573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72340, "epoch": 0, "train_loss": 3.74510994553566, "train_ppl": 42.31365916102517, "lr": 0.00056, "grad_norm": 0.6771, "tokens_per_sec": 147568, "dt_s": 4.441, "eta_s": 9123, "world_size": 1, "timestamp": "2026-05-05T05:19:52.401636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72350, "epoch": 0, "train_loss": 3.6506552696228027, "train_ppl": 38.499885590838964, "lr": 0.00056, "grad_norm": 0.6727, "tokens_per_sec": 148600, "dt_s": 4.41, "eta_s": 8928, "world_size": 1, "timestamp": "2026-05-05T05:19:56.811944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72360, "epoch": 0, "train_loss": 3.7016740143299103, "train_ppl": 40.51507043203496, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 147791, "dt_s": 4.434, "eta_s": 8962, "world_size": 1, "timestamp": "2026-05-05T05:20:01.246279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72370, "epoch": 0, "train_loss": 3.7380851358175278, "train_ppl": 42.01745536268818, "lr": 0.00056, "grad_norm": 0.6498, "tokens_per_sec": 151575, "dt_s": 4.324, "eta_s": 8953, "world_size": 1, "timestamp": "2026-05-05T05:20:05.569917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72380, "epoch": 0, "train_loss": 3.603520020842552, "train_ppl": 36.727287994060276, "lr": 0.00056, "grad_norm": 0.6137, "tokens_per_sec": 148984, "dt_s": 4.399, "eta_s": 8936, "world_size": 1, "timestamp": "2026-05-05T05:20:09.968795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72390, "epoch": 0, "train_loss": 3.6283279955387115, "train_ppl": 37.649813305137144, "lr": 0.00056, "grad_norm": 0.7582, "tokens_per_sec": 149483, "dt_s": 4.384, "eta_s": 8909, "world_size": 1, "timestamp": "2026-05-05T05:20:14.352992"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72400, "epoch": 0, "train_loss": 3.824997588992119, "train_ppl": 45.83268986322378, "lr": 0.00056, "grad_norm": 0.8145, "tokens_per_sec": 146618, "dt_s": 4.47, "eta_s": 8929, "world_size": 1, "timestamp": "2026-05-05T05:20:18.822820"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72410, "epoch": 0, "train_loss": 3.7526582181453705, "train_ppl": 42.63426267461182, "lr": 0.00056, "grad_norm": 0.6868, "tokens_per_sec": 144816, "dt_s": 4.525, "eta_s": 8961, "world_size": 1, "timestamp": "2026-05-05T05:20:23.348317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72420, "epoch": 0, "train_loss": 3.6661810278892517, "train_ppl": 39.10228979815479, "lr": 0.00056, "grad_norm": 0.6553, "tokens_per_sec": 145971, "dt_s": 4.49, "eta_s": 9024, "world_size": 1, "timestamp": "2026-05-05T05:20:27.837944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72430, "epoch": 0, "train_loss": 3.682777911424637, "train_ppl": 39.75668135684394, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 146934, "dt_s": 4.46, "eta_s": 9044, "world_size": 1, "timestamp": "2026-05-05T05:20:32.298169"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72440, "epoch": 0, "train_loss": 3.649272456765175, "train_ppl": 38.44668424625593, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 144787, "dt_s": 4.526, "eta_s": 9098, "world_size": 1, "timestamp": "2026-05-05T05:20:36.824546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72450, "epoch": 0, "train_loss": 3.78185074031353, "train_ppl": 43.89720893693106, "lr": 0.00056, "grad_norm": 0.7242, "tokens_per_sec": 145982, "dt_s": 4.489, "eta_s": 9101, "world_size": 1, "timestamp": "2026-05-05T05:20:41.313875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72460, "epoch": 0, "train_loss": 3.7363858371973038, "train_ppl": 41.946115789550916, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 143473, "dt_s": 4.568, "eta_s": 9114, "world_size": 1, "timestamp": "2026-05-05T05:20:45.881691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72470, "epoch": 0, "train_loss": 3.7191969603300095, "train_ppl": 41.23127046476351, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 144246, "dt_s": 4.543, "eta_s": 9131, "world_size": 1, "timestamp": "2026-05-05T05:20:50.425051"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72480, "epoch": 0, "train_loss": 3.6602058857679367, "train_ppl": 38.86934469245486, "lr": 0.00056, "grad_norm": 0.6816, "tokens_per_sec": 144684, "dt_s": 4.53, "eta_s": 9154, "world_size": 1, "timestamp": "2026-05-05T05:20:54.954651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72490, "epoch": 0, "train_loss": 3.636045664548874, "train_ppl": 37.94150624949258, "lr": 0.00056, "grad_norm": 0.731, "tokens_per_sec": 144446, "dt_s": 4.537, "eta_s": 9154, "world_size": 1, "timestamp": "2026-05-05T05:20:59.491730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72500, "epoch": 0, "train_loss": 3.7363151013851166, "train_ppl": 41.94314880191982, "lr": 0.00056, "grad_norm": 0.795, "tokens_per_sec": 147195, "dt_s": 4.452, "eta_s": 9135, "world_size": 1, "timestamp": "2026-05-05T05:21:03.944016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72510, "epoch": 0, "train_loss": 3.80960875749588, "train_ppl": 45.13277755070851, "lr": 0.00056, "grad_norm": 0.6642, "tokens_per_sec": 122198, "dt_s": 5.363, "eta_s": 9135, "world_size": 1, "timestamp": "2026-05-05T05:21:09.307127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72520, "epoch": 0, "train_loss": 3.641769826412201, "train_ppl": 38.15931235658926, "lr": 0.00056, "grad_norm": 0.6641, "tokens_per_sec": 141597, "dt_s": 4.628, "eta_s": 9165, "world_size": 1, "timestamp": "2026-05-05T05:21:13.935497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72530, "epoch": 0, "train_loss": 3.6828393787145615, "train_ppl": 39.75912516740978, "lr": 0.00056, "grad_norm": 0.6307, "tokens_per_sec": 145763, "dt_s": 4.496, "eta_s": 9147, "world_size": 1, "timestamp": "2026-05-05T05:21:18.431540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72540, "epoch": 0, "train_loss": 3.6375982761383057, "train_ppl": 38.00046042644231, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 145376, "dt_s": 4.508, "eta_s": 9131, "world_size": 1, "timestamp": "2026-05-05T05:21:22.939575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72550, "epoch": 0, "train_loss": 3.609434723854065, "train_ppl": 36.94516269185728, "lr": 0.00056, "grad_norm": 0.6823, "tokens_per_sec": 148071, "dt_s": 4.426, "eta_s": 9116, "world_size": 1, "timestamp": "2026-05-05T05:21:27.365555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72560, "epoch": 0, "train_loss": 3.7079565823078156, "train_ppl": 40.77041037154845, "lr": 0.00056, "grad_norm": 0.7197, "tokens_per_sec": 147576, "dt_s": 4.441, "eta_s": 9055, "world_size": 1, "timestamp": "2026-05-05T05:21:31.806398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72570, "epoch": 0, "train_loss": 3.7095156610012054, "train_ppl": 40.83402422629661, "lr": 0.00056, "grad_norm": 0.6318, "tokens_per_sec": 141968, "dt_s": 4.616, "eta_s": 9045, "world_size": 1, "timestamp": "2026-05-05T05:21:36.422642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72580, "epoch": 0, "train_loss": 3.748460754752159, "train_ppl": 42.45568197291419, "lr": 0.00056, "grad_norm": 0.6923, "tokens_per_sec": 143618, "dt_s": 4.563, "eta_s": 9068, "world_size": 1, "timestamp": "2026-05-05T05:21:40.985851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72590, "epoch": 0, "train_loss": 3.679923191666603, "train_ppl": 39.64334901606304, "lr": 0.00056, "grad_norm": 0.6749, "tokens_per_sec": 131241, "dt_s": 4.994, "eta_s": 9258, "world_size": 1, "timestamp": "2026-05-05T05:21:45.979427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72600, "epoch": 0, "train_loss": 3.651947021484375, "train_ppl": 38.54965002445129, "lr": 0.00056, "grad_norm": 0.6668, "tokens_per_sec": 144738, "dt_s": 4.528, "eta_s": 9295, "world_size": 1, "timestamp": "2026-05-05T05:21:50.507325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72610, "epoch": 0, "train_loss": 3.798535495996475, "train_ppl": 44.63576734317928, "lr": 0.00056, "grad_norm": 0.6171, "tokens_per_sec": 146578, "dt_s": 4.471, "eta_s": 9302, "world_size": 1, "timestamp": "2026-05-05T05:21:54.978370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72620, "epoch": 0, "train_loss": 3.6056604385375977, "train_ppl": 36.80598392220765, "lr": 0.00056, "grad_norm": 0.6414, "tokens_per_sec": 142495, "dt_s": 4.599, "eta_s": 9291, "world_size": 1, "timestamp": "2026-05-05T05:21:59.577532"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72630, "epoch": 0, "train_loss": 3.69081512093544, "train_ppl": 40.07750165735489, "lr": 0.00056, "grad_norm": 0.6784, "tokens_per_sec": 142450, "dt_s": 4.601, "eta_s": 9301, "world_size": 1, "timestamp": "2026-05-05T05:22:04.178193"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72640, "epoch": 0, "train_loss": 3.6070723086595535, "train_ppl": 36.857985892592446, "lr": 0.00056, "grad_norm": 0.6824, "tokens_per_sec": 145131, "dt_s": 4.516, "eta_s": 9105, "world_size": 1, "timestamp": "2026-05-05T05:22:08.693832"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72650, "epoch": 0, "train_loss": 3.6569962203502655, "train_ppl": 38.744787102015394, "lr": 0.00056, "grad_norm": 0.6224, "tokens_per_sec": 146463, "dt_s": 4.475, "eta_s": 9079, "world_size": 1, "timestamp": "2026-05-05T05:22:13.168403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72660, "epoch": 0, "train_loss": 3.73041869699955, "train_ppl": 41.69656273539482, "lr": 0.00056, "grad_norm": 0.631, "tokens_per_sec": 148815, "dt_s": 4.404, "eta_s": 9048, "world_size": 1, "timestamp": "2026-05-05T05:22:17.572264"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72670, "epoch": 0, "train_loss": 3.7568701058626175, "train_ppl": 42.81421209926548, "lr": 0.00056, "grad_norm": 0.6967, "tokens_per_sec": 149763, "dt_s": 4.376, "eta_s": 8954, "world_size": 1, "timestamp": "2026-05-05T05:22:21.948258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72680, "epoch": 0, "train_loss": 3.678379535675049, "train_ppl": 39.58220053108415, "lr": 0.00056, "grad_norm": 0.6914, "tokens_per_sec": 148391, "dt_s": 4.416, "eta_s": 8876, "world_size": 1, "timestamp": "2026-05-05T05:22:26.364669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72690, "epoch": 0, "train_loss": 3.72115159034729, "train_ppl": 41.311941158660396, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 149711, "dt_s": 4.378, "eta_s": 8816, "world_size": 1, "timestamp": "2026-05-05T05:22:30.742192"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72700, "epoch": 0, "train_loss": 3.6932731568813324, "train_ppl": 40.1761347692562, "lr": 0.00056, "grad_norm": 0.706, "tokens_per_sec": 144885, "dt_s": 4.523, "eta_s": 8831, "world_size": 1, "timestamp": "2026-05-05T05:22:35.265475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72710, "epoch": 0, "train_loss": 3.6422030478715897, "train_ppl": 38.17584737098058, "lr": 0.00056, "grad_norm": 0.6058, "tokens_per_sec": 148395, "dt_s": 4.416, "eta_s": 8832, "world_size": 1, "timestamp": "2026-05-05T05:22:39.681824"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72720, "epoch": 0, "train_loss": 3.6913485527038574, "train_ppl": 40.098885972967295, "lr": 0.00056, "grad_norm": 0.662, "tokens_per_sec": 148759, "dt_s": 4.406, "eta_s": 8839, "world_size": 1, "timestamp": "2026-05-05T05:22:44.087356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72730, "epoch": 0, "train_loss": 3.69993694126606, "train_ppl": 40.444753884678875, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 145531, "dt_s": 4.503, "eta_s": 8869, "world_size": 1, "timestamp": "2026-05-05T05:22:48.590580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72740, "epoch": 0, "train_loss": 3.6337767839431763, "train_ppl": 37.85551908600314, "lr": 0.00056, "grad_norm": 0.6801, "tokens_per_sec": 151311, "dt_s": 4.331, "eta_s": 8846, "world_size": 1, "timestamp": "2026-05-05T05:22:52.921787"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72750, "epoch": 0, "train_loss": 3.708657205104828, "train_ppl": 40.798985059368285, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 151117, "dt_s": 4.337, "eta_s": 8767, "world_size": 1, "timestamp": "2026-05-05T05:22:57.258572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72760, "epoch": 0, "train_loss": 3.4728615283966064, "train_ppl": 32.228834344786755, "lr": 0.00056, "grad_norm": 0.6202, "tokens_per_sec": 146605, "dt_s": 4.47, "eta_s": 8785, "world_size": 1, "timestamp": "2026-05-05T05:23:01.728843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72770, "epoch": 0, "train_loss": 3.776894047856331, "train_ppl": 43.68016233351333, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 149492, "dt_s": 4.384, "eta_s": 8772, "world_size": 1, "timestamp": "2026-05-05T05:23:06.112751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72780, "epoch": 0, "train_loss": 3.758592650294304, "train_ppl": 42.88802503669507, "lr": 0.00056, "grad_norm": 0.5977, "tokens_per_sec": 147772, "dt_s": 4.435, "eta_s": 8740, "world_size": 1, "timestamp": "2026-05-05T05:23:10.547685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72790, "epoch": 0, "train_loss": 3.6515311300754547, "train_ppl": 38.53362088961001, "lr": 0.00056, "grad_norm": 0.6258, "tokens_per_sec": 150091, "dt_s": 4.366, "eta_s": 8750, "world_size": 1, "timestamp": "2026-05-05T05:23:14.914095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72800, "epoch": 0, "train_loss": 3.742941051721573, "train_ppl": 42.22198477935622, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 149863, "dt_s": 4.373, "eta_s": 8760, "world_size": 1, "timestamp": "2026-05-05T05:23:19.287151"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72810, "epoch": 0, "train_loss": 3.6609538346529007, "train_ppl": 38.89842785047884, "lr": 0.00056, "grad_norm": 0.7442, "tokens_per_sec": 147063, "dt_s": 4.456, "eta_s": 8750, "world_size": 1, "timestamp": "2026-05-05T05:23:23.743466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72820, "epoch": 0, "train_loss": 3.6321422308683395, "train_ppl": 37.793692773841336, "lr": 0.00056, "grad_norm": 0.6785, "tokens_per_sec": 149567, "dt_s": 4.382, "eta_s": 8744, "world_size": 1, "timestamp": "2026-05-05T05:23:28.125195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72830, "epoch": 0, "train_loss": 3.6202998608350754, "train_ppl": 37.348765575120346, "lr": 0.00056, "grad_norm": 0.6928, "tokens_per_sec": 151323, "dt_s": 4.331, "eta_s": 8699, "world_size": 1, "timestamp": "2026-05-05T05:23:32.456037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72840, "epoch": 0, "train_loss": 3.7420945316553116, "train_ppl": 42.186258145792685, "lr": 0.00056, "grad_norm": 0.6391, "tokens_per_sec": 149678, "dt_s": 4.378, "eta_s": 8699, "world_size": 1, "timestamp": "2026-05-05T05:23:36.834501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72850, "epoch": 0, "train_loss": 3.731126517057419, "train_ppl": 41.72608684648949, "lr": 0.00056, "grad_norm": 0.6837, "tokens_per_sec": 151124, "dt_s": 4.337, "eta_s": 8680, "world_size": 1, "timestamp": "2026-05-05T05:23:41.171065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72860, "epoch": 0, "train_loss": 3.6355280578136444, "train_ppl": 37.92187255201847, "lr": 0.00056, "grad_norm": 0.6442, "tokens_per_sec": 151996, "dt_s": 4.312, "eta_s": 8618, "world_size": 1, "timestamp": "2026-05-05T05:23:45.482768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72870, "epoch": 0, "train_loss": 3.6743817925453186, "train_ppl": 39.42427694015392, "lr": 0.00056, "grad_norm": 0.6499, "tokens_per_sec": 146346, "dt_s": 4.478, "eta_s": 8652, "world_size": 1, "timestamp": "2026-05-05T05:23:49.960944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72880, "epoch": 0, "train_loss": 3.6892872750759125, "train_ppl": 40.01631616529002, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 150239, "dt_s": 4.362, "eta_s": 8660, "world_size": 1, "timestamp": "2026-05-05T05:23:54.323045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72890, "epoch": 0, "train_loss": 3.7154787331819534, "train_ppl": 41.078247898200644, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 132567, "dt_s": 4.944, "eta_s": 8880, "world_size": 1, "timestamp": "2026-05-05T05:23:59.266639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72900, "epoch": 0, "train_loss": 3.4712137393653393, "train_ppl": 32.17577175504589, "lr": 0.00056, "grad_norm": 0.7433, "tokens_per_sec": 147239, "dt_s": 4.451, "eta_s": 8921, "world_size": 1, "timestamp": "2026-05-05T05:24:03.717659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72910, "epoch": 0, "train_loss": 3.770980656147003, "train_ppl": 43.422626628921364, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 139555, "dt_s": 4.696, "eta_s": 9068, "world_size": 1, "timestamp": "2026-05-05T05:24:08.413702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72920, "epoch": 0, "train_loss": 3.6701536625623703, "train_ppl": 39.25793787217224, "lr": 0.00056, "grad_norm": 0.6835, "tokens_per_sec": 148452, "dt_s": 4.415, "eta_s": 9038, "world_size": 1, "timestamp": "2026-05-05T05:24:12.828352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72930, "epoch": 0, "train_loss": 3.6554701775312424, "train_ppl": 38.685705989503234, "lr": 0.00056, "grad_norm": 0.6768, "tokens_per_sec": 148928, "dt_s": 4.401, "eta_s": 9049, "world_size": 1, "timestamp": "2026-05-05T05:24:17.228837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72940, "epoch": 0, "train_loss": 3.6955738365650177, "train_ppl": 40.2686735965705, "lr": 0.00056, "grad_norm": 0.6879, "tokens_per_sec": 145017, "dt_s": 4.519, "eta_s": 8877, "world_size": 1, "timestamp": "2026-05-05T05:24:21.748036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72950, "epoch": 0, "train_loss": 3.6903994530439377, "train_ppl": 40.0608461885559, "lr": 0.00056, "grad_norm": 0.6428, "tokens_per_sec": 145346, "dt_s": 4.509, "eta_s": 8895, "world_size": 1, "timestamp": "2026-05-05T05:24:26.257006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72960, "epoch": 0, "train_loss": 3.7707556188106537, "train_ppl": 43.412856016105074, "lr": 0.00056, "grad_norm": 0.6497, "tokens_per_sec": 141560, "dt_s": 4.63, "eta_s": 8864, "world_size": 1, "timestamp": "2026-05-05T05:24:30.886563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72970, "epoch": 0, "train_loss": 3.7710332572460175, "train_ppl": 43.424910766877694, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 140609, "dt_s": 4.661, "eta_s": 8957, "world_size": 1, "timestamp": "2026-05-05T05:24:35.547442"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72980, "epoch": 0, "train_loss": 3.943317621946335, "train_ppl": 51.58947206622682, "lr": 0.00056, "grad_norm": 1.662, "tokens_per_sec": 139403, "dt_s": 4.701, "eta_s": 9071, "world_size": 1, "timestamp": "2026-05-05T05:24:40.248656"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 72990, "epoch": 0, "train_loss": 3.7356170266866684, "train_ppl": 41.913879568212295, "lr": 0.00056, "grad_norm": 0.7002, "tokens_per_sec": 140688, "dt_s": 4.658, "eta_s": 9121, "world_size": 1, "timestamp": "2026-05-05T05:24:44.906899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73000, "epoch": 0, "train_loss": 3.614533916115761, "train_ppl": 37.13403431667715, "lr": 0.00056, "grad_norm": 0.6297, "tokens_per_sec": 138945, "dt_s": 4.717, "eta_s": 9198, "world_size": 1, "timestamp": "2026-05-05T05:24:49.623583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73010, "epoch": 0, "train_loss": 3.7961511313915253, "train_ppl": 44.52946618013863, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 121938, "dt_s": 5.375, "eta_s": 9175, "world_size": 1, "timestamp": "2026-05-05T05:24:54.998118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73020, "epoch": 0, "train_loss": 4.016678884625435, "train_ppl": 55.51642287430185, "lr": 0.00056, "grad_norm": 1.1193, "tokens_per_sec": 143813, "dt_s": 4.557, "eta_s": 9130, "world_size": 1, "timestamp": "2026-05-05T05:24:59.555150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73030, "epoch": 0, "train_loss": 3.631409227848053, "train_ppl": 37.76600003356139, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 141993, "dt_s": 4.615, "eta_s": 9091, "world_size": 1, "timestamp": "2026-05-05T05:25:04.170594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73040, "epoch": 0, "train_loss": 3.6743248999118805, "train_ppl": 39.42203405301989, "lr": 0.00056, "grad_norm": 0.6426, "tokens_per_sec": 139970, "dt_s": 4.682, "eta_s": 9096, "world_size": 1, "timestamp": "2026-05-05T05:25:08.852735"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73050, "epoch": 0, "train_loss": 3.621425062417984, "train_ppl": 37.39081411737191, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 139779, "dt_s": 4.689, "eta_s": 9080, "world_size": 1, "timestamp": "2026-05-05T05:25:13.541285"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73060, "epoch": 0, "train_loss": 3.691949784755707, "train_ppl": 40.123001957382634, "lr": 0.00056, "grad_norm": 0.6288, "tokens_per_sec": 143100, "dt_s": 4.58, "eta_s": 9075, "world_size": 1, "timestamp": "2026-05-05T05:25:18.120999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73070, "epoch": 0, "train_loss": 3.7741878777742386, "train_ppl": 43.56211618354627, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 145860, "dt_s": 4.493, "eta_s": 9045, "world_size": 1, "timestamp": "2026-05-05T05:25:22.614057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73080, "epoch": 0, "train_loss": 3.7445967197418213, "train_ppl": 42.29194827148286, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 142049, "dt_s": 4.614, "eta_s": 9040, "world_size": 1, "timestamp": "2026-05-05T05:25:27.227708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73090, "epoch": 0, "train_loss": 3.656404361128807, "train_ppl": 38.72186242724491, "lr": 0.00056, "grad_norm": 0.6126, "tokens_per_sec": 144464, "dt_s": 4.536, "eta_s": 8978, "world_size": 1, "timestamp": "2026-05-05T05:25:31.764213"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73100, "epoch": 0, "train_loss": 3.7041322737932205, "train_ppl": 40.61478950479114, "lr": 0.00056, "grad_norm": 0.6496, "tokens_per_sec": 143297, "dt_s": 4.573, "eta_s": 8928, "world_size": 1, "timestamp": "2026-05-05T05:25:36.337634"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73110, "epoch": 0, "train_loss": 3.8326849043369293, "train_ppl": 46.18637791734695, "lr": 0.00056, "grad_norm": 0.7833, "tokens_per_sec": 140511, "dt_s": 4.664, "eta_s": 8957, "world_size": 1, "timestamp": "2026-05-05T05:25:41.001754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73120, "epoch": 0, "train_loss": 3.753149837255478, "train_ppl": 42.655227645856, "lr": 0.00056, "grad_norm": 0.9829, "tokens_per_sec": 144107, "dt_s": 4.548, "eta_s": 8973, "world_size": 1, "timestamp": "2026-05-05T05:25:45.549454"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73130, "epoch": 0, "train_loss": 3.5906962901353836, "train_ppl": 36.25931414164928, "lr": 0.00056, "grad_norm": 0.6968, "tokens_per_sec": 143850, "dt_s": 4.556, "eta_s": 8946, "world_size": 1, "timestamp": "2026-05-05T05:25:50.105344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73140, "epoch": 0, "train_loss": 3.631360724568367, "train_ppl": 37.76416830312197, "lr": 0.00056, "grad_norm": 0.6155, "tokens_per_sec": 147055, "dt_s": 4.457, "eta_s": 8910, "world_size": 1, "timestamp": "2026-05-05T05:25:54.561863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73150, "epoch": 0, "train_loss": 3.6679611653089523, "train_ppl": 39.17195923961244, "lr": 0.00056, "grad_norm": 0.6722, "tokens_per_sec": 143514, "dt_s": 4.567, "eta_s": 8903, "world_size": 1, "timestamp": "2026-05-05T05:25:59.128425"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73160, "epoch": 0, "train_loss": 3.618045389652252, "train_ppl": 37.26465870330036, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 141056, "dt_s": 4.646, "eta_s": 8892, "world_size": 1, "timestamp": "2026-05-05T05:26:03.774508"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73170, "epoch": 0, "train_loss": 3.6921409964561462, "train_ppl": 40.130674678347276, "lr": 0.00056, "grad_norm": 0.6378, "tokens_per_sec": 143362, "dt_s": 4.571, "eta_s": 8896, "world_size": 1, "timestamp": "2026-05-05T05:26:08.345856"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73180, "epoch": 0, "train_loss": 3.7754951119422913, "train_ppl": 43.619099307282575, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 142045, "dt_s": 4.614, "eta_s": 8914, "world_size": 1, "timestamp": "2026-05-05T05:26:12.959634"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73190, "epoch": 0, "train_loss": 3.6498441100120544, "train_ppl": 38.46866870128091, "lr": 0.00056, "grad_norm": 0.6514, "tokens_per_sec": 128247, "dt_s": 5.11, "eta_s": 9165, "world_size": 1, "timestamp": "2026-05-05T05:26:18.069800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73200, "epoch": 0, "train_loss": 3.701733872294426, "train_ppl": 40.51749565426694, "lr": 0.00056, "grad_norm": 0.67, "tokens_per_sec": 145133, "dt_s": 4.516, "eta_s": 9140, "world_size": 1, "timestamp": "2026-05-05T05:26:22.585364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73210, "epoch": 0, "train_loss": 3.700045019388199, "train_ppl": 40.449125313952784, "lr": 0.00056, "grad_norm": 0.6143, "tokens_per_sec": 138395, "dt_s": 4.735, "eta_s": 9170, "world_size": 1, "timestamp": "2026-05-05T05:26:27.320773"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73220, "epoch": 0, "train_loss": 3.629220888018608, "train_ppl": 37.683445553062974, "lr": 0.00056, "grad_norm": 0.6647, "tokens_per_sec": 143706, "dt_s": 4.56, "eta_s": 9161, "world_size": 1, "timestamp": "2026-05-05T05:26:31.881214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73230, "epoch": 0, "train_loss": 3.80047270655632, "train_ppl": 44.72232003134622, "lr": 0.00056, "grad_norm": 0.6759, "tokens_per_sec": 144807, "dt_s": 4.526, "eta_s": 9122, "world_size": 1, "timestamp": "2026-05-05T05:26:36.406958"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73240, "epoch": 0, "train_loss": 3.5809065103530884, "train_ppl": 35.90607532743624, "lr": 0.00056, "grad_norm": 0.7028, "tokens_per_sec": 141295, "dt_s": 4.638, "eta_s": 8934, "world_size": 1, "timestamp": "2026-05-05T05:26:41.045198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73250, "epoch": 0, "train_loss": 3.6545674800872803, "train_ppl": 38.650800258615945, "lr": 0.00056, "grad_norm": 0.6048, "tokens_per_sec": 145610, "dt_s": 4.501, "eta_s": 8924, "world_size": 1, "timestamp": "2026-05-05T05:26:45.545970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73260, "epoch": 0, "train_loss": 3.698624610900879, "train_ppl": 40.39171181801357, "lr": 0.00056, "grad_norm": 0.6523, "tokens_per_sec": 140342, "dt_s": 4.67, "eta_s": 8894, "world_size": 1, "timestamp": "2026-05-05T05:26:50.215707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73270, "epoch": 0, "train_loss": 3.738275170326233, "train_ppl": 42.02544088791372, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 145044, "dt_s": 4.518, "eta_s": 8873, "world_size": 1, "timestamp": "2026-05-05T05:26:54.734036"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73280, "epoch": 0, "train_loss": 3.6875238567590714, "train_ppl": 39.94581284208743, "lr": 0.00056, "grad_norm": 0.7042, "tokens_per_sec": 147658, "dt_s": 4.438, "eta_s": 8834, "world_size": 1, "timestamp": "2026-05-05T05:26:59.172432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73290, "epoch": 0, "train_loss": 3.888635754585266, "train_ppl": 48.844205565998756, "lr": 0.00056, "grad_norm": 0.6945, "tokens_per_sec": 139846, "dt_s": 4.686, "eta_s": 8848, "world_size": 1, "timestamp": "2026-05-05T05:27:03.858729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73300, "epoch": 0, "train_loss": 3.641365349292755, "train_ppl": 38.143880908891376, "lr": 0.00056, "grad_norm": 0.6462, "tokens_per_sec": 141751, "dt_s": 4.623, "eta_s": 8891, "world_size": 1, "timestamp": "2026-05-05T05:27:08.482025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73310, "epoch": 0, "train_loss": 3.8175167739391327, "train_ppl": 45.49110325310813, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 142784, "dt_s": 4.59, "eta_s": 8856, "world_size": 1, "timestamp": "2026-05-05T05:27:13.071923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73320, "epoch": 0, "train_loss": 3.675276145339012, "train_ppl": 39.45955192416368, "lr": 0.00056, "grad_norm": 0.6818, "tokens_per_sec": 142309, "dt_s": 4.605, "eta_s": 8885, "world_size": 1, "timestamp": "2026-05-05T05:27:17.677075"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73330, "epoch": 0, "train_loss": 3.7541280686855316, "train_ppl": 42.696974746028914, "lr": 0.00056, "grad_norm": 0.6878, "tokens_per_sec": 142785, "dt_s": 4.59, "eta_s": 8939, "world_size": 1, "timestamp": "2026-05-05T05:27:22.266928"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73340, "epoch": 0, "train_loss": 3.6878545582294464, "train_ppl": 39.95902516567664, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 141224, "dt_s": 4.641, "eta_s": 8916, "world_size": 1, "timestamp": "2026-05-05T05:27:26.907493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73350, "epoch": 0, "train_loss": 3.70433446764946, "train_ppl": 40.62300239597155, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 144672, "dt_s": 4.53, "eta_s": 8876, "world_size": 1, "timestamp": "2026-05-05T05:27:31.437483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73360, "epoch": 0, "train_loss": 3.616554468870163, "train_ppl": 37.209141445402786, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 142384, "dt_s": 4.603, "eta_s": 8876, "world_size": 1, "timestamp": "2026-05-05T05:27:36.040272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73370, "epoch": 0, "train_loss": 3.6547845005989075, "train_ppl": 38.65918918531453, "lr": 0.00056, "grad_norm": 0.6313, "tokens_per_sec": 139601, "dt_s": 4.695, "eta_s": 8906, "world_size": 1, "timestamp": "2026-05-05T05:27:40.734808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73380, "epoch": 0, "train_loss": 3.651369422674179, "train_ppl": 38.52739022170035, "lr": 0.00056, "grad_norm": 0.7143, "tokens_per_sec": 144165, "dt_s": 4.546, "eta_s": 8884, "world_size": 1, "timestamp": "2026-05-05T05:27:45.280695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73390, "epoch": 0, "train_loss": 3.8591209650039673, "train_ppl": 47.42364599682757, "lr": 0.00056, "grad_norm": 0.7756, "tokens_per_sec": 143727, "dt_s": 4.56, "eta_s": 8849, "world_size": 1, "timestamp": "2026-05-05T05:27:49.840446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73400, "epoch": 0, "train_loss": 3.5968368351459503, "train_ppl": 36.48265109573094, "lr": 0.00056, "grad_norm": 0.6705, "tokens_per_sec": 142693, "dt_s": 4.593, "eta_s": 8868, "world_size": 1, "timestamp": "2026-05-05T05:27:54.433256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73410, "epoch": 0, "train_loss": 3.662262484431267, "train_ppl": 38.949365592026744, "lr": 0.00056, "grad_norm": 0.636, "tokens_per_sec": 144217, "dt_s": 4.544, "eta_s": 8841, "world_size": 1, "timestamp": "2026-05-05T05:27:58.977506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73420, "epoch": 0, "train_loss": 3.7998809814453125, "train_ppl": 44.695864539522496, "lr": 0.00056, "grad_norm": 0.7496, "tokens_per_sec": 142481, "dt_s": 4.6, "eta_s": 8800, "world_size": 1, "timestamp": "2026-05-05T05:28:03.577136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73430, "epoch": 0, "train_loss": 3.700086832046509, "train_ppl": 40.450816634767534, "lr": 0.00056, "grad_norm": 0.7151, "tokens_per_sec": 144843, "dt_s": 4.525, "eta_s": 8787, "world_size": 1, "timestamp": "2026-05-05T05:28:08.101769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73440, "epoch": 0, "train_loss": 3.6861709505319595, "train_ppl": 39.89180644417991, "lr": 0.00056, "grad_norm": 0.6717, "tokens_per_sec": 146195, "dt_s": 4.483, "eta_s": 8753, "world_size": 1, "timestamp": "2026-05-05T05:28:12.584506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73450, "epoch": 0, "train_loss": 3.7779500484466553, "train_ppl": 43.72631297398608, "lr": 0.00056, "grad_norm": 0.6844, "tokens_per_sec": 142819, "dt_s": 4.589, "eta_s": 8747, "world_size": 1, "timestamp": "2026-05-05T05:28:17.173280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73460, "epoch": 0, "train_loss": 3.687908411026001, "train_ppl": 39.9611771288735, "lr": 0.00056, "grad_norm": 0.6708, "tokens_per_sec": 144312, "dt_s": 4.541, "eta_s": 8741, "world_size": 1, "timestamp": "2026-05-05T05:28:21.714546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73470, "epoch": 0, "train_loss": 3.8365324437618256, "train_ppl": 46.3644241275406, "lr": 0.00056, "grad_norm": 0.6255, "tokens_per_sec": 145614, "dt_s": 4.501, "eta_s": 8699, "world_size": 1, "timestamp": "2026-05-05T05:28:26.215223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73480, "epoch": 0, "train_loss": 3.616667941212654, "train_ppl": 37.21336389340568, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 128897, "dt_s": 5.084, "eta_s": 8909, "world_size": 1, "timestamp": "2026-05-05T05:28:31.299598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73490, "epoch": 0, "train_loss": 3.689175173640251, "train_ppl": 40.01183053022578, "lr": 0.00056, "grad_norm": 0.6123, "tokens_per_sec": 145564, "dt_s": 4.502, "eta_s": 8912, "world_size": 1, "timestamp": "2026-05-05T05:28:35.801798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73500, "epoch": 0, "train_loss": 3.69465309381485, "train_ppl": 40.231613571290566, "lr": 0.00056, "grad_norm": 0.7259, "tokens_per_sec": 146287, "dt_s": 4.48, "eta_s": 8865, "world_size": 1, "timestamp": "2026-05-05T05:28:40.281750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73510, "epoch": 0, "train_loss": 3.6595240235328674, "train_ppl": 38.8428501880347, "lr": 0.00056, "grad_norm": 0.6358, "tokens_per_sec": 123279, "dt_s": 5.316, "eta_s": 8861, "world_size": 1, "timestamp": "2026-05-05T05:28:45.597818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73520, "epoch": 0, "train_loss": 3.6688847690820694, "train_ppl": 39.20815532181251, "lr": 0.00056, "grad_norm": 0.6609, "tokens_per_sec": 144570, "dt_s": 4.533, "eta_s": 8868, "world_size": 1, "timestamp": "2026-05-05T05:28:50.130991"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73530, "epoch": 0, "train_loss": 3.738506555557251, "train_ppl": 42.03516607935165, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 139055, "dt_s": 4.713, "eta_s": 8722, "world_size": 1, "timestamp": "2026-05-05T05:28:54.843973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73540, "epoch": 0, "train_loss": 3.7692546993494034, "train_ppl": 43.34774569054258, "lr": 0.00056, "grad_norm": 0.7653, "tokens_per_sec": 145136, "dt_s": 4.516, "eta_s": 8722, "world_size": 1, "timestamp": "2026-05-05T05:28:59.359462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73550, "epoch": 0, "train_loss": 3.6875929087400436, "train_ppl": 39.94857127483225, "lr": 0.00056, "grad_norm": 0.7013, "tokens_per_sec": 143261, "dt_s": 4.575, "eta_s": 8754, "world_size": 1, "timestamp": "2026-05-05T05:29:03.934059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73560, "epoch": 0, "train_loss": 3.7318004220724106, "train_ppl": 41.75421574270887, "lr": 0.00056, "grad_norm": 0.7167, "tokens_per_sec": 144992, "dt_s": 4.52, "eta_s": 8741, "world_size": 1, "timestamp": "2026-05-05T05:29:08.454017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73570, "epoch": 0, "train_loss": 3.6183276623487473, "train_ppl": 37.275178983720615, "lr": 0.00056, "grad_norm": 0.664, "tokens_per_sec": 144359, "dt_s": 4.54, "eta_s": 8739, "world_size": 1, "timestamp": "2026-05-05T05:29:12.993804"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73580, "epoch": 0, "train_loss": 3.688601925969124, "train_ppl": 39.98890041450243, "lr": 0.00056, "grad_norm": 0.6684, "tokens_per_sec": 143658, "dt_s": 4.562, "eta_s": 8677, "world_size": 1, "timestamp": "2026-05-05T05:29:17.555744"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73590, "epoch": 0, "train_loss": 3.683479055762291, "train_ppl": 39.78456630340504, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 144058, "dt_s": 4.549, "eta_s": 8685, "world_size": 1, "timestamp": "2026-05-05T05:29:22.105027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73600, "epoch": 0, "train_loss": 3.780395045876503, "train_ppl": 43.83335450162956, "lr": 0.00056, "grad_norm": 0.6648, "tokens_per_sec": 144134, "dt_s": 4.547, "eta_s": 8670, "world_size": 1, "timestamp": "2026-05-05T05:29:26.651943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73610, "epoch": 0, "train_loss": 3.6804943084716797, "train_ppl": 39.66599646544951, "lr": 0.00056, "grad_norm": 0.6222, "tokens_per_sec": 141255, "dt_s": 4.64, "eta_s": 8711, "world_size": 1, "timestamp": "2026-05-05T05:29:31.291469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73620, "epoch": 0, "train_loss": 3.656144380569458, "train_ppl": 38.711796804281875, "lr": 0.00056, "grad_norm": 0.693, "tokens_per_sec": 145478, "dt_s": 4.505, "eta_s": 8693, "world_size": 1, "timestamp": "2026-05-05T05:29:35.796344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73630, "epoch": 0, "train_loss": 3.7365458756685257, "train_ppl": 41.95282931899274, "lr": 0.00056, "grad_norm": 0.6215, "tokens_per_sec": 142887, "dt_s": 4.587, "eta_s": 8698, "world_size": 1, "timestamp": "2026-05-05T05:29:40.382907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73640, "epoch": 0, "train_loss": 3.6593177318573, "train_ppl": 38.83483805783189, "lr": 0.00056, "grad_norm": 0.6125, "tokens_per_sec": 147011, "dt_s": 4.458, "eta_s": 8659, "world_size": 1, "timestamp": "2026-05-05T05:29:44.840813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73650, "epoch": 0, "train_loss": 3.6966675966978073, "train_ppl": 40.3127419620681, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 144786, "dt_s": 4.526, "eta_s": 8647, "world_size": 1, "timestamp": "2026-05-05T05:29:49.367246"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73660, "epoch": 0, "train_loss": 3.6917383521795273, "train_ppl": 40.11451954447554, "lr": 0.00056, "grad_norm": 0.6333, "tokens_per_sec": 142128, "dt_s": 4.611, "eta_s": 8631, "world_size": 1, "timestamp": "2026-05-05T05:29:53.978302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73670, "epoch": 0, "train_loss": 3.6631685942411423, "train_ppl": 38.984673988504525, "lr": 0.00056, "grad_norm": 0.6782, "tokens_per_sec": 143524, "dt_s": 4.566, "eta_s": 8650, "world_size": 1, "timestamp": "2026-05-05T05:29:58.544464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73680, "epoch": 0, "train_loss": 3.7201652228832245, "train_ppl": 41.27121249404107, "lr": 0.00056, "grad_norm": 0.6518, "tokens_per_sec": 145530, "dt_s": 4.503, "eta_s": 8614, "world_size": 1, "timestamp": "2026-05-05T05:30:03.047758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73690, "epoch": 0, "train_loss": 3.6977670937776566, "train_ppl": 40.3570900799815, "lr": 0.00056, "grad_norm": 0.6376, "tokens_per_sec": 145363, "dt_s": 4.508, "eta_s": 8628, "world_size": 1, "timestamp": "2026-05-05T05:30:07.556206"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73700, "epoch": 0, "train_loss": 3.8303513675928116, "train_ppl": 46.0787259611474, "lr": 0.00056, "grad_norm": 0.7379, "tokens_per_sec": 148722, "dt_s": 4.407, "eta_s": 8578, "world_size": 1, "timestamp": "2026-05-05T05:30:11.962799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73710, "epoch": 0, "train_loss": 3.714573413133621, "train_ppl": 41.04107576570531, "lr": 0.00056, "grad_norm": 0.7423, "tokens_per_sec": 145680, "dt_s": 4.499, "eta_s": 8531, "world_size": 1, "timestamp": "2026-05-05T05:30:16.461408"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73720, "epoch": 0, "train_loss": 3.6883358508348465, "train_ppl": 39.97826177785614, "lr": 0.00056, "grad_norm": 0.6312, "tokens_per_sec": 147082, "dt_s": 4.456, "eta_s": 8485, "world_size": 1, "timestamp": "2026-05-05T05:30:20.917138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73730, "epoch": 0, "train_loss": 3.7240283489227295, "train_ppl": 41.43095674713551, "lr": 0.00056, "grad_norm": 0.6581, "tokens_per_sec": 147011, "dt_s": 4.458, "eta_s": 8463, "world_size": 1, "timestamp": "2026-05-05T05:30:25.375052"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73740, "epoch": 0, "train_loss": 3.851042225956917, "train_ppl": 47.04206615349714, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 143774, "dt_s": 4.558, "eta_s": 8478, "world_size": 1, "timestamp": "2026-05-05T05:30:29.933367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73750, "epoch": 0, "train_loss": 3.6481624990701675, "train_ppl": 38.404033727743695, "lr": 0.00056, "grad_norm": 0.6803, "tokens_per_sec": 149684, "dt_s": 4.378, "eta_s": 8462, "world_size": 1, "timestamp": "2026-05-05T05:30:34.311592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73760, "epoch": 0, "train_loss": 3.7491308450698853, "train_ppl": 42.484140648213206, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 145413, "dt_s": 4.507, "eta_s": 8461, "world_size": 1, "timestamp": "2026-05-05T05:30:38.818482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73770, "epoch": 0, "train_loss": 3.7278264611959457, "train_ppl": 41.588615385525216, "lr": 0.00056, "grad_norm": 0.6786, "tokens_per_sec": 147083, "dt_s": 4.456, "eta_s": 8457, "world_size": 1, "timestamp": "2026-05-05T05:30:43.274202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73780, "epoch": 0, "train_loss": 3.765882357954979, "train_ppl": 43.20180850670653, "lr": 0.00056, "grad_norm": 0.7083, "tokens_per_sec": 131494, "dt_s": 4.984, "eta_s": 8651, "world_size": 1, "timestamp": "2026-05-05T05:30:48.258162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73790, "epoch": 0, "train_loss": 3.7498475164175034, "train_ppl": 42.51459872745666, "lr": 0.00056, "grad_norm": 0.7507, "tokens_per_sec": 142499, "dt_s": 4.599, "eta_s": 8662, "world_size": 1, "timestamp": "2026-05-05T05:30:52.857236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73800, "epoch": 0, "train_loss": 3.6350472271442413, "train_ppl": 37.9036429356848, "lr": 0.00056, "grad_norm": 0.6258, "tokens_per_sec": 147078, "dt_s": 4.456, "eta_s": 8687, "world_size": 1, "timestamp": "2026-05-05T05:30:57.313088"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73810, "epoch": 0, "train_loss": 3.718214377760887, "train_ppl": 41.19077723433117, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 146851, "dt_s": 4.463, "eta_s": 8665, "world_size": 1, "timestamp": "2026-05-05T05:31:01.775829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73820, "epoch": 0, "train_loss": 3.801148623228073, "train_ppl": 44.7525588113514, "lr": 0.00056, "grad_norm": 0.6432, "tokens_per_sec": 144383, "dt_s": 4.539, "eta_s": 8692, "world_size": 1, "timestamp": "2026-05-05T05:31:06.314839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73830, "epoch": 0, "train_loss": 3.6607052385807037, "train_ppl": 38.88875905596254, "lr": 0.00056, "grad_norm": 0.6259, "tokens_per_sec": 148508, "dt_s": 4.413, "eta_s": 8472, "world_size": 1, "timestamp": "2026-05-05T05:31:10.727818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73840, "epoch": 0, "train_loss": 3.7400962710380554, "train_ppl": 42.10204317730085, "lr": 0.00056, "grad_norm": 0.792, "tokens_per_sec": 148313, "dt_s": 4.419, "eta_s": 8400, "world_size": 1, "timestamp": "2026-05-05T05:31:15.146580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73850, "epoch": 0, "train_loss": 3.7090925127267838, "train_ppl": 40.81674903464909, "lr": 0.00056, "grad_norm": 0.641, "tokens_per_sec": 144714, "dt_s": 4.529, "eta_s": 8423, "world_size": 1, "timestamp": "2026-05-05T05:31:19.675251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73860, "epoch": 0, "train_loss": 3.686088427901268, "train_ppl": 39.88851460319666, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 146490, "dt_s": 4.474, "eta_s": 8422, "world_size": 1, "timestamp": "2026-05-05T05:31:24.148999"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73870, "epoch": 0, "train_loss": 3.685244932770729, "train_ppl": 39.854883021396205, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 146600, "dt_s": 4.47, "eta_s": 8392, "world_size": 1, "timestamp": "2026-05-05T05:31:28.619398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73880, "epoch": 0, "train_loss": 3.751804858446121, "train_ppl": 42.597895832243864, "lr": 0.00056, "grad_norm": 0.742, "tokens_per_sec": 149257, "dt_s": 4.391, "eta_s": 8379, "world_size": 1, "timestamp": "2026-05-05T05:31:33.010216"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73890, "epoch": 0, "train_loss": 3.7017468959093094, "train_ppl": 40.518023341962575, "lr": 0.00056, "grad_norm": 0.6582, "tokens_per_sec": 148024, "dt_s": 4.427, "eta_s": 8378, "world_size": 1, "timestamp": "2026-05-05T05:31:37.437590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73900, "epoch": 0, "train_loss": 3.7649655640125275, "train_ppl": 43.16221950061803, "lr": 0.00056, "grad_norm": 0.6544, "tokens_per_sec": 144688, "dt_s": 4.529, "eta_s": 8374, "world_size": 1, "timestamp": "2026-05-05T05:31:41.967053"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73910, "epoch": 0, "train_loss": 3.8266957253217697, "train_ppl": 45.91058613949557, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 147955, "dt_s": 4.429, "eta_s": 8353, "world_size": 1, "timestamp": "2026-05-05T05:31:46.396500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73920, "epoch": 0, "train_loss": 3.639964669942856, "train_ppl": 38.09049096240386, "lr": 0.00056, "grad_norm": 0.626, "tokens_per_sec": 145217, "dt_s": 4.513, "eta_s": 8364, "world_size": 1, "timestamp": "2026-05-05T05:31:50.909490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73930, "epoch": 0, "train_loss": 3.763289839029312, "train_ppl": 43.08995205816013, "lr": 0.00056, "grad_norm": 0.9645, "tokens_per_sec": 147663, "dt_s": 4.438, "eta_s": 8378, "world_size": 1, "timestamp": "2026-05-05T05:31:55.347693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73940, "epoch": 0, "train_loss": 3.7265371680259705, "train_ppl": 41.535030018806395, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 148817, "dt_s": 4.404, "eta_s": 8364, "world_size": 1, "timestamp": "2026-05-05T05:31:59.751487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73950, "epoch": 0, "train_loss": 3.775447443127632, "train_ppl": 43.6170200860795, "lr": 0.00056, "grad_norm": 0.6689, "tokens_per_sec": 145683, "dt_s": 4.499, "eta_s": 8348, "world_size": 1, "timestamp": "2026-05-05T05:32:04.250014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73960, "epoch": 0, "train_loss": 3.7065481543540955, "train_ppl": 40.71302860442449, "lr": 0.00056, "grad_norm": 0.6888, "tokens_per_sec": 145743, "dt_s": 4.497, "eta_s": 8369, "world_size": 1, "timestamp": "2026-05-05T05:32:08.746714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73970, "epoch": 0, "train_loss": 3.7057657539844513, "train_ppl": 40.681187173792914, "lr": 0.00056, "grad_norm": 0.676, "tokens_per_sec": 148070, "dt_s": 4.426, "eta_s": 8332, "world_size": 1, "timestamp": "2026-05-05T05:32:13.172713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73980, "epoch": 0, "train_loss": 3.781156614422798, "train_ppl": 43.866749320304315, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 148108, "dt_s": 4.425, "eta_s": 8323, "world_size": 1, "timestamp": "2026-05-05T05:32:17.597577"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 73990, "epoch": 0, "train_loss": 3.8112727999687195, "train_ppl": 45.207942931318364, "lr": 0.00056, "grad_norm": 0.654, "tokens_per_sec": 148749, "dt_s": 4.406, "eta_s": 8319, "world_size": 1, "timestamp": "2026-05-05T05:32:22.003402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74000, "epoch": 0, "train_loss": 3.6534045189619064, "train_ppl": 38.60587700751106, "lr": 0.00056, "grad_norm": 0.6571, "tokens_per_sec": 146555, "dt_s": 4.472, "eta_s": 8304, "world_size": 1, "timestamp": "2026-05-05T05:32:26.475171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74010, "epoch": 0, "train_loss": 3.9197145253419876, "train_ppl": 50.38605878184517, "lr": 0.00056, "grad_norm": 0.861, "tokens_per_sec": 122558, "dt_s": 5.347, "eta_s": 8326, "world_size": 1, "timestamp": "2026-05-05T05:32:31.822500"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74020, "epoch": 0, "train_loss": 3.7828238904476166, "train_ppl": 43.93994830423165, "lr": 0.00056, "grad_norm": 0.707, "tokens_per_sec": 147389, "dt_s": 4.446, "eta_s": 8329, "world_size": 1, "timestamp": "2026-05-05T05:32:36.268952"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74030, "epoch": 0, "train_loss": 3.7152353078126907, "train_ppl": 41.0682496275031, "lr": 0.00056, "grad_norm": 0.66, "tokens_per_sec": 143844, "dt_s": 4.556, "eta_s": 8373, "world_size": 1, "timestamp": "2026-05-05T05:32:40.825008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74040, "epoch": 0, "train_loss": 3.8054751604795456, "train_ppl": 44.946601888759105, "lr": 0.00056, "grad_norm": 0.6432, "tokens_per_sec": 148028, "dt_s": 4.427, "eta_s": 8377, "world_size": 1, "timestamp": "2026-05-05T05:32:45.252293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74050, "epoch": 0, "train_loss": 3.7689846009016037, "train_ppl": 43.336039112751365, "lr": 0.00056, "grad_norm": 0.6594, "tokens_per_sec": 145735, "dt_s": 4.497, "eta_s": 8382, "world_size": 1, "timestamp": "2026-05-05T05:32:49.749224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74060, "epoch": 0, "train_loss": 3.771371841430664, "train_ppl": 43.4396162442636, "lr": 0.00056, "grad_norm": 0.7154, "tokens_per_sec": 144282, "dt_s": 4.542, "eta_s": 8369, "world_size": 1, "timestamp": "2026-05-05T05:32:54.291431"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74070, "epoch": 0, "train_loss": 3.718433767557144, "train_ppl": 41.19981506192361, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 132689, "dt_s": 4.939, "eta_s": 8547, "world_size": 1, "timestamp": "2026-05-05T05:32:59.230463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74080, "epoch": 0, "train_loss": 3.5912643671035767, "train_ppl": 36.27991807465143, "lr": 0.00056, "grad_norm": 0.6404, "tokens_per_sec": 147503, "dt_s": 4.443, "eta_s": 8501, "world_size": 1, "timestamp": "2026-05-05T05:33:03.673492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74090, "epoch": 0, "train_loss": 3.6375788897275925, "train_ppl": 37.999723741050055, "lr": 0.00056, "grad_norm": 0.6732, "tokens_per_sec": 145587, "dt_s": 4.502, "eta_s": 8524, "world_size": 1, "timestamp": "2026-05-05T05:33:08.175025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74100, "epoch": 0, "train_loss": 3.723304823040962, "train_ppl": 41.400991219347254, "lr": 0.00056, "grad_norm": 0.7535, "tokens_per_sec": 147948, "dt_s": 4.43, "eta_s": 8494, "world_size": 1, "timestamp": "2026-05-05T05:33:12.604681"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74110, "epoch": 0, "train_loss": 3.7126921117305756, "train_ppl": 40.963937714997485, "lr": 0.00056, "grad_norm": 0.6189, "tokens_per_sec": 146445, "dt_s": 4.475, "eta_s": 8465, "world_size": 1, "timestamp": "2026-05-05T05:33:17.079824"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74120, "epoch": 0, "train_loss": 3.6970867663621902, "train_ppl": 40.329643382620624, "lr": 0.00056, "grad_norm": 0.6503, "tokens_per_sec": 141453, "dt_s": 4.633, "eta_s": 8347, "world_size": 1, "timestamp": "2026-05-05T05:33:21.712893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74130, "epoch": 0, "train_loss": 3.6751381009817123, "train_ppl": 39.4541051316371, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 145851, "dt_s": 4.493, "eta_s": 8361, "world_size": 1, "timestamp": "2026-05-05T05:33:26.206222"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74140, "epoch": 0, "train_loss": 3.6232195794582367, "train_ppl": 37.45797281114209, "lr": 0.00056, "grad_norm": 0.6154, "tokens_per_sec": 145885, "dt_s": 4.492, "eta_s": 8353, "world_size": 1, "timestamp": "2026-05-05T05:33:30.698527"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74150, "epoch": 0, "train_loss": 3.71145398914814, "train_ppl": 40.91325072347652, "lr": 0.00056, "grad_norm": 0.7243, "tokens_per_sec": 150427, "dt_s": 4.357, "eta_s": 8321, "world_size": 1, "timestamp": "2026-05-05T05:33:35.055188"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74160, "epoch": 0, "train_loss": 3.776407763361931, "train_ppl": 43.6589265116016, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 149802, "dt_s": 4.375, "eta_s": 8280, "world_size": 1, "timestamp": "2026-05-05T05:33:39.430025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74170, "epoch": 0, "train_loss": 3.6578414142131805, "train_ppl": 38.77754780091457, "lr": 0.00056, "grad_norm": 0.6535, "tokens_per_sec": 143583, "dt_s": 4.564, "eta_s": 8250, "world_size": 1, "timestamp": "2026-05-05T05:33:43.994361"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74180, "epoch": 0, "train_loss": 3.712516114115715, "train_ppl": 40.9567287940594, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 148646, "dt_s": 4.409, "eta_s": 8214, "world_size": 1, "timestamp": "2026-05-05T05:33:48.403226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74190, "epoch": 0, "train_loss": 3.7827616930007935, "train_ppl": 43.93721543662316, "lr": 0.00056, "grad_norm": 0.6692, "tokens_per_sec": 146764, "dt_s": 4.465, "eta_s": 8200, "world_size": 1, "timestamp": "2026-05-05T05:33:52.868617"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74200, "epoch": 0, "train_loss": 3.6381116658449173, "train_ppl": 38.01997448040036, "lr": 0.00056, "grad_norm": 0.6228, "tokens_per_sec": 148013, "dt_s": 4.428, "eta_s": 8221, "world_size": 1, "timestamp": "2026-05-05T05:33:57.296352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74210, "epoch": 0, "train_loss": 3.6930387914180756, "train_ppl": 40.16671997411364, "lr": 0.00056, "grad_norm": 0.6965, "tokens_per_sec": 149383, "dt_s": 4.387, "eta_s": 8222, "world_size": 1, "timestamp": "2026-05-05T05:34:01.683459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74220, "epoch": 0, "train_loss": 3.5313849449157715, "train_ppl": 34.17126017144213, "lr": 0.00056, "grad_norm": 0.6157, "tokens_per_sec": 145409, "dt_s": 4.507, "eta_s": 8196, "world_size": 1, "timestamp": "2026-05-05T05:34:06.190453"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74230, "epoch": 0, "train_loss": 3.7213250249624252, "train_ppl": 41.319106700634286, "lr": 0.00056, "grad_norm": 0.6429, "tokens_per_sec": 146526, "dt_s": 4.473, "eta_s": 8215, "world_size": 1, "timestamp": "2026-05-05T05:34:10.663097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74240, "epoch": 0, "train_loss": 3.738951861858368, "train_ppl": 42.05388877203241, "lr": 0.00056, "grad_norm": 0.687, "tokens_per_sec": 150754, "dt_s": 4.347, "eta_s": 8167, "world_size": 1, "timestamp": "2026-05-05T05:34:15.010324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74250, "epoch": 0, "train_loss": 3.7477365136146545, "train_ppl": 42.42494495335913, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 146174, "dt_s": 4.483, "eta_s": 8183, "world_size": 1, "timestamp": "2026-05-05T05:34:19.493716"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74260, "epoch": 0, "train_loss": 3.790719613432884, "train_ppl": 44.28825923780668, "lr": 0.00056, "grad_norm": 1.429, "tokens_per_sec": 148439, "dt_s": 4.415, "eta_s": 8189, "world_size": 1, "timestamp": "2026-05-05T05:34:23.908738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74270, "epoch": 0, "train_loss": 3.700746104121208, "train_ppl": 40.47749352127054, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 146048, "dt_s": 4.487, "eta_s": 8177, "world_size": 1, "timestamp": "2026-05-05T05:34:28.396017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74280, "epoch": 0, "train_loss": 3.6890721917152405, "train_ppl": 40.00771024705556, "lr": 0.00056, "grad_norm": 0.6797, "tokens_per_sec": 147218, "dt_s": 4.452, "eta_s": 8165, "world_size": 1, "timestamp": "2026-05-05T05:34:32.847636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74290, "epoch": 0, "train_loss": 3.682404085993767, "train_ppl": 39.741822075867354, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 143743, "dt_s": 4.559, "eta_s": 8239, "world_size": 1, "timestamp": "2026-05-05T05:34:37.406876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74300, "epoch": 0, "train_loss": 3.6511932760477066, "train_ppl": 38.5206043495578, "lr": 0.00056, "grad_norm": 0.6449, "tokens_per_sec": 142266, "dt_s": 4.607, "eta_s": 8279, "world_size": 1, "timestamp": "2026-05-05T05:34:42.013473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74310, "epoch": 0, "train_loss": 3.626666247844696, "train_ppl": 37.58730076911371, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 142673, "dt_s": 4.593, "eta_s": 8341, "world_size": 1, "timestamp": "2026-05-05T05:34:46.606922"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74320, "epoch": 0, "train_loss": 3.692184701561928, "train_ppl": 40.13242863205727, "lr": 0.00056, "grad_norm": 0.6165, "tokens_per_sec": 142802, "dt_s": 4.589, "eta_s": 8373, "world_size": 1, "timestamp": "2026-05-05T05:34:51.196210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74330, "epoch": 0, "train_loss": 3.749492198228836, "train_ppl": 42.499495200682695, "lr": 0.00056, "grad_norm": 0.6761, "tokens_per_sec": 143046, "dt_s": 4.581, "eta_s": 8417, "world_size": 1, "timestamp": "2026-05-05T05:34:55.777666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74340, "epoch": 0, "train_loss": 3.6830729097127914, "train_ppl": 39.768411239849556, "lr": 0.00056, "grad_norm": 0.6399, "tokens_per_sec": 147290, "dt_s": 4.449, "eta_s": 8372, "world_size": 1, "timestamp": "2026-05-05T05:35:00.227119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74350, "epoch": 0, "train_loss": 3.8211619555950165, "train_ppl": 45.657229183842176, "lr": 0.00056, "grad_norm": 0.7008, "tokens_per_sec": 147019, "dt_s": 4.458, "eta_s": 8313, "world_size": 1, "timestamp": "2026-05-05T05:35:04.684795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74360, "epoch": 0, "train_loss": 3.7113366574048996, "train_ppl": 40.90845058205749, "lr": 0.00056, "grad_norm": 0.6356, "tokens_per_sec": 143941, "dt_s": 4.553, "eta_s": 8293, "world_size": 1, "timestamp": "2026-05-05T05:35:09.237784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74370, "epoch": 0, "train_loss": 3.7127090096473694, "train_ppl": 40.964629926056986, "lr": 0.00056, "grad_norm": 0.626, "tokens_per_sec": 132059, "dt_s": 4.963, "eta_s": 8425, "world_size": 1, "timestamp": "2026-05-05T05:35:14.200412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74380, "epoch": 0, "train_loss": 3.8097680658102036, "train_ppl": 45.13996815016678, "lr": 0.00056, "grad_norm": 0.6678, "tokens_per_sec": 143773, "dt_s": 4.558, "eta_s": 8412, "world_size": 1, "timestamp": "2026-05-05T05:35:18.758698"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74390, "epoch": 0, "train_loss": 3.5628005117177963, "train_ppl": 35.26181010967739, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 147733, "dt_s": 4.436, "eta_s": 8403, "world_size": 1, "timestamp": "2026-05-05T05:35:23.194802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74400, "epoch": 0, "train_loss": 3.7116974741220474, "train_ppl": 40.92321369812949, "lr": 0.00056, "grad_norm": 0.8168, "tokens_per_sec": 148448, "dt_s": 4.415, "eta_s": 8383, "world_size": 1, "timestamp": "2026-05-05T05:35:27.609527"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74410, "epoch": 0, "train_loss": 3.7483667135238647, "train_ppl": 42.45168957616127, "lr": 0.00056, "grad_norm": 0.7273, "tokens_per_sec": 145241, "dt_s": 4.512, "eta_s": 8363, "world_size": 1, "timestamp": "2026-05-05T05:35:32.121766"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74420, "epoch": 0, "train_loss": 3.6866201907396317, "train_ppl": 39.90973147361182, "lr": 0.00056, "grad_norm": 0.7523, "tokens_per_sec": 147042, "dt_s": 4.457, "eta_s": 8174, "world_size": 1, "timestamp": "2026-05-05T05:35:36.578732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74430, "epoch": 0, "train_loss": 3.7049876302480698, "train_ppl": 40.649544488987445, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 147102, "dt_s": 4.455, "eta_s": 8132, "world_size": 1, "timestamp": "2026-05-05T05:35:41.033895"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74440, "epoch": 0, "train_loss": 3.8238560408353806, "train_ppl": 45.78039949225879, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 145871, "dt_s": 4.493, "eta_s": 8148, "world_size": 1, "timestamp": "2026-05-05T05:35:45.526633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74450, "epoch": 0, "train_loss": 3.6156915575265884, "train_ppl": 37.17704710444181, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 149564, "dt_s": 4.382, "eta_s": 8131, "world_size": 1, "timestamp": "2026-05-05T05:35:49.908430"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74460, "epoch": 0, "train_loss": 3.670917421579361, "train_ppl": 39.28793292925015, "lr": 0.00056, "grad_norm": 0.6271, "tokens_per_sec": 147175, "dt_s": 4.453, "eta_s": 8105, "world_size": 1, "timestamp": "2026-05-05T05:35:54.361394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74470, "epoch": 0, "train_loss": 3.7233096808195114, "train_ppl": 41.401192336682804, "lr": 0.00056, "grad_norm": 0.6473, "tokens_per_sec": 147599, "dt_s": 4.44, "eta_s": 8095, "world_size": 1, "timestamp": "2026-05-05T05:35:58.801497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74480, "epoch": 0, "train_loss": 3.7127913534641266, "train_ppl": 40.96800324892138, "lr": 0.00056, "grad_norm": 0.6249, "tokens_per_sec": 148158, "dt_s": 4.423, "eta_s": 8079, "world_size": 1, "timestamp": "2026-05-05T05:36:03.224910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74490, "epoch": 0, "train_loss": 3.6366013139486313, "train_ppl": 37.962594282899985, "lr": 0.00056, "grad_norm": 0.6423, "tokens_per_sec": 147290, "dt_s": 4.449, "eta_s": 8059, "world_size": 1, "timestamp": "2026-05-05T05:36:07.674359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74500, "epoch": 0, "train_loss": 3.7411864399909973, "train_ppl": 42.147966545194905, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 146648, "dt_s": 4.469, "eta_s": 8086, "world_size": 1, "timestamp": "2026-05-05T05:36:12.143285"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74510, "epoch": 0, "train_loss": 3.7527536302804947, "train_ppl": 42.638330694709204, "lr": 0.00056, "grad_norm": 0.6513, "tokens_per_sec": 125075, "dt_s": 5.24, "eta_s": 8088, "world_size": 1, "timestamp": "2026-05-05T05:36:17.383026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74520, "epoch": 0, "train_loss": 3.779154747724533, "train_ppl": 43.77902177440231, "lr": 0.00056, "grad_norm": 0.703, "tokens_per_sec": 142219, "dt_s": 4.608, "eta_s": 8145, "world_size": 1, "timestamp": "2026-05-05T05:36:21.991110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74530, "epoch": 0, "train_loss": 3.5888556391000748, "train_ppl": 36.19263478307584, "lr": 0.00056, "grad_norm": 0.6747, "tokens_per_sec": 145499, "dt_s": 4.504, "eta_s": 8170, "world_size": 1, "timestamp": "2026-05-05T05:36:26.495347"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74540, "epoch": 0, "train_loss": 4.025417670607567, "train_ppl": 56.00369499475168, "lr": 0.00056, "grad_norm": 1.1995, "tokens_per_sec": 146117, "dt_s": 4.485, "eta_s": 8178, "world_size": 1, "timestamp": "2026-05-05T05:36:30.980506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74550, "epoch": 0, "train_loss": 3.6226508170366287, "train_ppl": 37.4366741813218, "lr": 0.00056, "grad_norm": 0.6131, "tokens_per_sec": 145656, "dt_s": 4.499, "eta_s": 8185, "world_size": 1, "timestamp": "2026-05-05T05:36:35.479880"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74560, "epoch": 0, "train_loss": 3.694113254547119, "train_ppl": 40.20990082770373, "lr": 0.00056, "grad_norm": 0.6872, "tokens_per_sec": 149211, "dt_s": 4.392, "eta_s": 8151, "world_size": 1, "timestamp": "2026-05-05T05:36:39.872068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74570, "epoch": 0, "train_loss": 3.697236865758896, "train_ppl": 40.335697292094416, "lr": 0.00056, "grad_norm": 0.678, "tokens_per_sec": 148258, "dt_s": 4.42, "eta_s": 8079, "world_size": 1, "timestamp": "2026-05-05T05:36:44.292448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74580, "epoch": 0, "train_loss": 3.7360928058624268, "train_ppl": 41.933826063973484, "lr": 0.00056, "grad_norm": 0.6226, "tokens_per_sec": 150312, "dt_s": 4.36, "eta_s": 8022, "world_size": 1, "timestamp": "2026-05-05T05:36:48.652428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74590, "epoch": 0, "train_loss": 3.7567795664072037, "train_ppl": 42.81033589929521, "lr": 0.00056, "grad_norm": 0.6372, "tokens_per_sec": 150652, "dt_s": 4.35, "eta_s": 7969, "world_size": 1, "timestamp": "2026-05-05T05:36:53.002609"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74600, "epoch": 0, "train_loss": 3.6620593070983887, "train_ppl": 38.94145276768895, "lr": 0.00056, "grad_norm": 0.6525, "tokens_per_sec": 144163, "dt_s": 4.546, "eta_s": 7981, "world_size": 1, "timestamp": "2026-05-05T05:36:57.548581"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74610, "epoch": 0, "train_loss": 3.7159773409366608, "train_ppl": 41.09873493822655, "lr": 0.00056, "grad_norm": 0.6461, "tokens_per_sec": 147250, "dt_s": 4.451, "eta_s": 7998, "world_size": 1, "timestamp": "2026-05-05T05:37:01.999235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74620, "epoch": 0, "train_loss": 3.7154343128204346, "train_ppl": 41.076423228105, "lr": 0.00056, "grad_norm": 0.706, "tokens_per_sec": 147988, "dt_s": 4.428, "eta_s": 7996, "world_size": 1, "timestamp": "2026-05-05T05:37:06.427701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74630, "epoch": 0, "train_loss": 3.708693638443947, "train_ppl": 40.800471529705035, "lr": 0.00056, "grad_norm": 0.6292, "tokens_per_sec": 144750, "dt_s": 4.528, "eta_s": 8052, "world_size": 1, "timestamp": "2026-05-05T05:37:10.955232"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74640, "epoch": 0, "train_loss": 3.674119472503662, "train_ppl": 39.41393651849392, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 149912, "dt_s": 4.372, "eta_s": 8056, "world_size": 1, "timestamp": "2026-05-05T05:37:15.326833"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74650, "epoch": 0, "train_loss": 3.6786295771598816, "train_ppl": 39.59209896073534, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 147194, "dt_s": 4.452, "eta_s": 8018, "world_size": 1, "timestamp": "2026-05-05T05:37:19.779215"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74660, "epoch": 0, "train_loss": 3.763374909758568, "train_ppl": 43.09361790773134, "lr": 0.00056, "grad_norm": 0.7303, "tokens_per_sec": 133723, "dt_s": 4.901, "eta_s": 8175, "world_size": 1, "timestamp": "2026-05-05T05:37:24.680081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74670, "epoch": 0, "train_loss": 3.7344503700733185, "train_ppl": 41.865008976565456, "lr": 0.00056, "grad_norm": 0.6926, "tokens_per_sec": 147940, "dt_s": 4.43, "eta_s": 8171, "world_size": 1, "timestamp": "2026-05-05T05:37:29.109987"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74680, "epoch": 0, "train_loss": 3.6839121878147125, "train_ppl": 39.80180200666105, "lr": 0.00056, "grad_norm": 0.6322, "tokens_per_sec": 144149, "dt_s": 4.546, "eta_s": 8174, "world_size": 1, "timestamp": "2026-05-05T05:37:33.656397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74690, "epoch": 0, "train_loss": 3.6481942385435104, "train_ppl": 38.40525267089266, "lr": 0.00056, "grad_norm": 0.6354, "tokens_per_sec": 148603, "dt_s": 4.41, "eta_s": 8183, "world_size": 1, "timestamp": "2026-05-05T05:37:38.066557"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74700, "epoch": 0, "train_loss": 3.8031495213508606, "train_ppl": 44.84219376756963, "lr": 0.00056, "grad_norm": 0.7262, "tokens_per_sec": 147286, "dt_s": 4.45, "eta_s": 8177, "world_size": 1, "timestamp": "2026-05-05T05:37:42.516119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74710, "epoch": 0, "train_loss": 3.7173905968666077, "train_ppl": 41.15685903152685, "lr": 0.00056, "grad_norm": 0.6358, "tokens_per_sec": 145412, "dt_s": 4.507, "eta_s": 8031, "world_size": 1, "timestamp": "2026-05-05T05:37:47.023046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74720, "epoch": 0, "train_loss": 3.8073936253786087, "train_ppl": 45.03291313288855, "lr": 0.00056, "grad_norm": 0.6739, "tokens_per_sec": 146932, "dt_s": 4.46, "eta_s": 8038, "world_size": 1, "timestamp": "2026-05-05T05:37:51.483359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74730, "epoch": 0, "train_loss": 3.7230992764234543, "train_ppl": 41.39248226016449, "lr": 0.00056, "grad_norm": 0.6603, "tokens_per_sec": 144317, "dt_s": 4.541, "eta_s": 8031, "world_size": 1, "timestamp": "2026-05-05T05:37:56.024461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74740, "epoch": 0, "train_loss": 3.675141289830208, "train_ppl": 39.45423094500149, "lr": 0.00056, "grad_norm": 0.6766, "tokens_per_sec": 148031, "dt_s": 4.427, "eta_s": 8033, "world_size": 1, "timestamp": "2026-05-05T05:38:00.451663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74750, "epoch": 0, "train_loss": 3.735057845711708, "train_ppl": 41.89044867583613, "lr": 0.00056, "grad_norm": 0.6585, "tokens_per_sec": 147772, "dt_s": 4.435, "eta_s": 8023, "world_size": 1, "timestamp": "2026-05-05T05:38:04.886600"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74760, "epoch": 0, "train_loss": 3.6733291149139404, "train_ppl": 39.38279772163888, "lr": 0.00056, "grad_norm": 0.6729, "tokens_per_sec": 145731, "dt_s": 4.497, "eta_s": 8015, "world_size": 1, "timestamp": "2026-05-05T05:38:09.383657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74770, "epoch": 0, "train_loss": 3.6872508972883224, "train_ppl": 39.934910742138825, "lr": 0.00056, "grad_norm": 0.6441, "tokens_per_sec": 145893, "dt_s": 4.492, "eta_s": 8022, "world_size": 1, "timestamp": "2026-05-05T05:38:13.875703"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74780, "epoch": 0, "train_loss": 3.7037027180194855, "train_ppl": 40.59734693400702, "lr": 0.00056, "grad_norm": 0.6526, "tokens_per_sec": 145198, "dt_s": 4.514, "eta_s": 8008, "world_size": 1, "timestamp": "2026-05-05T05:38:18.389280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74790, "epoch": 0, "train_loss": 3.661094516515732, "train_ppl": 38.903900538715014, "lr": 0.00056, "grad_norm": 0.6748, "tokens_per_sec": 144188, "dt_s": 4.545, "eta_s": 8046, "world_size": 1, "timestamp": "2026-05-05T05:38:22.934460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74800, "epoch": 0, "train_loss": 3.689399868249893, "train_ppl": 40.020821982995486, "lr": 0.00056, "grad_norm": 0.6549, "tokens_per_sec": 146497, "dt_s": 4.474, "eta_s": 8055, "world_size": 1, "timestamp": "2026-05-05T05:38:27.408007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74810, "epoch": 0, "train_loss": 3.7918500751256943, "train_ppl": 44.338353727932486, "lr": 0.00056, "grad_norm": 0.6381, "tokens_per_sec": 147144, "dt_s": 4.454, "eta_s": 8035, "world_size": 1, "timestamp": "2026-05-05T05:38:31.861862"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74820, "epoch": 0, "train_loss": 3.7466065883636475, "train_ppl": 42.37703500920892, "lr": 0.00056, "grad_norm": 0.6787, "tokens_per_sec": 146927, "dt_s": 4.46, "eta_s": 8019, "world_size": 1, "timestamp": "2026-05-05T05:38:36.322348"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74830, "epoch": 0, "train_loss": 3.7963484674692154, "train_ppl": 44.538254317416005, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 147255, "dt_s": 4.451, "eta_s": 7992, "world_size": 1, "timestamp": "2026-05-05T05:38:40.772834"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74840, "epoch": 0, "train_loss": 3.695446640253067, "train_ppl": 40.263551895539514, "lr": 0.00056, "grad_norm": 0.6702, "tokens_per_sec": 145822, "dt_s": 4.494, "eta_s": 7970, "world_size": 1, "timestamp": "2026-05-05T05:38:45.267076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74850, "epoch": 0, "train_loss": 3.7639498710632324, "train_ppl": 43.118402194826544, "lr": 0.00056, "grad_norm": 0.7267, "tokens_per_sec": 149909, "dt_s": 4.372, "eta_s": 7929, "world_size": 1, "timestamp": "2026-05-05T05:38:49.638794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74860, "epoch": 0, "train_loss": 3.7602125853300095, "train_ppl": 42.957557154615735, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 149813, "dt_s": 4.375, "eta_s": 7896, "world_size": 1, "timestamp": "2026-05-05T05:38:54.013342"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74870, "epoch": 0, "train_loss": 3.7324931770563126, "train_ppl": 41.78315120519928, "lr": 0.00056, "grad_norm": 0.6696, "tokens_per_sec": 147608, "dt_s": 4.44, "eta_s": 7884, "world_size": 1, "timestamp": "2026-05-05T05:38:58.453212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74880, "epoch": 0, "train_loss": 3.657608136534691, "train_ppl": 38.76850291961152, "lr": 0.00056, "grad_norm": 0.6787, "tokens_per_sec": 149590, "dt_s": 4.381, "eta_s": 7855, "world_size": 1, "timestamp": "2026-05-05T05:39:02.834251"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74890, "epoch": 0, "train_loss": 3.6440068930387497, "train_ppl": 38.24477283550653, "lr": 0.00056, "grad_norm": 0.6332, "tokens_per_sec": 148786, "dt_s": 4.405, "eta_s": 7819, "world_size": 1, "timestamp": "2026-05-05T05:39:07.238970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74900, "epoch": 0, "train_loss": 3.7944045662879944, "train_ppl": 44.45176044724363, "lr": 0.00056, "grad_norm": 0.6337, "tokens_per_sec": 147041, "dt_s": 4.457, "eta_s": 7845, "world_size": 1, "timestamp": "2026-05-05T05:39:11.695943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74910, "epoch": 0, "train_loss": 3.573420524597168, "train_ppl": 35.63828654107627, "lr": 0.00056, "grad_norm": 0.6436, "tokens_per_sec": 147728, "dt_s": 4.436, "eta_s": 7862, "world_size": 1, "timestamp": "2026-05-05T05:39:16.132220"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74920, "epoch": 0, "train_loss": 3.684887111186981, "train_ppl": 39.8406246351647, "lr": 0.00056, "grad_norm": 0.68, "tokens_per_sec": 145062, "dt_s": 4.518, "eta_s": 7886, "world_size": 1, "timestamp": "2026-05-05T05:39:20.650009"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74930, "epoch": 0, "train_loss": 3.646583467721939, "train_ppl": 38.34344040655596, "lr": 0.00056, "grad_norm": 0.6168, "tokens_per_sec": 147064, "dt_s": 4.456, "eta_s": 7908, "world_size": 1, "timestamp": "2026-05-05T05:39:25.106313"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74940, "epoch": 0, "train_loss": 3.712677523493767, "train_ppl": 40.96334012773236, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 151153, "dt_s": 4.336, "eta_s": 7879, "world_size": 1, "timestamp": "2026-05-05T05:39:29.442023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74950, "epoch": 0, "train_loss": 3.67568276822567, "train_ppl": 39.47560034367951, "lr": 0.00056, "grad_norm": 0.6613, "tokens_per_sec": 146488, "dt_s": 4.474, "eta_s": 7880, "world_size": 1, "timestamp": "2026-05-05T05:39:33.915837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74960, "epoch": 0, "train_loss": 3.7737689912319183, "train_ppl": 43.543872420622144, "lr": 0.00056, "grad_norm": 0.6565, "tokens_per_sec": 133719, "dt_s": 4.901, "eta_s": 8041, "world_size": 1, "timestamp": "2026-05-05T05:39:38.816863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74970, "epoch": 0, "train_loss": 3.6601574420928955, "train_ppl": 38.86746176415986, "lr": 0.00056, "grad_norm": 0.7082, "tokens_per_sec": 148210, "dt_s": 4.422, "eta_s": 8002, "world_size": 1, "timestamp": "2026-05-05T05:39:43.238697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74980, "epoch": 0, "train_loss": 3.7898037433624268, "train_ppl": 44.24771551592289, "lr": 0.00056, "grad_norm": 0.7185, "tokens_per_sec": 146429, "dt_s": 4.476, "eta_s": 8004, "world_size": 1, "timestamp": "2026-05-05T05:39:47.714309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 74990, "epoch": 0, "train_loss": 3.699265331029892, "train_ppl": 40.41759989344054, "lr": 0.00056, "grad_norm": 0.6119, "tokens_per_sec": 149263, "dt_s": 4.391, "eta_s": 8019, "world_size": 1, "timestamp": "2026-05-05T05:39:52.104922"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75000, "epoch": 0, "train_loss": 3.702608957886696, "train_ppl": 40.55296744911353, "lr": 0.00056, "grad_norm": 0.8262, "tokens_per_sec": 147275, "dt_s": 4.45, "eta_s": 8006, "world_size": 1, "timestamp": "2026-05-05T05:39:56.554838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75010, "epoch": 0, "train_loss": 3.7765374928712845, "train_ppl": 43.664590730117055, "lr": 0.00056, "grad_norm": 0.6261, "tokens_per_sec": 126557, "dt_s": 5.178, "eta_s": 7821, "world_size": 1, "timestamp": "2026-05-05T05:40:01.733223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75020, "epoch": 0, "train_loss": 3.634949952363968, "train_ppl": 37.899956046470244, "lr": 0.00056, "grad_norm": 0.6632, "tokens_per_sec": 151557, "dt_s": 4.324, "eta_s": 7782, "world_size": 1, "timestamp": "2026-05-05T05:40:06.057402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75030, "epoch": 0, "train_loss": 3.5557267367839813, "train_ppl": 35.013256145038774, "lr": 0.00056, "grad_norm": 0.6425, "tokens_per_sec": 147427, "dt_s": 4.445, "eta_s": 7767, "world_size": 1, "timestamp": "2026-05-05T05:40:10.502729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75040, "epoch": 0, "train_loss": 3.7705686390399933, "train_ppl": 43.40473944908403, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 149098, "dt_s": 4.395, "eta_s": 7764, "world_size": 1, "timestamp": "2026-05-05T05:40:14.898241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75050, "epoch": 0, "train_loss": 3.688227415084839, "train_ppl": 39.973926940086194, "lr": 0.00056, "grad_norm": 0.6545, "tokens_per_sec": 150507, "dt_s": 4.354, "eta_s": 7726, "world_size": 1, "timestamp": "2026-05-05T05:40:19.252560"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75060, "epoch": 0, "train_loss": 3.7022827565670013, "train_ppl": 40.539741174945426, "lr": 0.00056, "grad_norm": 0.7095, "tokens_per_sec": 147158, "dt_s": 4.453, "eta_s": 7744, "world_size": 1, "timestamp": "2026-05-05T05:40:23.706034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75070, "epoch": 0, "train_loss": 3.650388538837433, "train_ppl": 38.4896178555401, "lr": 0.00056, "grad_norm": 0.6936, "tokens_per_sec": 149914, "dt_s": 4.372, "eta_s": 7757, "world_size": 1, "timestamp": "2026-05-05T05:40:28.077569"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75080, "epoch": 0, "train_loss": 3.7404390424489975, "train_ppl": 42.11647702765858, "lr": 0.00056, "grad_norm": 0.6635, "tokens_per_sec": 147617, "dt_s": 4.44, "eta_s": 7750, "world_size": 1, "timestamp": "2026-05-05T05:40:32.517163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75090, "epoch": 0, "train_loss": 3.657660201191902, "train_ppl": 38.770521440972985, "lr": 0.00056, "grad_norm": 0.6311, "tokens_per_sec": 149101, "dt_s": 4.395, "eta_s": 7746, "world_size": 1, "timestamp": "2026-05-05T05:40:36.912590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75100, "epoch": 0, "train_loss": 3.669553577899933, "train_ppl": 39.23438685278572, "lr": 0.00056, "grad_norm": 0.6245, "tokens_per_sec": 150001, "dt_s": 4.369, "eta_s": 7747, "world_size": 1, "timestamp": "2026-05-05T05:40:41.281623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75110, "epoch": 0, "train_loss": 3.78360952436924, "train_ppl": 43.97448258200759, "lr": 0.00056, "grad_norm": 0.7585, "tokens_per_sec": 149782, "dt_s": 4.375, "eta_s": 7715, "world_size": 1, "timestamp": "2026-05-05T05:40:45.657053"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75120, "epoch": 0, "train_loss": 3.6065897345542908, "train_ppl": 36.84020347404099, "lr": 0.00056, "grad_norm": 0.6617, "tokens_per_sec": 150421, "dt_s": 4.357, "eta_s": 7705, "world_size": 1, "timestamp": "2026-05-05T05:40:50.013878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75130, "epoch": 0, "train_loss": 3.737252250313759, "train_ppl": 41.98247420288399, "lr": 0.00056, "grad_norm": 0.6387, "tokens_per_sec": 151177, "dt_s": 4.335, "eta_s": 7664, "world_size": 1, "timestamp": "2026-05-05T05:40:54.348931"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75140, "epoch": 0, "train_loss": 3.6905935257673264, "train_ppl": 40.068621660556076, "lr": 0.00056, "grad_norm": 0.6434, "tokens_per_sec": 146191, "dt_s": 4.483, "eta_s": 7690, "world_size": 1, "timestamp": "2026-05-05T05:40:58.831844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75150, "epoch": 0, "train_loss": 3.66829352080822, "train_ppl": 39.184980419393305, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 149219, "dt_s": 4.392, "eta_s": 7694, "world_size": 1, "timestamp": "2026-05-05T05:41:03.223770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75160, "epoch": 0, "train_loss": 3.7630732655525208, "train_ppl": 43.08062092790225, "lr": 0.00056, "grad_norm": 0.7102, "tokens_per_sec": 150145, "dt_s": 4.365, "eta_s": 7686, "world_size": 1, "timestamp": "2026-05-05T05:41:07.588627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75170, "epoch": 0, "train_loss": 3.7775886058807373, "train_ppl": 43.71051127910123, "lr": 0.00056, "grad_norm": 0.6499, "tokens_per_sec": 148645, "dt_s": 4.409, "eta_s": 7700, "world_size": 1, "timestamp": "2026-05-05T05:41:11.997519"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75180, "epoch": 0, "train_loss": 3.6400386840105057, "train_ppl": 38.09331029891276, "lr": 0.00056, "grad_norm": 0.6437, "tokens_per_sec": 150639, "dt_s": 4.351, "eta_s": 7701, "world_size": 1, "timestamp": "2026-05-05T05:41:16.348034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75190, "epoch": 0, "train_loss": 3.74392469227314, "train_ppl": 42.263536468365714, "lr": 0.00056, "grad_norm": 0.6704, "tokens_per_sec": 148160, "dt_s": 4.423, "eta_s": 7676, "world_size": 1, "timestamp": "2026-05-05T05:41:20.771364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75200, "epoch": 0, "train_loss": 3.808406263589859, "train_ppl": 45.07853827847493, "lr": 0.00056, "grad_norm": 0.7615, "tokens_per_sec": 148038, "dt_s": 4.427, "eta_s": 7684, "world_size": 1, "timestamp": "2026-05-05T05:41:25.198323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75210, "epoch": 0, "train_loss": 3.623738318681717, "train_ppl": 37.47740877153295, "lr": 0.00056, "grad_norm": 0.7409, "tokens_per_sec": 149701, "dt_s": 4.378, "eta_s": 7684, "world_size": 1, "timestamp": "2026-05-05T05:41:29.576107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75220, "epoch": 0, "train_loss": 3.750081852078438, "train_ppl": 42.52456258144638, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 147982, "dt_s": 4.429, "eta_s": 7686, "world_size": 1, "timestamp": "2026-05-05T05:41:34.004748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75230, "epoch": 0, "train_loss": 3.7366557866334915, "train_ppl": 41.95744064835942, "lr": 0.00056, "grad_norm": 0.6286, "tokens_per_sec": 151021, "dt_s": 4.34, "eta_s": 7678, "world_size": 1, "timestamp": "2026-05-05T05:41:38.344287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75240, "epoch": 0, "train_loss": 3.6848665475845337, "train_ppl": 39.83980537682193, "lr": 0.00056, "grad_norm": 0.6636, "tokens_per_sec": 150666, "dt_s": 4.35, "eta_s": 7648, "world_size": 1, "timestamp": "2026-05-05T05:41:42.694060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75250, "epoch": 0, "train_loss": 3.6353027671575546, "train_ppl": 37.9133300707778, "lr": 0.00056, "grad_norm": 0.6593, "tokens_per_sec": 147973, "dt_s": 4.429, "eta_s": 7644, "world_size": 1, "timestamp": "2026-05-05T05:41:47.122953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75260, "epoch": 0, "train_loss": 3.710788682103157, "train_ppl": 40.88603990231625, "lr": 0.00056, "grad_norm": 0.6421, "tokens_per_sec": 133977, "dt_s": 4.892, "eta_s": 7819, "world_size": 1, "timestamp": "2026-05-05T05:41:52.014563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75270, "epoch": 0, "train_loss": 3.6669553965330124, "train_ppl": 39.132581112081894, "lr": 0.00056, "grad_norm": 0.6274, "tokens_per_sec": 149491, "dt_s": 4.384, "eta_s": 7799, "world_size": 1, "timestamp": "2026-05-05T05:41:56.398476"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75280, "epoch": 0, "train_loss": 3.6731783002614975, "train_ppl": 39.37685866654778, "lr": 0.00056, "grad_norm": 0.6385, "tokens_per_sec": 150042, "dt_s": 4.368, "eta_s": 7804, "world_size": 1, "timestamp": "2026-05-05T05:42:00.766341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75290, "epoch": 0, "train_loss": 3.691625639796257, "train_ppl": 40.109998396173424, "lr": 0.00056, "grad_norm": 0.6298, "tokens_per_sec": 150313, "dt_s": 4.36, "eta_s": 7803, "world_size": 1, "timestamp": "2026-05-05T05:42:05.126302"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75300, "epoch": 0, "train_loss": 3.6006796956062317, "train_ppl": 36.623118558678996, "lr": 0.00056, "grad_norm": 0.6324, "tokens_per_sec": 148743, "dt_s": 4.406, "eta_s": 7791, "world_size": 1, "timestamp": "2026-05-05T05:42:09.532279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75310, "epoch": 0, "train_loss": 3.642493799328804, "train_ppl": 38.18694866801502, "lr": 0.00056, "grad_norm": 0.6633, "tokens_per_sec": 149285, "dt_s": 4.39, "eta_s": 7612, "world_size": 1, "timestamp": "2026-05-05T05:42:13.922286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75320, "epoch": 0, "train_loss": 3.7270173877477646, "train_ppl": 41.55498074935047, "lr": 0.00056, "grad_norm": 0.6608, "tokens_per_sec": 150637, "dt_s": 4.351, "eta_s": 7596, "world_size": 1, "timestamp": "2026-05-05T05:42:18.272887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75330, "epoch": 0, "train_loss": 3.815061166882515, "train_ppl": 45.379532022561584, "lr": 0.00056, "grad_norm": 0.6931, "tokens_per_sec": 147407, "dt_s": 4.446, "eta_s": 7619, "world_size": 1, "timestamp": "2026-05-05T05:42:22.718781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75340, "epoch": 0, "train_loss": 3.7054507732391357, "train_ppl": 40.66837540097339, "lr": 0.00056, "grad_norm": 0.6554, "tokens_per_sec": 151192, "dt_s": 4.335, "eta_s": 7606, "world_size": 1, "timestamp": "2026-05-05T05:42:27.053404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75350, "epoch": 0, "train_loss": 3.71019646525383, "train_ppl": 40.861833668960934, "lr": 0.00056, "grad_norm": 0.6917, "tokens_per_sec": 149992, "dt_s": 4.369, "eta_s": 7588, "world_size": 1, "timestamp": "2026-05-05T05:42:31.422702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75360, "epoch": 0, "train_loss": 3.7292192578315735, "train_ppl": 41.64658022636183, "lr": 0.00056, "grad_norm": 0.7104, "tokens_per_sec": 149667, "dt_s": 4.379, "eta_s": 7580, "world_size": 1, "timestamp": "2026-05-05T05:42:35.801481"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75370, "epoch": 0, "train_loss": 3.5403784811496735, "train_ppl": 34.47996673902821, "lr": 0.00056, "grad_norm": 0.6695, "tokens_per_sec": 150788, "dt_s": 4.346, "eta_s": 7574, "world_size": 1, "timestamp": "2026-05-05T05:42:40.147733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75380, "epoch": 0, "train_loss": 3.669126719236374, "train_ppl": 39.21764288875548, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 151370, "dt_s": 4.33, "eta_s": 7530, "world_size": 1, "timestamp": "2026-05-05T05:42:44.477281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75390, "epoch": 0, "train_loss": 3.5716643035411835, "train_ppl": 35.57575275949839, "lr": 0.00056, "grad_norm": 0.6787, "tokens_per_sec": 149703, "dt_s": 4.378, "eta_s": 7540, "world_size": 1, "timestamp": "2026-05-05T05:42:48.855003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75400, "epoch": 0, "train_loss": 3.701859176158905, "train_ppl": 40.5225729711485, "lr": 0.00056, "grad_norm": 0.6626, "tokens_per_sec": 149983, "dt_s": 4.37, "eta_s": 7536, "world_size": 1, "timestamp": "2026-05-05T05:42:53.224557"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75410, "epoch": 0, "train_loss": 3.6590402722358704, "train_ppl": 38.82406445305536, "lr": 0.00056, "grad_norm": 0.6353, "tokens_per_sec": 148742, "dt_s": 4.406, "eta_s": 7541, "world_size": 1, "timestamp": "2026-05-05T05:42:57.630564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75420, "epoch": 0, "train_loss": 3.699078783392906, "train_ppl": 40.41006078891071, "lr": 0.00056, "grad_norm": 0.6398, "tokens_per_sec": 148172, "dt_s": 4.423, "eta_s": 7563, "world_size": 1, "timestamp": "2026-05-05T05:43:02.053553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75430, "epoch": 0, "train_loss": 3.7140839099884033, "train_ppl": 41.020990946227066, "lr": 0.00056, "grad_norm": 0.6415, "tokens_per_sec": 151039, "dt_s": 4.339, "eta_s": 7562, "world_size": 1, "timestamp": "2026-05-05T05:43:06.392545"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75440, "epoch": 0, "train_loss": 3.6375642120838165, "train_ppl": 37.99916599873458, "lr": 0.00056, "grad_norm": 0.6245, "tokens_per_sec": 148208, "dt_s": 4.422, "eta_s": 7573, "world_size": 1, "timestamp": "2026-05-05T05:43:10.814429"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75450, "epoch": 0, "train_loss": 3.7138424664735794, "train_ppl": 41.01108788955396, "lr": 0.00056, "grad_norm": 0.6726, "tokens_per_sec": 151158, "dt_s": 4.336, "eta_s": 7557, "world_size": 1, "timestamp": "2026-05-05T05:43:15.150033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75460, "epoch": 0, "train_loss": 3.7177511155605316, "train_ppl": 41.17169952356757, "lr": 0.00056, "grad_norm": 0.6578, "tokens_per_sec": 151195, "dt_s": 4.335, "eta_s": 7528, "world_size": 1, "timestamp": "2026-05-05T05:43:19.484569"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75470, "epoch": 0, "train_loss": 3.7625135332345963, "train_ppl": 43.056514059418724, "lr": 0.00056, "grad_norm": 0.6687, "tokens_per_sec": 148014, "dt_s": 4.428, "eta_s": 7525, "world_size": 1, "timestamp": "2026-05-05T05:43:23.912271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75480, "epoch": 0, "train_loss": 3.663373664021492, "train_ppl": 38.99266938681563, "lr": 0.00056, "grad_norm": 0.6077, "tokens_per_sec": 149722, "dt_s": 4.377, "eta_s": 7534, "world_size": 1, "timestamp": "2026-05-05T05:43:28.289441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75490, "epoch": 0, "train_loss": 3.7465096712112427, "train_ppl": 42.37292814666439, "lr": 0.00056, "grad_norm": 0.6301, "tokens_per_sec": 150033, "dt_s": 4.368, "eta_s": 7511, "world_size": 1, "timestamp": "2026-05-05T05:43:32.657551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75500, "epoch": 0, "train_loss": 3.729994475841522, "train_ppl": 41.67887792266653, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 146739, "dt_s": 4.466, "eta_s": 7551, "world_size": 1, "timestamp": "2026-05-05T05:43:37.123711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75510, "epoch": 0, "train_loss": 3.7674516290426254, "train_ppl": 43.26965707820721, "lr": 0.00056, "grad_norm": 0.6306, "tokens_per_sec": 127378, "dt_s": 5.145, "eta_s": 7561, "world_size": 1, "timestamp": "2026-05-05T05:43:42.268725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75520, "epoch": 0, "train_loss": 3.6882699131965637, "train_ppl": 39.97562579259813, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 149050, "dt_s": 4.397, "eta_s": 7546, "world_size": 1, "timestamp": "2026-05-05T05:43:46.665626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75530, "epoch": 0, "train_loss": 3.6665263921022415, "train_ppl": 39.115796661956466, "lr": 0.00056, "grad_norm": 0.7181, "tokens_per_sec": 147388, "dt_s": 4.446, "eta_s": 7566, "world_size": 1, "timestamp": "2026-05-05T05:43:51.112147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75540, "epoch": 0, "train_loss": 3.7647928446531296, "train_ppl": 43.1547651934858, "lr": 0.00056, "grad_norm": 0.8488, "tokens_per_sec": 148169, "dt_s": 4.423, "eta_s": 7580, "world_size": 1, "timestamp": "2026-05-05T05:43:55.535190"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75550, "epoch": 0, "train_loss": 3.721986800432205, "train_ppl": 41.34645972166289, "lr": 0.00056, "grad_norm": 0.6741, "tokens_per_sec": 133079, "dt_s": 4.925, "eta_s": 7733, "world_size": 1, "timestamp": "2026-05-05T05:44:00.459770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75560, "epoch": 0, "train_loss": 3.8553062975406647, "train_ppl": 47.243085166170566, "lr": 0.00056, "grad_norm": 0.9846, "tokens_per_sec": 151983, "dt_s": 4.312, "eta_s": 7706, "world_size": 1, "timestamp": "2026-05-05T05:44:04.771823"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75570, "epoch": 0, "train_loss": 3.845029130578041, "train_ppl": 46.76004647825545, "lr": 0.00056, "grad_norm": 0.7134, "tokens_per_sec": 149557, "dt_s": 4.382, "eta_s": 7697, "world_size": 1, "timestamp": "2026-05-05T05:44:09.153852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75580, "epoch": 0, "train_loss": 3.745027244091034, "train_ppl": 42.310159904983905, "lr": 0.00056, "grad_norm": 0.6572, "tokens_per_sec": 147948, "dt_s": 4.43, "eta_s": 7687, "world_size": 1, "timestamp": "2026-05-05T05:44:13.583499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75590, "epoch": 0, "train_loss": 3.709423065185547, "train_ppl": 40.83024334156651, "lr": 0.00056, "grad_norm": 0.671, "tokens_per_sec": 149707, "dt_s": 4.378, "eta_s": 7666, "world_size": 1, "timestamp": "2026-05-05T05:44:17.961130"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75600, "epoch": 0, "train_loss": 3.6599111109972, "train_ppl": 38.8578886788393, "lr": 0.00056, "grad_norm": 0.6094, "tokens_per_sec": 149232, "dt_s": 4.392, "eta_s": 7480, "world_size": 1, "timestamp": "2026-05-05T05:44:22.352679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75610, "epoch": 0, "train_loss": 3.664203479886055, "train_ppl": 39.02503955125496, "lr": 0.00056, "grad_norm": 0.6298, "tokens_per_sec": 148857, "dt_s": 4.403, "eta_s": 7506, "world_size": 1, "timestamp": "2026-05-05T05:44:26.755304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75620, "epoch": 0, "train_loss": 3.600478768348694, "train_ppl": 36.615760715124935, "lr": 0.00056, "grad_norm": 0.6316, "tokens_per_sec": 150668, "dt_s": 4.35, "eta_s": 7491, "world_size": 1, "timestamp": "2026-05-05T05:44:31.105000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75630, "epoch": 0, "train_loss": 3.8005476891994476, "train_ppl": 44.725673554835424, "lr": 0.00056, "grad_norm": 0.6869, "tokens_per_sec": 148412, "dt_s": 4.416, "eta_s": 7482, "world_size": 1, "timestamp": "2026-05-05T05:44:35.520827"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75640, "epoch": 0, "train_loss": 3.7235244512557983, "train_ppl": 41.41008504373486, "lr": 0.00056, "grad_norm": 0.7097, "tokens_per_sec": 149309, "dt_s": 4.389, "eta_s": 7481, "world_size": 1, "timestamp": "2026-05-05T05:44:39.910108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75650, "epoch": 0, "train_loss": 3.572879984974861, "train_ppl": 35.619027840643945, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 149485, "dt_s": 4.384, "eta_s": 7475, "world_size": 1, "timestamp": "2026-05-05T05:44:44.294263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75660, "epoch": 0, "train_loss": 3.788808509707451, "train_ppl": 44.20370060647827, "lr": 0.00056, "grad_norm": 0.6832, "tokens_per_sec": 148003, "dt_s": 4.428, "eta_s": 7479, "world_size": 1, "timestamp": "2026-05-05T05:44:48.722249"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75670, "epoch": 0, "train_loss": 3.6891571432352066, "train_ppl": 40.01110910721854, "lr": 0.00056, "grad_norm": 0.6662, "tokens_per_sec": 151111, "dt_s": 4.337, "eta_s": 7470, "world_size": 1, "timestamp": "2026-05-05T05:44:53.059199"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75680, "epoch": 0, "train_loss": 3.6679607778787613, "train_ppl": 39.17194406321573, "lr": 0.00056, "grad_norm": 0.6961, "tokens_per_sec": 150620, "dt_s": 4.351, "eta_s": 7444, "world_size": 1, "timestamp": "2026-05-05T05:44:57.410281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75690, "epoch": 0, "train_loss": 3.631922274827957, "train_ppl": 37.78538073700224, "lr": 0.00056, "grad_norm": 0.638, "tokens_per_sec": 146656, "dt_s": 4.469, "eta_s": 7466, "world_size": 1, "timestamp": "2026-05-05T05:45:01.878970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75700, "epoch": 0, "train_loss": 3.7997382432222366, "train_ppl": 44.68948518653887, "lr": 0.00056, "grad_norm": 0.6751, "tokens_per_sec": 151104, "dt_s": 4.337, "eta_s": 7446, "world_size": 1, "timestamp": "2026-05-05T05:45:06.216111"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75710, "epoch": 0, "train_loss": 3.69614839553833, "train_ppl": 40.29181697230923, "lr": 0.00056, "grad_norm": 0.6809, "tokens_per_sec": 148053, "dt_s": 4.427, "eta_s": 7441, "world_size": 1, "timestamp": "2026-05-05T05:45:10.642637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75720, "epoch": 0, "train_loss": 3.675365701317787, "train_ppl": 39.463085921201184, "lr": 0.00056, "grad_norm": 0.6459, "tokens_per_sec": 149465, "dt_s": 4.385, "eta_s": 7453, "world_size": 1, "timestamp": "2026-05-05T05:45:15.027350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75730, "epoch": 0, "train_loss": 3.729367256164551, "train_ppl": 41.652744306935105, "lr": 0.00056, "grad_norm": 0.7006, "tokens_per_sec": 150768, "dt_s": 4.347, "eta_s": 7447, "world_size": 1, "timestamp": "2026-05-05T05:45:19.374156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75740, "epoch": 0, "train_loss": 3.7792330384254456, "train_ppl": 43.7824493988761, "lr": 0.00056, "grad_norm": 0.8108, "tokens_per_sec": 148139, "dt_s": 4.424, "eta_s": 7427, "world_size": 1, "timestamp": "2026-05-05T05:45:23.798128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75750, "epoch": 0, "train_loss": 3.7902449518442154, "train_ppl": 44.26724229068084, "lr": 0.00056, "grad_norm": 0.7299, "tokens_per_sec": 151324, "dt_s": 4.331, "eta_s": 7421, "world_size": 1, "timestamp": "2026-05-05T05:45:28.128973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75760, "epoch": 0, "train_loss": 3.685695067048073, "train_ppl": 39.87282710868505, "lr": 0.00056, "grad_norm": 0.6443, "tokens_per_sec": 151046, "dt_s": 4.339, "eta_s": 7387, "world_size": 1, "timestamp": "2026-05-05T05:45:32.467791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75770, "epoch": 0, "train_loss": 3.725130006670952, "train_ppl": 41.47662463222325, "lr": 0.00056, "grad_norm": 0.6542, "tokens_per_sec": 149912, "dt_s": 4.372, "eta_s": 7378, "world_size": 1, "timestamp": "2026-05-05T05:45:36.839415"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75780, "epoch": 0, "train_loss": 3.7351578027009964, "train_ppl": 41.894636128244784, "lr": 0.00056, "grad_norm": 0.6158, "tokens_per_sec": 151165, "dt_s": 4.335, "eta_s": 7370, "world_size": 1, "timestamp": "2026-05-05T05:45:41.174811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75790, "epoch": 0, "train_loss": 3.6861854195594788, "train_ppl": 39.8923836440009, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 151018, "dt_s": 4.34, "eta_s": 7337, "world_size": 1, "timestamp": "2026-05-05T05:45:45.514412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75800, "epoch": 0, "train_loss": 3.6189538687467575, "train_ppl": 37.29852824925393, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 149160, "dt_s": 4.394, "eta_s": 7354, "world_size": 1, "timestamp": "2026-05-05T05:45:49.908086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75810, "epoch": 0, "train_loss": 3.591049239039421, "train_ppl": 36.27211408556693, "lr": 0.00056, "grad_norm": 0.7056, "tokens_per_sec": 148812, "dt_s": 4.404, "eta_s": 7371, "world_size": 1, "timestamp": "2026-05-05T05:45:54.312022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75820, "epoch": 0, "train_loss": 3.6687383502721786, "train_ppl": 39.20241493063316, "lr": 0.00056, "grad_norm": 0.7194, "tokens_per_sec": 148610, "dt_s": 4.41, "eta_s": 7380, "world_size": 1, "timestamp": "2026-05-05T05:45:58.721956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75830, "epoch": 0, "train_loss": 3.6096759140491486, "train_ppl": 36.95407457754084, "lr": 0.00056, "grad_norm": 0.6473, "tokens_per_sec": 148799, "dt_s": 4.404, "eta_s": 7399, "world_size": 1, "timestamp": "2026-05-05T05:46:03.126291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75840, "epoch": 0, "train_loss": 3.6881337016820908, "train_ppl": 39.9701810228955, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 151252, "dt_s": 4.333, "eta_s": 7392, "world_size": 1, "timestamp": "2026-05-05T05:46:07.459195"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75850, "epoch": 0, "train_loss": 3.686090797185898, "train_ppl": 39.888609110553176, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 133856, "dt_s": 4.896, "eta_s": 7557, "world_size": 1, "timestamp": "2026-05-05T05:46:12.355209"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75860, "epoch": 0, "train_loss": 3.655232846736908, "train_ppl": 38.67652576958889, "lr": 0.00056, "grad_norm": 0.6776, "tokens_per_sec": 151478, "dt_s": 4.326, "eta_s": 7526, "world_size": 1, "timestamp": "2026-05-05T05:46:16.681662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75870, "epoch": 0, "train_loss": 3.7714962661266327, "train_ppl": 43.44502154157705, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 149295, "dt_s": 4.39, "eta_s": 7515, "world_size": 1, "timestamp": "2026-05-05T05:46:21.071354"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75880, "epoch": 0, "train_loss": 3.629991203546524, "train_ppl": 37.71248487960399, "lr": 0.00056, "grad_norm": 0.6586, "tokens_per_sec": 147730, "dt_s": 4.436, "eta_s": 7521, "world_size": 1, "timestamp": "2026-05-05T05:46:25.507556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75890, "epoch": 0, "train_loss": 3.6318391114473343, "train_ppl": 37.78223850766305, "lr": 0.00056, "grad_norm": 0.8505, "tokens_per_sec": 150586, "dt_s": 4.352, "eta_s": 7523, "world_size": 1, "timestamp": "2026-05-05T05:46:29.859622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75900, "epoch": 0, "train_loss": 3.67012856900692, "train_ppl": 39.25695276329137, "lr": 0.00056, "grad_norm": 0.6942, "tokens_per_sec": 149425, "dt_s": 4.386, "eta_s": 7348, "world_size": 1, "timestamp": "2026-05-05T05:46:34.245460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75910, "epoch": 0, "train_loss": 3.6651827096939087, "train_ppl": 39.063272749724895, "lr": 0.00056, "grad_norm": 0.6952, "tokens_per_sec": 148569, "dt_s": 4.411, "eta_s": 7372, "world_size": 1, "timestamp": "2026-05-05T05:46:38.656639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75920, "epoch": 0, "train_loss": 3.6598161458969116, "train_ppl": 38.85419871075579, "lr": 0.00056, "grad_norm": 0.6629, "tokens_per_sec": 151372, "dt_s": 4.329, "eta_s": 7347, "world_size": 1, "timestamp": "2026-05-05T05:46:42.986080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75930, "epoch": 0, "train_loss": 3.660011202096939, "train_ppl": 38.8617782023008, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 149277, "dt_s": 4.39, "eta_s": 7327, "world_size": 1, "timestamp": "2026-05-05T05:46:47.376353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75940, "epoch": 0, "train_loss": 3.7225194424390793, "train_ppl": 41.36848844913817, "lr": 0.00056, "grad_norm": 0.6527, "tokens_per_sec": 148440, "dt_s": 4.415, "eta_s": 7344, "world_size": 1, "timestamp": "2026-05-05T05:46:51.791323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75950, "epoch": 0, "train_loss": 3.750073343515396, "train_ppl": 42.524200760064126, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 150230, "dt_s": 4.362, "eta_s": 7332, "world_size": 1, "timestamp": "2026-05-05T05:46:56.153701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75960, "epoch": 0, "train_loss": 3.731168895959854, "train_ppl": 41.727855189722916, "lr": 0.00056, "grad_norm": 0.622, "tokens_per_sec": 146768, "dt_s": 4.465, "eta_s": 7345, "world_size": 1, "timestamp": "2026-05-05T05:47:00.618970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75970, "epoch": 0, "train_loss": 3.7139556258916855, "train_ppl": 41.01572894297995, "lr": 0.00056, "grad_norm": 0.777, "tokens_per_sec": 151037, "dt_s": 4.339, "eta_s": 7344, "world_size": 1, "timestamp": "2026-05-05T05:47:04.958026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75980, "epoch": 0, "train_loss": 3.695720225572586, "train_ppl": 40.27456891922908, "lr": 0.00056, "grad_norm": 0.7168, "tokens_per_sec": 150381, "dt_s": 4.358, "eta_s": 7329, "world_size": 1, "timestamp": "2026-05-05T05:47:09.316021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 75990, "epoch": 0, "train_loss": 3.646338641643524, "train_ppl": 38.33405408146376, "lr": 0.00056, "grad_norm": 0.6351, "tokens_per_sec": 149766, "dt_s": 4.376, "eta_s": 7312, "world_size": 1, "timestamp": "2026-05-05T05:47:13.691920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76000, "epoch": 0, "train_loss": 3.672642767429352, "train_ppl": 39.3557767114485, "lr": 0.00056, "grad_norm": 0.6672, "tokens_per_sec": 152602, "dt_s": 4.295, "eta_s": 7285, "world_size": 1, "timestamp": "2026-05-05T05:47:17.986496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76010, "epoch": 0, "train_loss": 3.709777608513832, "train_ppl": 40.84472199843946, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 128755, "dt_s": 5.09, "eta_s": 7234, "world_size": 1, "timestamp": "2026-05-05T05:47:23.076479"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76020, "epoch": 0, "train_loss": 3.7179086208343506, "train_ppl": 41.17818479409337, "lr": 0.00056, "grad_norm": 0.6621, "tokens_per_sec": 149108, "dt_s": 4.395, "eta_s": 7249, "world_size": 1, "timestamp": "2026-05-05T05:47:27.471670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76030, "epoch": 0, "train_loss": 3.8207317888736725, "train_ppl": 45.63759318693751, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 146941, "dt_s": 4.46, "eta_s": 7278, "world_size": 1, "timestamp": "2026-05-05T05:47:31.931684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76040, "epoch": 0, "train_loss": 3.8134175837039948, "train_ppl": 45.30500824686858, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 149732, "dt_s": 4.377, "eta_s": 7274, "world_size": 1, "timestamp": "2026-05-05T05:47:36.308584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76050, "epoch": 0, "train_loss": 3.715815395116806, "train_ppl": 41.09207970880983, "lr": 0.00056, "grad_norm": 0.6455, "tokens_per_sec": 150328, "dt_s": 4.36, "eta_s": 7292, "world_size": 1, "timestamp": "2026-05-05T05:47:40.668140"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76060, "epoch": 0, "train_loss": 3.7062689810991287, "train_ppl": 40.70166420210184, "lr": 0.00056, "grad_norm": 0.6286, "tokens_per_sec": 150972, "dt_s": 4.341, "eta_s": 7292, "world_size": 1, "timestamp": "2026-05-05T05:47:45.009058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76070, "epoch": 0, "train_loss": 4.142921939492226, "train_ppl": 62.98659585150783, "lr": 0.00056, "grad_norm": 1.4529, "tokens_per_sec": 148499, "dt_s": 4.413, "eta_s": 7293, "world_size": 1, "timestamp": "2026-05-05T05:47:49.422295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76080, "epoch": 0, "train_loss": 3.65392629802227, "train_ppl": 38.626026001945114, "lr": 0.00056, "grad_norm": 0.6737, "tokens_per_sec": 150265, "dt_s": 4.361, "eta_s": 7256, "world_size": 1, "timestamp": "2026-05-05T05:47:53.783632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76090, "epoch": 0, "train_loss": 3.6954673528671265, "train_ppl": 40.26438586758743, "lr": 0.00056, "grad_norm": 0.6886, "tokens_per_sec": 148142, "dt_s": 4.424, "eta_s": 7267, "world_size": 1, "timestamp": "2026-05-05T05:47:58.207501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76100, "epoch": 0, "train_loss": 3.768054187297821, "train_ppl": 43.29573742395389, "lr": 0.00056, "grad_norm": 0.69, "tokens_per_sec": 147333, "dt_s": 4.448, "eta_s": 7292, "world_size": 1, "timestamp": "2026-05-05T05:48:02.655673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76110, "epoch": 0, "train_loss": 3.7642712593078613, "train_ppl": 43.132262169516316, "lr": 0.00056, "grad_norm": 0.646, "tokens_per_sec": 150325, "dt_s": 4.36, "eta_s": 7294, "world_size": 1, "timestamp": "2026-05-05T05:48:07.015294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76120, "epoch": 0, "train_loss": 3.8087010830640793, "train_ppl": 45.091830268701315, "lr": 0.00056, "grad_norm": 0.6515, "tokens_per_sec": 146689, "dt_s": 4.468, "eta_s": 7308, "world_size": 1, "timestamp": "2026-05-05T05:48:11.482958"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76130, "epoch": 0, "train_loss": 3.709856703877449, "train_ppl": 40.84795275434498, "lr": 0.00056, "grad_norm": 0.6458, "tokens_per_sec": 147647, "dt_s": 4.439, "eta_s": 7329, "world_size": 1, "timestamp": "2026-05-05T05:48:15.921663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76140, "epoch": 0, "train_loss": 3.7087566256523132, "train_ppl": 40.80304151844408, "lr": 0.00056, "grad_norm": 0.7318, "tokens_per_sec": 134985, "dt_s": 4.855, "eta_s": 7467, "world_size": 1, "timestamp": "2026-05-05T05:48:20.776729"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76150, "epoch": 0, "train_loss": 3.715379536151886, "train_ppl": 41.07417326010811, "lr": 0.00056, "grad_norm": 0.6989, "tokens_per_sec": 147756, "dt_s": 4.435, "eta_s": 7459, "world_size": 1, "timestamp": "2026-05-05T05:48:25.212140"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76160, "epoch": 0, "train_loss": 3.6799146085977554, "train_ppl": 39.64300875592932, "lr": 0.00056, "grad_norm": 0.6926, "tokens_per_sec": 149456, "dt_s": 4.385, "eta_s": 7462, "world_size": 1, "timestamp": "2026-05-05T05:48:29.597131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76170, "epoch": 0, "train_loss": 3.6216707676649094, "train_ppl": 37.4000023653415, "lr": 0.00056, "grad_norm": 0.6322, "tokens_per_sec": 149469, "dt_s": 4.385, "eta_s": 7430, "world_size": 1, "timestamp": "2026-05-05T05:48:33.981732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76180, "epoch": 0, "train_loss": 3.6748476773500443, "train_ppl": 39.44264839087521, "lr": 0.00056, "grad_norm": 0.6989, "tokens_per_sec": 148236, "dt_s": 4.421, "eta_s": 7420, "world_size": 1, "timestamp": "2026-05-05T05:48:38.402770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76190, "epoch": 0, "train_loss": 3.682577520608902, "train_ppl": 39.74871528122679, "lr": 0.00056, "grad_norm": 0.6178, "tokens_per_sec": 150081, "dt_s": 4.367, "eta_s": 7255, "world_size": 1, "timestamp": "2026-05-05T05:48:42.769495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76200, "epoch": 0, "train_loss": 3.6733709275722504, "train_ppl": 39.38444445553023, "lr": 0.00056, "grad_norm": 0.652, "tokens_per_sec": 149696, "dt_s": 4.378, "eta_s": 7231, "world_size": 1, "timestamp": "2026-05-05T05:48:47.147421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76210, "epoch": 0, "train_loss": 3.6941647231578827, "train_ppl": 40.211970428697555, "lr": 0.00056, "grad_norm": 0.7045, "tokens_per_sec": 147459, "dt_s": 4.444, "eta_s": 7246, "world_size": 1, "timestamp": "2026-05-05T05:48:51.591794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76220, "epoch": 0, "train_loss": 3.6422620862722397, "train_ppl": 38.178101278485705, "lr": 0.00056, "grad_norm": 0.7397, "tokens_per_sec": 148794, "dt_s": 4.404, "eta_s": 7248, "world_size": 1, "timestamp": "2026-05-05T05:48:55.996310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76230, "epoch": 0, "train_loss": 3.7361443489789963, "train_ppl": 41.93598751976211, "lr": 0.00056, "grad_norm": 0.6501, "tokens_per_sec": 149129, "dt_s": 4.395, "eta_s": 7235, "world_size": 1, "timestamp": "2026-05-05T05:49:00.390881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76240, "epoch": 0, "train_loss": 3.682907685637474, "train_ppl": 39.76184108366455, "lr": 0.00056, "grad_norm": 0.6317, "tokens_per_sec": 149597, "dt_s": 4.381, "eta_s": 7236, "world_size": 1, "timestamp": "2026-05-05T05:49:04.771713"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76250, "epoch": 0, "train_loss": 3.7324277460575104, "train_ppl": 41.78041738132221, "lr": 0.00056, "grad_norm": 0.6389, "tokens_per_sec": 149917, "dt_s": 4.371, "eta_s": 7229, "world_size": 1, "timestamp": "2026-05-05T05:49:09.143213"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76260, "epoch": 0, "train_loss": 3.706443175673485, "train_ppl": 40.708754828729525, "lr": 0.00056, "grad_norm": 0.707, "tokens_per_sec": 146498, "dt_s": 4.474, "eta_s": 7234, "world_size": 1, "timestamp": "2026-05-05T05:49:13.616724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76270, "epoch": 0, "train_loss": 3.6595364212989807, "train_ppl": 38.843331755591684, "lr": 0.00056, "grad_norm": 0.6927, "tokens_per_sec": 150543, "dt_s": 4.353, "eta_s": 7213, "world_size": 1, "timestamp": "2026-05-05T05:49:17.970032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76280, "epoch": 0, "train_loss": 3.665185436606407, "train_ppl": 39.06337927199683, "lr": 0.00056, "grad_norm": 0.6492, "tokens_per_sec": 150320, "dt_s": 4.36, "eta_s": 7197, "world_size": 1, "timestamp": "2026-05-05T05:49:22.329783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76290, "epoch": 0, "train_loss": 3.7326283156871796, "train_ppl": 41.78879810459499, "lr": 0.00056, "grad_norm": 0.663, "tokens_per_sec": 148157, "dt_s": 4.423, "eta_s": 7207, "world_size": 1, "timestamp": "2026-05-05T05:49:26.753214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76300, "epoch": 0, "train_loss": 3.6785134375095367, "train_ppl": 39.58750101521269, "lr": 0.00056, "grad_norm": 0.7053, "tokens_per_sec": 151488, "dt_s": 4.326, "eta_s": 7188, "world_size": 1, "timestamp": "2026-05-05T05:49:31.079370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76310, "epoch": 0, "train_loss": 3.7762258797883987, "train_ppl": 43.65098639214163, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 150312, "dt_s": 4.36, "eta_s": 7146, "world_size": 1, "timestamp": "2026-05-05T05:49:35.439396"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76320, "epoch": 0, "train_loss": 3.6922023743391037, "train_ppl": 40.13313788979327, "lr": 0.00056, "grad_norm": 0.6848, "tokens_per_sec": 149211, "dt_s": 4.392, "eta_s": 7154, "world_size": 1, "timestamp": "2026-05-05T05:49:39.831570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76330, "epoch": 0, "train_loss": 3.7730354964733124, "train_ppl": 43.51194492918748, "lr": 0.00056, "grad_norm": 0.7261, "tokens_per_sec": 150400, "dt_s": 4.357, "eta_s": 7149, "world_size": 1, "timestamp": "2026-05-05T05:49:44.189010"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76340, "epoch": 0, "train_loss": 3.581901863217354, "train_ppl": 35.941832334825676, "lr": 0.00056, "grad_norm": 0.6331, "tokens_per_sec": 148561, "dt_s": 4.411, "eta_s": 7141, "world_size": 1, "timestamp": "2026-05-05T05:49:48.600407"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76350, "epoch": 0, "train_loss": 3.8059235215187073, "train_ppl": 44.96675871231824, "lr": 0.00056, "grad_norm": 0.6212, "tokens_per_sec": 148969, "dt_s": 4.399, "eta_s": 7160, "world_size": 1, "timestamp": "2026-05-05T05:49:52.999693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76360, "epoch": 0, "train_loss": 3.808829128742218, "train_ppl": 45.09760445235833, "lr": 0.00056, "grad_norm": 0.6826, "tokens_per_sec": 150557, "dt_s": 4.353, "eta_s": 7154, "world_size": 1, "timestamp": "2026-05-05T05:49:57.352613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76370, "epoch": 0, "train_loss": 3.5895954072475433, "train_ppl": 36.21941884723861, "lr": 0.00056, "grad_norm": 0.6165, "tokens_per_sec": 147561, "dt_s": 4.441, "eta_s": 7165, "world_size": 1, "timestamp": "2026-05-05T05:50:01.793893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76380, "epoch": 0, "train_loss": 3.662864610552788, "train_ppl": 38.97282508454421, "lr": 0.00056, "grad_norm": 0.6963, "tokens_per_sec": 150185, "dt_s": 4.364, "eta_s": 7163, "world_size": 1, "timestamp": "2026-05-05T05:50:06.157573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76390, "epoch": 0, "train_loss": 3.746876999735832, "train_ppl": 42.388495790887895, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 149501, "dt_s": 4.384, "eta_s": 7150, "world_size": 1, "timestamp": "2026-05-05T05:50:10.541244"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76400, "epoch": 0, "train_loss": 3.7077421844005585, "train_ppl": 40.76167021785559, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 147656, "dt_s": 4.438, "eta_s": 7158, "world_size": 1, "timestamp": "2026-05-05T05:50:14.979674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76410, "epoch": 0, "train_loss": 3.7636510729789734, "train_ppl": 43.105520423474566, "lr": 0.00056, "grad_norm": 0.6576, "tokens_per_sec": 149799, "dt_s": 4.375, "eta_s": 7161, "world_size": 1, "timestamp": "2026-05-05T05:50:19.354574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76420, "epoch": 0, "train_loss": 3.8410544097423553, "train_ppl": 46.574557225442476, "lr": 0.00056, "grad_norm": 0.6882, "tokens_per_sec": 148908, "dt_s": 4.401, "eta_s": 7143, "world_size": 1, "timestamp": "2026-05-05T05:50:23.755691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76430, "epoch": 0, "train_loss": 3.7033539563417435, "train_ppl": 40.58319060391465, "lr": 0.00056, "grad_norm": 0.642, "tokens_per_sec": 147686, "dt_s": 4.438, "eta_s": 7163, "world_size": 1, "timestamp": "2026-05-05T05:50:28.193248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76440, "epoch": 0, "train_loss": 3.7549222260713577, "train_ppl": 42.73089633163195, "lr": 0.00056, "grad_norm": 0.6587, "tokens_per_sec": 134490, "dt_s": 4.873, "eta_s": 7317, "world_size": 1, "timestamp": "2026-05-05T05:50:33.066171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76450, "epoch": 0, "train_loss": 3.6646775752305984, "train_ppl": 39.04354552727875, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 147429, "dt_s": 4.445, "eta_s": 7315, "world_size": 1, "timestamp": "2026-05-05T05:50:37.511429"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76460, "epoch": 0, "train_loss": 3.6167415976524353, "train_ppl": 37.216104998251126, "lr": 0.00056, "grad_norm": 0.6556, "tokens_per_sec": 148815, "dt_s": 4.404, "eta_s": 7320, "world_size": 1, "timestamp": "2026-05-05T05:50:41.915301"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76470, "epoch": 0, "train_loss": 3.6918881088495255, "train_ppl": 40.120527411188924, "lr": 0.00056, "grad_norm": 0.7043, "tokens_per_sec": 150057, "dt_s": 4.367, "eta_s": 7305, "world_size": 1, "timestamp": "2026-05-05T05:50:46.282694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76480, "epoch": 0, "train_loss": 3.7112399637699127, "train_ppl": 40.9044951865029, "lr": 0.00056, "grad_norm": 0.6352, "tokens_per_sec": 147138, "dt_s": 4.454, "eta_s": 7305, "world_size": 1, "timestamp": "2026-05-05T05:50:50.736734"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76490, "epoch": 0, "train_loss": 3.604617655277252, "train_ppl": 36.76762326269593, "lr": 0.00056, "grad_norm": 0.6082, "tokens_per_sec": 150919, "dt_s": 4.342, "eta_s": 7129, "world_size": 1, "timestamp": "2026-05-05T05:50:55.079224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76500, "epoch": 0, "train_loss": 3.71314138174057, "train_ppl": 40.982345718476445, "lr": 0.00056, "grad_norm": 0.6359, "tokens_per_sec": 150378, "dt_s": 4.358, "eta_s": 7096, "world_size": 1, "timestamp": "2026-05-05T05:50:59.437306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76510, "epoch": 0, "train_loss": 3.7518363296985626, "train_ppl": 42.59923646247262, "lr": 0.00056, "grad_norm": 0.6682, "tokens_per_sec": 126972, "dt_s": 5.161, "eta_s": 7087, "world_size": 1, "timestamp": "2026-05-05T05:51:04.598747"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76520, "epoch": 0, "train_loss": 3.6911110281944275, "train_ppl": 40.08936263580564, "lr": 0.00056, "grad_norm": 0.6754, "tokens_per_sec": 149653, "dt_s": 4.379, "eta_s": 7086, "world_size": 1, "timestamp": "2026-05-05T05:51:08.977937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76530, "epoch": 0, "train_loss": 3.637658029794693, "train_ppl": 38.002731160738854, "lr": 0.00056, "grad_norm": 0.6285, "tokens_per_sec": 148218, "dt_s": 4.422, "eta_s": 7071, "world_size": 1, "timestamp": "2026-05-05T05:51:13.399525"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76540, "epoch": 0, "train_loss": 3.7442096322774887, "train_ppl": 42.27558075649907, "lr": 0.00056, "grad_norm": 0.623, "tokens_per_sec": 147885, "dt_s": 4.432, "eta_s": 7096, "world_size": 1, "timestamp": "2026-05-05T05:51:17.831080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76550, "epoch": 0, "train_loss": 3.621155336499214, "train_ppl": 37.380730205687854, "lr": 0.00056, "grad_norm": 0.6806, "tokens_per_sec": 150149, "dt_s": 4.365, "eta_s": 7093, "world_size": 1, "timestamp": "2026-05-05T05:51:22.195811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76560, "epoch": 0, "train_loss": 3.70428729057312, "train_ppl": 40.62108596669248, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 146709, "dt_s": 4.467, "eta_s": 7115, "world_size": 1, "timestamp": "2026-05-05T05:51:26.662882"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76570, "epoch": 0, "train_loss": 3.62014839053154, "train_ppl": 37.34310877469146, "lr": 0.00056, "grad_norm": 0.6503, "tokens_per_sec": 151270, "dt_s": 4.332, "eta_s": 7095, "world_size": 1, "timestamp": "2026-05-05T05:51:30.995284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76580, "epoch": 0, "train_loss": 3.60647152364254, "train_ppl": 36.83584881738828, "lr": 0.00056, "grad_norm": 0.7138, "tokens_per_sec": 152222, "dt_s": 4.305, "eta_s": 7053, "world_size": 1, "timestamp": "2026-05-05T05:51:35.300539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76590, "epoch": 0, "train_loss": 3.7336137890815735, "train_ppl": 41.83000015173692, "lr": 0.00056, "grad_norm": 0.6876, "tokens_per_sec": 151288, "dt_s": 4.332, "eta_s": 7017, "world_size": 1, "timestamp": "2026-05-05T05:51:39.632403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76600, "epoch": 0, "train_loss": 3.645879775285721, "train_ppl": 38.31646790884763, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 151441, "dt_s": 4.328, "eta_s": 7001, "world_size": 1, "timestamp": "2026-05-05T05:51:43.959913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76610, "epoch": 0, "train_loss": 3.7418391555547714, "train_ppl": 42.17548615920354, "lr": 0.00056, "grad_norm": 0.6396, "tokens_per_sec": 149892, "dt_s": 4.372, "eta_s": 6966, "world_size": 1, "timestamp": "2026-05-05T05:51:48.332160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76620, "epoch": 0, "train_loss": 3.6704562604427338, "train_ppl": 39.26981903847702, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 146063, "dt_s": 4.487, "eta_s": 7011, "world_size": 1, "timestamp": "2026-05-05T05:51:52.818960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76630, "epoch": 0, "train_loss": 3.7389565259218216, "train_ppl": 42.054084914495526, "lr": 0.00056, "grad_norm": 0.6167, "tokens_per_sec": 150140, "dt_s": 4.365, "eta_s": 7026, "world_size": 1, "timestamp": "2026-05-05T05:51:57.183949"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76640, "epoch": 0, "train_loss": 3.7543561458587646, "train_ppl": 42.706714061950194, "lr": 0.00056, "grad_norm": 0.6004, "tokens_per_sec": 149148, "dt_s": 4.394, "eta_s": 7041, "world_size": 1, "timestamp": "2026-05-05T05:52:01.577981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76650, "epoch": 0, "train_loss": 3.762843891978264, "train_ppl": 43.070740505096076, "lr": 0.00056, "grad_norm": 0.7328, "tokens_per_sec": 150920, "dt_s": 4.342, "eta_s": 7042, "world_size": 1, "timestamp": "2026-05-05T05:52:05.920438"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76660, "epoch": 0, "train_loss": 3.772351861000061, "train_ppl": 43.48220878562313, "lr": 0.00056, "grad_norm": 0.7743, "tokens_per_sec": 151782, "dt_s": 4.318, "eta_s": 7020, "world_size": 1, "timestamp": "2026-05-05T05:52:10.238207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76670, "epoch": 0, "train_loss": 3.6964191198349, "train_ppl": 40.302726422776594, "lr": 0.00056, "grad_norm": 0.6589, "tokens_per_sec": 148143, "dt_s": 4.424, "eta_s": 6995, "world_size": 1, "timestamp": "2026-05-05T05:52:14.662014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76680, "epoch": 0, "train_loss": 3.7351838797330856, "train_ppl": 41.89572863026001, "lr": 0.00056, "grad_norm": 0.679, "tokens_per_sec": 150703, "dt_s": 4.349, "eta_s": 6986, "world_size": 1, "timestamp": "2026-05-05T05:52:19.010692"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76690, "epoch": 0, "train_loss": 3.61942158639431, "train_ppl": 37.315977509489215, "lr": 0.00056, "grad_norm": 0.7844, "tokens_per_sec": 150210, "dt_s": 4.363, "eta_s": 6972, "world_size": 1, "timestamp": "2026-05-05T05:52:23.373659"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76700, "epoch": 0, "train_loss": 3.716043159365654, "train_ppl": 41.10144008141695, "lr": 0.00056, "grad_norm": 0.6871, "tokens_per_sec": 147921, "dt_s": 4.43, "eta_s": 6995, "world_size": 1, "timestamp": "2026-05-05T05:52:27.804093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76710, "epoch": 0, "train_loss": 3.7403846830129623, "train_ppl": 42.11418766194445, "lr": 0.00056, "grad_norm": 0.7163, "tokens_per_sec": 150536, "dt_s": 4.354, "eta_s": 7002, "world_size": 1, "timestamp": "2026-05-05T05:52:32.157636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76720, "epoch": 0, "train_loss": 3.665805295109749, "train_ppl": 39.0876005459136, "lr": 0.00056, "grad_norm": 0.6651, "tokens_per_sec": 150856, "dt_s": 4.344, "eta_s": 6973, "world_size": 1, "timestamp": "2026-05-05T05:52:36.501894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76730, "epoch": 0, "train_loss": 3.6574227064847946, "train_ppl": 38.761314740653525, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 132368, "dt_s": 4.951, "eta_s": 7160, "world_size": 1, "timestamp": "2026-05-05T05:52:41.452945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76740, "epoch": 0, "train_loss": 3.6686677634716034, "train_ppl": 39.19964785524903, "lr": 0.00056, "grad_norm": 0.697, "tokens_per_sec": 149957, "dt_s": 4.37, "eta_s": 7158, "world_size": 1, "timestamp": "2026-05-05T05:52:45.823280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76750, "epoch": 0, "train_loss": 3.677684411406517, "train_ppl": 39.55469554369275, "lr": 0.00056, "grad_norm": 0.657, "tokens_per_sec": 148221, "dt_s": 4.421, "eta_s": 7151, "world_size": 1, "timestamp": "2026-05-05T05:52:50.244745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76760, "epoch": 0, "train_loss": 3.7863558530807495, "train_ppl": 44.09541695285329, "lr": 0.00056, "grad_norm": 0.6194, "tokens_per_sec": 149411, "dt_s": 4.386, "eta_s": 7157, "world_size": 1, "timestamp": "2026-05-05T05:52:54.631021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76770, "epoch": 0, "train_loss": 3.669599801301956, "train_ppl": 39.23620044153715, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 151270, "dt_s": 4.332, "eta_s": 7149, "world_size": 1, "timestamp": "2026-05-05T05:52:58.963420"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76780, "epoch": 0, "train_loss": 3.78530253469944, "train_ppl": 44.04899489254208, "lr": 0.00056, "grad_norm": 0.6394, "tokens_per_sec": 147455, "dt_s": 4.444, "eta_s": 6983, "world_size": 1, "timestamp": "2026-05-05T05:53:03.407877"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76790, "epoch": 0, "train_loss": 3.7512544691562653, "train_ppl": 42.574456857479426, "lr": 0.00056, "grad_norm": 0.7525, "tokens_per_sec": 149901, "dt_s": 4.372, "eta_s": 6979, "world_size": 1, "timestamp": "2026-05-05T05:53:07.779833"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76800, "epoch": 0, "train_loss": 3.7478531897068024, "train_ppl": 42.429895218929076, "lr": 0.00056, "grad_norm": 0.703, "tokens_per_sec": 149349, "dt_s": 4.388, "eta_s": 6964, "world_size": 1, "timestamp": "2026-05-05T05:53:12.167954"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76810, "epoch": 0, "train_loss": 3.8676600009202957, "train_ppl": 47.83033209575246, "lr": 0.00056, "grad_norm": 0.7568, "tokens_per_sec": 147691, "dt_s": 4.437, "eta_s": 6976, "world_size": 1, "timestamp": "2026-05-05T05:53:16.605359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76820, "epoch": 0, "train_loss": 3.420940935611725, "train_ppl": 30.598192408984477, "lr": 0.00056, "grad_norm": 0.7007, "tokens_per_sec": 148229, "dt_s": 4.421, "eta_s": 7000, "world_size": 1, "timestamp": "2026-05-05T05:53:21.026618"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76830, "epoch": 0, "train_loss": 3.6991301625967026, "train_ppl": 40.41213707899802, "lr": 0.00056, "grad_norm": 0.6681, "tokens_per_sec": 149150, "dt_s": 4.394, "eta_s": 6979, "world_size": 1, "timestamp": "2026-05-05T05:53:25.420593"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76840, "epoch": 0, "train_loss": 3.6191363781690598, "train_ppl": 37.30533620333641, "lr": 0.00056, "grad_norm": 0.6305, "tokens_per_sec": 147084, "dt_s": 4.456, "eta_s": 7001, "world_size": 1, "timestamp": "2026-05-05T05:53:29.876259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76850, "epoch": 0, "train_loss": 3.781398221850395, "train_ppl": 43.87734913320984, "lr": 0.00056, "grad_norm": 0.669, "tokens_per_sec": 151313, "dt_s": 4.331, "eta_s": 6979, "world_size": 1, "timestamp": "2026-05-05T05:53:34.207399"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76860, "epoch": 0, "train_loss": 3.704055890440941, "train_ppl": 40.61168732949541, "lr": 0.00056, "grad_norm": 0.7175, "tokens_per_sec": 148943, "dt_s": 4.4, "eta_s": 6963, "world_size": 1, "timestamp": "2026-05-05T05:53:38.607501"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76870, "epoch": 0, "train_loss": 3.7560160160064697, "train_ppl": 42.777660526398286, "lr": 0.00056, "grad_norm": 0.6335, "tokens_per_sec": 149890, "dt_s": 4.372, "eta_s": 6943, "world_size": 1, "timestamp": "2026-05-05T05:53:42.979800"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76880, "epoch": 0, "train_loss": 3.654222786426544, "train_ppl": 38.637479868643304, "lr": 0.00056, "grad_norm": 0.6728, "tokens_per_sec": 151161, "dt_s": 4.336, "eta_s": 6920, "world_size": 1, "timestamp": "2026-05-05T05:53:47.315309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76890, "epoch": 0, "train_loss": 3.6571090817451477, "train_ppl": 38.74916013950105, "lr": 0.00056, "grad_norm": 0.6281, "tokens_per_sec": 148144, "dt_s": 4.424, "eta_s": 6906, "world_size": 1, "timestamp": "2026-05-05T05:53:51.739096"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76900, "epoch": 0, "train_loss": 3.591747596859932, "train_ppl": 36.297453847185544, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 151229, "dt_s": 4.334, "eta_s": 6902, "world_size": 1, "timestamp": "2026-05-05T05:53:56.072654"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76910, "epoch": 0, "train_loss": 3.703265354037285, "train_ppl": 40.579594998996654, "lr": 0.00056, "grad_norm": 0.7041, "tokens_per_sec": 149962, "dt_s": 4.37, "eta_s": 6888, "world_size": 1, "timestamp": "2026-05-05T05:54:00.442863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76920, "epoch": 0, "train_loss": 3.581663116812706, "train_ppl": 35.93325237583722, "lr": 0.00056, "grad_norm": 0.6937, "tokens_per_sec": 147059, "dt_s": 4.456, "eta_s": 6910, "world_size": 1, "timestamp": "2026-05-05T05:54:04.899290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76930, "epoch": 0, "train_loss": 3.672309175133705, "train_ppl": 39.34265011713546, "lr": 0.00056, "grad_norm": 0.6472, "tokens_per_sec": 149330, "dt_s": 4.389, "eta_s": 6923, "world_size": 1, "timestamp": "2026-05-05T05:54:09.287947"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76940, "epoch": 0, "train_loss": 3.7060734927654266, "train_ppl": 40.69370827925845, "lr": 0.00056, "grad_norm": 0.6579, "tokens_per_sec": 148892, "dt_s": 4.402, "eta_s": 6911, "world_size": 1, "timestamp": "2026-05-05T05:54:13.689512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76950, "epoch": 0, "train_loss": 3.696237102150917, "train_ppl": 40.2953912814379, "lr": 0.00056, "grad_norm": 0.6006, "tokens_per_sec": 148747, "dt_s": 4.406, "eta_s": 6930, "world_size": 1, "timestamp": "2026-05-05T05:54:18.095401"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76960, "epoch": 0, "train_loss": 3.771711602807045, "train_ppl": 43.4543778556389, "lr": 0.00056, "grad_norm": 0.6567, "tokens_per_sec": 152279, "dt_s": 4.304, "eta_s": 6904, "world_size": 1, "timestamp": "2026-05-05T05:54:22.399057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76970, "epoch": 0, "train_loss": 3.694717228412628, "train_ppl": 40.23419389238798, "lr": 0.00056, "grad_norm": 1.0195, "tokens_per_sec": 151080, "dt_s": 4.338, "eta_s": 6863, "world_size": 1, "timestamp": "2026-05-05T05:54:26.736874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76980, "epoch": 0, "train_loss": 3.7720285803079605, "train_ppl": 43.46815409899967, "lr": 0.00056, "grad_norm": 0.7568, "tokens_per_sec": 149779, "dt_s": 4.376, "eta_s": 6854, "world_size": 1, "timestamp": "2026-05-05T05:54:31.112412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 76990, "epoch": 0, "train_loss": 3.7362342923879623, "train_ppl": 41.93975955507031, "lr": 0.00056, "grad_norm": 0.6627, "tokens_per_sec": 149856, "dt_s": 4.373, "eta_s": 6841, "world_size": 1, "timestamp": "2026-05-05T05:54:35.485671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77000, "epoch": 0, "train_loss": 3.756608083844185, "train_ppl": 42.802995302581664, "lr": 0.00056, "grad_norm": 0.6447, "tokens_per_sec": 149981, "dt_s": 4.37, "eta_s": 6825, "world_size": 1, "timestamp": "2026-05-05T05:54:39.855310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77010, "epoch": 0, "train_loss": 3.553222805261612, "train_ppl": 34.925695018557086, "lr": 0.00056, "grad_norm": 0.6235, "tokens_per_sec": 127298, "dt_s": 5.148, "eta_s": 6837, "world_size": 1, "timestamp": "2026-05-05T05:54:45.003531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77020, "epoch": 0, "train_loss": 3.666526585817337, "train_ppl": 39.11580423927749, "lr": 0.00056, "grad_norm": 0.6746, "tokens_per_sec": 149692, "dt_s": 4.378, "eta_s": 6845, "world_size": 1, "timestamp": "2026-05-05T05:54:49.381579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77030, "epoch": 0, "train_loss": 3.7896167784929276, "train_ppl": 44.23944352087615, "lr": 0.00056, "grad_norm": 0.7004, "tokens_per_sec": 131128, "dt_s": 4.998, "eta_s": 7036, "world_size": 1, "timestamp": "2026-05-05T05:54:54.379431"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77040, "epoch": 0, "train_loss": 3.7084244042634964, "train_ppl": 40.78948812681083, "lr": 0.00056, "grad_norm": 0.6466, "tokens_per_sec": 149298, "dt_s": 4.39, "eta_s": 7036, "world_size": 1, "timestamp": "2026-05-05T05:54:58.769038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77050, "epoch": 0, "train_loss": 3.5174458622932434, "train_ppl": 33.698248485824976, "lr": 0.00056, "grad_norm": 0.7164, "tokens_per_sec": 148336, "dt_s": 4.418, "eta_s": 7047, "world_size": 1, "timestamp": "2026-05-05T05:55:03.187124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77060, "epoch": 0, "train_loss": 3.7951730638742447, "train_ppl": 44.48593464756523, "lr": 0.00056, "grad_norm": 0.6894, "tokens_per_sec": 148204, "dt_s": 4.422, "eta_s": 7063, "world_size": 1, "timestamp": "2026-05-05T05:55:07.609130"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77070, "epoch": 0, "train_loss": 3.7453987151384354, "train_ppl": 42.32587982396615, "lr": 0.00056, "grad_norm": 0.7534, "tokens_per_sec": 149270, "dt_s": 4.39, "eta_s": 7063, "world_size": 1, "timestamp": "2026-05-05T05:55:11.999566"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77080, "epoch": 0, "train_loss": 3.7366814464330673, "train_ppl": 41.95851728169021, "lr": 0.00056, "grad_norm": 0.6887, "tokens_per_sec": 147557, "dt_s": 4.441, "eta_s": 6885, "world_size": 1, "timestamp": "2026-05-05T05:55:16.440966"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77090, "epoch": 0, "train_loss": 3.6558692306280136, "train_ppl": 38.70114672091007, "lr": 0.00056, "grad_norm": 0.6792, "tokens_per_sec": 149871, "dt_s": 4.373, "eta_s": 6875, "world_size": 1, "timestamp": "2026-05-05T05:55:20.813795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77100, "epoch": 0, "train_loss": 3.691288262605667, "train_ppl": 40.09646848007083, "lr": 0.00056, "grad_norm": 0.6827, "tokens_per_sec": 148822, "dt_s": 4.404, "eta_s": 6866, "world_size": 1, "timestamp": "2026-05-05T05:55:25.217434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77110, "epoch": 0, "train_loss": 3.650007262825966, "train_ppl": 38.47494548485048, "lr": 0.00056, "grad_norm": 0.6468, "tokens_per_sec": 145960, "dt_s": 4.49, "eta_s": 6883, "world_size": 1, "timestamp": "2026-05-05T05:55:29.707450"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77120, "epoch": 0, "train_loss": 3.736391067504883, "train_ppl": 41.946335181211985, "lr": 0.00056, "grad_norm": 0.6865, "tokens_per_sec": 149808, "dt_s": 4.375, "eta_s": 6873, "world_size": 1, "timestamp": "2026-05-05T05:55:34.082132"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77130, "epoch": 0, "train_loss": 3.6173678785562515, "train_ppl": 37.23942003424479, "lr": 0.00056, "grad_norm": 0.7038, "tokens_per_sec": 147597, "dt_s": 4.44, "eta_s": 6869, "world_size": 1, "timestamp": "2026-05-05T05:55:38.522356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77140, "epoch": 0, "train_loss": 3.675690397620201, "train_ppl": 39.475901519757784, "lr": 0.00056, "grad_norm": 0.7064, "tokens_per_sec": 146629, "dt_s": 4.469, "eta_s": 6894, "world_size": 1, "timestamp": "2026-05-05T05:55:42.991821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77150, "epoch": 0, "train_loss": 3.6549244672060013, "train_ppl": 38.664600559554806, "lr": 0.00056, "grad_norm": 0.6592, "tokens_per_sec": 149602, "dt_s": 4.381, "eta_s": 6883, "world_size": 1, "timestamp": "2026-05-05T05:55:47.372507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77160, "epoch": 0, "train_loss": 3.6610018610954285, "train_ppl": 38.90029604844952, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 148717, "dt_s": 4.407, "eta_s": 6852, "world_size": 1, "timestamp": "2026-05-05T05:55:51.779286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77170, "epoch": 0, "train_loss": 3.6288623958826065, "train_ppl": 37.66993875535979, "lr": 0.00056, "grad_norm": 0.6612, "tokens_per_sec": 148712, "dt_s": 4.407, "eta_s": 6858, "world_size": 1, "timestamp": "2026-05-05T05:55:56.186199"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77180, "epoch": 0, "train_loss": 3.678250417113304, "train_ppl": 39.57709006421605, "lr": 0.00056, "grad_norm": 0.6373, "tokens_per_sec": 150159, "dt_s": 4.364, "eta_s": 6830, "world_size": 1, "timestamp": "2026-05-05T05:56:00.550640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77190, "epoch": 0, "train_loss": 3.64543579518795, "train_ppl": 38.29945993555943, "lr": 0.00056, "grad_norm": 0.6187, "tokens_per_sec": 148036, "dt_s": 4.427, "eta_s": 6813, "world_size": 1, "timestamp": "2026-05-05T05:56:04.977666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77200, "epoch": 0, "train_loss": 3.6732905507087708, "train_ppl": 39.38127898463202, "lr": 0.00056, "grad_norm": 0.6144, "tokens_per_sec": 150563, "dt_s": 4.353, "eta_s": 6800, "world_size": 1, "timestamp": "2026-05-05T05:56:09.330410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77210, "epoch": 0, "train_loss": 3.6685339510440826, "train_ppl": 39.19440280614576, "lr": 0.00056, "grad_norm": 0.6566, "tokens_per_sec": 149450, "dt_s": 4.385, "eta_s": 6788, "world_size": 1, "timestamp": "2026-05-05T05:56:13.715540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77220, "epoch": 0, "train_loss": 3.557396739721298, "train_ppl": 35.07177723724312, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 147258, "dt_s": 4.45, "eta_s": 6798, "world_size": 1, "timestamp": "2026-05-05T05:56:18.165978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77230, "epoch": 0, "train_loss": 3.7385197281837463, "train_ppl": 42.03571979654103, "lr": 0.00056, "grad_norm": 0.6767, "tokens_per_sec": 149849, "dt_s": 4.373, "eta_s": 6796, "world_size": 1, "timestamp": "2026-05-05T05:56:22.539443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77240, "epoch": 0, "train_loss": 3.6696563959121704, "train_ppl": 39.238421061844406, "lr": 0.00056, "grad_norm": 0.6401, "tokens_per_sec": 150766, "dt_s": 4.347, "eta_s": 6767, "world_size": 1, "timestamp": "2026-05-05T05:56:26.886320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77250, "epoch": 0, "train_loss": 3.5895240008831024, "train_ppl": 36.2168326425534, "lr": 0.00056, "grad_norm": 0.6763, "tokens_per_sec": 148779, "dt_s": 4.405, "eta_s": 6778, "world_size": 1, "timestamp": "2026-05-05T05:56:31.291270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77260, "epoch": 0, "train_loss": 3.55728280544281, "train_ppl": 35.06778158723337, "lr": 0.00056, "grad_norm": 0.7412, "tokens_per_sec": 151548, "dt_s": 4.324, "eta_s": 6755, "world_size": 1, "timestamp": "2026-05-05T05:56:35.615709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77270, "epoch": 0, "train_loss": 3.6832039803266525, "train_ppl": 39.773624051538796, "lr": 0.00056, "grad_norm": 0.6461, "tokens_per_sec": 148885, "dt_s": 4.402, "eta_s": 6736, "world_size": 1, "timestamp": "2026-05-05T05:56:40.017472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77280, "epoch": 0, "train_loss": 3.805431991815567, "train_ppl": 44.94466164588434, "lr": 0.00056, "grad_norm": 0.6857, "tokens_per_sec": 148818, "dt_s": 4.404, "eta_s": 6741, "world_size": 1, "timestamp": "2026-05-05T05:56:44.421261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77290, "epoch": 0, "train_loss": 3.6760000437498093, "train_ppl": 39.48812697256056, "lr": 0.00056, "grad_norm": 0.6532, "tokens_per_sec": 148703, "dt_s": 4.407, "eta_s": 6755, "world_size": 1, "timestamp": "2026-05-05T05:56:48.828408"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77300, "epoch": 0, "train_loss": 3.6976760923862457, "train_ppl": 40.353417695729505, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 146197, "dt_s": 4.483, "eta_s": 6775, "world_size": 1, "timestamp": "2026-05-05T05:56:53.311119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77310, "epoch": 0, "train_loss": 3.639459252357483, "train_ppl": 38.071244222666046, "lr": 0.00056, "grad_norm": 0.6243, "tokens_per_sec": 148866, "dt_s": 4.402, "eta_s": 6794, "world_size": 1, "timestamp": "2026-05-05T05:56:57.713466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77320, "epoch": 0, "train_loss": 3.6508832573890686, "train_ppl": 38.508664094413994, "lr": 0.00056, "grad_norm": 0.6536, "tokens_per_sec": 148006, "dt_s": 4.428, "eta_s": 6798, "world_size": 1, "timestamp": "2026-05-05T05:57:02.141407"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77330, "epoch": 0, "train_loss": 3.775908961892128, "train_ppl": 43.63715480521857, "lr": 0.00056, "grad_norm": 0.7119, "tokens_per_sec": 132496, "dt_s": 4.946, "eta_s": 6960, "world_size": 1, "timestamp": "2026-05-05T05:57:07.087684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77340, "epoch": 0, "train_loss": 3.6012881249189377, "train_ppl": 36.645407917594866, "lr": 0.00056, "grad_norm": 0.7868, "tokens_per_sec": 150060, "dt_s": 4.367, "eta_s": 6943, "world_size": 1, "timestamp": "2026-05-05T05:57:11.454980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77350, "epoch": 0, "train_loss": 3.6801286041736603, "train_ppl": 39.65149309219148, "lr": 0.00056, "grad_norm": 0.6654, "tokens_per_sec": 149696, "dt_s": 4.378, "eta_s": 6907, "world_size": 1, "timestamp": "2026-05-05T05:57:15.832927"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77360, "epoch": 0, "train_loss": 3.7491599768400192, "train_ppl": 42.48537830446038, "lr": 0.00056, "grad_norm": 0.6408, "tokens_per_sec": 149695, "dt_s": 4.378, "eta_s": 6895, "world_size": 1, "timestamp": "2026-05-05T05:57:20.210889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77370, "epoch": 0, "train_loss": 3.720237076282501, "train_ppl": 41.27417807749339, "lr": 0.00056, "grad_norm": 0.6606, "tokens_per_sec": 152986, "dt_s": 4.284, "eta_s": 6846, "world_size": 1, "timestamp": "2026-05-05T05:57:24.494679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77380, "epoch": 0, "train_loss": 3.6406602412462234, "train_ppl": 38.11699483144527, "lr": 0.00056, "grad_norm": 0.6053, "tokens_per_sec": 151723, "dt_s": 4.319, "eta_s": 6650, "world_size": 1, "timestamp": "2026-05-05T05:57:28.814139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77390, "epoch": 0, "train_loss": 3.6191479712724686, "train_ppl": 37.305768690463644, "lr": 0.00056, "grad_norm": 0.7206, "tokens_per_sec": 152908, "dt_s": 4.286, "eta_s": 6620, "world_size": 1, "timestamp": "2026-05-05T05:57:33.100107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77400, "epoch": 0, "train_loss": 3.7776887267827988, "train_ppl": 43.71488783400872, "lr": 0.00056, "grad_norm": 0.6614, "tokens_per_sec": 152362, "dt_s": 4.301, "eta_s": 6593, "world_size": 1, "timestamp": "2026-05-05T05:57:37.401446"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77410, "epoch": 0, "train_loss": 3.668689265847206, "train_ppl": 39.200490749862794, "lr": 0.00056, "grad_norm": 0.6575, "tokens_per_sec": 145758, "dt_s": 4.496, "eta_s": 6625, "world_size": 1, "timestamp": "2026-05-05T05:57:41.897674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77420, "epoch": 0, "train_loss": 3.6322270035743713, "train_ppl": 37.79689678325307, "lr": 0.00056, "grad_norm": 0.6916, "tokens_per_sec": 151721, "dt_s": 4.32, "eta_s": 6631, "world_size": 1, "timestamp": "2026-05-05T05:57:46.217194"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77430, "epoch": 0, "train_loss": 3.6913079917430878, "train_ppl": 40.097259556611164, "lr": 0.00056, "grad_norm": 0.6358, "tokens_per_sec": 149349, "dt_s": 4.388, "eta_s": 6648, "world_size": 1, "timestamp": "2026-05-05T05:57:50.605337"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77440, "epoch": 0, "train_loss": 3.705120235681534, "train_ppl": 40.6549351968652, "lr": 0.00056, "grad_norm": 0.6384, "tokens_per_sec": 148649, "dt_s": 4.409, "eta_s": 6681, "world_size": 1, "timestamp": "2026-05-05T05:57:55.014068"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77450, "epoch": 0, "train_loss": 3.7460927069187164, "train_ppl": 42.355263831607694, "lr": 0.00056, "grad_norm": 0.6105, "tokens_per_sec": 149990, "dt_s": 4.369, "eta_s": 6697, "world_size": 1, "timestamp": "2026-05-05T05:57:59.383422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77460, "epoch": 0, "train_loss": 3.7201396375894547, "train_ppl": 41.27015657145328, "lr": 0.00056, "grad_norm": 0.6883, "tokens_per_sec": 149070, "dt_s": 4.396, "eta_s": 6662, "world_size": 1, "timestamp": "2026-05-05T05:58:03.779761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77470, "epoch": 0, "train_loss": 3.7627771198749542, "train_ppl": 43.06786467717506, "lr": 0.00056, "grad_norm": 0.6543, "tokens_per_sec": 148764, "dt_s": 4.405, "eta_s": 6684, "world_size": 1, "timestamp": "2026-05-05T05:58:08.185097"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77480, "epoch": 0, "train_loss": 3.641744151711464, "train_ppl": 38.15833264024119, "lr": 0.00056, "grad_norm": 0.6528, "tokens_per_sec": 151720, "dt_s": 4.32, "eta_s": 6659, "world_size": 1, "timestamp": "2026-05-05T05:58:12.504632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77490, "epoch": 0, "train_loss": 3.7001113295555115, "train_ppl": 40.45180759115014, "lr": 0.00056, "grad_norm": 0.708, "tokens_per_sec": 149332, "dt_s": 4.389, "eta_s": 6648, "world_size": 1, "timestamp": "2026-05-05T05:58:16.893271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77500, "epoch": 0, "train_loss": 3.704943925142288, "train_ppl": 40.6477679351681, "lr": 0.00056, "grad_norm": 0.6123, "tokens_per_sec": 149653, "dt_s": 4.379, "eta_s": 6647, "world_size": 1, "timestamp": "2026-05-05T05:58:21.272460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77510, "epoch": 0, "train_loss": 3.63031305372715, "train_ppl": 37.724624603155654, "lr": 0.00056, "grad_norm": 0.6326, "tokens_per_sec": 127009, "dt_s": 5.16, "eta_s": 6640, "world_size": 1, "timestamp": "2026-05-05T05:58:26.432418"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77520, "epoch": 0, "train_loss": 3.6239464432001114, "train_ppl": 37.48520955092277, "lr": 0.00056, "grad_norm": 0.6539, "tokens_per_sec": 144981, "dt_s": 4.52, "eta_s": 6670, "world_size": 1, "timestamp": "2026-05-05T05:58:30.952721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77530, "epoch": 0, "train_loss": 3.7433658093214035, "train_ppl": 42.239922697635045, "lr": 0.00056, "grad_norm": 0.6709, "tokens_per_sec": 149225, "dt_s": 4.392, "eta_s": 6688, "world_size": 1, "timestamp": "2026-05-05T05:58:35.344469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77540, "epoch": 0, "train_loss": 3.617006614804268, "train_ppl": 37.22596921143517, "lr": 0.00056, "grad_norm": 0.6386, "tokens_per_sec": 149439, "dt_s": 4.385, "eta_s": 6683, "world_size": 1, "timestamp": "2026-05-05T05:58:39.729953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77550, "epoch": 0, "train_loss": 3.7071643024683, "train_ppl": 40.73812158992665, "lr": 0.00056, "grad_norm": 0.65, "tokens_per_sec": 148367, "dt_s": 4.417, "eta_s": 6690, "world_size": 1, "timestamp": "2026-05-05T05:58:44.147101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77560, "epoch": 0, "train_loss": 3.6773230880498886, "train_ppl": 39.54040609004063, "lr": 0.00056, "grad_norm": 0.665, "tokens_per_sec": 150928, "dt_s": 4.342, "eta_s": 6671, "world_size": 1, "timestamp": "2026-05-05T05:58:48.489336"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77570, "epoch": 0, "train_loss": 3.6959648579359055, "train_ppl": 40.28442258741939, "lr": 0.00056, "grad_norm": 0.661, "tokens_per_sec": 149949, "dt_s": 4.371, "eta_s": 6622, "world_size": 1, "timestamp": "2026-05-05T05:58:52.859909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77580, "epoch": 0, "train_loss": 3.702372208237648, "train_ppl": 40.54336768471737, "lr": 0.00056, "grad_norm": 0.6834, "tokens_per_sec": 150476, "dt_s": 4.355, "eta_s": 6606, "world_size": 1, "timestamp": "2026-05-05T05:58:57.215125"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77590, "epoch": 0, "train_loss": 3.715341717004776, "train_ppl": 41.07261989928074, "lr": 0.00056, "grad_norm": 0.6631, "tokens_per_sec": 149383, "dt_s": 4.387, "eta_s": 6602, "world_size": 1, "timestamp": "2026-05-05T05:59:01.602254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77600, "epoch": 0, "train_loss": 3.6768398731946945, "train_ppl": 39.52130419396917, "lr": 0.00056, "grad_norm": 0.6403, "tokens_per_sec": 149037, "dt_s": 4.397, "eta_s": 6592, "world_size": 1, "timestamp": "2026-05-05T05:59:05.999535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77610, "epoch": 0, "train_loss": 3.6698570251464844, "train_ppl": 39.24629422598477, "lr": 0.00056, "grad_norm": 0.6506, "tokens_per_sec": 150993, "dt_s": 4.34, "eta_s": 6587, "world_size": 1, "timestamp": "2026-05-05T05:59:10.339880"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77620, "epoch": 0, "train_loss": 3.7735378742218018, "train_ppl": 43.533809853879355, "lr": 0.00056, "grad_norm": 0.6904, "tokens_per_sec": 135047, "dt_s": 4.853, "eta_s": 6728, "world_size": 1, "timestamp": "2026-05-05T05:59:15.192726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77630, "epoch": 0, "train_loss": 3.6375704258680344, "train_ppl": 37.99940211808615, "lr": 0.00056, "grad_norm": 0.6854, "tokens_per_sec": 150793, "dt_s": 4.346, "eta_s": 6721, "world_size": 1, "timestamp": "2026-05-05T05:59:19.538821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77640, "epoch": 0, "train_loss": 3.7519589960575104, "train_ppl": 42.604462276212644, "lr": 0.00056, "grad_norm": 0.7835, "tokens_per_sec": 153414, "dt_s": 4.272, "eta_s": 6682, "world_size": 1, "timestamp": "2026-05-05T05:59:23.810654"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77650, "epoch": 0, "train_loss": 3.5856795758008957, "train_ppl": 36.07786703533195, "lr": 0.00056, "grad_norm": 0.6324, "tokens_per_sec": 150881, "dt_s": 4.344, "eta_s": 6661, "world_size": 1, "timestamp": "2026-05-05T05:59:28.154225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77660, "epoch": 0, "train_loss": 3.713533952832222, "train_ppl": 40.99843736102378, "lr": 0.00056, "grad_norm": 0.6602, "tokens_per_sec": 149407, "dt_s": 4.386, "eta_s": 6671, "world_size": 1, "timestamp": "2026-05-05T05:59:32.540621"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77670, "epoch": 0, "train_loss": 3.798605352640152, "train_ppl": 44.63888555698652, "lr": 0.00056, "grad_norm": 0.6711, "tokens_per_sec": 150764, "dt_s": 4.347, "eta_s": 6514, "world_size": 1, "timestamp": "2026-05-05T05:59:36.887530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77680, "epoch": 0, "train_loss": 3.6381433308124542, "train_ppl": 38.02117840071898, "lr": 0.00056, "grad_norm": 0.6508, "tokens_per_sec": 147965, "dt_s": 4.429, "eta_s": 6535, "world_size": 1, "timestamp": "2026-05-05T05:59:41.316704"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77690, "epoch": 0, "train_loss": 3.70750430226326, "train_ppl": 40.75197489784175, "lr": 0.00056, "grad_norm": 0.7515, "tokens_per_sec": 148780, "dt_s": 4.405, "eta_s": 6570, "world_size": 1, "timestamp": "2026-05-05T05:59:45.721583"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77700, "epoch": 0, "train_loss": 3.709745094180107, "train_ppl": 40.84339398110741, "lr": 0.00056, "grad_norm": 0.6602, "tokens_per_sec": 150416, "dt_s": 4.357, "eta_s": 6570, "world_size": 1, "timestamp": "2026-05-05T05:59:50.078568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77710, "epoch": 0, "train_loss": 3.682282656431198, "train_ppl": 39.73699653678448, "lr": 0.00056, "grad_norm": 0.7395, "tokens_per_sec": 147975, "dt_s": 4.429, "eta_s": 6578, "world_size": 1, "timestamp": "2026-05-05T05:59:54.507405"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77720, "epoch": 0, "train_loss": 3.6284434348344803, "train_ppl": 37.65415982394562, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 150627, "dt_s": 4.351, "eta_s": 6575, "world_size": 1, "timestamp": "2026-05-05T05:59:58.858317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77730, "epoch": 0, "train_loss": 3.753574773669243, "train_ppl": 42.67335725701332, "lr": 0.00056, "grad_norm": 0.7223, "tokens_per_sec": 149491, "dt_s": 4.384, "eta_s": 6557, "world_size": 1, "timestamp": "2026-05-05T06:00:03.242256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77740, "epoch": 0, "train_loss": 3.6461966931819916, "train_ppl": 38.32861300764775, "lr": 0.00056, "grad_norm": 0.7393, "tokens_per_sec": 146735, "dt_s": 4.466, "eta_s": 6571, "world_size": 1, "timestamp": "2026-05-05T06:00:07.708530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77750, "epoch": 0, "train_loss": 3.747885689139366, "train_ppl": 42.431274188855184, "lr": 0.00056, "grad_norm": 0.6304, "tokens_per_sec": 149089, "dt_s": 4.396, "eta_s": 6578, "world_size": 1, "timestamp": "2026-05-05T06:00:12.104320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77760, "epoch": 0, "train_loss": 3.632003888487816, "train_ppl": 37.788464666057, "lr": 0.00056, "grad_norm": 0.6731, "tokens_per_sec": 149867, "dt_s": 4.373, "eta_s": 6557, "world_size": 1, "timestamp": "2026-05-05T06:00:16.477276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77770, "epoch": 0, "train_loss": 3.623112067580223, "train_ppl": 37.453945850615504, "lr": 0.00056, "grad_norm": 0.6517, "tokens_per_sec": 145051, "dt_s": 4.518, "eta_s": 6603, "world_size": 1, "timestamp": "2026-05-05T06:00:20.995400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77780, "epoch": 0, "train_loss": 3.725630834698677, "train_ppl": 41.497402490967204, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 150188, "dt_s": 4.364, "eta_s": 6592, "world_size": 1, "timestamp": "2026-05-05T06:00:25.358996"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77790, "epoch": 0, "train_loss": 3.584802284836769, "train_ppl": 36.046230127992445, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 147168, "dt_s": 4.453, "eta_s": 6584, "world_size": 1, "timestamp": "2026-05-05T06:00:29.812132"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77800, "epoch": 0, "train_loss": 3.79238823056221, "train_ppl": 44.362221075632775, "lr": 0.00056, "grad_norm": 0.6953, "tokens_per_sec": 149142, "dt_s": 4.394, "eta_s": 6579, "world_size": 1, "timestamp": "2026-05-05T06:00:34.206315"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77810, "epoch": 0, "train_loss": 3.765669733285904, "train_ppl": 43.19262371296082, "lr": 0.00056, "grad_norm": 0.6605, "tokens_per_sec": 149488, "dt_s": 4.384, "eta_s": 6578, "world_size": 1, "timestamp": "2026-05-05T06:00:38.590352"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77820, "epoch": 0, "train_loss": 3.7689029574394226, "train_ppl": 43.3325011529086, "lr": 0.00056, "grad_norm": 0.7081, "tokens_per_sec": 146330, "dt_s": 4.479, "eta_s": 6562, "world_size": 1, "timestamp": "2026-05-05T06:00:43.069004"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77830, "epoch": 0, "train_loss": 3.716550439596176, "train_ppl": 41.12229531869381, "lr": 0.00056, "grad_norm": 0.6701, "tokens_per_sec": 148854, "dt_s": 4.403, "eta_s": 6569, "world_size": 1, "timestamp": "2026-05-05T06:00:47.471728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77840, "epoch": 0, "train_loss": 3.675225615501404, "train_ppl": 39.457558089787334, "lr": 0.00056, "grad_norm": 0.6887, "tokens_per_sec": 150488, "dt_s": 4.355, "eta_s": 6535, "world_size": 1, "timestamp": "2026-05-05T06:00:51.826646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77850, "epoch": 0, "train_loss": 3.708295002579689, "train_ppl": 40.78421023985667, "lr": 0.00056, "grad_norm": 0.734, "tokens_per_sec": 147302, "dt_s": 4.449, "eta_s": 6547, "world_size": 1, "timestamp": "2026-05-05T06:00:56.275722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77860, "epoch": 0, "train_loss": 3.748590186238289, "train_ppl": 42.46117743056155, "lr": 0.00056, "grad_norm": 0.6987, "tokens_per_sec": 148119, "dt_s": 4.425, "eta_s": 6555, "world_size": 1, "timestamp": "2026-05-05T06:01:00.700290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77870, "epoch": 0, "train_loss": 3.611745238304138, "train_ppl": 37.0306237155629, "lr": 0.00056, "grad_norm": 0.6411, "tokens_per_sec": 148935, "dt_s": 4.4, "eta_s": 6527, "world_size": 1, "timestamp": "2026-05-05T06:01:05.100604"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77880, "epoch": 0, "train_loss": 3.712057501077652, "train_ppl": 40.93794981071048, "lr": 0.00056, "grad_norm": 0.8242, "tokens_per_sec": 148651, "dt_s": 4.409, "eta_s": 6525, "world_size": 1, "timestamp": "2026-05-05T06:01:09.509320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77890, "epoch": 0, "train_loss": 3.7951406091451645, "train_ppl": 44.48449089203684, "lr": 0.00056, "grad_norm": 0.6691, "tokens_per_sec": 149659, "dt_s": 4.379, "eta_s": 6527, "world_size": 1, "timestamp": "2026-05-05T06:01:13.888328"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77900, "epoch": 0, "train_loss": 3.671082839369774, "train_ppl": 39.29443238985358, "lr": 0.00056, "grad_norm": 0.755, "tokens_per_sec": 148244, "dt_s": 4.421, "eta_s": 6514, "world_size": 1, "timestamp": "2026-05-05T06:01:18.309207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77910, "epoch": 0, "train_loss": 3.672291651368141, "train_ppl": 39.341960691798825, "lr": 0.00056, "grad_norm": 0.6694, "tokens_per_sec": 150025, "dt_s": 4.368, "eta_s": 6493, "world_size": 1, "timestamp": "2026-05-05T06:01:22.677492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77920, "epoch": 0, "train_loss": 3.74993696808815, "train_ppl": 42.51840189943721, "lr": 0.00056, "grad_norm": 0.709, "tokens_per_sec": 134654, "dt_s": 4.867, "eta_s": 6627, "world_size": 1, "timestamp": "2026-05-05T06:01:27.544458"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77930, "epoch": 0, "train_loss": 3.7379473596811295, "train_ppl": 42.011666758801915, "lr": 0.00056, "grad_norm": 0.6319, "tokens_per_sec": 148230, "dt_s": 4.421, "eta_s": 6626, "world_size": 1, "timestamp": "2026-05-05T06:01:31.965695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77940, "epoch": 0, "train_loss": 3.6267013400793076, "train_ppl": 37.588619814634704, "lr": 0.00056, "grad_norm": 0.6939, "tokens_per_sec": 149403, "dt_s": 4.387, "eta_s": 6624, "world_size": 1, "timestamp": "2026-05-05T06:01:36.352256"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77950, "epoch": 0, "train_loss": 3.7342672795057297, "train_ppl": 41.857344589969685, "lr": 0.00056, "grad_norm": 0.6271, "tokens_per_sec": 149034, "dt_s": 4.397, "eta_s": 6612, "world_size": 1, "timestamp": "2026-05-05T06:01:40.749602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77960, "epoch": 0, "train_loss": 3.724597007036209, "train_ppl": 41.454523496915655, "lr": 0.00056, "grad_norm": 0.6534, "tokens_per_sec": 145939, "dt_s": 4.491, "eta_s": 6644, "world_size": 1, "timestamp": "2026-05-05T06:01:45.240257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77970, "epoch": 0, "train_loss": 3.661429911851883, "train_ppl": 38.916950913908934, "lr": 0.00056, "grad_norm": 0.6729, "tokens_per_sec": 148171, "dt_s": 4.423, "eta_s": 6509, "world_size": 1, "timestamp": "2026-05-05T06:01:49.663270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77980, "epoch": 0, "train_loss": 3.6301815658807755, "train_ppl": 37.719664599608535, "lr": 0.00056, "grad_norm": 0.6331, "tokens_per_sec": 147527, "dt_s": 4.442, "eta_s": 6511, "world_size": 1, "timestamp": "2026-05-05T06:01:54.105570"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 77990, "epoch": 0, "train_loss": 3.761081635951996, "train_ppl": 42.99490567290587, "lr": 0.00056, "grad_norm": 0.7148, "tokens_per_sec": 145703, "dt_s": 4.498, "eta_s": 6539, "world_size": 1, "timestamp": "2026-05-05T06:01:58.603503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78000, "epoch": 0, "train_loss": 3.7555234730243683, "train_ppl": 42.75659587796353, "lr": 0.00056, "grad_norm": 0.7193, "tokens_per_sec": 149560, "dt_s": 4.382, "eta_s": 6530, "world_size": 1, "timestamp": "2026-05-05T06:02:02.985413"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78010, "epoch": 0, "train_loss": 3.763359308242798, "train_ppl": 43.0929455872166, "lr": 0.00056, "grad_norm": 0.7793, "tokens_per_sec": 125083, "dt_s": 5.239, "eta_s": 6520, "world_size": 1, "timestamp": "2026-05-05T06:02:08.224825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78020, "epoch": 0, "train_loss": 3.7074937373399734, "train_ppl": 40.75154435862749, "lr": 0.00056, "grad_norm": 0.9629, "tokens_per_sec": 144770, "dt_s": 4.527, "eta_s": 6546, "world_size": 1, "timestamp": "2026-05-05T06:02:12.751714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78030, "epoch": 0, "train_loss": 3.6474927216768265, "train_ppl": 38.378320186275396, "lr": 0.00056, "grad_norm": 0.6248, "tokens_per_sec": 149206, "dt_s": 4.392, "eta_s": 6527, "world_size": 1, "timestamp": "2026-05-05T06:02:17.144026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78040, "epoch": 0, "train_loss": 3.6453386694192886, "train_ppl": 38.29574025171537, "lr": 0.00056, "grad_norm": 0.6492, "tokens_per_sec": 150366, "dt_s": 4.358, "eta_s": 6482, "world_size": 1, "timestamp": "2026-05-05T06:02:21.502467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78050, "epoch": 0, "train_loss": 3.6842246055603027, "train_ppl": 39.81423873854111, "lr": 0.00056, "grad_norm": 0.8202, "tokens_per_sec": 150088, "dt_s": 4.367, "eta_s": 6473, "world_size": 1, "timestamp": "2026-05-05T06:02:25.868980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78060, "epoch": 0, "train_loss": 3.7835945188999176, "train_ppl": 43.97382272920896, "lr": 0.00056, "grad_norm": 0.6697, "tokens_per_sec": 149415, "dt_s": 4.386, "eta_s": 6443, "world_size": 1, "timestamp": "2026-05-05T06:02:30.255149"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78070, "epoch": 0, "train_loss": 3.800689920783043, "train_ppl": 44.732035410630175, "lr": 0.00056, "grad_norm": 0.6584, "tokens_per_sec": 148074, "dt_s": 4.426, "eta_s": 6409, "world_size": 1, "timestamp": "2026-05-05T06:02:34.681046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78080, "epoch": 0, "train_loss": 3.75820754468441, "train_ppl": 42.87151179753077, "lr": 0.00056, "grad_norm": 0.7564, "tokens_per_sec": 150508, "dt_s": 4.354, "eta_s": 6394, "world_size": 1, "timestamp": "2026-05-05T06:02:39.035385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78090, "epoch": 0, "train_loss": 3.7714749574661255, "train_ppl": 43.44409579622552, "lr": 0.00056, "grad_norm": 0.6604, "tokens_per_sec": 147333, "dt_s": 4.448, "eta_s": 6416, "world_size": 1, "timestamp": "2026-05-05T06:02:43.483512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78100, "epoch": 0, "train_loss": 3.7495990693569183, "train_ppl": 42.50403741238904, "lr": 0.00056, "grad_norm": 0.6944, "tokens_per_sec": 149882, "dt_s": 4.373, "eta_s": 6413, "world_size": 1, "timestamp": "2026-05-05T06:02:47.856027"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78110, "epoch": 0, "train_loss": 3.7043569684028625, "train_ppl": 40.6239164544144, "lr": 0.00056, "grad_norm": 0.6171, "tokens_per_sec": 150445, "dt_s": 4.356, "eta_s": 6400, "world_size": 1, "timestamp": "2026-05-05T06:02:52.212161"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78120, "epoch": 0, "train_loss": 3.6855270713567734, "train_ppl": 39.86612920815583, "lr": 0.00056, "grad_norm": 0.6733, "tokens_per_sec": 147029, "dt_s": 4.457, "eta_s": 6404, "world_size": 1, "timestamp": "2026-05-05T06:02:56.669522"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78130, "epoch": 0, "train_loss": 3.723153308033943, "train_ppl": 41.39471882306514, "lr": 0.00056, "grad_norm": 0.6725, "tokens_per_sec": 150139, "dt_s": 4.365, "eta_s": 6403, "world_size": 1, "timestamp": "2026-05-05T06:03:01.034552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78140, "epoch": 0, "train_loss": 3.677002489566803, "train_ppl": 39.52773152765887, "lr": 0.00056, "grad_norm": 0.6812, "tokens_per_sec": 149348, "dt_s": 4.388, "eta_s": 6381, "world_size": 1, "timestamp": "2026-05-05T06:03:05.422680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78150, "epoch": 0, "train_loss": 3.747004896402359, "train_ppl": 42.39391748489954, "lr": 0.00056, "grad_norm": 0.6724, "tokens_per_sec": 148483, "dt_s": 4.414, "eta_s": 6389, "world_size": 1, "timestamp": "2026-05-05T06:03:09.836387"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78160, "epoch": 0, "train_loss": 3.6427070051431656, "train_ppl": 38.195091215492155, "lr": 0.00056, "grad_norm": 0.6261, "tokens_per_sec": 149980, "dt_s": 4.37, "eta_s": 6388, "world_size": 1, "timestamp": "2026-05-05T06:03:14.206018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78170, "epoch": 0, "train_loss": 3.6806375682353973, "train_ppl": 39.671679413789974, "lr": 0.00056, "grad_norm": 0.6483, "tokens_per_sec": 149840, "dt_s": 4.374, "eta_s": 6360, "world_size": 1, "timestamp": "2026-05-05T06:03:18.579752"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78180, "epoch": 0, "train_loss": 3.6782220751047134, "train_ppl": 39.57596838588484, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 148862, "dt_s": 4.402, "eta_s": 6366, "world_size": 1, "timestamp": "2026-05-05T06:03:22.982232"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78190, "epoch": 0, "train_loss": 3.6859235912561417, "train_ppl": 39.88194005614842, "lr": 0.00056, "grad_norm": 0.706, "tokens_per_sec": 147728, "dt_s": 4.436, "eta_s": 6376, "world_size": 1, "timestamp": "2026-05-05T06:03:27.418496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78200, "epoch": 0, "train_loss": 3.723869413137436, "train_ppl": 41.4243724087468, "lr": 0.00056, "grad_norm": 0.7169, "tokens_per_sec": 147688, "dt_s": 4.437, "eta_s": 6378, "world_size": 1, "timestamp": "2026-05-05T06:03:31.855968"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78210, "epoch": 0, "train_loss": 3.596502408385277, "train_ppl": 36.47045236080961, "lr": 0.00056, "grad_norm": 0.6901, "tokens_per_sec": 135530, "dt_s": 4.836, "eta_s": 6509, "world_size": 1, "timestamp": "2026-05-05T06:03:36.691495"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78220, "epoch": 0, "train_loss": 3.716734543442726, "train_ppl": 41.12986678838791, "lr": 0.00056, "grad_norm": 0.7059, "tokens_per_sec": 151333, "dt_s": 4.331, "eta_s": 6492, "world_size": 1, "timestamp": "2026-05-05T06:03:41.022065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78230, "epoch": 0, "train_loss": 3.676825538277626, "train_ppl": 39.52073766341171, "lr": 0.00056, "grad_norm": 0.6884, "tokens_per_sec": 149316, "dt_s": 4.389, "eta_s": 6483, "world_size": 1, "timestamp": "2026-05-05T06:03:45.411139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78240, "epoch": 0, "train_loss": 3.647347867488861, "train_ppl": 38.37276132849097, "lr": 0.00056, "grad_norm": 0.731, "tokens_per_sec": 150975, "dt_s": 4.341, "eta_s": 6451, "world_size": 1, "timestamp": "2026-05-05T06:03:49.751978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78250, "epoch": 0, "train_loss": 3.7410785406827927, "train_ppl": 42.14341905410243, "lr": 0.00056, "grad_norm": 0.6431, "tokens_per_sec": 150062, "dt_s": 4.367, "eta_s": 6427, "world_size": 1, "timestamp": "2026-05-05T06:03:54.119271"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78260, "epoch": 0, "train_loss": 3.6948967576026917, "train_ppl": 40.24141775305786, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 149767, "dt_s": 4.376, "eta_s": 6290, "world_size": 1, "timestamp": "2026-05-05T06:03:58.495139"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78270, "epoch": 0, "train_loss": 3.7737224996089935, "train_ppl": 43.541848042383556, "lr": 0.00056, "grad_norm": 0.7365, "tokens_per_sec": 149918, "dt_s": 4.371, "eta_s": 6297, "world_size": 1, "timestamp": "2026-05-05T06:04:02.866592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78280, "epoch": 0, "train_loss": 3.7022612392902374, "train_ppl": 40.53886887949937, "lr": 0.00056, "grad_norm": 0.6542, "tokens_per_sec": 148544, "dt_s": 4.412, "eta_s": 6299, "world_size": 1, "timestamp": "2026-05-05T06:04:07.278484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78290, "epoch": 0, "train_loss": 3.763151004910469, "train_ppl": 43.08397011789345, "lr": 0.00056, "grad_norm": 0.6881, "tokens_per_sec": 148615, "dt_s": 4.41, "eta_s": 6315, "world_size": 1, "timestamp": "2026-05-05T06:04:11.688293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78300, "epoch": 0, "train_loss": 3.6011611074209213, "train_ppl": 36.640753605163205, "lr": 0.00056, "grad_norm": 0.6891, "tokens_per_sec": 149712, "dt_s": 4.377, "eta_s": 6313, "world_size": 1, "timestamp": "2026-05-05T06:04:16.065758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78310, "epoch": 0, "train_loss": 3.6807811409235, "train_ppl": 39.67737559234301, "lr": 0.00056, "grad_norm": 0.647, "tokens_per_sec": 147056, "dt_s": 4.457, "eta_s": 6332, "world_size": 1, "timestamp": "2026-05-05T06:04:20.522291"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78320, "epoch": 0, "train_loss": 3.687133803963661, "train_ppl": 39.93023490443001, "lr": 0.00056, "grad_norm": 0.7126, "tokens_per_sec": 150164, "dt_s": 4.364, "eta_s": 6326, "world_size": 1, "timestamp": "2026-05-05T06:04:24.886587"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78330, "epoch": 0, "train_loss": 3.682897448539734, "train_ppl": 39.761434039894525, "lr": 0.00056, "grad_norm": 0.6683, "tokens_per_sec": 148962, "dt_s": 4.4, "eta_s": 6318, "world_size": 1, "timestamp": "2026-05-05T06:04:29.286095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78340, "epoch": 0, "train_loss": 3.714660182595253, "train_ppl": 41.04463703225664, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 146392, "dt_s": 4.477, "eta_s": 6332, "world_size": 1, "timestamp": "2026-05-05T06:04:33.762841"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78350, "epoch": 0, "train_loss": 3.7651704400777817, "train_ppl": 43.17106331222868, "lr": 0.00056, "grad_norm": 0.643, "tokens_per_sec": 150015, "dt_s": 4.369, "eta_s": 6326, "world_size": 1, "timestamp": "2026-05-05T06:04:38.131475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78360, "epoch": 0, "train_loss": 3.736381411552429, "train_ppl": 41.945930151349344, "lr": 0.00056, "grad_norm": 0.7146, "tokens_per_sec": 149646, "dt_s": 4.379, "eta_s": 6299, "world_size": 1, "timestamp": "2026-05-05T06:04:42.510876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78370, "epoch": 0, "train_loss": 3.67334021627903, "train_ppl": 39.383234926881414, "lr": 0.00056, "grad_norm": 0.7003, "tokens_per_sec": 149322, "dt_s": 4.389, "eta_s": 6302, "world_size": 1, "timestamp": "2026-05-05T06:04:46.899793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78380, "epoch": 0, "train_loss": 3.7171765118837357, "train_ppl": 41.14804890915711, "lr": 0.00056, "grad_norm": 0.6679, "tokens_per_sec": 151232, "dt_s": 4.333, "eta_s": 6278, "world_size": 1, "timestamp": "2026-05-05T06:04:51.233260"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78390, "epoch": 0, "train_loss": 3.6183622926473618, "train_ppl": 37.27646985665126, "lr": 0.00056, "grad_norm": 0.6275, "tokens_per_sec": 149708, "dt_s": 4.378, "eta_s": 6246, "world_size": 1, "timestamp": "2026-05-05T06:04:55.610819"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78400, "epoch": 0, "train_loss": 3.5625113993883133, "train_ppl": 35.25161695916854, "lr": 0.00056, "grad_norm": 0.6811, "tokens_per_sec": 149516, "dt_s": 4.383, "eta_s": 6245, "world_size": 1, "timestamp": "2026-05-05T06:04:59.994076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78410, "epoch": 0, "train_loss": 3.7141436636447906, "train_ppl": 41.02344217365891, "lr": 0.00056, "grad_norm": 0.6725, "tokens_per_sec": 149768, "dt_s": 4.376, "eta_s": 6240, "world_size": 1, "timestamp": "2026-05-05T06:05:04.369873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78420, "epoch": 0, "train_loss": 3.670370116829872, "train_ppl": 39.26643634008964, "lr": 0.00056, "grad_norm": 0.7214, "tokens_per_sec": 147594, "dt_s": 4.44, "eta_s": 6250, "world_size": 1, "timestamp": "2026-05-05T06:05:08.810166"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78430, "epoch": 0, "train_loss": 3.7400598376989365, "train_ppl": 42.100509287226714, "lr": 0.00056, "grad_norm": 0.6467, "tokens_per_sec": 150527, "dt_s": 4.354, "eta_s": 6252, "world_size": 1, "timestamp": "2026-05-05T06:05:13.163949"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78440, "epoch": 0, "train_loss": 3.706916853785515, "train_ppl": 40.72804224251213, "lr": 0.00056, "grad_norm": 0.6637, "tokens_per_sec": 150438, "dt_s": 4.356, "eta_s": 6241, "world_size": 1, "timestamp": "2026-05-05T06:05:17.520328"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78450, "epoch": 0, "train_loss": 3.7022231072187424, "train_ppl": 40.53732307792542, "lr": 0.00056, "grad_norm": 0.667, "tokens_per_sec": 147067, "dt_s": 4.456, "eta_s": 6258, "world_size": 1, "timestamp": "2026-05-05T06:05:21.976506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78460, "epoch": 0, "train_loss": 3.6663919389247894, "train_ppl": 39.11053777235183, "lr": 0.00056, "grad_norm": 0.6588, "tokens_per_sec": 150518, "dt_s": 4.354, "eta_s": 6247, "world_size": 1, "timestamp": "2026-05-05T06:05:26.330530"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78470, "epoch": 0, "train_loss": 3.7569015324115753, "train_ppl": 42.815557623340595, "lr": 0.00056, "grad_norm": 0.6344, "tokens_per_sec": 150151, "dt_s": 4.365, "eta_s": 6221, "world_size": 1, "timestamp": "2026-05-05T06:05:30.695222"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78480, "epoch": 0, "train_loss": 3.6663714945316315, "train_ppl": 39.10973818931451, "lr": 0.00056, "grad_norm": 0.673, "tokens_per_sec": 148035, "dt_s": 4.427, "eta_s": 6238, "world_size": 1, "timestamp": "2026-05-05T06:05:35.122293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78490, "epoch": 0, "train_loss": 3.721761003136635, "train_ppl": 41.33712485680961, "lr": 0.00056, "grad_norm": 0.6512, "tokens_per_sec": 152293, "dt_s": 4.303, "eta_s": 6218, "world_size": 1, "timestamp": "2026-05-05T06:05:39.425551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78500, "epoch": 0, "train_loss": 3.6087824255228043, "train_ppl": 36.92107128213263, "lr": 0.00056, "grad_norm": 0.6869, "tokens_per_sec": 148655, "dt_s": 4.409, "eta_s": 6200, "world_size": 1, "timestamp": "2026-05-05T06:05:43.834153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78510, "epoch": 0, "train_loss": 3.6557850539684296, "train_ppl": 38.69788912476573, "lr": 0.00056, "grad_norm": 0.7649, "tokens_per_sec": 115372, "dt_s": 5.68, "eta_s": 6352, "world_size": 1, "timestamp": "2026-05-05T06:05:49.514553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78520, "epoch": 0, "train_loss": 3.821828246116638, "train_ppl": 45.68766029974712, "lr": 0.00056, "grad_norm": 0.9111, "tokens_per_sec": 148257, "dt_s": 4.42, "eta_s": 6364, "world_size": 1, "timestamp": "2026-05-05T06:05:53.934973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78530, "epoch": 0, "train_loss": 3.7205406725406647, "train_ppl": 41.28671066584369, "lr": 0.00056, "grad_norm": 0.655, "tokens_per_sec": 148672, "dt_s": 4.408, "eta_s": 6354, "world_size": 1, "timestamp": "2026-05-05T06:05:58.343066"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78540, "epoch": 0, "train_loss": 3.679295152425766, "train_ppl": 39.61845925393452, "lr": 0.00056, "grad_norm": 0.6962, "tokens_per_sec": 151042, "dt_s": 4.339, "eta_s": 6359, "world_size": 1, "timestamp": "2026-05-05T06:06:02.681998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78550, "epoch": 0, "train_loss": 3.5548326820135117, "train_ppl": 34.98196636582506, "lr": 0.00056, "grad_norm": 0.7057, "tokens_per_sec": 150582, "dt_s": 4.352, "eta_s": 6339, "world_size": 1, "timestamp": "2026-05-05T06:06:07.034190"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78560, "epoch": 0, "train_loss": 3.625630721449852, "train_ppl": 37.548398272813536, "lr": 0.00056, "grad_norm": 0.6283, "tokens_per_sec": 148087, "dt_s": 4.425, "eta_s": 6199, "world_size": 1, "timestamp": "2026-05-05T06:06:11.459681"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78570, "epoch": 0, "train_loss": 3.584687262773514, "train_ppl": 36.04208425466857, "lr": 0.00056, "grad_norm": 0.7382, "tokens_per_sec": 148366, "dt_s": 4.417, "eta_s": 6193, "world_size": 1, "timestamp": "2026-05-05T06:06:15.876849"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78580, "epoch": 0, "train_loss": 3.727650061249733, "train_ppl": 41.581279803025346, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 147105, "dt_s": 4.455, "eta_s": 6202, "world_size": 1, "timestamp": "2026-05-05T06:06:20.331914"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78590, "epoch": 0, "train_loss": 3.602734684944153, "train_ppl": 36.69845605920627, "lr": 0.00056, "grad_norm": 0.6848, "tokens_per_sec": 150987, "dt_s": 4.341, "eta_s": 6198, "world_size": 1, "timestamp": "2026-05-05T06:06:24.672435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78600, "epoch": 0, "train_loss": 3.75446318089962, "train_ppl": 42.711285421478074, "lr": 0.00056, "grad_norm": 0.7893, "tokens_per_sec": 150581, "dt_s": 4.352, "eta_s": 6194, "world_size": 1, "timestamp": "2026-05-05T06:06:29.024622"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78610, "epoch": 0, "train_loss": 3.665741801261902, "train_ppl": 39.085118802540386, "lr": 0.00056, "grad_norm": 0.6759, "tokens_per_sec": 146351, "dt_s": 4.478, "eta_s": 6204, "world_size": 1, "timestamp": "2026-05-05T06:06:33.502631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78620, "epoch": 0, "train_loss": 3.6560627073049545, "train_ppl": 38.7086352145725, "lr": 0.00056, "grad_norm": 0.6845, "tokens_per_sec": 148873, "dt_s": 4.402, "eta_s": 6196, "world_size": 1, "timestamp": "2026-05-05T06:06:37.904776"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78630, "epoch": 0, "train_loss": 3.700544908642769, "train_ppl": 40.469350451797354, "lr": 0.00056, "grad_norm": 0.6628, "tokens_per_sec": 147628, "dt_s": 4.439, "eta_s": 6187, "world_size": 1, "timestamp": "2026-05-05T06:06:42.344040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78640, "epoch": 0, "train_loss": 3.617782846093178, "train_ppl": 37.25487639137439, "lr": 0.00056, "grad_norm": 0.6328, "tokens_per_sec": 145498, "dt_s": 4.504, "eta_s": 6228, "world_size": 1, "timestamp": "2026-05-05T06:06:46.848314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78650, "epoch": 0, "train_loss": 3.725793346762657, "train_ppl": 41.50414686750226, "lr": 0.00056, "grad_norm": 0.6673, "tokens_per_sec": 147461, "dt_s": 4.444, "eta_s": 6250, "world_size": 1, "timestamp": "2026-05-05T06:06:51.292619"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78660, "epoch": 0, "train_loss": 3.6861590892076492, "train_ppl": 39.891333277332556, "lr": 0.00056, "grad_norm": 0.7374, "tokens_per_sec": 148119, "dt_s": 4.425, "eta_s": 6230, "world_size": 1, "timestamp": "2026-05-05T06:06:55.717201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78670, "epoch": 0, "train_loss": 3.7048078030347824, "train_ppl": 40.64223525190015, "lr": 0.00056, "grad_norm": 0.7317, "tokens_per_sec": 147003, "dt_s": 4.458, "eta_s": 6242, "world_size": 1, "timestamp": "2026-05-05T06:07:00.175310"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78680, "epoch": 0, "train_loss": 3.7225494980812073, "train_ppl": 41.3697318243075, "lr": 0.00056, "grad_norm": 0.6598, "tokens_per_sec": 147675, "dt_s": 4.438, "eta_s": 6237, "world_size": 1, "timestamp": "2026-05-05T06:07:04.613191"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78690, "epoch": 0, "train_loss": 3.730771780014038, "train_ppl": 41.711287682870676, "lr": 0.00056, "grad_norm": 0.6714, "tokens_per_sec": 146386, "dt_s": 4.477, "eta_s": 6225, "world_size": 1, "timestamp": "2026-05-05T06:07:09.090093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78700, "epoch": 0, "train_loss": 3.694352775812149, "train_ppl": 40.219533107538574, "lr": 0.00056, "grad_norm": 0.651, "tokens_per_sec": 147657, "dt_s": 4.438, "eta_s": 6219, "world_size": 1, "timestamp": "2026-05-05T06:07:13.528497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78710, "epoch": 0, "train_loss": 3.6879570186138153, "train_ppl": 39.96311959250881, "lr": 0.00056, "grad_norm": 0.7747, "tokens_per_sec": 149903, "dt_s": 4.372, "eta_s": 6199, "world_size": 1, "timestamp": "2026-05-05T06:07:17.900401"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78720, "epoch": 0, "train_loss": 3.7583755999803543, "train_ppl": 42.878717187568505, "lr": 0.00056, "grad_norm": 0.6626, "tokens_per_sec": 146033, "dt_s": 4.488, "eta_s": 6203, "world_size": 1, "timestamp": "2026-05-05T06:07:22.388122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78730, "epoch": 0, "train_loss": 3.6858911216259003, "train_ppl": 39.88064512532457, "lr": 0.00056, "grad_norm": 0.6516, "tokens_per_sec": 147115, "dt_s": 4.455, "eta_s": 6204, "world_size": 1, "timestamp": "2026-05-05T06:07:26.842875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78740, "epoch": 0, "train_loss": 3.6261686384677887, "train_ppl": 37.56860162861804, "lr": 0.00056, "grad_norm": 0.7171, "tokens_per_sec": 147211, "dt_s": 4.452, "eta_s": 6192, "world_size": 1, "timestamp": "2026-05-05T06:07:31.294701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78750, "epoch": 0, "train_loss": 3.7567827105522156, "train_ppl": 42.81047050141089, "lr": 0.00056, "grad_norm": 0.7394, "tokens_per_sec": 145123, "dt_s": 4.516, "eta_s": 6209, "world_size": 1, "timestamp": "2026-05-05T06:07:35.810624"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78760, "epoch": 0, "train_loss": 3.765588700771332, "train_ppl": 43.189123847853736, "lr": 0.00056, "grad_norm": 0.6719, "tokens_per_sec": 146755, "dt_s": 4.466, "eta_s": 6231, "world_size": 1, "timestamp": "2026-05-05T06:07:40.276304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78770, "epoch": 0, "train_loss": 3.6261132806539536, "train_ppl": 37.566521970526225, "lr": 0.00056, "grad_norm": 0.698, "tokens_per_sec": 146397, "dt_s": 4.477, "eta_s": 6223, "world_size": 1, "timestamp": "2026-05-05T06:07:44.752878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78780, "epoch": 0, "train_loss": 3.6858748495578766, "train_ppl": 39.879996190034035, "lr": 0.00056, "grad_norm": 0.6824, "tokens_per_sec": 147771, "dt_s": 4.435, "eta_s": 6213, "world_size": 1, "timestamp": "2026-05-05T06:07:49.187834"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78790, "epoch": 0, "train_loss": 3.6656826734542847, "train_ppl": 39.08280785347649, "lr": 0.0005597462418183125, "grad_norm": 0.6847, "tokens_per_sec": 146987, "dt_s": 4.459, "eta_s": 6211, "world_size": 1, "timestamp": "2026-05-05T06:07:53.646464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78800, "epoch": 0, "train_loss": 3.6791960299015045, "train_ppl": 39.61453236687061, "lr": 0.0005593837301301876, "grad_norm": 0.6473, "tokens_per_sec": 131561, "dt_s": 4.981, "eta_s": 6336, "world_size": 1, "timestamp": "2026-05-05T06:07:58.627891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78810, "epoch": 0, "train_loss": 3.701318860054016, "train_ppl": 40.50068388640568, "lr": 0.0005590212184420628, "grad_norm": 0.6398, "tokens_per_sec": 147847, "dt_s": 4.433, "eta_s": 6322, "world_size": 1, "timestamp": "2026-05-05T06:08:03.060602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78820, "epoch": 0, "train_loss": 3.750977411866188, "train_ppl": 42.56266292770854, "lr": 0.000558658706753938, "grad_norm": 0.6561, "tokens_per_sec": 148597, "dt_s": 4.41, "eta_s": 6299, "world_size": 1, "timestamp": "2026-05-05T06:08:07.470914"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78830, "epoch": 0, "train_loss": 3.755327582359314, "train_ppl": 42.748221080260656, "lr": 0.0005582961950658131, "grad_norm": 1.1339, "tokens_per_sec": 146792, "dt_s": 4.465, "eta_s": 6303, "world_size": 1, "timestamp": "2026-05-05T06:08:11.935470"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78840, "epoch": 0, "train_loss": 3.684532344341278, "train_ppl": 39.82649300929643, "lr": 0.0005579336833776882, "grad_norm": 0.6668, "tokens_per_sec": 149183, "dt_s": 4.393, "eta_s": 6280, "world_size": 1, "timestamp": "2026-05-05T06:08:16.328464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78850, "epoch": 0, "train_loss": 3.658101186156273, "train_ppl": 38.787622428351284, "lr": 0.0005575711716895634, "grad_norm": 0.6261, "tokens_per_sec": 143566, "dt_s": 4.565, "eta_s": 6160, "world_size": 1, "timestamp": "2026-05-05T06:08:20.893354"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78860, "epoch": 0, "train_loss": 3.6772001683712006, "train_ppl": 39.5355460947295, "lr": 0.0005572086600014385, "grad_norm": 0.7446, "tokens_per_sec": 148866, "dt_s": 4.402, "eta_s": 6147, "world_size": 1, "timestamp": "2026-05-05T06:08:25.295689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78870, "epoch": 0, "train_loss": 3.844531923532486, "train_ppl": 46.736802832627006, "lr": 0.0005568461483133136, "grad_norm": 0.654, "tokens_per_sec": 149617, "dt_s": 4.38, "eta_s": 6135, "world_size": 1, "timestamp": "2026-05-05T06:08:29.675934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78880, "epoch": 0, "train_loss": 3.6841988414525986, "train_ppl": 39.81321297342011, "lr": 0.0005564836366251887, "grad_norm": 0.6597, "tokens_per_sec": 146712, "dt_s": 4.467, "eta_s": 6131, "world_size": 1, "timestamp": "2026-05-05T06:08:34.142944"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78890, "epoch": 0, "train_loss": 3.615575522184372, "train_ppl": 37.17273350332838, "lr": 0.0005561211249370639, "grad_norm": 0.6637, "tokens_per_sec": 148737, "dt_s": 4.406, "eta_s": 6130, "world_size": 1, "timestamp": "2026-05-05T06:08:38.549118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78900, "epoch": 0, "train_loss": 3.7549643963575363, "train_ppl": 42.73269834375434, "lr": 0.000555758613248939, "grad_norm": 0.7079, "tokens_per_sec": 148233, "dt_s": 4.421, "eta_s": 6086, "world_size": 1, "timestamp": "2026-05-05T06:08:42.970252"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78910, "epoch": 0, "train_loss": 3.773595318198204, "train_ppl": 43.53631068085333, "lr": 0.0005553961015608141, "grad_norm": 0.6764, "tokens_per_sec": 144951, "dt_s": 4.521, "eta_s": 6114, "world_size": 1, "timestamp": "2026-05-05T06:08:47.491505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78920, "epoch": 0, "train_loss": 3.751356914639473, "train_ppl": 42.57881864170323, "lr": 0.0005550335898726893, "grad_norm": 0.6418, "tokens_per_sec": 147304, "dt_s": 4.449, "eta_s": 6129, "world_size": 1, "timestamp": "2026-05-05T06:08:51.940544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78930, "epoch": 0, "train_loss": 3.708961322903633, "train_ppl": 40.81139464379013, "lr": 0.0005546710781845645, "grad_norm": 0.6472, "tokens_per_sec": 147679, "dt_s": 4.438, "eta_s": 6116, "world_size": 1, "timestamp": "2026-05-05T06:08:56.378253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78940, "epoch": 0, "train_loss": 3.604815810918808, "train_ppl": 36.77490969657218, "lr": 0.0005543085664964396, "grad_norm": 0.6595, "tokens_per_sec": 146527, "dt_s": 4.473, "eta_s": 6130, "world_size": 1, "timestamp": "2026-05-05T06:09:00.850917"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78950, "epoch": 0, "train_loss": 3.568019613623619, "train_ppl": 35.44632617526206, "lr": 0.0005539460548083147, "grad_norm": 0.6647, "tokens_per_sec": 149425, "dt_s": 4.386, "eta_s": 6116, "world_size": 1, "timestamp": "2026-05-05T06:09:05.236793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78960, "epoch": 0, "train_loss": 3.6669755429029465, "train_ppl": 39.133369499478995, "lr": 0.0005535835431201898, "grad_norm": 0.7174, "tokens_per_sec": 146551, "dt_s": 4.472, "eta_s": 6098, "world_size": 1, "timestamp": "2026-05-05T06:09:09.708669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78970, "epoch": 0, "train_loss": 3.7049840092658997, "train_ppl": 40.64939729797811, "lr": 0.000553221031432065, "grad_norm": 0.6717, "tokens_per_sec": 148245, "dt_s": 4.421, "eta_s": 6086, "world_size": 1, "timestamp": "2026-05-05T06:09:14.129440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78980, "epoch": 0, "train_loss": 3.721596822142601, "train_ppl": 41.330338643658976, "lr": 0.0005528585197439401, "grad_norm": 0.7191, "tokens_per_sec": 147642, "dt_s": 4.439, "eta_s": 6082, "world_size": 1, "timestamp": "2026-05-05T06:09:18.568309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 78990, "epoch": 0, "train_loss": 3.658053398132324, "train_ppl": 38.785768888810615, "lr": 0.0005524960080558152, "grad_norm": 0.6447, "tokens_per_sec": 147723, "dt_s": 4.436, "eta_s": 6067, "world_size": 1, "timestamp": "2026-05-05T06:09:23.004738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79000, "epoch": 0, "train_loss": 3.739175245165825, "train_ppl": 42.06328395812253, "lr": 0.0005521334963676903, "grad_norm": 0.6742, "tokens_per_sec": 148503, "dt_s": 4.413, "eta_s": 6070, "world_size": 1, "timestamp": "2026-05-05T06:09:27.417826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79010, "epoch": 0, "train_loss": 3.5864913016557693, "train_ppl": 36.10716426184268, "lr": 0.0005517709846795655, "grad_norm": 0.6591, "tokens_per_sec": 125873, "dt_s": 5.206, "eta_s": 6050, "world_size": 1, "timestamp": "2026-05-05T06:09:32.624361"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79020, "epoch": 0, "train_loss": 3.575705111026764, "train_ppl": 35.719798361828715, "lr": 0.0005514084729914407, "grad_norm": 0.8489, "tokens_per_sec": 149227, "dt_s": 4.392, "eta_s": 6038, "world_size": 1, "timestamp": "2026-05-05T06:09:37.016033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79030, "epoch": 0, "train_loss": 3.6779503226280212, "train_ppl": 39.56521497965705, "lr": 0.0005510459613033158, "grad_norm": 0.7087, "tokens_per_sec": 148888, "dt_s": 4.402, "eta_s": 6023, "world_size": 1, "timestamp": "2026-05-05T06:09:41.417727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79040, "epoch": 0, "train_loss": 3.708777591586113, "train_ppl": 40.80389700127935, "lr": 0.0005506834496151909, "grad_norm": 0.7064, "tokens_per_sec": 146986, "dt_s": 4.459, "eta_s": 6025, "world_size": 1, "timestamp": "2026-05-05T06:09:45.876394"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79050, "epoch": 0, "train_loss": 3.777454987168312, "train_ppl": 43.70467112705016, "lr": 0.0005503209379270661, "grad_norm": 0.7262, "tokens_per_sec": 149888, "dt_s": 4.372, "eta_s": 6010, "world_size": 1, "timestamp": "2026-05-05T06:09:50.248745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79060, "epoch": 0, "train_loss": 3.789386346936226, "train_ppl": 44.22925053147633, "lr": 0.0005499584262389412, "grad_norm": 0.6383, "tokens_per_sec": 149697, "dt_s": 4.378, "eta_s": 5995, "world_size": 1, "timestamp": "2026-05-05T06:09:54.626653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79070, "epoch": 0, "train_loss": 3.6796186715364456, "train_ppl": 39.631278656187845, "lr": 0.0005495959145508163, "grad_norm": 0.6403, "tokens_per_sec": 146562, "dt_s": 4.472, "eta_s": 6012, "world_size": 1, "timestamp": "2026-05-05T06:09:59.098203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79080, "epoch": 0, "train_loss": 3.730225130915642, "train_ppl": 41.688492476123145, "lr": 0.0005492334028626914, "grad_norm": 0.6442, "tokens_per_sec": 148561, "dt_s": 4.411, "eta_s": 6011, "world_size": 1, "timestamp": "2026-05-05T06:10:03.509613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79090, "epoch": 0, "train_loss": 3.652339592576027, "train_ppl": 38.564786473515966, "lr": 0.0005488708911745666, "grad_norm": 0.682, "tokens_per_sec": 133600, "dt_s": 4.905, "eta_s": 6128, "world_size": 1, "timestamp": "2026-05-05T06:10:08.414965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79100, "epoch": 0, "train_loss": 3.6159818172454834, "train_ppl": 37.187839669931435, "lr": 0.0005485083794864417, "grad_norm": 0.6709, "tokens_per_sec": 147814, "dt_s": 4.434, "eta_s": 6140, "world_size": 1, "timestamp": "2026-05-05T06:10:12.848728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79110, "epoch": 0, "train_loss": 3.6231767684221268, "train_ppl": 37.45636923084118, "lr": 0.0005481458677983168, "grad_norm": 0.6214, "tokens_per_sec": 149352, "dt_s": 4.388, "eta_s": 6138, "world_size": 1, "timestamp": "2026-05-05T06:10:17.236636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79120, "epoch": 0, "train_loss": 3.573303073644638, "train_ppl": 35.63410103617607, "lr": 0.000547783356110192, "grad_norm": 0.7197, "tokens_per_sec": 146354, "dt_s": 4.478, "eta_s": 6135, "world_size": 1, "timestamp": "2026-05-05T06:10:21.714537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79130, "epoch": 0, "train_loss": 3.8119328916072845, "train_ppl": 45.2377941676359, "lr": 0.0005474208444220672, "grad_norm": 0.7242, "tokens_per_sec": 148380, "dt_s": 4.417, "eta_s": 6132, "world_size": 1, "timestamp": "2026-05-05T06:10:26.131334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79140, "epoch": 0, "train_loss": 3.6609376072883606, "train_ppl": 38.89779663663156, "lr": 0.0005470583327339423, "grad_norm": 0.6853, "tokens_per_sec": 149397, "dt_s": 4.387, "eta_s": 5987, "world_size": 1, "timestamp": "2026-05-05T06:10:30.518025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79150, "epoch": 0, "train_loss": 3.6677122712135315, "train_ppl": 39.16221078346855, "lr": 0.0005466958210458174, "grad_norm": 0.7162, "tokens_per_sec": 146165, "dt_s": 4.484, "eta_s": 5996, "world_size": 1, "timestamp": "2026-05-05T06:10:35.001722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79160, "epoch": 0, "train_loss": 3.565130889415741, "train_ppl": 35.34407926742004, "lr": 0.0005463333093576925, "grad_norm": 0.6569, "tokens_per_sec": 149658, "dt_s": 4.379, "eta_s": 5989, "world_size": 1, "timestamp": "2026-05-05T06:10:39.380786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79170, "epoch": 0, "train_loss": 3.6618080139160156, "train_ppl": 38.9316682755363, "lr": 0.0005459707976695677, "grad_norm": 0.6875, "tokens_per_sec": 149007, "dt_s": 4.398, "eta_s": 5963, "world_size": 1, "timestamp": "2026-05-05T06:10:43.778971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79180, "epoch": 0, "train_loss": 3.8389778584241867, "train_ppl": 46.47794311404025, "lr": 0.0005456082859814428, "grad_norm": 0.6733, "tokens_per_sec": 148253, "dt_s": 4.421, "eta_s": 5960, "world_size": 1, "timestamp": "2026-05-05T06:10:48.199524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79190, "epoch": 0, "train_loss": 3.638620972633362, "train_ppl": 38.03934324340227, "lr": 0.0005452457742933179, "grad_norm": 0.7204, "tokens_per_sec": 151375, "dt_s": 4.329, "eta_s": 5940, "world_size": 1, "timestamp": "2026-05-05T06:10:52.528895"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79200, "epoch": 0, "train_loss": 3.6231644600629807, "train_ppl": 37.455908207233605, "lr": 0.000544883262605193, "grad_norm": 0.6194, "tokens_per_sec": 150262, "dt_s": 4.361, "eta_s": 5903, "world_size": 1, "timestamp": "2026-05-05T06:10:56.890354"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79210, "epoch": 0, "train_loss": 3.682535335421562, "train_ppl": 39.74703850959384, "lr": 0.0005445207509170682, "grad_norm": 0.6747, "tokens_per_sec": 146473, "dt_s": 4.474, "eta_s": 5924, "world_size": 1, "timestamp": "2026-05-05T06:11:01.364595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79220, "epoch": 0, "train_loss": 3.6618266850709915, "train_ppl": 38.93239518153421, "lr": 0.0005441582392289434, "grad_norm": 0.7147, "tokens_per_sec": 147413, "dt_s": 4.446, "eta_s": 5932, "world_size": 1, "timestamp": "2026-05-05T06:11:05.810346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79230, "epoch": 0, "train_loss": 3.7368338257074356, "train_ppl": 41.964911377258616, "lr": 0.0005437957275408185, "grad_norm": 0.691, "tokens_per_sec": 147826, "dt_s": 4.433, "eta_s": 5931, "world_size": 1, "timestamp": "2026-05-05T06:11:10.243691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79240, "epoch": 0, "train_loss": 3.8185494244098663, "train_ppl": 45.53810392575334, "lr": 0.0005434332158526936, "grad_norm": 0.7066, "tokens_per_sec": 149861, "dt_s": 4.373, "eta_s": 5939, "world_size": 1, "timestamp": "2026-05-05T06:11:14.616796"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79250, "epoch": 0, "train_loss": 3.662011280655861, "train_ppl": 38.93958259315493, "lr": 0.0005430707041645688, "grad_norm": 0.7047, "tokens_per_sec": 151194, "dt_s": 4.335, "eta_s": 5927, "world_size": 1, "timestamp": "2026-05-05T06:11:18.951378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79260, "epoch": 0, "train_loss": 3.8943146765232086, "train_ppl": 49.12237710617069, "lr": 0.0005427081924764439, "grad_norm": 0.8408, "tokens_per_sec": 148978, "dt_s": 4.399, "eta_s": 5903, "world_size": 1, "timestamp": "2026-05-05T06:11:23.350417"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79270, "epoch": 0, "train_loss": 3.620983988046646, "train_ppl": 37.374325624134485, "lr": 0.000542345680788319, "grad_norm": 0.6935, "tokens_per_sec": 150227, "dt_s": 4.362, "eta_s": 5876, "world_size": 1, "timestamp": "2026-05-05T06:11:27.712900"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79280, "epoch": 0, "train_loss": 3.6315152049064636, "train_ppl": 37.77000257523793, "lr": 0.0005419831691001941, "grad_norm": 0.6915, "tokens_per_sec": 149578, "dt_s": 4.381, "eta_s": 5857, "world_size": 1, "timestamp": "2026-05-05T06:11:32.094326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79290, "epoch": 0, "train_loss": 3.5995214730501175, "train_ppl": 36.58072539178928, "lr": 0.0005416206574120693, "grad_norm": 0.689, "tokens_per_sec": 146716, "dt_s": 4.467, "eta_s": 5878, "world_size": 1, "timestamp": "2026-05-05T06:11:36.561158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79300, "epoch": 0, "train_loss": 3.732434630393982, "train_ppl": 41.780705012763455, "lr": 0.0005412581457239444, "grad_norm": 0.6678, "tokens_per_sec": 149647, "dt_s": 4.379, "eta_s": 5886, "world_size": 1, "timestamp": "2026-05-05T06:11:40.940551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79310, "epoch": 0, "train_loss": 3.715913236141205, "train_ppl": 41.09610039667412, "lr": 0.0005408956340358195, "grad_norm": 0.6535, "tokens_per_sec": 148712, "dt_s": 4.407, "eta_s": 5884, "world_size": 1, "timestamp": "2026-05-05T06:11:45.347450"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79320, "epoch": 0, "train_loss": 3.685906410217285, "train_ppl": 39.88125484887295, "lr": 0.0005405331223476947, "grad_norm": 0.6618, "tokens_per_sec": 147293, "dt_s": 4.449, "eta_s": 5902, "world_size": 1, "timestamp": "2026-05-05T06:11:49.796822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79330, "epoch": 0, "train_loss": 3.651981994509697, "train_ppl": 38.550998245913306, "lr": 0.0005401706106595699, "grad_norm": 0.7023, "tokens_per_sec": 150156, "dt_s": 4.365, "eta_s": 5893, "world_size": 1, "timestamp": "2026-05-05T06:11:54.161344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79340, "epoch": 0, "train_loss": 3.69199575483799, "train_ppl": 40.12484645747965, "lr": 0.000539808098971445, "grad_norm": 0.7018, "tokens_per_sec": 148150, "dt_s": 4.424, "eta_s": 5878, "world_size": 1, "timestamp": "2026-05-05T06:11:58.584972"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79350, "epoch": 0, "train_loss": 3.6200890988111496, "train_ppl": 37.34089470316619, "lr": 0.0005394455872833201, "grad_norm": 0.6242, "tokens_per_sec": 149785, "dt_s": 4.375, "eta_s": 5872, "world_size": 1, "timestamp": "2026-05-05T06:12:02.960312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79360, "epoch": 0, "train_loss": 3.777865916490555, "train_ppl": 43.72263434848967, "lr": 0.0005390830755951952, "grad_norm": 0.6721, "tokens_per_sec": 151005, "dt_s": 4.34, "eta_s": 5850, "world_size": 1, "timestamp": "2026-05-05T06:12:07.300321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79370, "epoch": 0, "train_loss": 3.713292494416237, "train_ppl": 40.9885391383334, "lr": 0.0005387205639070704, "grad_norm": 0.6388, "tokens_per_sec": 145452, "dt_s": 4.506, "eta_s": 5860, "world_size": 1, "timestamp": "2026-05-05T06:12:11.805953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79380, "epoch": 0, "train_loss": 3.7848882526159286, "train_ppl": 44.03074996269614, "lr": 0.0005383580522189455, "grad_norm": 0.76, "tokens_per_sec": 150591, "dt_s": 4.352, "eta_s": 5853, "world_size": 1, "timestamp": "2026-05-05T06:12:16.157875"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79390, "epoch": 0, "train_loss": 3.727680206298828, "train_ppl": 41.58253329163959, "lr": 0.0005379955405308206, "grad_norm": 0.6781, "tokens_per_sec": 134133, "dt_s": 4.886, "eta_s": 5971, "world_size": 1, "timestamp": "2026-05-05T06:12:21.043757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79400, "epoch": 0, "train_loss": 3.601213052868843, "train_ppl": 36.64265697495669, "lr": 0.0005376330288426957, "grad_norm": 0.6597, "tokens_per_sec": 149314, "dt_s": 4.389, "eta_s": 5970, "world_size": 1, "timestamp": "2026-05-05T06:12:25.432903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79410, "epoch": 0, "train_loss": 3.6444889158010483, "train_ppl": 38.26321213027506, "lr": 0.0005372705171545709, "grad_norm": 0.6385, "tokens_per_sec": 150912, "dt_s": 4.343, "eta_s": 5967, "world_size": 1, "timestamp": "2026-05-05T06:12:29.775564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79420, "epoch": 0, "train_loss": 3.6308760195970535, "train_ppl": 37.74586825842753, "lr": 0.0005369080054664461, "grad_norm": 0.8881, "tokens_per_sec": 149038, "dt_s": 4.397, "eta_s": 5933, "world_size": 1, "timestamp": "2026-05-05T06:12:34.172820"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79430, "epoch": 0, "train_loss": 3.6955691426992416, "train_ppl": 40.26848458126526, "lr": 0.0005365454937783212, "grad_norm": 0.6498, "tokens_per_sec": 149075, "dt_s": 4.396, "eta_s": 5941, "world_size": 1, "timestamp": "2026-05-05T06:12:38.569001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79440, "epoch": 0, "train_loss": 3.6800256073474884, "train_ppl": 39.647409324561174, "lr": 0.0005361829820901963, "grad_norm": 0.6833, "tokens_per_sec": 149655, "dt_s": 4.379, "eta_s": 5802, "world_size": 1, "timestamp": "2026-05-05T06:12:42.948150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79450, "epoch": 0, "train_loss": 3.772969350218773, "train_ppl": 43.509066872190076, "lr": 0.0005358204704020715, "grad_norm": 0.83, "tokens_per_sec": 146271, "dt_s": 4.48, "eta_s": 5822, "world_size": 1, "timestamp": "2026-05-05T06:12:47.428589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79460, "epoch": 0, "train_loss": 3.702747255563736, "train_ppl": 40.55857621813975, "lr": 0.0005354579587139466, "grad_norm": 0.6921, "tokens_per_sec": 149629, "dt_s": 4.38, "eta_s": 5827, "world_size": 1, "timestamp": "2026-05-05T06:12:51.808499"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79470, "epoch": 0, "train_loss": 3.6273671239614487, "train_ppl": 37.61365404462789, "lr": 0.0005350954470258217, "grad_norm": 0.6402, "tokens_per_sec": 149441, "dt_s": 4.385, "eta_s": 5820, "world_size": 1, "timestamp": "2026-05-05T06:12:56.193904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79480, "epoch": 0, "train_loss": 3.647852048277855, "train_ppl": 38.39211301553793, "lr": 0.0005347329353376968, "grad_norm": 0.6742, "tokens_per_sec": 145742, "dt_s": 4.497, "eta_s": 5842, "world_size": 1, "timestamp": "2026-05-05T06:13:00.690633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79490, "epoch": 0, "train_loss": 3.6925113648176193, "train_ppl": 40.145540563329504, "lr": 0.000534370423649572, "grad_norm": 0.6843, "tokens_per_sec": 149067, "dt_s": 4.396, "eta_s": 5842, "world_size": 1, "timestamp": "2026-05-05T06:13:05.087058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79500, "epoch": 0, "train_loss": 3.6689730137586594, "train_ppl": 39.21161538546243, "lr": 0.0005340079119614471, "grad_norm": 0.6284, "tokens_per_sec": 148807, "dt_s": 4.404, "eta_s": 5817, "world_size": 1, "timestamp": "2026-05-05T06:13:09.491152"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79510, "epoch": 0, "train_loss": 3.7514169961214066, "train_ppl": 42.58137691707792, "lr": 0.0005336454002733222, "grad_norm": 0.6572, "tokens_per_sec": 126726, "dt_s": 5.171, "eta_s": 5818, "world_size": 1, "timestamp": "2026-05-05T06:13:14.662662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79520, "epoch": 0, "train_loss": 3.677818715572357, "train_ppl": 39.56000826083484, "lr": 0.0005332828885851973, "grad_norm": 0.6463, "tokens_per_sec": 149820, "dt_s": 4.374, "eta_s": 5811, "world_size": 1, "timestamp": "2026-05-05T06:13:19.036948"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79530, "epoch": 0, "train_loss": 3.7597424387931824, "train_ppl": 42.93736555476624, "lr": 0.0005329203768970726, "grad_norm": 0.735, "tokens_per_sec": 145416, "dt_s": 4.507, "eta_s": 5809, "world_size": 1, "timestamp": "2026-05-05T06:13:23.543742"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79540, "epoch": 0, "train_loss": 3.655355766415596, "train_ppl": 38.68128016790783, "lr": 0.0005325578652089477, "grad_norm": 0.664, "tokens_per_sec": 150067, "dt_s": 4.367, "eta_s": 5797, "world_size": 1, "timestamp": "2026-05-05T06:13:27.910853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79550, "epoch": 0, "train_loss": 3.65670345723629, "train_ppl": 38.73344571774358, "lr": 0.0005321953535208228, "grad_norm": 0.6376, "tokens_per_sec": 150887, "dt_s": 4.343, "eta_s": 5777, "world_size": 1, "timestamp": "2026-05-05T06:13:32.254228"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79560, "epoch": 0, "train_loss": 3.6439542174339294, "train_ppl": 38.24275832202453, "lr": 0.0005318328418326979, "grad_norm": 0.6604, "tokens_per_sec": 146809, "dt_s": 4.464, "eta_s": 5789, "world_size": 1, "timestamp": "2026-05-05T06:13:36.718263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79570, "epoch": 0, "train_loss": 3.604446068406105, "train_ppl": 36.76131496248699, "lr": 0.0005314703301445731, "grad_norm": 0.6948, "tokens_per_sec": 149058, "dt_s": 4.397, "eta_s": 5790, "world_size": 1, "timestamp": "2026-05-05T06:13:41.114935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79580, "epoch": 0, "train_loss": 3.7350134551525116, "train_ppl": 41.88858917666683, "lr": 0.0005311078184564482, "grad_norm": 0.6816, "tokens_per_sec": 149601, "dt_s": 4.381, "eta_s": 5753, "world_size": 1, "timestamp": "2026-05-05T06:13:45.495662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79590, "epoch": 0, "train_loss": 3.651636302471161, "train_ppl": 38.53767377595636, "lr": 0.0005307453067683233, "grad_norm": 0.6822, "tokens_per_sec": 146413, "dt_s": 4.476, "eta_s": 5777, "world_size": 1, "timestamp": "2026-05-05T06:13:49.971789"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79600, "epoch": 0, "train_loss": 3.641232907772064, "train_ppl": 38.13882940982024, "lr": 0.0005303827950801984, "grad_norm": 0.6905, "tokens_per_sec": 150995, "dt_s": 4.34, "eta_s": 5772, "world_size": 1, "timestamp": "2026-05-05T06:13:54.312037"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79610, "epoch": 0, "train_loss": 3.6506872475147247, "train_ppl": 38.501116755704324, "lr": 0.0005300202833920736, "grad_norm": 0.645, "tokens_per_sec": 147368, "dt_s": 4.447, "eta_s": 5763, "world_size": 1, "timestamp": "2026-05-05T06:13:58.759137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79620, "epoch": 0, "train_loss": 3.743714213371277, "train_ppl": 42.25464182172157, "lr": 0.0005296577717039488, "grad_norm": 0.6673, "tokens_per_sec": 147727, "dt_s": 4.436, "eta_s": 5769, "world_size": 1, "timestamp": "2026-05-05T06:14:03.195427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79630, "epoch": 0, "train_loss": 3.6538611352443695, "train_ppl": 38.62350910479646, "lr": 0.0005292952600158239, "grad_norm": 0.6783, "tokens_per_sec": 149533, "dt_s": 4.383, "eta_s": 5765, "world_size": 1, "timestamp": "2026-05-05T06:14:07.578147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79640, "epoch": 0, "train_loss": 3.6772042363882065, "train_ppl": 39.535706926330484, "lr": 0.000528932748327699, "grad_norm": 0.6242, "tokens_per_sec": 144943, "dt_s": 4.521, "eta_s": 5773, "world_size": 1, "timestamp": "2026-05-05T06:14:12.099621"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79650, "epoch": 0, "train_loss": 3.7464872896671295, "train_ppl": 42.3719797857168, "lr": 0.0005285702366395742, "grad_norm": 0.6724, "tokens_per_sec": 150344, "dt_s": 4.359, "eta_s": 5773, "world_size": 1, "timestamp": "2026-05-05T06:14:16.458685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79660, "epoch": 0, "train_loss": 3.6223377138376236, "train_ppl": 37.42495447371023, "lr": 0.0005282077249514493, "grad_norm": 0.6397, "tokens_per_sec": 148853, "dt_s": 4.403, "eta_s": 5757, "world_size": 1, "timestamp": "2026-05-05T06:14:20.861416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79670, "epoch": 0, "train_loss": 3.704949602484703, "train_ppl": 40.64799870712015, "lr": 0.0005278452132633244, "grad_norm": 0.6575, "tokens_per_sec": 147662, "dt_s": 4.438, "eta_s": 5753, "world_size": 1, "timestamp": "2026-05-05T06:14:25.299667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79680, "epoch": 0, "train_loss": 3.697441875934601, "train_ppl": 40.343967368179364, "lr": 0.0005274827015751995, "grad_norm": 0.6261, "tokens_per_sec": 134245, "dt_s": 4.882, "eta_s": 5879, "world_size": 1, "timestamp": "2026-05-05T06:14:30.181476"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79690, "epoch": 0, "train_loss": 3.6780739426612854, "train_ppl": 39.57010633517748, "lr": 0.0005271201898870747, "grad_norm": 0.6282, "tokens_per_sec": 147079, "dt_s": 4.456, "eta_s": 5857, "world_size": 1, "timestamp": "2026-05-05T06:14:34.637328"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79700, "epoch": 0, "train_loss": 3.8316764384508133, "train_ppl": 46.13982400877383, "lr": 0.0005267576781989498, "grad_norm": 0.6584, "tokens_per_sec": 147183, "dt_s": 4.453, "eta_s": 5877, "world_size": 1, "timestamp": "2026-05-05T06:14:39.090012"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79710, "epoch": 0, "train_loss": 3.7576999366283417, "train_ppl": 42.849755395096246, "lr": 0.0005263951665108249, "grad_norm": 0.6481, "tokens_per_sec": 148800, "dt_s": 4.404, "eta_s": 5873, "world_size": 1, "timestamp": "2026-05-05T06:14:43.494329"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79720, "epoch": 0, "train_loss": 3.665182799100876, "train_ppl": 39.0632762422538, "lr": 0.0005260326548227, "grad_norm": 0.7134, "tokens_per_sec": 147790, "dt_s": 4.434, "eta_s": 5867, "world_size": 1, "timestamp": "2026-05-05T06:14:47.928728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79730, "epoch": 0, "train_loss": 3.6775417923927307, "train_ppl": 39.54905469427944, "lr": 0.0005256701431345753, "grad_norm": 0.6389, "tokens_per_sec": 148045, "dt_s": 4.427, "eta_s": 5745, "world_size": 1, "timestamp": "2026-05-05T06:14:52.355477"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79740, "epoch": 0, "train_loss": 3.7384023666381836, "train_ppl": 42.03078670897992, "lr": 0.0005253076314464504, "grad_norm": 0.6492, "tokens_per_sec": 149473, "dt_s": 4.384, "eta_s": 5722, "world_size": 1, "timestamp": "2026-05-05T06:14:56.739936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79750, "epoch": 0, "train_loss": 3.7365012913942337, "train_ppl": 41.95095892423846, "lr": 0.0005249451197583255, "grad_norm": 0.7196, "tokens_per_sec": 145017, "dt_s": 4.519, "eta_s": 5735, "world_size": 1, "timestamp": "2026-05-05T06:15:01.259155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79760, "epoch": 0, "train_loss": 3.659949168562889, "train_ppl": 38.85936754363105, "lr": 0.0005245826080702006, "grad_norm": 0.6508, "tokens_per_sec": 147844, "dt_s": 4.433, "eta_s": 5737, "world_size": 1, "timestamp": "2026-05-05T06:15:05.691890"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79770, "epoch": 0, "train_loss": 3.7451425343751907, "train_ppl": 42.315038136543016, "lr": 0.0005242200963820758, "grad_norm": 0.6843, "tokens_per_sec": 149164, "dt_s": 4.394, "eta_s": 5722, "world_size": 1, "timestamp": "2026-05-05T06:15:10.085464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79780, "epoch": 0, "train_loss": 3.609969899058342, "train_ppl": 36.964940118570084, "lr": 0.0005238575846939509, "grad_norm": 0.6847, "tokens_per_sec": 148144, "dt_s": 4.424, "eta_s": 5717, "world_size": 1, "timestamp": "2026-05-05T06:15:14.509261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79790, "epoch": 0, "train_loss": 3.7035408169031143, "train_ppl": 40.590774710256234, "lr": 0.000523495073005826, "grad_norm": 0.6878, "tokens_per_sec": 148810, "dt_s": 4.404, "eta_s": 5718, "world_size": 1, "timestamp": "2026-05-05T06:15:18.913274"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79800, "epoch": 0, "train_loss": 3.6359328776597977, "train_ppl": 37.937227186351485, "lr": 0.0005231325613177011, "grad_norm": 0.6664, "tokens_per_sec": 148740, "dt_s": 4.406, "eta_s": 5684, "world_size": 1, "timestamp": "2026-05-05T06:15:23.319341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79810, "epoch": 0, "train_loss": 3.8846840858459473, "train_ppl": 48.651570311800356, "lr": 0.0005227700496295763, "grad_norm": 0.7723, "tokens_per_sec": 146902, "dt_s": 4.461, "eta_s": 5687, "world_size": 1, "timestamp": "2026-05-05T06:15:27.780565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79820, "epoch": 0, "train_loss": 3.7475360929965973, "train_ppl": 42.4164429716852, "lr": 0.0005224075379414515, "grad_norm": 0.696, "tokens_per_sec": 149121, "dt_s": 4.395, "eta_s": 5683, "world_size": 1, "timestamp": "2026-05-05T06:15:32.175391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79830, "epoch": 0, "train_loss": 3.7457534670829773, "train_ppl": 42.34089767578672, "lr": 0.0005220450262533266, "grad_norm": 0.6652, "tokens_per_sec": 147771, "dt_s": 4.435, "eta_s": 5682, "world_size": 1, "timestamp": "2026-05-05T06:15:36.610330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79840, "epoch": 0, "train_loss": 3.6730008721351624, "train_ppl": 39.369872724063285, "lr": 0.0005216825145652017, "grad_norm": 0.7112, "tokens_per_sec": 149032, "dt_s": 4.397, "eta_s": 5675, "world_size": 1, "timestamp": "2026-05-05T06:15:41.007756"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79850, "epoch": 0, "train_loss": 3.7174413204193115, "train_ppl": 41.15894670658176, "lr": 0.0005213200028770769, "grad_norm": 0.7889, "tokens_per_sec": 150899, "dt_s": 4.343, "eta_s": 5655, "world_size": 1, "timestamp": "2026-05-05T06:15:45.350801"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79860, "epoch": 0, "train_loss": 3.760057181119919, "train_ppl": 42.95088188807439, "lr": 0.000520957491188952, "grad_norm": 0.6904, "tokens_per_sec": 147632, "dt_s": 4.439, "eta_s": 5645, "world_size": 1, "timestamp": "2026-05-05T06:15:49.789965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79870, "epoch": 0, "train_loss": 3.651275485754013, "train_ppl": 38.523771247301205, "lr": 0.0005205949795008271, "grad_norm": 0.742, "tokens_per_sec": 149188, "dt_s": 4.393, "eta_s": 5640, "world_size": 1, "timestamp": "2026-05-05T06:15:54.182793"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79880, "epoch": 0, "train_loss": 3.6426256895065308, "train_ppl": 38.19198548360764, "lr": 0.0005202324678127022, "grad_norm": 0.658, "tokens_per_sec": 148792, "dt_s": 4.405, "eta_s": 5628, "world_size": 1, "timestamp": "2026-05-05T06:15:58.587346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79890, "epoch": 0, "train_loss": 3.6124318689107895, "train_ppl": 37.05605880644604, "lr": 0.0005198699561245774, "grad_norm": 0.6396, "tokens_per_sec": 148285, "dt_s": 4.42, "eta_s": 5629, "world_size": 1, "timestamp": "2026-05-05T06:16:03.006938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79900, "epoch": 0, "train_loss": 3.6558077931404114, "train_ppl": 38.69876909272671, "lr": 0.0005195074444364525, "grad_norm": 0.6406, "tokens_per_sec": 149875, "dt_s": 4.373, "eta_s": 5632, "world_size": 1, "timestamp": "2026-05-05T06:16:07.379652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79910, "epoch": 0, "train_loss": 3.6434546262025833, "train_ppl": 38.223657347041446, "lr": 0.0005191449327483276, "grad_norm": 0.6835, "tokens_per_sec": 147517, "dt_s": 4.443, "eta_s": 5629, "world_size": 1, "timestamp": "2026-05-05T06:16:11.822284"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79920, "epoch": 0, "train_loss": 3.7214286625385284, "train_ppl": 41.323389134606195, "lr": 0.0005187824210602027, "grad_norm": 0.676, "tokens_per_sec": 149634, "dt_s": 4.38, "eta_s": 5621, "world_size": 1, "timestamp": "2026-05-05T06:16:16.202029"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79930, "epoch": 0, "train_loss": 3.6566268652677536, "train_ppl": 38.73047916049654, "lr": 0.000518419909372078, "grad_norm": 0.6527, "tokens_per_sec": 150782, "dt_s": 4.346, "eta_s": 5602, "world_size": 1, "timestamp": "2026-05-05T06:16:20.548443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79940, "epoch": 0, "train_loss": 3.7080273926258087, "train_ppl": 40.773297439487465, "lr": 0.0005180573976839531, "grad_norm": 0.6372, "tokens_per_sec": 149127, "dt_s": 4.395, "eta_s": 5591, "world_size": 1, "timestamp": "2026-05-05T06:16:24.943067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79950, "epoch": 0, "train_loss": 3.7269607931375504, "train_ppl": 41.552629027960506, "lr": 0.0005176948859958282, "grad_norm": 0.663, "tokens_per_sec": 150165, "dt_s": 4.364, "eta_s": 5584, "world_size": 1, "timestamp": "2026-05-05T06:16:29.307365"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79960, "epoch": 0, "train_loss": 3.760081246495247, "train_ppl": 42.95191552960514, "lr": 0.0005173323743077033, "grad_norm": 0.68, "tokens_per_sec": 148889, "dt_s": 4.402, "eta_s": 5570, "world_size": 1, "timestamp": "2026-05-05T06:16:33.709034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79970, "epoch": 0, "train_loss": 3.7444652915000916, "train_ppl": 42.28639028032871, "lr": 0.0005169698626195785, "grad_norm": 0.6277, "tokens_per_sec": 148977, "dt_s": 4.399, "eta_s": 5570, "world_size": 1, "timestamp": "2026-05-05T06:16:38.108093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79980, "epoch": 0, "train_loss": 3.7392994314432144, "train_ppl": 42.06850796514035, "lr": 0.0005166073509314536, "grad_norm": 0.6881, "tokens_per_sec": 135287, "dt_s": 4.844, "eta_s": 5692, "world_size": 1, "timestamp": "2026-05-05T06:16:42.952316"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 79990, "epoch": 0, "train_loss": 3.721988797187805, "train_ppl": 41.3465422805203, "lr": 0.0005162448392433287, "grad_norm": 0.6555, "tokens_per_sec": 149512, "dt_s": 4.383, "eta_s": 5685, "world_size": 1, "timestamp": "2026-05-05T06:16:47.335637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80000, "epoch": 0, "train_loss": 3.714726909995079, "train_ppl": 41.0473759255412, "lr": 0.0005158823275552038, "grad_norm": 0.6727, "tokens_per_sec": 149679, "dt_s": 4.378, "eta_s": 5684, "world_size": 1, "timestamp": "2026-05-05T06:16:51.714059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80010, "epoch": 0, "train_loss": 3.5745506286621094, "train_ppl": 35.67858427959413, "lr": 0.000515519815867079, "grad_norm": 0.6881, "tokens_per_sec": 127783, "dt_s": 5.129, "eta_s": 5661, "world_size": 1, "timestamp": "2026-05-05T06:16:56.842783"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80020, "epoch": 0, "train_loss": 3.7366984486579895, "train_ppl": 41.959230675903065, "lr": 0.0005151573041789542, "grad_norm": 0.7014, "tokens_per_sec": 148004, "dt_s": 4.428, "eta_s": 5664, "world_size": 1, "timestamp": "2026-05-05T06:17:01.270781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80030, "epoch": 0, "train_loss": 3.6118910759687424, "train_ppl": 37.03602456905878, "lr": 0.0005147947924908293, "grad_norm": 0.6276, "tokens_per_sec": 147221, "dt_s": 4.452, "eta_s": 5560, "world_size": 1, "timestamp": "2026-05-05T06:17:05.722333"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80040, "epoch": 0, "train_loss": 3.7173410058021545, "train_ppl": 41.15481806968508, "lr": 0.0005144322808027044, "grad_norm": 0.6261, "tokens_per_sec": 148979, "dt_s": 4.399, "eta_s": 5560, "world_size": 1, "timestamp": "2026-05-05T06:17:10.121351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80050, "epoch": 0, "train_loss": 3.725380375981331, "train_ppl": 41.48701040621455, "lr": 0.0005140697691145796, "grad_norm": 0.7487, "tokens_per_sec": 146793, "dt_s": 4.465, "eta_s": 5577, "world_size": 1, "timestamp": "2026-05-05T06:17:14.585860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80060, "epoch": 0, "train_loss": 3.656165197491646, "train_ppl": 38.71260267313152, "lr": 0.0005137072574264547, "grad_norm": 0.65, "tokens_per_sec": 151025, "dt_s": 4.339, "eta_s": 5575, "world_size": 1, "timestamp": "2026-05-05T06:17:18.925280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80070, "epoch": 0, "train_loss": 3.7557225823402405, "train_ppl": 42.765109962106415, "lr": 0.0005133447457383298, "grad_norm": 0.7145, "tokens_per_sec": 150985, "dt_s": 4.341, "eta_s": 5549, "world_size": 1, "timestamp": "2026-05-05T06:17:23.265818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80080, "epoch": 0, "train_loss": 3.7298336774110794, "train_ppl": 41.67217656331242, "lr": 0.0005129822340502049, "grad_norm": 0.6617, "tokens_per_sec": 149185, "dt_s": 4.393, "eta_s": 5530, "world_size": 1, "timestamp": "2026-05-05T06:17:27.658774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80090, "epoch": 0, "train_loss": 3.666024759411812, "train_ppl": 39.09617982027452, "lr": 0.0005126197223620801, "grad_norm": 0.6752, "tokens_per_sec": 151593, "dt_s": 4.323, "eta_s": 5506, "world_size": 1, "timestamp": "2026-05-05T06:17:31.981907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80100, "epoch": 0, "train_loss": 3.7812161594629288, "train_ppl": 43.86936144542177, "lr": 0.0005122572106739552, "grad_norm": 0.6567, "tokens_per_sec": 149456, "dt_s": 4.385, "eta_s": 5482, "world_size": 1, "timestamp": "2026-05-05T06:17:36.366886"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80110, "epoch": 0, "train_loss": 3.690228968858719, "train_ppl": 40.05401702998258, "lr": 0.0005118946989858303, "grad_norm": 0.638, "tokens_per_sec": 149118, "dt_s": 4.395, "eta_s": 5491, "world_size": 1, "timestamp": "2026-05-05T06:17:40.761821"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80120, "epoch": 0, "train_loss": 3.623507872223854, "train_ppl": 37.468773230484814, "lr": 0.0005115321872977054, "grad_norm": 0.6458, "tokens_per_sec": 150783, "dt_s": 4.346, "eta_s": 5488, "world_size": 1, "timestamp": "2026-05-05T06:17:45.108160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80130, "epoch": 0, "train_loss": 3.760701686143875, "train_ppl": 42.978572869764484, "lr": 0.0005111696756095807, "grad_norm": 0.709, "tokens_per_sec": 147379, "dt_s": 4.447, "eta_s": 5498, "world_size": 1, "timestamp": "2026-05-05T06:17:49.554985"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80140, "epoch": 0, "train_loss": 3.643842399120331, "train_ppl": 38.238482320353704, "lr": 0.0005108071639214558, "grad_norm": 0.6922, "tokens_per_sec": 149565, "dt_s": 4.382, "eta_s": 5508, "world_size": 1, "timestamp": "2026-05-05T06:17:53.936738"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80150, "epoch": 0, "train_loss": 3.6915720105171204, "train_ppl": 40.10784738355241, "lr": 0.0005104446522333309, "grad_norm": 0.6132, "tokens_per_sec": 150187, "dt_s": 4.364, "eta_s": 5498, "world_size": 1, "timestamp": "2026-05-05T06:17:58.300367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80160, "epoch": 0, "train_loss": 3.6271646320819855, "train_ppl": 37.606038356210476, "lr": 0.000510082140545206, "grad_norm": 0.8045, "tokens_per_sec": 147644, "dt_s": 4.439, "eta_s": 5505, "world_size": 1, "timestamp": "2026-05-05T06:18:02.739136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80170, "epoch": 0, "train_loss": 3.651817634701729, "train_ppl": 38.54466253192725, "lr": 0.0005097196288570812, "grad_norm": 0.6365, "tokens_per_sec": 152309, "dt_s": 4.303, "eta_s": 5489, "world_size": 1, "timestamp": "2026-05-05T06:18:07.041976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80180, "epoch": 0, "train_loss": 3.839212864637375, "train_ppl": 46.48886700298887, "lr": 0.0005093571171689563, "grad_norm": 0.6697, "tokens_per_sec": 150452, "dt_s": 4.356, "eta_s": 5462, "world_size": 1, "timestamp": "2026-05-05T06:18:11.397904"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80190, "epoch": 0, "train_loss": 3.744972303509712, "train_ppl": 42.307835424057664, "lr": 0.0005089946054808314, "grad_norm": 0.6855, "tokens_per_sec": 147612, "dt_s": 4.44, "eta_s": 5472, "world_size": 1, "timestamp": "2026-05-05T06:18:15.837651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80200, "epoch": 0, "train_loss": 3.7405507564544678, "train_ppl": 42.12118229082064, "lr": 0.0005086320937927065, "grad_norm": 0.6454, "tokens_per_sec": 150792, "dt_s": 4.346, "eta_s": 5464, "world_size": 1, "timestamp": "2026-05-05T06:18:20.183818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80210, "epoch": 0, "train_loss": 3.730904296040535, "train_ppl": 41.71681546322613, "lr": 0.0005082695821045817, "grad_norm": 0.6941, "tokens_per_sec": 148507, "dt_s": 4.413, "eta_s": 5453, "world_size": 1, "timestamp": "2026-05-05T06:18:24.596762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80220, "epoch": 0, "train_loss": 3.6279134154319763, "train_ppl": 37.63420767663362, "lr": 0.0005079070704164569, "grad_norm": 0.6647, "tokens_per_sec": 151342, "dt_s": 4.33, "eta_s": 5455, "world_size": 1, "timestamp": "2026-05-05T06:18:28.927090"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80230, "epoch": 0, "train_loss": 3.6883564889431, "train_ppl": 39.979086862064555, "lr": 0.000507544558728332, "grad_norm": 0.6602, "tokens_per_sec": 152160, "dt_s": 4.307, "eta_s": 5439, "world_size": 1, "timestamp": "2026-05-05T06:18:33.234150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80240, "epoch": 0, "train_loss": 3.6327499002218246, "train_ppl": 37.81666582199677, "lr": 0.0005071820470402071, "grad_norm": 0.7651, "tokens_per_sec": 148815, "dt_s": 4.404, "eta_s": 5426, "world_size": 1, "timestamp": "2026-05-05T06:18:37.638001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80250, "epoch": 0, "train_loss": 3.678294986486435, "train_ppl": 39.57885402961968, "lr": 0.0005068195353520823, "grad_norm": 0.6975, "tokens_per_sec": 150149, "dt_s": 4.365, "eta_s": 5426, "world_size": 1, "timestamp": "2026-05-05T06:18:42.002745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80260, "epoch": 0, "train_loss": 3.748530939221382, "train_ppl": 42.458661806986775, "lr": 0.0005064570236639574, "grad_norm": 0.6232, "tokens_per_sec": 149440, "dt_s": 4.385, "eta_s": 5415, "world_size": 1, "timestamp": "2026-05-05T06:18:46.388155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80270, "epoch": 0, "train_loss": 3.7478389143943787, "train_ppl": 42.42928952324198, "lr": 0.0005060945119758325, "grad_norm": 0.7055, "tokens_per_sec": 147567, "dt_s": 4.441, "eta_s": 5438, "world_size": 1, "timestamp": "2026-05-05T06:18:50.829272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80280, "epoch": 0, "train_loss": 3.680028945207596, "train_ppl": 39.647541662287985, "lr": 0.0005057320002877076, "grad_norm": 0.6447, "tokens_per_sec": 133816, "dt_s": 4.897, "eta_s": 5580, "world_size": 1, "timestamp": "2026-05-05T06:18:55.726751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80290, "epoch": 0, "train_loss": 3.5841682702302933, "train_ppl": 36.02338353488509, "lr": 0.0005053694885995828, "grad_norm": 0.6451, "tokens_per_sec": 149540, "dt_s": 4.383, "eta_s": 5570, "world_size": 1, "timestamp": "2026-05-05T06:19:00.109260"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80300, "epoch": 0, "train_loss": 3.7364753037691116, "train_ppl": 41.949868732610234, "lr": 0.0005050069769114579, "grad_norm": 0.6604, "tokens_per_sec": 148248, "dt_s": 4.421, "eta_s": 5579, "world_size": 1, "timestamp": "2026-05-05T06:19:04.529965"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80310, "epoch": 0, "train_loss": 3.766941100358963, "train_ppl": 43.24757231506876, "lr": 0.000504644465223333, "grad_norm": 0.6673, "tokens_per_sec": 150701, "dt_s": 4.349, "eta_s": 5566, "world_size": 1, "timestamp": "2026-05-05T06:19:08.878709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80320, "epoch": 0, "train_loss": 3.669856086373329, "train_ppl": 39.2462573826346, "lr": 0.0005042819535352081, "grad_norm": 0.6284, "tokens_per_sec": 147220, "dt_s": 4.452, "eta_s": 5564, "world_size": 1, "timestamp": "2026-05-05T06:19:13.330285"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80330, "epoch": 0, "train_loss": 3.7453152537345886, "train_ppl": 42.322347394029826, "lr": 0.0005039194418470834, "grad_norm": 0.9066, "tokens_per_sec": 149058, "dt_s": 4.397, "eta_s": 5436, "world_size": 1, "timestamp": "2026-05-05T06:19:17.726941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80340, "epoch": 0, "train_loss": 3.6083589643239975, "train_ppl": 36.90543995089099, "lr": 0.0005035569301589585, "grad_norm": 0.6686, "tokens_per_sec": 150573, "dt_s": 4.352, "eta_s": 5424, "world_size": 1, "timestamp": "2026-05-05T06:19:22.079391"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80350, "epoch": 0, "train_loss": 3.714286655187607, "train_ppl": 41.02930859836152, "lr": 0.0005031944184708336, "grad_norm": 0.6793, "tokens_per_sec": 146669, "dt_s": 4.468, "eta_s": 5431, "world_size": 1, "timestamp": "2026-05-05T06:19:26.547662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80360, "epoch": 0, "train_loss": 3.660103976726532, "train_ppl": 38.86538375662823, "lr": 0.0005028319067827087, "grad_norm": 0.6661, "tokens_per_sec": 150025, "dt_s": 4.368, "eta_s": 5432, "world_size": 1, "timestamp": "2026-05-05T06:19:30.916025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80370, "epoch": 0, "train_loss": 3.5876298546791077, "train_ppl": 36.14829759467396, "lr": 0.0005024693950945839, "grad_norm": 0.6614, "tokens_per_sec": 150004, "dt_s": 4.369, "eta_s": 5407, "world_size": 1, "timestamp": "2026-05-05T06:19:35.284930"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80380, "epoch": 0, "train_loss": 3.6054291129112244, "train_ppl": 36.79747073961917, "lr": 0.000502106883406459, "grad_norm": 0.702, "tokens_per_sec": 147894, "dt_s": 4.431, "eta_s": 5411, "world_size": 1, "timestamp": "2026-05-05T06:19:39.716222"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80390, "epoch": 0, "train_loss": 3.6794515401124954, "train_ppl": 39.62465557763076, "lr": 0.0005017443717183341, "grad_norm": 0.6827, "tokens_per_sec": 149168, "dt_s": 4.393, "eta_s": 5417, "world_size": 1, "timestamp": "2026-05-05T06:19:44.109645"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80400, "epoch": 0, "train_loss": 3.5454807430505753, "train_ppl": 34.65634213376083, "lr": 0.0005013818600302092, "grad_norm": 0.6309, "tokens_per_sec": 149558, "dt_s": 4.382, "eta_s": 5391, "world_size": 1, "timestamp": "2026-05-05T06:19:48.491613"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80410, "epoch": 0, "train_loss": 3.6426879167556763, "train_ppl": 38.19436213974931, "lr": 0.0005010193483420844, "grad_norm": 0.6253, "tokens_per_sec": 148556, "dt_s": 4.412, "eta_s": 5397, "world_size": 1, "timestamp": "2026-05-05T06:19:52.903159"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80420, "epoch": 0, "train_loss": 3.717734396457672, "train_ppl": 41.17101117544263, "lr": 0.0005006568366539596, "grad_norm": 0.6474, "tokens_per_sec": 152000, "dt_s": 4.312, "eta_s": 5379, "world_size": 1, "timestamp": "2026-05-05T06:19:57.214743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80430, "epoch": 0, "train_loss": 3.705772116780281, "train_ppl": 40.681446020704506, "lr": 0.0005002943249658347, "grad_norm": 0.7356, "tokens_per_sec": 144855, "dt_s": 4.524, "eta_s": 5397, "world_size": 1, "timestamp": "2026-05-05T06:20:01.738975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80440, "epoch": 0, "train_loss": 3.580528348684311, "train_ppl": 35.89249959314412, "lr": 0.0004999318132777098, "grad_norm": 0.6829, "tokens_per_sec": 147298, "dt_s": 4.449, "eta_s": 5406, "world_size": 1, "timestamp": "2026-05-05T06:20:06.188198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80450, "epoch": 0, "train_loss": 3.7630360573530197, "train_ppl": 43.07901800538525, "lr": 0.000499569301589585, "grad_norm": 0.7168, "tokens_per_sec": 147880, "dt_s": 4.432, "eta_s": 5414, "world_size": 1, "timestamp": "2026-05-05T06:20:10.619871"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80460, "epoch": 0, "train_loss": 3.7596465051174164, "train_ppl": 42.93324661303663, "lr": 0.0004992067899014601, "grad_norm": 0.7061, "tokens_per_sec": 147813, "dt_s": 4.434, "eta_s": 5415, "world_size": 1, "timestamp": "2026-05-05T06:20:15.053574"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80470, "epoch": 0, "train_loss": 3.7856244146823883, "train_ppl": 44.06317566439785, "lr": 0.0004988442782133352, "grad_norm": 0.6549, "tokens_per_sec": 150694, "dt_s": 4.349, "eta_s": 5420, "world_size": 1, "timestamp": "2026-05-05T06:20:19.402516"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80480, "epoch": 0, "train_loss": 3.691703885793686, "train_ppl": 40.11313696579346, "lr": 0.0004984817665252103, "grad_norm": 0.6753, "tokens_per_sec": 149846, "dt_s": 4.374, "eta_s": 5379, "world_size": 1, "timestamp": "2026-05-05T06:20:23.776062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80490, "epoch": 0, "train_loss": 3.670359343290329, "train_ppl": 39.266013303863815, "lr": 0.0004981192548370855, "grad_norm": 0.704, "tokens_per_sec": 148935, "dt_s": 4.4, "eta_s": 5362, "world_size": 1, "timestamp": "2026-05-05T06:20:28.176385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80500, "epoch": 0, "train_loss": 3.9159855097532272, "train_ppl": 50.198518271256454, "lr": 0.0004977567431489606, "grad_norm": 0.7332, "tokens_per_sec": 149701, "dt_s": 4.378, "eta_s": 5345, "world_size": 1, "timestamp": "2026-05-05T06:20:32.554156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80510, "epoch": 0, "train_loss": 3.6100863963365555, "train_ppl": 36.969246684329946, "lr": 0.0004973942314608357, "grad_norm": 0.7906, "tokens_per_sec": 125532, "dt_s": 5.221, "eta_s": 5346, "world_size": 1, "timestamp": "2026-05-05T06:20:37.774822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80520, "epoch": 0, "train_loss": 3.6893929839134216, "train_ppl": 40.02054646713946, "lr": 0.0004970317197727108, "grad_norm": 0.8028, "tokens_per_sec": 146608, "dt_s": 4.47, "eta_s": 5371, "world_size": 1, "timestamp": "2026-05-05T06:20:42.244975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80530, "epoch": 0, "train_loss": 3.700476363301277, "train_ppl": 40.4665765614204, "lr": 0.0004966692080845861, "grad_norm": 0.6904, "tokens_per_sec": 148956, "dt_s": 4.4, "eta_s": 5373, "world_size": 1, "timestamp": "2026-05-05T06:20:46.644666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80540, "epoch": 0, "train_loss": 3.818197250366211, "train_ppl": 45.52206941118905, "lr": 0.0004963066963964612, "grad_norm": 0.8015, "tokens_per_sec": 147125, "dt_s": 4.454, "eta_s": 5382, "world_size": 1, "timestamp": "2026-05-05T06:20:51.099098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80550, "epoch": 0, "train_loss": 3.7491351515054703, "train_ppl": 42.48432360382224, "lr": 0.0004959441847083363, "grad_norm": 0.7166, "tokens_per_sec": 150418, "dt_s": 4.357, "eta_s": 5372, "world_size": 1, "timestamp": "2026-05-05T06:20:55.456034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80560, "epoch": 0, "train_loss": 3.7197535187005997, "train_ppl": 41.25422446049746, "lr": 0.0004955816730202114, "grad_norm": 0.6813, "tokens_per_sec": 150222, "dt_s": 4.363, "eta_s": 5345, "world_size": 1, "timestamp": "2026-05-05T06:20:59.818635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80570, "epoch": 0, "train_loss": 3.80360609292984, "train_ppl": 44.86267211334258, "lr": 0.0004952191613320866, "grad_norm": 0.6455, "tokens_per_sec": 132296, "dt_s": 4.954, "eta_s": 5458, "world_size": 1, "timestamp": "2026-05-05T06:21:04.772358"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80580, "epoch": 0, "train_loss": 3.5872992277145386, "train_ppl": 36.136347968309096, "lr": 0.0004948566496439617, "grad_norm": 0.6422, "tokens_per_sec": 149150, "dt_s": 4.394, "eta_s": 5452, "world_size": 1, "timestamp": "2026-05-05T06:21:09.166323"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80590, "epoch": 0, "train_loss": 3.7262625992298126, "train_ppl": 41.52362736109439, "lr": 0.0004944941379558368, "grad_norm": 0.6678, "tokens_per_sec": 150025, "dt_s": 4.368, "eta_s": 5427, "world_size": 1, "timestamp": "2026-05-05T06:21:13.534652"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80600, "epoch": 0, "train_loss": 3.815188020467758, "train_ppl": 45.38528894403065, "lr": 0.0004941316262677119, "grad_norm": 0.7295, "tokens_per_sec": 149881, "dt_s": 4.373, "eta_s": 5426, "world_size": 1, "timestamp": "2026-05-05T06:21:17.907214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80610, "epoch": 0, "train_loss": 3.6628230065107346, "train_ppl": 38.97120369121895, "lr": 0.000493769114579587, "grad_norm": 0.6866, "tokens_per_sec": 151144, "dt_s": 4.336, "eta_s": 5415, "world_size": 1, "timestamp": "2026-05-05T06:21:22.243218"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80620, "epoch": 0, "train_loss": 3.616885185241699, "train_ppl": 37.22144915271756, "lr": 0.0004934066028914622, "grad_norm": 0.701, "tokens_per_sec": 148548, "dt_s": 4.412, "eta_s": 5280, "world_size": 1, "timestamp": "2026-05-05T06:21:26.654948"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80630, "epoch": 0, "train_loss": 3.6877196580171585, "train_ppl": 39.95363504827114, "lr": 0.0004930440912033374, "grad_norm": 0.7046, "tokens_per_sec": 149153, "dt_s": 4.394, "eta_s": 5275, "world_size": 1, "timestamp": "2026-05-05T06:21:31.048818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80640, "epoch": 0, "train_loss": 3.625943437218666, "train_ppl": 37.560142085189106, "lr": 0.0004926815795152125, "grad_norm": 0.619, "tokens_per_sec": 150861, "dt_s": 4.344, "eta_s": 5265, "world_size": 1, "timestamp": "2026-05-05T06:21:35.392953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80650, "epoch": 0, "train_loss": 3.710668995976448, "train_ppl": 40.881146703393725, "lr": 0.0004923190678270877, "grad_norm": 0.6648, "tokens_per_sec": 145709, "dt_s": 4.498, "eta_s": 5291, "world_size": 1, "timestamp": "2026-05-05T06:21:39.890676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80660, "epoch": 0, "train_loss": 3.6492466777563095, "train_ppl": 38.4456931416168, "lr": 0.0004919565561389628, "grad_norm": 0.7125, "tokens_per_sec": 150040, "dt_s": 4.368, "eta_s": 5294, "world_size": 1, "timestamp": "2026-05-05T06:21:44.258568"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80670, "epoch": 0, "train_loss": 3.609560415148735, "train_ppl": 36.949806669035446, "lr": 0.0004915940444508379, "grad_norm": 0.6818, "tokens_per_sec": 147221, "dt_s": 4.452, "eta_s": 5299, "world_size": 1, "timestamp": "2026-05-05T06:21:48.710099"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80680, "epoch": 0, "train_loss": 3.7774082124233246, "train_ppl": 43.7026269000129, "lr": 0.000491231532762713, "grad_norm": 0.69, "tokens_per_sec": 147949, "dt_s": 4.43, "eta_s": 5303, "world_size": 1, "timestamp": "2026-05-05T06:21:53.139749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80690, "epoch": 0, "train_loss": 3.7720076888799667, "train_ppl": 43.4672459966741, "lr": 0.0004908690210745882, "grad_norm": 0.6531, "tokens_per_sec": 150908, "dt_s": 4.343, "eta_s": 5299, "world_size": 1, "timestamp": "2026-05-05T06:21:57.482525"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80700, "epoch": 0, "train_loss": 3.620174154639244, "train_ppl": 37.34407089896202, "lr": 0.0004905065093864633, "grad_norm": 0.6707, "tokens_per_sec": 146878, "dt_s": 4.462, "eta_s": 5286, "world_size": 1, "timestamp": "2026-05-05T06:22:01.944457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80710, "epoch": 0, "train_loss": 3.826737493276596, "train_ppl": 45.91250377083099, "lr": 0.0004901439976983384, "grad_norm": 0.7351, "tokens_per_sec": 148259, "dt_s": 4.42, "eta_s": 5294, "world_size": 1, "timestamp": "2026-05-05T06:22:06.364818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80720, "epoch": 0, "train_loss": 3.6401521116495132, "train_ppl": 38.09763137822223, "lr": 0.0004897814860102135, "grad_norm": 0.6908, "tokens_per_sec": 149629, "dt_s": 4.38, "eta_s": 5272, "world_size": 1, "timestamp": "2026-05-05T06:22:10.744716"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80730, "epoch": 0, "train_loss": 3.632924973964691, "train_ppl": 37.823287106814604, "lr": 0.0004894189743220888, "grad_norm": 0.6521, "tokens_per_sec": 146679, "dt_s": 4.468, "eta_s": 5277, "world_size": 1, "timestamp": "2026-05-05T06:22:15.212708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80740, "epoch": 0, "train_loss": 3.603345289826393, "train_ppl": 36.72087115833475, "lr": 0.0004890564626339639, "grad_norm": 0.6949, "tokens_per_sec": 150050, "dt_s": 4.368, "eta_s": 5279, "world_size": 1, "timestamp": "2026-05-05T06:22:19.580340"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80750, "epoch": 0, "train_loss": 3.561590865254402, "train_ppl": 35.21918157371223, "lr": 0.000488693950945839, "grad_norm": 0.6664, "tokens_per_sec": 149698, "dt_s": 4.378, "eta_s": 5254, "world_size": 1, "timestamp": "2026-05-05T06:22:23.958247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80760, "epoch": 0, "train_loss": 3.595314145088196, "train_ppl": 36.42714159823221, "lr": 0.0004883314392577141, "grad_norm": 0.6275, "tokens_per_sec": 147156, "dt_s": 4.454, "eta_s": 5258, "world_size": 1, "timestamp": "2026-05-05T06:22:28.411709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80770, "epoch": 0, "train_loss": 3.72693994641304, "train_ppl": 41.55176280077952, "lr": 0.00048796892756958925, "grad_norm": 0.6981, "tokens_per_sec": 149578, "dt_s": 4.381, "eta_s": 5254, "world_size": 1, "timestamp": "2026-05-05T06:22:32.793120"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80780, "epoch": 0, "train_loss": 3.681845635175705, "train_ppl": 39.719634418752086, "lr": 0.0004876064158814644, "grad_norm": 0.69, "tokens_per_sec": 145138, "dt_s": 4.515, "eta_s": 5260, "world_size": 1, "timestamp": "2026-05-05T06:22:37.308539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80790, "epoch": 0, "train_loss": 3.607121095061302, "train_ppl": 36.859784104963644, "lr": 0.0004872439041933395, "grad_norm": 0.7041, "tokens_per_sec": 146213, "dt_s": 4.482, "eta_s": 5283, "world_size": 1, "timestamp": "2026-05-05T06:22:41.790771"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80800, "epoch": 0, "train_loss": 3.7993935495615005, "train_ppl": 44.674083658850485, "lr": 0.0004868813925052146, "grad_norm": 0.7182, "tokens_per_sec": 150765, "dt_s": 4.347, "eta_s": 5272, "world_size": 1, "timestamp": "2026-05-05T06:22:46.137696"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80810, "epoch": 0, "train_loss": 3.819931909441948, "train_ppl": 45.601103210570194, "lr": 0.0004865188808170898, "grad_norm": 0.6947, "tokens_per_sec": 147968, "dt_s": 4.429, "eta_s": 5261, "world_size": 1, "timestamp": "2026-05-05T06:22:50.566741"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80820, "epoch": 0, "train_loss": 3.7780513763427734, "train_ppl": 43.73074389376884, "lr": 0.00048615636912896493, "grad_norm": 0.6646, "tokens_per_sec": 148849, "dt_s": 4.403, "eta_s": 5262, "world_size": 1, "timestamp": "2026-05-05T06:22:54.969610"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80830, "epoch": 0, "train_loss": 3.6423596292734146, "train_ppl": 38.18182546669487, "lr": 0.00048579385744084005, "grad_norm": 0.656, "tokens_per_sec": 149577, "dt_s": 4.381, "eta_s": 5226, "world_size": 1, "timestamp": "2026-05-05T06:22:59.351033"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80840, "epoch": 0, "train_loss": 3.6653275787830353, "train_ppl": 39.068932220397734, "lr": 0.0004854313457527152, "grad_norm": 0.6677, "tokens_per_sec": 142944, "dt_s": 4.585, "eta_s": 5246, "world_size": 1, "timestamp": "2026-05-05T06:23:03.935772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80850, "epoch": 0, "train_loss": 3.6809088736772537, "train_ppl": 39.68244401648413, "lr": 0.00048506883406459035, "grad_norm": 0.6571, "tokens_per_sec": 148152, "dt_s": 4.424, "eta_s": 5259, "world_size": 1, "timestamp": "2026-05-05T06:23:08.359346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80860, "epoch": 0, "train_loss": 3.5515245646238327, "train_ppl": 34.8664331186999, "lr": 0.0004847063223764655, "grad_norm": 0.6892, "tokens_per_sec": 148018, "dt_s": 4.428, "eta_s": 5255, "world_size": 1, "timestamp": "2026-05-05T06:23:12.786916"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80870, "epoch": 0, "train_loss": 3.626905918121338, "train_ppl": 37.5963104075154, "lr": 0.0004843438106883406, "grad_norm": 0.6498, "tokens_per_sec": 134424, "dt_s": 4.875, "eta_s": 5362, "world_size": 1, "timestamp": "2026-05-05T06:23:17.662234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80880, "epoch": 0, "train_loss": 3.761749967932701, "train_ppl": 43.02365014773044, "lr": 0.0004839812990002157, "grad_norm": 0.6929, "tokens_per_sec": 151773, "dt_s": 4.318, "eta_s": 5342, "world_size": 1, "timestamp": "2026-05-05T06:23:21.980241"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80890, "epoch": 0, "train_loss": 3.6232875883579254, "train_ppl": 37.460520373285014, "lr": 0.00048361878731209085, "grad_norm": 0.6345, "tokens_per_sec": 149485, "dt_s": 4.384, "eta_s": 5290, "world_size": 1, "timestamp": "2026-05-05T06:23:26.364357"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80900, "epoch": 0, "train_loss": 3.548967406153679, "train_ppl": 34.77738802365847, "lr": 0.000483256275623966, "grad_norm": 0.6766, "tokens_per_sec": 149752, "dt_s": 4.376, "eta_s": 5275, "world_size": 1, "timestamp": "2026-05-05T06:23:30.740653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80910, "epoch": 0, "train_loss": 3.684955045580864, "train_ppl": 39.84333127578716, "lr": 0.00048289376393584115, "grad_norm": 0.6144, "tokens_per_sec": 151426, "dt_s": 4.328, "eta_s": 5247, "world_size": 1, "timestamp": "2026-05-05T06:23:35.068580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80920, "epoch": 0, "train_loss": 3.7477328181266785, "train_ppl": 42.42478817277485, "lr": 0.0004825312522477163, "grad_norm": 0.7023, "tokens_per_sec": 147140, "dt_s": 4.454, "eta_s": 5143, "world_size": 1, "timestamp": "2026-05-05T06:23:39.522561"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80930, "epoch": 0, "train_loss": 3.582462415099144, "train_ppl": 35.96198524442395, "lr": 0.0004821687405595914, "grad_norm": 0.7548, "tokens_per_sec": 151076, "dt_s": 4.338, "eta_s": 5144, "world_size": 1, "timestamp": "2026-05-05T06:23:43.860507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80940, "epoch": 0, "train_loss": 3.658581405878067, "train_ppl": 38.80625348274472, "lr": 0.0004818062288714665, "grad_norm": 0.6687, "tokens_per_sec": 148885, "dt_s": 4.402, "eta_s": 5143, "world_size": 1, "timestamp": "2026-05-05T06:23:48.262299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80950, "epoch": 0, "train_loss": 3.677649423480034, "train_ppl": 39.55331163112326, "lr": 0.0004814437171833417, "grad_norm": 0.6728, "tokens_per_sec": 149046, "dt_s": 4.397, "eta_s": 5144, "world_size": 1, "timestamp": "2026-05-05T06:23:52.659325"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80960, "epoch": 0, "train_loss": 3.586775988340378, "train_ppl": 36.11744495404565, "lr": 0.00048108120549521683, "grad_norm": 0.6542, "tokens_per_sec": 152082, "dt_s": 4.309, "eta_s": 5135, "world_size": 1, "timestamp": "2026-05-05T06:23:56.968563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80970, "epoch": 0, "train_loss": 3.629767417907715, "train_ppl": 37.70404631133454, "lr": 0.00048071869380709195, "grad_norm": 0.6685, "tokens_per_sec": 151053, "dt_s": 4.339, "eta_s": 5104, "world_size": 1, "timestamp": "2026-05-05T06:24:01.307170"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80980, "epoch": 0, "train_loss": 3.5700792372226715, "train_ppl": 35.51940749932764, "lr": 0.0004803561821189671, "grad_norm": 0.6358, "tokens_per_sec": 150235, "dt_s": 4.362, "eta_s": 5105, "world_size": 1, "timestamp": "2026-05-05T06:24:05.669397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 80990, "epoch": 0, "train_loss": 3.6616513580083847, "train_ppl": 38.92556987739454, "lr": 0.0004799936704308422, "grad_norm": 0.6408, "tokens_per_sec": 152067, "dt_s": 4.31, "eta_s": 5079, "world_size": 1, "timestamp": "2026-05-05T06:24:09.979082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81000, "epoch": 0, "train_loss": 3.653942286968231, "train_ppl": 38.626643596324875, "lr": 0.0004796311587427173, "grad_norm": 0.6846, "tokens_per_sec": 149076, "dt_s": 4.396, "eta_s": 5074, "world_size": 1, "timestamp": "2026-05-05T06:24:14.375240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81010, "epoch": 0, "train_loss": 3.6606761664152145, "train_ppl": 38.88762849195765, "lr": 0.0004792686470545925, "grad_norm": 0.6686, "tokens_per_sec": 127667, "dt_s": 5.133, "eta_s": 5081, "world_size": 1, "timestamp": "2026-05-05T06:24:19.508553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81020, "epoch": 0, "train_loss": 3.80593778192997, "train_ppl": 44.96739996136285, "lr": 0.0004789061353664676, "grad_norm": 0.7431, "tokens_per_sec": 150802, "dt_s": 4.346, "eta_s": 5078, "world_size": 1, "timestamp": "2026-05-05T06:24:23.854400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81030, "epoch": 0, "train_loss": 3.7863349616527557, "train_ppl": 44.09449574624786, "lr": 0.00047854362367834275, "grad_norm": 0.7594, "tokens_per_sec": 146891, "dt_s": 4.462, "eta_s": 5097, "world_size": 1, "timestamp": "2026-05-05T06:24:28.315943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81040, "epoch": 0, "train_loss": 3.6429758071899414, "train_ppl": 38.20535951419569, "lr": 0.0004781811119902179, "grad_norm": 0.7057, "tokens_per_sec": 150881, "dt_s": 4.344, "eta_s": 5101, "world_size": 1, "timestamp": "2026-05-05T06:24:32.659488"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81050, "epoch": 0, "train_loss": 3.5217643827199936, "train_ppl": 33.844089742823506, "lr": 0.00047781860030209305, "grad_norm": 0.6608, "tokens_per_sec": 150786, "dt_s": 4.346, "eta_s": 5085, "world_size": 1, "timestamp": "2026-05-05T06:24:37.005816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81060, "epoch": 0, "train_loss": 3.623600870370865, "train_ppl": 37.472257918998295, "lr": 0.0004774560886139682, "grad_norm": 0.6616, "tokens_per_sec": 149011, "dt_s": 4.398, "eta_s": 5090, "world_size": 1, "timestamp": "2026-05-05T06:24:41.403879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81070, "epoch": 0, "train_loss": 3.6526577919721603, "train_ppl": 38.577059717849764, "lr": 0.0004770935769258433, "grad_norm": 0.7179, "tokens_per_sec": 150529, "dt_s": 4.354, "eta_s": 5088, "world_size": 1, "timestamp": "2026-05-05T06:24:45.757596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81080, "epoch": 0, "train_loss": 3.6861688792705536, "train_ppl": 39.89172381790638, "lr": 0.0004767310652377184, "grad_norm": 0.6445, "tokens_per_sec": 149383, "dt_s": 4.387, "eta_s": 5066, "world_size": 1, "timestamp": "2026-05-05T06:24:50.144693"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81090, "epoch": 0, "train_loss": 3.6987734138965607, "train_ppl": 40.39772267293832, "lr": 0.00047636855354959355, "grad_norm": 0.7016, "tokens_per_sec": 149172, "dt_s": 4.393, "eta_s": 5073, "world_size": 1, "timestamp": "2026-05-05T06:24:54.538016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81100, "epoch": 0, "train_loss": 3.7382585555315018, "train_ppl": 42.024742649640444, "lr": 0.0004760060418614687, "grad_norm": 0.8006, "tokens_per_sec": 150585, "dt_s": 4.352, "eta_s": 5070, "world_size": 1, "timestamp": "2026-05-05T06:24:58.890127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81110, "epoch": 0, "train_loss": 3.5741868913173676, "train_ppl": 35.66560900602297, "lr": 0.00047564353017334385, "grad_norm": 0.668, "tokens_per_sec": 149057, "dt_s": 4.397, "eta_s": 5065, "world_size": 1, "timestamp": "2026-05-05T06:25:03.286817"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81120, "epoch": 0, "train_loss": 3.7043635696172714, "train_ppl": 40.62418462248216, "lr": 0.000475281018485219, "grad_norm": 0.8677, "tokens_per_sec": 149625, "dt_s": 4.38, "eta_s": 5067, "world_size": 1, "timestamp": "2026-05-05T06:25:07.666829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81130, "epoch": 0, "train_loss": 3.667996406555176, "train_ppl": 39.173339732598066, "lr": 0.0004749185067970941, "grad_norm": 0.666, "tokens_per_sec": 151437, "dt_s": 4.328, "eta_s": 5049, "world_size": 1, "timestamp": "2026-05-05T06:25:11.994431"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81140, "epoch": 0, "train_loss": 3.8506226241588593, "train_ppl": 47.02233135862205, "lr": 0.0004745559951089692, "grad_norm": 0.6807, "tokens_per_sec": 150024, "dt_s": 4.368, "eta_s": 5039, "world_size": 1, "timestamp": "2026-05-05T06:25:16.362795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81150, "epoch": 0, "train_loss": 3.701398104429245, "train_ppl": 40.50389346496545, "lr": 0.0004741934834208444, "grad_norm": 0.7054, "tokens_per_sec": 149407, "dt_s": 4.386, "eta_s": 5042, "world_size": 1, "timestamp": "2026-05-05T06:25:20.749198"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81160, "epoch": 0, "train_loss": 3.7214027047157288, "train_ppl": 41.32231648331546, "lr": 0.0004738309717327195, "grad_norm": 0.7693, "tokens_per_sec": 135088, "dt_s": 4.851, "eta_s": 5143, "world_size": 1, "timestamp": "2026-05-05T06:25:25.600590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81170, "epoch": 0, "train_loss": 3.7439886331558228, "train_ppl": 42.26623892259053, "lr": 0.00047346846004459465, "grad_norm": 0.7097, "tokens_per_sec": 147383, "dt_s": 4.447, "eta_s": 5154, "world_size": 1, "timestamp": "2026-05-05T06:25:30.047259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81180, "epoch": 0, "train_loss": 3.7186297476291656, "train_ppl": 41.20789019590354, "lr": 0.0004731059483564698, "grad_norm": 0.6908, "tokens_per_sec": 149859, "dt_s": 4.373, "eta_s": 5160, "world_size": 1, "timestamp": "2026-05-05T06:25:34.420420"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81190, "epoch": 0, "train_loss": 3.643665298819542, "train_ppl": 38.23171087326342, "lr": 0.0004727434366683449, "grad_norm": 0.6806, "tokens_per_sec": 148049, "dt_s": 4.427, "eta_s": 5169, "world_size": 1, "timestamp": "2026-05-05T06:25:38.847053"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81200, "epoch": 0, "train_loss": 3.7244159430265427, "train_ppl": 41.44701825415767, "lr": 0.00047238092498022, "grad_norm": 0.7223, "tokens_per_sec": 150815, "dt_s": 4.345, "eta_s": 5155, "world_size": 1, "timestamp": "2026-05-05T06:25:43.192507"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81210, "epoch": 0, "train_loss": 3.6046643555164337, "train_ppl": 36.769340359590544, "lr": 0.0004720184132920952, "grad_norm": 0.7001, "tokens_per_sec": 151912, "dt_s": 4.314, "eta_s": 5027, "world_size": 1, "timestamp": "2026-05-05T06:25:47.506591"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81220, "epoch": 0, "train_loss": 3.563395604491234, "train_ppl": 35.28280040301847, "lr": 0.0004716559016039703, "grad_norm": 0.6803, "tokens_per_sec": 149092, "dt_s": 4.396, "eta_s": 5011, "world_size": 1, "timestamp": "2026-05-05T06:25:51.902282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81230, "epoch": 0, "train_loss": 3.73168121278286, "train_ppl": 41.749238548984195, "lr": 0.00047129338991584545, "grad_norm": 0.7409, "tokens_per_sec": 150794, "dt_s": 4.346, "eta_s": 5000, "world_size": 1, "timestamp": "2026-05-05T06:25:56.248338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81240, "epoch": 0, "train_loss": 3.6588881611824036, "train_ppl": 38.818159332839954, "lr": 0.0004709308782277206, "grad_norm": 0.6521, "tokens_per_sec": 150966, "dt_s": 4.341, "eta_s": 4976, "world_size": 1, "timestamp": "2026-05-05T06:26:00.589433"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81250, "epoch": 0, "train_loss": 3.780524119734764, "train_ppl": 43.839012606964495, "lr": 0.00047056836653959575, "grad_norm": 0.6919, "tokens_per_sec": 149771, "dt_s": 4.376, "eta_s": 4979, "world_size": 1, "timestamp": "2026-05-05T06:26:04.965156"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81260, "epoch": 0, "train_loss": 3.7200179249048233, "train_ppl": 41.26513377557699, "lr": 0.0004702058548514709, "grad_norm": 0.6717, "tokens_per_sec": 150988, "dt_s": 4.34, "eta_s": 4981, "world_size": 1, "timestamp": "2026-05-05T06:26:09.305647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81270, "epoch": 0, "train_loss": 3.602700486779213, "train_ppl": 36.69720106081236, "lr": 0.000469843343163346, "grad_norm": 0.6992, "tokens_per_sec": 151033, "dt_s": 4.339, "eta_s": 4963, "world_size": 1, "timestamp": "2026-05-05T06:26:13.644860"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81280, "epoch": 0, "train_loss": 3.7746614515781403, "train_ppl": 43.58275094626921, "lr": 0.0004694808314752211, "grad_norm": 0.7336, "tokens_per_sec": 148801, "dt_s": 4.404, "eta_s": 4972, "world_size": 1, "timestamp": "2026-05-05T06:26:18.049115"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81290, "epoch": 0, "train_loss": 3.7296204417943954, "train_ppl": 41.66329151838206, "lr": 0.00046911831978709625, "grad_norm": 0.7178, "tokens_per_sec": 150677, "dt_s": 4.349, "eta_s": 4970, "world_size": 1, "timestamp": "2026-05-05T06:26:22.398535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81300, "epoch": 0, "train_loss": 3.6432906836271286, "train_ppl": 38.21739137585644, "lr": 0.00046875580809897137, "grad_norm": 0.6669, "tokens_per_sec": 148577, "dt_s": 4.411, "eta_s": 4973, "world_size": 1, "timestamp": "2026-05-05T06:26:26.809467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81310, "epoch": 0, "train_loss": 3.682658851146698, "train_ppl": 39.75194819708292, "lr": 0.00046839329641084655, "grad_norm": 0.6691, "tokens_per_sec": 149611, "dt_s": 4.38, "eta_s": 4978, "world_size": 1, "timestamp": "2026-05-05T06:26:31.189909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81320, "epoch": 0, "train_loss": 3.743831589818001, "train_ppl": 42.25960181252355, "lr": 0.0004680307847227217, "grad_norm": 0.8148, "tokens_per_sec": 151446, "dt_s": 4.327, "eta_s": 4971, "world_size": 1, "timestamp": "2026-05-05T06:26:35.517269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81330, "epoch": 0, "train_loss": 3.7034154385328293, "train_ppl": 40.58568582409925, "lr": 0.0004676682730345968, "grad_norm": 0.6893, "tokens_per_sec": 147823, "dt_s": 4.433, "eta_s": 4973, "world_size": 1, "timestamp": "2026-05-05T06:26:39.950709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81340, "epoch": 0, "train_loss": 3.8390640020370483, "train_ppl": 46.48194706443337, "lr": 0.0004673057613464719, "grad_norm": 0.7736, "tokens_per_sec": 150280, "dt_s": 4.361, "eta_s": 4972, "world_size": 1, "timestamp": "2026-05-05T06:26:44.311596"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81350, "epoch": 0, "train_loss": 3.674143061041832, "train_ppl": 39.414866246605335, "lr": 0.0004669432496583471, "grad_norm": 0.7476, "tokens_per_sec": 150685, "dt_s": 4.349, "eta_s": 4953, "world_size": 1, "timestamp": "2026-05-05T06:26:48.660795"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81360, "epoch": 0, "train_loss": 3.722981035709381, "train_ppl": 41.387588272844766, "lr": 0.0004665807379702222, "grad_norm": 0.6872, "tokens_per_sec": 149952, "dt_s": 4.37, "eta_s": 4947, "world_size": 1, "timestamp": "2026-05-05T06:26:53.031282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81370, "epoch": 0, "train_loss": 3.734901413321495, "train_ppl": 41.883896165348474, "lr": 0.00046621822628209735, "grad_norm": 0.6533, "tokens_per_sec": 150363, "dt_s": 4.359, "eta_s": 4949, "world_size": 1, "timestamp": "2026-05-05T06:26:57.389802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81380, "epoch": 0, "train_loss": 3.679299384355545, "train_ppl": 39.618626916826805, "lr": 0.0004658557145939725, "grad_norm": 0.675, "tokens_per_sec": 150919, "dt_s": 4.342, "eta_s": 4924, "world_size": 1, "timestamp": "2026-05-05T06:27:01.732261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81390, "epoch": 0, "train_loss": 3.6410226076841354, "train_ppl": 38.13080965394909, "lr": 0.0004654932029058476, "grad_norm": 0.6756, "tokens_per_sec": 148363, "dt_s": 4.417, "eta_s": 4933, "world_size": 1, "timestamp": "2026-05-05T06:27:06.149535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81400, "epoch": 0, "train_loss": 3.6415965259075165, "train_ppl": 38.152699901486905, "lr": 0.0004651306912177227, "grad_norm": 0.724, "tokens_per_sec": 150926, "dt_s": 4.342, "eta_s": 4927, "world_size": 1, "timestamp": "2026-05-05T06:27:10.491767"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81410, "epoch": 0, "train_loss": 3.8486613780260086, "train_ppl": 46.93019936937317, "lr": 0.0004647681795295979, "grad_norm": 0.8409, "tokens_per_sec": 151091, "dt_s": 4.338, "eta_s": 4915, "world_size": 1, "timestamp": "2026-05-05T06:27:14.829331"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81420, "epoch": 0, "train_loss": 3.744506612420082, "train_ppl": 42.28813762897895, "lr": 0.000464405667841473, "grad_norm": 0.6567, "tokens_per_sec": 148354, "dt_s": 4.418, "eta_s": 4924, "world_size": 1, "timestamp": "2026-05-05T06:27:19.246847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81430, "epoch": 0, "train_loss": 3.7286239862442017, "train_ppl": 41.62179657767466, "lr": 0.00046404315615334815, "grad_norm": 0.6935, "tokens_per_sec": 150362, "dt_s": 4.359, "eta_s": 4923, "world_size": 1, "timestamp": "2026-05-05T06:27:23.605377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81440, "epoch": 0, "train_loss": 3.591883569955826, "train_ppl": 36.30238965991948, "lr": 0.00046368064446522327, "grad_norm": 0.6491, "tokens_per_sec": 148763, "dt_s": 4.405, "eta_s": 4916, "world_size": 1, "timestamp": "2026-05-05T06:27:28.010751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81450, "epoch": 0, "train_loss": 3.581229194998741, "train_ppl": 35.91766353619803, "lr": 0.00046331813277709845, "grad_norm": 0.6805, "tokens_per_sec": 148906, "dt_s": 4.401, "eta_s": 4925, "world_size": 1, "timestamp": "2026-05-05T06:27:32.411918"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81460, "epoch": 0, "train_loss": 3.7541347444057465, "train_ppl": 42.69725978003774, "lr": 0.0004629556210889736, "grad_norm": 0.7034, "tokens_per_sec": 134484, "dt_s": 4.873, "eta_s": 5041, "world_size": 1, "timestamp": "2026-05-05T06:27:37.285071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81470, "epoch": 0, "train_loss": 3.7294343262910843, "train_ppl": 41.655538055453725, "lr": 0.0004625931094008487, "grad_norm": 0.6385, "tokens_per_sec": 147502, "dt_s": 4.443, "eta_s": 5042, "world_size": 1, "timestamp": "2026-05-05T06:27:41.728112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81480, "epoch": 0, "train_loss": 3.7479959577322006, "train_ppl": 42.43595328372602, "lr": 0.0004622305977127238, "grad_norm": 0.8596, "tokens_per_sec": 151213, "dt_s": 4.334, "eta_s": 5032, "world_size": 1, "timestamp": "2026-05-05T06:27:46.062137"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81490, "epoch": 0, "train_loss": 3.665031746029854, "train_ppl": 39.05737606004476, "lr": 0.00046186808602459895, "grad_norm": 0.663, "tokens_per_sec": 151815, "dt_s": 4.317, "eta_s": 5008, "world_size": 1, "timestamp": "2026-05-05T06:27:50.378949"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81500, "epoch": 0, "train_loss": 3.623459205031395, "train_ppl": 37.46694977485839, "lr": 0.00046150557433647407, "grad_norm": 0.6452, "tokens_per_sec": 148453, "dt_s": 4.415, "eta_s": 5006, "world_size": 1, "timestamp": "2026-05-05T06:27:54.793565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81510, "epoch": 0, "train_loss": 3.645747646689415, "train_ppl": 38.311405542176495, "lr": 0.00046114306264834925, "grad_norm": 0.6565, "tokens_per_sec": 127898, "dt_s": 5.124, "eta_s": 4885, "world_size": 1, "timestamp": "2026-05-05T06:27:59.917636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81520, "epoch": 0, "train_loss": 3.7084817588329315, "train_ppl": 40.79182765743057, "lr": 0.0004607805509602244, "grad_norm": 0.7005, "tokens_per_sec": 145818, "dt_s": 4.494, "eta_s": 4892, "world_size": 1, "timestamp": "2026-05-05T06:28:04.411991"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81530, "epoch": 0, "train_loss": 3.6646698862314224, "train_ppl": 39.0432453226435, "lr": 0.0004604180392720995, "grad_norm": 0.8915, "tokens_per_sec": 145612, "dt_s": 4.501, "eta_s": 4925, "world_size": 1, "timestamp": "2026-05-05T06:28:08.912732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81540, "epoch": 0, "train_loss": 3.769098609685898, "train_ppl": 43.34098008353842, "lr": 0.0004600555275839746, "grad_norm": 0.6655, "tokens_per_sec": 148954, "dt_s": 4.4, "eta_s": 4939, "world_size": 1, "timestamp": "2026-05-05T06:28:13.312469"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81550, "epoch": 0, "train_loss": 3.6712471544742584, "train_ppl": 39.30088958911054, "lr": 0.0004596930158958498, "grad_norm": 0.6811, "tokens_per_sec": 147446, "dt_s": 4.445, "eta_s": 4941, "world_size": 1, "timestamp": "2026-05-05T06:28:17.757225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81560, "epoch": 0, "train_loss": 3.626888945698738, "train_ppl": 37.59567231246199, "lr": 0.0004593305042077249, "grad_norm": 0.6741, "tokens_per_sec": 149578, "dt_s": 4.381, "eta_s": 4944, "world_size": 1, "timestamp": "2026-05-05T06:28:22.138598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81570, "epoch": 0, "train_loss": 3.6668121963739395, "train_ppl": 39.1269777214545, "lr": 0.00045896799251960005, "grad_norm": 0.6885, "tokens_per_sec": 150224, "dt_s": 4.363, "eta_s": 4910, "world_size": 1, "timestamp": "2026-05-05T06:28:26.501155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81580, "epoch": 0, "train_loss": 3.6373549103736877, "train_ppl": 37.99121354056816, "lr": 0.00045860548083147517, "grad_norm": 0.7312, "tokens_per_sec": 147208, "dt_s": 4.452, "eta_s": 4895, "world_size": 1, "timestamp": "2026-05-05T06:28:30.953076"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81590, "epoch": 0, "train_loss": 3.7129265666007996, "train_ppl": 40.97354303566142, "lr": 0.0004582429691433503, "grad_norm": 0.7434, "tokens_per_sec": 151054, "dt_s": 4.339, "eta_s": 4877, "world_size": 1, "timestamp": "2026-05-05T06:28:35.291683"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81600, "epoch": 0, "train_loss": 3.7765463143587112, "train_ppl": 43.66497591845414, "lr": 0.0004578804574552254, "grad_norm": 0.7773, "tokens_per_sec": 147426, "dt_s": 4.445, "eta_s": 4872, "world_size": 1, "timestamp": "2026-05-05T06:28:39.737013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81610, "epoch": 0, "train_loss": 3.6695363372564316, "train_ppl": 39.23371043253998, "lr": 0.00045751794576710054, "grad_norm": 0.6374, "tokens_per_sec": 148946, "dt_s": 4.4, "eta_s": 4872, "world_size": 1, "timestamp": "2026-05-05T06:28:44.136993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81620, "epoch": 0, "train_loss": 3.6473096013069153, "train_ppl": 38.371292977518486, "lr": 0.0004571554340789757, "grad_norm": 0.6524, "tokens_per_sec": 149514, "dt_s": 4.383, "eta_s": 4872, "world_size": 1, "timestamp": "2026-05-05T06:28:48.520280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81630, "epoch": 0, "train_loss": 3.5673695877194405, "train_ppl": 35.42329263205109, "lr": 0.00045679292239085085, "grad_norm": 0.7078, "tokens_per_sec": 148576, "dt_s": 4.411, "eta_s": 4859, "world_size": 1, "timestamp": "2026-05-05T06:28:52.931225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81640, "epoch": 0, "train_loss": 3.619526281952858, "train_ppl": 37.31988453111767, "lr": 0.00045643041070272597, "grad_norm": 0.7291, "tokens_per_sec": 152245, "dt_s": 4.305, "eta_s": 4847, "world_size": 1, "timestamp": "2026-05-05T06:28:57.235853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81650, "epoch": 0, "train_loss": 3.6616032868623734, "train_ppl": 38.92369872561595, "lr": 0.00045606789901460115, "grad_norm": 0.7165, "tokens_per_sec": 150746, "dt_s": 4.347, "eta_s": 4821, "world_size": 1, "timestamp": "2026-05-05T06:29:01.583338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81660, "epoch": 0, "train_loss": 3.7115001529455185, "train_ppl": 40.915139478088726, "lr": 0.0004557053873264763, "grad_norm": 0.6722, "tokens_per_sec": 148664, "dt_s": 4.408, "eta_s": 4818, "world_size": 1, "timestamp": "2026-05-05T06:29:05.991632"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81670, "epoch": 0, "train_loss": 3.6629574447870255, "train_ppl": 38.97644326485992, "lr": 0.0004553428756383514, "grad_norm": 0.6836, "tokens_per_sec": 150130, "dt_s": 4.365, "eta_s": 4810, "world_size": 1, "timestamp": "2026-05-05T06:29:10.356936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81680, "epoch": 0, "train_loss": 3.6617034524679184, "train_ppl": 38.927597736738996, "lr": 0.0004549803639502265, "grad_norm": 0.6982, "tokens_per_sec": 151065, "dt_s": 4.338, "eta_s": 4790, "world_size": 1, "timestamp": "2026-05-05T06:29:14.695219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81690, "epoch": 0, "train_loss": 3.65877041220665, "train_ppl": 38.813588803430726, "lr": 0.00045461785226210165, "grad_norm": 0.6548, "tokens_per_sec": 147955, "dt_s": 4.429, "eta_s": 4813, "world_size": 1, "timestamp": "2026-05-05T06:29:19.124665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81700, "epoch": 0, "train_loss": 3.6480915397405624, "train_ppl": 38.40130869994046, "lr": 0.00045425534057397677, "grad_norm": 0.6685, "tokens_per_sec": 150777, "dt_s": 4.347, "eta_s": 4808, "world_size": 1, "timestamp": "2026-05-05T06:29:23.471226"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81710, "epoch": 0, "train_loss": 3.641873076558113, "train_ppl": 38.16325251456543, "lr": 0.0004538928288858519, "grad_norm": 0.6744, "tokens_per_sec": 149991, "dt_s": 4.369, "eta_s": 4795, "world_size": 1, "timestamp": "2026-05-05T06:29:27.840556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81720, "epoch": 0, "train_loss": 3.705004319548607, "train_ppl": 40.65022290711326, "lr": 0.00045353031719772707, "grad_norm": 0.6688, "tokens_per_sec": 149797, "dt_s": 4.375, "eta_s": 4793, "world_size": 1, "timestamp": "2026-05-05T06:29:32.215531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81730, "epoch": 0, "train_loss": 3.499556079506874, "train_ppl": 33.10075459340091, "lr": 0.0004531678055096022, "grad_norm": 0.6571, "tokens_per_sec": 150869, "dt_s": 4.344, "eta_s": 4790, "world_size": 1, "timestamp": "2026-05-05T06:29:36.559434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81740, "epoch": 0, "train_loss": 3.603410020470619, "train_ppl": 36.72324820091429, "lr": 0.0004528052938214773, "grad_norm": 0.6841, "tokens_per_sec": 146970, "dt_s": 4.459, "eta_s": 4792, "world_size": 1, "timestamp": "2026-05-05T06:29:41.018573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81750, "epoch": 0, "train_loss": 3.593021869659424, "train_ppl": 36.34373618721466, "lr": 0.0004524427821333525, "grad_norm": 0.6893, "tokens_per_sec": 132809, "dt_s": 4.935, "eta_s": 4916, "world_size": 1, "timestamp": "2026-05-05T06:29:45.953189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81760, "epoch": 0, "train_loss": 3.684838369488716, "train_ppl": 39.83868278278502, "lr": 0.0004520802704452276, "grad_norm": 0.649, "tokens_per_sec": 150207, "dt_s": 4.363, "eta_s": 4910, "world_size": 1, "timestamp": "2026-05-05T06:29:50.316223"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81770, "epoch": 0, "train_loss": 3.610943153500557, "train_ppl": 37.00093392347077, "lr": 0.00045171775875710275, "grad_norm": 0.7622, "tokens_per_sec": 147542, "dt_s": 4.442, "eta_s": 4921, "world_size": 1, "timestamp": "2026-05-05T06:29:54.758065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81780, "epoch": 0, "train_loss": 3.7151350528001785, "train_ppl": 41.064132536005815, "lr": 0.00045135524706897787, "grad_norm": 0.7111, "tokens_per_sec": 148911, "dt_s": 4.401, "eta_s": 4928, "world_size": 1, "timestamp": "2026-05-05T06:29:59.159067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81790, "epoch": 0, "train_loss": 3.8263669461011887, "train_ppl": 45.8954941738665, "lr": 0.000450992735380853, "grad_norm": 0.7138, "tokens_per_sec": 149168, "dt_s": 4.393, "eta_s": 4910, "world_size": 1, "timestamp": "2026-05-05T06:30:03.552502"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81800, "epoch": 0, "train_loss": 3.6909951865673065, "train_ppl": 40.08471888778253, "lr": 0.0004506302236927281, "grad_norm": 0.6636, "tokens_per_sec": 148677, "dt_s": 4.408, "eta_s": 4790, "world_size": 1, "timestamp": "2026-05-05T06:30:07.960466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81810, "epoch": 0, "train_loss": 3.606060117483139, "train_ppl": 36.820697439196906, "lr": 0.00045026771200460324, "grad_norm": 0.7114, "tokens_per_sec": 150698, "dt_s": 4.349, "eta_s": 4783, "world_size": 1, "timestamp": "2026-05-05T06:30:12.309286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81820, "epoch": 0, "train_loss": 3.81881807744503, "train_ppl": 45.55033951907966, "lr": 0.0004499052003164784, "grad_norm": 0.6892, "tokens_per_sec": 149578, "dt_s": 4.381, "eta_s": 4765, "world_size": 1, "timestamp": "2026-05-05T06:30:16.690667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81830, "epoch": 0, "train_loss": 3.657913938164711, "train_ppl": 38.78036020389381, "lr": 0.00044954268862835355, "grad_norm": 0.6837, "tokens_per_sec": 148157, "dt_s": 4.423, "eta_s": 4766, "world_size": 1, "timestamp": "2026-05-05T06:30:21.114090"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81840, "epoch": 0, "train_loss": 3.6671667397022247, "train_ppl": 39.140852389801815, "lr": 0.00044918017694022867, "grad_norm": 0.687, "tokens_per_sec": 149187, "dt_s": 4.393, "eta_s": 4761, "world_size": 1, "timestamp": "2026-05-05T06:30:25.506968"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81850, "epoch": 0, "train_loss": 3.6656389385461807, "train_ppl": 39.08109860784369, "lr": 0.00044881766525210385, "grad_norm": 0.6669, "tokens_per_sec": 148435, "dt_s": 4.415, "eta_s": 4759, "world_size": 1, "timestamp": "2026-05-05T06:30:29.922114"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81860, "epoch": 0, "train_loss": 3.644529312849045, "train_ppl": 38.26475788231368, "lr": 0.00044845515356397897, "grad_norm": 0.7359, "tokens_per_sec": 150630, "dt_s": 4.351, "eta_s": 4755, "world_size": 1, "timestamp": "2026-05-05T06:30:34.272870"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81870, "epoch": 0, "train_loss": 3.5698053538799286, "train_ppl": 35.509680657340276, "lr": 0.0004480926418758541, "grad_norm": 0.7145, "tokens_per_sec": 150875, "dt_s": 4.344, "eta_s": 4742, "world_size": 1, "timestamp": "2026-05-05T06:30:38.616616"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81880, "epoch": 0, "train_loss": 3.602553352713585, "train_ppl": 36.69180204962208, "lr": 0.0004477301301877292, "grad_norm": 0.6813, "tokens_per_sec": 146723, "dt_s": 4.467, "eta_s": 4747, "world_size": 1, "timestamp": "2026-05-05T06:30:43.083250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81890, "epoch": 0, "train_loss": 3.5158654153347015, "train_ppl": 33.64503225528923, "lr": 0.00044736761849960434, "grad_norm": 0.7093, "tokens_per_sec": 150597, "dt_s": 4.352, "eta_s": 4734, "world_size": 1, "timestamp": "2026-05-05T06:30:47.434978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81900, "epoch": 0, "train_loss": 3.6019522845745087, "train_ppl": 36.669754403177464, "lr": 0.00044700510681147947, "grad_norm": 0.6453, "tokens_per_sec": 148671, "dt_s": 4.408, "eta_s": 4728, "world_size": 1, "timestamp": "2026-05-05T06:30:51.843093"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81910, "epoch": 0, "train_loss": 3.713289812207222, "train_ppl": 40.98842919865166, "lr": 0.0004466425951233546, "grad_norm": 0.6693, "tokens_per_sec": 146700, "dt_s": 4.467, "eta_s": 4749, "world_size": 1, "timestamp": "2026-05-05T06:30:56.310448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81920, "epoch": 0, "train_loss": 3.7521200329065323, "train_ppl": 42.61132371702945, "lr": 0.00044628008343522977, "grad_norm": 0.7112, "tokens_per_sec": 150654, "dt_s": 4.35, "eta_s": 4746, "world_size": 1, "timestamp": "2026-05-05T06:31:00.660544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81930, "epoch": 0, "train_loss": 3.6945641040802, "train_ppl": 40.22803352997016, "lr": 0.0004459175717471049, "grad_norm": 0.6643, "tokens_per_sec": 150123, "dt_s": 4.365, "eta_s": 4719, "world_size": 1, "timestamp": "2026-05-05T06:31:05.026021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81940, "epoch": 0, "train_loss": 3.5720872581005096, "train_ppl": 35.59080286886131, "lr": 0.00044555506005898, "grad_norm": 0.7167, "tokens_per_sec": 150189, "dt_s": 4.364, "eta_s": 4718, "world_size": 1, "timestamp": "2026-05-05T06:31:09.389602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81950, "epoch": 0, "train_loss": 3.751992776989937, "train_ppl": 42.6059015189832, "lr": 0.0004451925483708552, "grad_norm": 0.7896, "tokens_per_sec": 150243, "dt_s": 4.362, "eta_s": 4703, "world_size": 1, "timestamp": "2026-05-05T06:31:13.751605"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81960, "epoch": 0, "train_loss": 3.647364556789398, "train_ppl": 38.37340174838128, "lr": 0.0004448300366827303, "grad_norm": 0.7484, "tokens_per_sec": 148624, "dt_s": 4.41, "eta_s": 4687, "world_size": 1, "timestamp": "2026-05-05T06:31:18.161122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81970, "epoch": 0, "train_loss": 3.6629447788000107, "train_ppl": 38.97594959286207, "lr": 0.00044446752499460544, "grad_norm": 0.6764, "tokens_per_sec": 150842, "dt_s": 4.345, "eta_s": 4681, "world_size": 1, "timestamp": "2026-05-05T06:31:22.505794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81980, "epoch": 0, "train_loss": 3.6627930402755737, "train_ppl": 38.97003588846206, "lr": 0.00044410501330648057, "grad_norm": 0.717, "tokens_per_sec": 151710, "dt_s": 4.32, "eta_s": 4667, "world_size": 1, "timestamp": "2026-05-05T06:31:26.825619"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 81990, "epoch": 0, "train_loss": 3.693933829665184, "train_ppl": 40.202686818200846, "lr": 0.0004437425016183557, "grad_norm": 0.6954, "tokens_per_sec": 148541, "dt_s": 4.412, "eta_s": 4673, "world_size": 1, "timestamp": "2026-05-05T06:31:31.237604"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82000, "epoch": 0, "train_loss": 3.6230524629354477, "train_ppl": 37.4517134880079, "lr": 0.0004433799899302308, "grad_norm": 0.6476, "tokens_per_sec": 150357, "dt_s": 4.359, "eta_s": 4668, "world_size": 1, "timestamp": "2026-05-05T06:31:35.596306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82010, "epoch": 0, "train_loss": 3.6998208463191986, "train_ppl": 40.440058725673204, "lr": 0.00044301747824210594, "grad_norm": 0.6905, "tokens_per_sec": 126611, "dt_s": 5.176, "eta_s": 4664, "world_size": 1, "timestamp": "2026-05-05T06:31:40.772451"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82020, "epoch": 0, "train_loss": 3.6699801087379456, "train_ppl": 39.25112509812409, "lr": 0.0004426549665539811, "grad_norm": 0.6765, "tokens_per_sec": 148074, "dt_s": 4.426, "eta_s": 4677, "world_size": 1, "timestamp": "2026-05-05T06:31:45.198383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82030, "epoch": 0, "train_loss": 3.7248449474573135, "train_ppl": 41.464803023230544, "lr": 0.00044229245486585624, "grad_norm": 0.7842, "tokens_per_sec": 150421, "dt_s": 4.357, "eta_s": 4680, "world_size": 1, "timestamp": "2026-05-05T06:31:49.555210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82040, "epoch": 0, "train_loss": 3.624847576022148, "train_ppl": 37.51900392791432, "lr": 0.00044192994317773137, "grad_norm": 0.6624, "tokens_per_sec": 145911, "dt_s": 4.492, "eta_s": 4693, "world_size": 1, "timestamp": "2026-05-05T06:31:54.046722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82050, "epoch": 0, "train_loss": 3.734827294945717, "train_ppl": 41.88079191403589, "lr": 0.00044156743148960655, "grad_norm": 0.683, "tokens_per_sec": 134850, "dt_s": 4.86, "eta_s": 4795, "world_size": 1, "timestamp": "2026-05-05T06:31:58.906665"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82060, "epoch": 0, "train_loss": 3.7142594903707504, "train_ppl": 41.02819405984588, "lr": 0.00044120491980148167, "grad_norm": 0.7099, "tokens_per_sec": 150145, "dt_s": 4.365, "eta_s": 4781, "world_size": 1, "timestamp": "2026-05-05T06:32:03.271533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82070, "epoch": 0, "train_loss": 3.6210083663463593, "train_ppl": 37.37523675775203, "lr": 0.0004408424081133568, "grad_norm": 0.7021, "tokens_per_sec": 147917, "dt_s": 4.431, "eta_s": 4777, "world_size": 1, "timestamp": "2026-05-05T06:32:07.702106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82080, "epoch": 0, "train_loss": 3.609932169318199, "train_ppl": 36.9635454672951, "lr": 0.0004404798964252319, "grad_norm": 0.7431, "tokens_per_sec": 150639, "dt_s": 4.351, "eta_s": 4771, "world_size": 1, "timestamp": "2026-05-05T06:32:12.052641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82090, "epoch": 0, "train_loss": 3.753660038113594, "train_ppl": 42.67699593223105, "lr": 0.00044011738473710704, "grad_norm": 0.7749, "tokens_per_sec": 150170, "dt_s": 4.364, "eta_s": 4740, "world_size": 1, "timestamp": "2026-05-05T06:32:16.416751"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82100, "epoch": 0, "train_loss": 3.6794838309288025, "train_ppl": 39.62593511076372, "lr": 0.00043975487304898217, "grad_norm": 0.692, "tokens_per_sec": 149165, "dt_s": 4.394, "eta_s": 4637, "world_size": 1, "timestamp": "2026-05-05T06:32:20.810288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82110, "epoch": 0, "train_loss": 3.6419342309236526, "train_ppl": 38.165586435423876, "lr": 0.0004393923613608573, "grad_norm": 0.7179, "tokens_per_sec": 149649, "dt_s": 4.379, "eta_s": 4635, "world_size": 1, "timestamp": "2026-05-05T06:32:25.189592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82120, "epoch": 0, "train_loss": 3.615654692053795, "train_ppl": 37.1756765802859, "lr": 0.00043902984967273247, "grad_norm": 0.6666, "tokens_per_sec": 147658, "dt_s": 4.438, "eta_s": 4632, "world_size": 1, "timestamp": "2026-05-05T06:32:29.627970"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82130, "epoch": 0, "train_loss": 3.5374848693609238, "train_ppl": 34.38033931186026, "lr": 0.0004386673379846076, "grad_norm": 0.6994, "tokens_per_sec": 148908, "dt_s": 4.401, "eta_s": 4639, "world_size": 1, "timestamp": "2026-05-05T06:32:34.029094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82140, "epoch": 0, "train_loss": 3.6893945783376694, "train_ppl": 40.020610276920024, "lr": 0.0004383048262964827, "grad_norm": 0.7549, "tokens_per_sec": 149944, "dt_s": 4.371, "eta_s": 4636, "world_size": 1, "timestamp": "2026-05-05T06:32:38.399805"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82150, "epoch": 0, "train_loss": 3.7905670404434204, "train_ppl": 44.281502561152905, "lr": 0.0004379423146083579, "grad_norm": 0.7168, "tokens_per_sec": 148770, "dt_s": 4.405, "eta_s": 4634, "world_size": 1, "timestamp": "2026-05-05T06:32:42.804976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82160, "epoch": 0, "train_loss": 3.7732872664928436, "train_ppl": 43.522901311598574, "lr": 0.000437579802920233, "grad_norm": 0.7283, "tokens_per_sec": 150564, "dt_s": 4.353, "eta_s": 4624, "world_size": 1, "timestamp": "2026-05-05T06:32:47.157702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82170, "epoch": 0, "train_loss": 3.6241065114736557, "train_ppl": 37.491210223944904, "lr": 0.00043721729123210814, "grad_norm": 0.6976, "tokens_per_sec": 150199, "dt_s": 4.363, "eta_s": 4604, "world_size": 1, "timestamp": "2026-05-05T06:32:51.520971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82180, "epoch": 0, "train_loss": 3.6227611303329468, "train_ppl": 37.44080417204618, "lr": 0.00043685477954398327, "grad_norm": 0.7299, "tokens_per_sec": 149109, "dt_s": 4.395, "eta_s": 4598, "world_size": 1, "timestamp": "2026-05-05T06:32:55.916145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82190, "epoch": 0, "train_loss": 3.703952729701996, "train_ppl": 40.607498013910856, "lr": 0.0004364922678558584, "grad_norm": 0.7297, "tokens_per_sec": 149032, "dt_s": 4.397, "eta_s": 4599, "world_size": 1, "timestamp": "2026-05-05T06:33:00.313592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82200, "epoch": 0, "train_loss": 3.7136306762695312, "train_ppl": 41.0024030625947, "lr": 0.0004361297561677335, "grad_norm": 0.7289, "tokens_per_sec": 148797, "dt_s": 4.404, "eta_s": 4595, "world_size": 1, "timestamp": "2026-05-05T06:33:04.717991"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82210, "epoch": 0, "train_loss": 3.6564908921718597, "train_ppl": 38.72521321536118, "lr": 0.00043576724447960864, "grad_norm": 0.7081, "tokens_per_sec": 149262, "dt_s": 4.391, "eta_s": 4598, "world_size": 1, "timestamp": "2026-05-05T06:33:09.108661"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82220, "epoch": 0, "train_loss": 3.5818191170692444, "train_ppl": 35.9388584096861, "lr": 0.0004354047327914838, "grad_norm": 0.6925, "tokens_per_sec": 150357, "dt_s": 4.359, "eta_s": 4593, "world_size": 1, "timestamp": "2026-05-05T06:33:13.467370"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82230, "epoch": 0, "train_loss": 3.764690324664116, "train_ppl": 43.15034119421033, "lr": 0.00043504222110335894, "grad_norm": 0.7475, "tokens_per_sec": 148856, "dt_s": 4.403, "eta_s": 4590, "world_size": 1, "timestamp": "2026-05-05T06:33:17.870019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82240, "epoch": 0, "train_loss": 3.6600436121225357, "train_ppl": 38.86303773393768, "lr": 0.00043467970941523407, "grad_norm": 0.7168, "tokens_per_sec": 149998, "dt_s": 4.369, "eta_s": 4580, "world_size": 1, "timestamp": "2026-05-05T06:33:22.239113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82250, "epoch": 0, "train_loss": 3.7002297937870026, "train_ppl": 40.45659996730583, "lr": 0.00043431719772710924, "grad_norm": 0.6784, "tokens_per_sec": 150716, "dt_s": 4.348, "eta_s": 4564, "world_size": 1, "timestamp": "2026-05-05T06:33:26.587435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82260, "epoch": 0, "train_loss": 3.5986236333847046, "train_ppl": 36.547896505286516, "lr": 0.00043395468603898437, "grad_norm": 0.7065, "tokens_per_sec": 149098, "dt_s": 4.395, "eta_s": 4560, "world_size": 1, "timestamp": "2026-05-05T06:33:30.982945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82270, "epoch": 0, "train_loss": 3.8004802018404007, "train_ppl": 44.722655239095836, "lr": 0.0004335921743508595, "grad_norm": 0.7101, "tokens_per_sec": 150612, "dt_s": 4.351, "eta_s": 4554, "world_size": 1, "timestamp": "2026-05-05T06:33:35.334263"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82280, "epoch": 0, "train_loss": 3.6547039449214935, "train_ppl": 38.656075093572, "lr": 0.0004332296626627346, "grad_norm": 0.7357, "tokens_per_sec": 150936, "dt_s": 4.342, "eta_s": 4537, "world_size": 1, "timestamp": "2026-05-05T06:33:39.676234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82290, "epoch": 0, "train_loss": 3.6267805099487305, "train_ppl": 37.591595818560585, "lr": 0.00043286715097460974, "grad_norm": 0.6819, "tokens_per_sec": 148248, "dt_s": 4.421, "eta_s": 4544, "world_size": 1, "timestamp": "2026-05-05T06:33:44.096916"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82300, "epoch": 0, "train_loss": 3.7738392800092697, "train_ppl": 43.54693317374277, "lr": 0.00043250463928648487, "grad_norm": 0.7045, "tokens_per_sec": 150653, "dt_s": 4.35, "eta_s": 4540, "world_size": 1, "timestamp": "2026-05-05T06:33:48.447048"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82310, "epoch": 0, "train_loss": 3.6841016709804535, "train_ppl": 39.80934449267196, "lr": 0.00043214212759836, "grad_norm": 0.7367, "tokens_per_sec": 150073, "dt_s": 4.367, "eta_s": 4529, "world_size": 1, "timestamp": "2026-05-05T06:33:52.813976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82320, "epoch": 0, "train_loss": 3.7009539753198624, "train_ppl": 40.48590850095298, "lr": 0.00043177961591023517, "grad_norm": 0.6667, "tokens_per_sec": 148700, "dt_s": 4.407, "eta_s": 4537, "world_size": 1, "timestamp": "2026-05-05T06:33:57.221246"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82330, "epoch": 0, "train_loss": 3.509482815861702, "train_ppl": 33.430973342984274, "lr": 0.00043141710422211024, "grad_norm": 0.8951, "tokens_per_sec": 150561, "dt_s": 4.353, "eta_s": 4535, "world_size": 1, "timestamp": "2026-05-05T06:34:01.574017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82340, "epoch": 0, "train_loss": 3.5400378853082657, "train_ppl": 34.46822500545134, "lr": 0.0004310545925339854, "grad_norm": 0.68, "tokens_per_sec": 133109, "dt_s": 4.924, "eta_s": 4634, "world_size": 1, "timestamp": "2026-05-05T06:34:06.497514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82350, "epoch": 0, "train_loss": 3.531952887773514, "train_ppl": 34.19067300676532, "lr": 0.0004306920808458606, "grad_norm": 0.676, "tokens_per_sec": 148760, "dt_s": 4.405, "eta_s": 4641, "world_size": 1, "timestamp": "2026-05-05T06:34:10.903018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82360, "epoch": 0, "train_loss": 3.6348406821489334, "train_ppl": 37.89581493637736, "lr": 0.0004303295691577357, "grad_norm": 0.7531, "tokens_per_sec": 150030, "dt_s": 4.368, "eta_s": 4637, "world_size": 1, "timestamp": "2026-05-05T06:34:15.271208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82370, "epoch": 0, "train_loss": 3.646297499537468, "train_ppl": 38.33247697018824, "lr": 0.00042996705746961084, "grad_norm": 0.6653, "tokens_per_sec": 148759, "dt_s": 4.406, "eta_s": 4632, "world_size": 1, "timestamp": "2026-05-05T06:34:19.676703"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82380, "epoch": 0, "train_loss": 3.720603123307228, "train_ppl": 41.28928913308644, "lr": 0.00042960454578148597, "grad_norm": 0.6772, "tokens_per_sec": 150459, "dt_s": 4.356, "eta_s": 4628, "world_size": 1, "timestamp": "2026-05-05T06:34:24.032445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82390, "epoch": 0, "train_loss": 3.6653234362602234, "train_ppl": 39.06877037679, "lr": 0.0004292420340933611, "grad_norm": 0.6875, "tokens_per_sec": 150872, "dt_s": 4.344, "eta_s": 4504, "world_size": 1, "timestamp": "2026-05-05T06:34:28.376258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82400, "epoch": 0, "train_loss": 3.776622638106346, "train_ppl": 43.668308720240965, "lr": 0.0004288795224052362, "grad_norm": 0.7159, "tokens_per_sec": 148219, "dt_s": 4.422, "eta_s": 4503, "world_size": 1, "timestamp": "2026-05-05T06:34:32.797833"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82410, "epoch": 0, "train_loss": 3.7430760711431503, "train_ppl": 42.22768595219485, "lr": 0.00042851701071711134, "grad_norm": 0.6949, "tokens_per_sec": 149455, "dt_s": 4.385, "eta_s": 4502, "world_size": 1, "timestamp": "2026-05-05T06:34:37.182836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82420, "epoch": 0, "train_loss": 3.6248816549777985, "train_ppl": 37.52028255817229, "lr": 0.0004281544990289865, "grad_norm": 0.6791, "tokens_per_sec": 148568, "dt_s": 4.411, "eta_s": 4499, "world_size": 1, "timestamp": "2026-05-05T06:34:41.594006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82430, "epoch": 0, "train_loss": 3.693085089325905, "train_ppl": 40.16857965226208, "lr": 0.0004277919873408616, "grad_norm": 0.7411, "tokens_per_sec": 147874, "dt_s": 4.432, "eta_s": 4510, "world_size": 1, "timestamp": "2026-05-05T06:34:46.025874"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82440, "epoch": 0, "train_loss": 3.65084171295166, "train_ppl": 38.50706430686021, "lr": 0.00042742947565273676, "grad_norm": 0.7244, "tokens_per_sec": 150057, "dt_s": 4.367, "eta_s": 4511, "world_size": 1, "timestamp": "2026-05-05T06:34:50.393285"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82450, "epoch": 0, "train_loss": 3.6307450383901596, "train_ppl": 37.74092458281915, "lr": 0.00042706696396461194, "grad_norm": 0.6632, "tokens_per_sec": 147872, "dt_s": 4.432, "eta_s": 4509, "world_size": 1, "timestamp": "2026-05-05T06:34:54.825247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82460, "epoch": 0, "train_loss": 3.684160500764847, "train_ppl": 39.8116865367156, "lr": 0.00042670445227648707, "grad_norm": 0.751, "tokens_per_sec": 149556, "dt_s": 4.382, "eta_s": 4504, "world_size": 1, "timestamp": "2026-05-05T06:34:59.207275"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82470, "epoch": 0, "train_loss": 3.894349381327629, "train_ppl": 49.12408191824324, "lr": 0.0004263419405883622, "grad_norm": 0.7254, "tokens_per_sec": 150066, "dt_s": 4.367, "eta_s": 4490, "world_size": 1, "timestamp": "2026-05-05T06:35:03.574398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82480, "epoch": 0, "train_loss": 3.625595510005951, "train_ppl": 37.54707616277101, "lr": 0.0004259794289002373, "grad_norm": 0.7229, "tokens_per_sec": 147130, "dt_s": 4.454, "eta_s": 4490, "world_size": 1, "timestamp": "2026-05-05T06:35:08.028697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82490, "epoch": 0, "train_loss": 3.704402595758438, "train_ppl": 40.625770058582546, "lr": 0.00042561691721211244, "grad_norm": 0.7275, "tokens_per_sec": 149204, "dt_s": 4.392, "eta_s": 4491, "world_size": 1, "timestamp": "2026-05-05T06:35:12.421046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82500, "epoch": 0, "train_loss": 3.630537360906601, "train_ppl": 37.73308745640013, "lr": 0.00042525440552398756, "grad_norm": 0.7076, "tokens_per_sec": 150428, "dt_s": 4.357, "eta_s": 4471, "world_size": 1, "timestamp": "2026-05-05T06:35:16.777692"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82510, "epoch": 0, "train_loss": 3.597399726510048, "train_ppl": 36.50319264576029, "lr": 0.0004248918938358627, "grad_norm": 0.6937, "tokens_per_sec": 106321, "dt_s": 6.164, "eta_s": 4492, "world_size": 1, "timestamp": "2026-05-05T06:35:22.941660"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82520, "epoch": 0, "train_loss": 3.7022873163223267, "train_ppl": 40.53992602666757, "lr": 0.00042452938214773787, "grad_norm": 0.7078, "tokens_per_sec": 146528, "dt_s": 4.473, "eta_s": 4509, "world_size": 1, "timestamp": "2026-05-05T06:35:27.414232"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82530, "epoch": 0, "train_loss": 3.6139347851276398, "train_ppl": 37.1117928294515, "lr": 0.00042416687045961294, "grad_norm": 0.7053, "tokens_per_sec": 146924, "dt_s": 4.461, "eta_s": 4506, "world_size": 1, "timestamp": "2026-05-05T06:35:31.874769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82540, "epoch": 0, "train_loss": 3.6433295905590057, "train_ppl": 38.21887832622537, "lr": 0.0004238043587714881, "grad_norm": 0.6597, "tokens_per_sec": 150742, "dt_s": 4.348, "eta_s": 4492, "world_size": 1, "timestamp": "2026-05-05T06:35:36.222326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82550, "epoch": 0, "train_loss": 3.7202146351337433, "train_ppl": 41.27325184791616, "lr": 0.0004234418470833633, "grad_norm": 0.7402, "tokens_per_sec": 150521, "dt_s": 4.354, "eta_s": 4487, "world_size": 1, "timestamp": "2026-05-05T06:35:40.576294"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82560, "epoch": 0, "train_loss": 3.7261673361063004, "train_ppl": 41.51967187906119, "lr": 0.0004230793353952384, "grad_norm": 0.7283, "tokens_per_sec": 147278, "dt_s": 4.45, "eta_s": 4472, "world_size": 1, "timestamp": "2026-05-05T06:35:45.026082"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82570, "epoch": 0, "train_loss": 3.667358085513115, "train_ppl": 39.14834254452329, "lr": 0.00042271682370711354, "grad_norm": 0.703, "tokens_per_sec": 149885, "dt_s": 4.372, "eta_s": 4447, "world_size": 1, "timestamp": "2026-05-05T06:35:49.398490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82580, "epoch": 0, "train_loss": 3.641693264245987, "train_ppl": 38.156390908811616, "lr": 0.00042235431201898866, "grad_norm": 0.6776, "tokens_per_sec": 149109, "dt_s": 4.395, "eta_s": 4429, "world_size": 1, "timestamp": "2026-05-05T06:35:53.793685"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82590, "epoch": 0, "train_loss": 3.786439672112465, "train_ppl": 44.099113142908735, "lr": 0.0004219918003308638, "grad_norm": 0.7585, "tokens_per_sec": 148031, "dt_s": 4.427, "eta_s": 4441, "world_size": 1, "timestamp": "2026-05-05T06:35:58.220885"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82600, "epoch": 0, "train_loss": 3.6544821709394455, "train_ppl": 38.647503132422266, "lr": 0.0004216292886427389, "grad_norm": 0.7169, "tokens_per_sec": 150643, "dt_s": 4.35, "eta_s": 4436, "world_size": 1, "timestamp": "2026-05-05T06:36:02.571278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82610, "epoch": 0, "train_loss": 3.8093560934066772, "train_ppl": 45.12137555907304, "lr": 0.00042126677695461404, "grad_norm": 0.7376, "tokens_per_sec": 150124, "dt_s": 4.365, "eta_s": 4415, "world_size": 1, "timestamp": "2026-05-05T06:36:06.936712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82620, "epoch": 0, "train_loss": 3.705399915575981, "train_ppl": 40.66630715502973, "lr": 0.0004209042652664892, "grad_norm": 0.7746, "tokens_per_sec": 148640, "dt_s": 4.409, "eta_s": 4418, "world_size": 1, "timestamp": "2026-05-05T06:36:11.345788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82630, "epoch": 0, "train_loss": 3.578615427017212, "train_ppl": 35.82390568129313, "lr": 0.0004205417535783643, "grad_norm": 0.6774, "tokens_per_sec": 149935, "dt_s": 4.371, "eta_s": 4408, "world_size": 1, "timestamp": "2026-05-05T06:36:15.716723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82640, "epoch": 0, "train_loss": 3.5627404898405075, "train_ppl": 35.25969369315428, "lr": 0.00042017924189023946, "grad_norm": 0.8092, "tokens_per_sec": 131739, "dt_s": 4.975, "eta_s": 4514, "world_size": 1, "timestamp": "2026-05-05T06:36:20.691422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82650, "epoch": 0, "train_loss": 3.694627493619919, "train_ppl": 40.23058364732394, "lr": 0.00041981673020211464, "grad_norm": 0.7145, "tokens_per_sec": 150228, "dt_s": 4.362, "eta_s": 4512, "world_size": 1, "timestamp": "2026-05-05T06:36:25.053842"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82660, "epoch": 0, "train_loss": 3.6791288554668427, "train_ppl": 39.611871372430876, "lr": 0.0004194542185139897, "grad_norm": 0.7801, "tokens_per_sec": 150914, "dt_s": 4.343, "eta_s": 4503, "world_size": 1, "timestamp": "2026-05-05T06:36:29.396472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82670, "epoch": 0, "train_loss": 3.639956384897232, "train_ppl": 38.09017538225571, "lr": 0.0004190917068258649, "grad_norm": 0.7181, "tokens_per_sec": 147194, "dt_s": 4.452, "eta_s": 4507, "world_size": 1, "timestamp": "2026-05-05T06:36:33.848815"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82680, "epoch": 0, "train_loss": 3.737374424934387, "train_ppl": 41.98760370908413, "lr": 0.00041872919513774, "grad_norm": 0.722, "tokens_per_sec": 150805, "dt_s": 4.346, "eta_s": 4497, "world_size": 1, "timestamp": "2026-05-05T06:36:38.194571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82690, "epoch": 0, "train_loss": 3.7663680016994476, "train_ppl": 43.22279429015231, "lr": 0.00041836668344961514, "grad_norm": 0.7378, "tokens_per_sec": 147632, "dt_s": 4.439, "eta_s": 4386, "world_size": 1, "timestamp": "2026-05-05T06:36:42.633712"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82700, "epoch": 0, "train_loss": 3.6948570907115936, "train_ppl": 40.23982153278098, "lr": 0.00041800417176149026, "grad_norm": 0.6952, "tokens_per_sec": 148732, "dt_s": 4.406, "eta_s": 4390, "world_size": 1, "timestamp": "2026-05-05T06:36:47.040016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82710, "epoch": 0, "train_loss": 3.7153764963150024, "train_ppl": 41.07404840151104, "lr": 0.0004176416600733654, "grad_norm": 0.7713, "tokens_per_sec": 149448, "dt_s": 4.385, "eta_s": 4394, "world_size": 1, "timestamp": "2026-05-05T06:36:51.425225"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82720, "epoch": 0, "train_loss": 3.5680351853370667, "train_ppl": 35.44687813959353, "lr": 0.00041727914838524056, "grad_norm": 0.7743, "tokens_per_sec": 148690, "dt_s": 4.408, "eta_s": 4381, "world_size": 1, "timestamp": "2026-05-05T06:36:55.832825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82730, "epoch": 0, "train_loss": 3.5736413300037384, "train_ppl": 35.64615653626222, "lr": 0.00041691663669711563, "grad_norm": 0.694, "tokens_per_sec": 149601, "dt_s": 4.381, "eta_s": 4384, "world_size": 1, "timestamp": "2026-05-05T06:37:00.213485"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82740, "epoch": 0, "train_loss": 3.5909010767936707, "train_ppl": 36.26674032578989, "lr": 0.0004165541250089908, "grad_norm": 0.6937, "tokens_per_sec": 150176, "dt_s": 4.364, "eta_s": 4364, "world_size": 1, "timestamp": "2026-05-05T06:37:04.577442"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82750, "epoch": 0, "train_loss": 3.785030335187912, "train_ppl": 44.03700640935269, "lr": 0.000416191613320866, "grad_norm": 0.694, "tokens_per_sec": 145513, "dt_s": 4.504, "eta_s": 4379, "world_size": 1, "timestamp": "2026-05-05T06:37:09.081248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82760, "epoch": 0, "train_loss": 3.6089189797639847, "train_ppl": 36.92611335525553, "lr": 0.00041582910163274106, "grad_norm": 0.7933, "tokens_per_sec": 149477, "dt_s": 4.384, "eta_s": 4375, "world_size": 1, "timestamp": "2026-05-05T06:37:13.465588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82770, "epoch": 0, "train_loss": 3.7498297840356827, "train_ppl": 42.51384484904312, "lr": 0.00041546658994461624, "grad_norm": 0.7303, "tokens_per_sec": 149786, "dt_s": 4.375, "eta_s": 4364, "world_size": 1, "timestamp": "2026-05-05T06:37:17.840887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82780, "epoch": 0, "train_loss": 3.6492556631565094, "train_ppl": 38.446038593107644, "lr": 0.00041510407825649136, "grad_norm": 0.7143, "tokens_per_sec": 147488, "dt_s": 4.443, "eta_s": 4372, "world_size": 1, "timestamp": "2026-05-05T06:37:22.284367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82790, "epoch": 0, "train_loss": 3.693030923604965, "train_ppl": 40.16640395111084, "lr": 0.0004147415665683665, "grad_norm": 0.667, "tokens_per_sec": 149741, "dt_s": 4.377, "eta_s": 4370, "world_size": 1, "timestamp": "2026-05-05T06:37:26.660983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82800, "epoch": 0, "train_loss": 3.5594119876623154, "train_ppl": 35.142526829148835, "lr": 0.0004143790548802416, "grad_norm": 0.7013, "tokens_per_sec": 149180, "dt_s": 4.393, "eta_s": 4344, "world_size": 1, "timestamp": "2026-05-05T06:37:31.054065"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82810, "epoch": 0, "train_loss": 3.689318284392357, "train_ppl": 40.01755706314053, "lr": 0.00041401654319211674, "grad_norm": 0.7326, "tokens_per_sec": 147124, "dt_s": 4.454, "eta_s": 4353, "world_size": 1, "timestamp": "2026-05-05T06:37:35.508550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82820, "epoch": 0, "train_loss": 3.7445302456617355, "train_ppl": 42.289137046564306, "lr": 0.0004136540315039919, "grad_norm": 0.6897, "tokens_per_sec": 148536, "dt_s": 4.412, "eta_s": 4356, "world_size": 1, "timestamp": "2026-05-05T06:37:39.920680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82830, "epoch": 0, "train_loss": 3.717950239777565, "train_ppl": 41.179898622291596, "lr": 0.000413291519815867, "grad_norm": 0.7645, "tokens_per_sec": 146286, "dt_s": 4.48, "eta_s": 4359, "world_size": 1, "timestamp": "2026-05-05T06:37:44.400674"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82840, "epoch": 0, "train_loss": 3.7001256197690964, "train_ppl": 40.45238566025087, "lr": 0.00041292900812774216, "grad_norm": 0.7195, "tokens_per_sec": 148343, "dt_s": 4.418, "eta_s": 4362, "world_size": 1, "timestamp": "2026-05-05T06:37:48.818544"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82850, "epoch": 0, "train_loss": 3.5972190648317337, "train_ppl": 36.49659851338453, "lr": 0.00041256649643961734, "grad_norm": 0.6874, "tokens_per_sec": 150069, "dt_s": 4.367, "eta_s": 4353, "world_size": 1, "timestamp": "2026-05-05T06:37:53.185594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82860, "epoch": 0, "train_loss": 3.753301292657852, "train_ppl": 42.661688499775735, "lr": 0.0004122039847514924, "grad_norm": 0.7809, "tokens_per_sec": 146495, "dt_s": 4.474, "eta_s": 4352, "world_size": 1, "timestamp": "2026-05-05T06:37:57.659210"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82870, "epoch": 0, "train_loss": 3.7210569083690643, "train_ppl": 41.308029847515414, "lr": 0.0004118414730633676, "grad_norm": 0.6849, "tokens_per_sec": 149030, "dt_s": 4.398, "eta_s": 4345, "world_size": 1, "timestamp": "2026-05-05T06:38:02.056722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82880, "epoch": 0, "train_loss": 3.6436435133218765, "train_ppl": 38.23087798548791, "lr": 0.0004114789613752427, "grad_norm": 0.718, "tokens_per_sec": 149623, "dt_s": 4.38, "eta_s": 4321, "world_size": 1, "timestamp": "2026-05-05T06:38:06.436811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82890, "epoch": 0, "train_loss": 3.5949998646974564, "train_ppl": 36.41569506074255, "lr": 0.00041111644968711784, "grad_norm": 0.7429, "tokens_per_sec": 146783, "dt_s": 4.465, "eta_s": 4326, "world_size": 1, "timestamp": "2026-05-05T06:38:10.901623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82900, "epoch": 0, "train_loss": 3.634142145514488, "train_ppl": 37.86935256489614, "lr": 0.00041075393799899296, "grad_norm": 0.7115, "tokens_per_sec": 149927, "dt_s": 4.371, "eta_s": 4322, "world_size": 1, "timestamp": "2026-05-05T06:38:15.272835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82910, "epoch": 0, "train_loss": 3.62519334256649, "train_ppl": 37.53197898729108, "lr": 0.0004103914263108681, "grad_norm": 0.6905, "tokens_per_sec": 148631, "dt_s": 4.409, "eta_s": 4305, "world_size": 1, "timestamp": "2026-05-05T06:38:19.682138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82920, "epoch": 0, "train_loss": 3.6563240736722946, "train_ppl": 38.718753672197884, "lr": 0.00041002891462274326, "grad_norm": 0.7018, "tokens_per_sec": 149661, "dt_s": 4.379, "eta_s": 4297, "world_size": 1, "timestamp": "2026-05-05T06:38:24.061104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82930, "epoch": 0, "train_loss": 3.734922334551811, "train_ppl": 41.884772437152996, "lr": 0.00040966640293461833, "grad_norm": 0.7354, "tokens_per_sec": 149692, "dt_s": 4.378, "eta_s": 4292, "world_size": 1, "timestamp": "2026-05-05T06:38:28.439172"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82940, "epoch": 0, "train_loss": 3.6196839064359665, "train_ppl": 37.3257675222661, "lr": 0.0004093038912464935, "grad_norm": 0.662, "tokens_per_sec": 132089, "dt_s": 4.962, "eta_s": 4385, "world_size": 1, "timestamp": "2026-05-05T06:38:33.400688"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82950, "epoch": 0, "train_loss": 3.675806701183319, "train_ppl": 39.48049297475794, "lr": 0.0004089413795583687, "grad_norm": 0.7162, "tokens_per_sec": 150921, "dt_s": 4.342, "eta_s": 4375, "world_size": 1, "timestamp": "2026-05-05T06:38:37.743078"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82960, "epoch": 0, "train_loss": 3.588173344731331, "train_ppl": 36.16794917455683, "lr": 0.00040857886787024376, "grad_norm": 0.7571, "tokens_per_sec": 150062, "dt_s": 4.367, "eta_s": 4362, "world_size": 1, "timestamp": "2026-05-05T06:38:42.110353"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82970, "epoch": 0, "train_loss": 3.766640529036522, "train_ppl": 43.23457528843024, "lr": 0.00040821635618211894, "grad_norm": 0.8603, "tokens_per_sec": 147576, "dt_s": 4.441, "eta_s": 4369, "world_size": 1, "timestamp": "2026-05-05T06:38:46.551189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82980, "epoch": 0, "train_loss": 3.6215467154979706, "train_ppl": 37.39536310176586, "lr": 0.00040785384449399406, "grad_norm": 0.6792, "tokens_per_sec": 149329, "dt_s": 4.389, "eta_s": 4367, "world_size": 1, "timestamp": "2026-05-05T06:38:50.939882"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 82990, "epoch": 0, "train_loss": 3.753826454281807, "train_ppl": 42.68409866535335, "lr": 0.0004074913328058692, "grad_norm": 0.6817, "tokens_per_sec": 150378, "dt_s": 4.358, "eta_s": 4246, "world_size": 1, "timestamp": "2026-05-05T06:38:55.297986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83000, "epoch": 0, "train_loss": 3.812087520956993, "train_ppl": 45.244789799181326, "lr": 0.0004071288211177443, "grad_norm": 0.7009, "tokens_per_sec": 149364, "dt_s": 4.388, "eta_s": 4250, "world_size": 1, "timestamp": "2026-05-05T06:38:59.685633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83010, "epoch": 0, "train_loss": 3.681627631187439, "train_ppl": 39.71097632382023, "lr": 0.00040676630942961943, "grad_norm": 0.7439, "tokens_per_sec": 127375, "dt_s": 5.145, "eta_s": 4246, "world_size": 1, "timestamp": "2026-05-05T06:39:04.830753"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83020, "epoch": 0, "train_loss": 3.6500632017850876, "train_ppl": 38.477097793451556, "lr": 0.0004064037977414946, "grad_norm": 0.7254, "tokens_per_sec": 149235, "dt_s": 4.391, "eta_s": 4233, "world_size": 1, "timestamp": "2026-05-05T06:39:09.222202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83030, "epoch": 0, "train_loss": 3.7783842235803604, "train_ppl": 43.745301973745455, "lr": 0.0004060412860533697, "grad_norm": 0.8082, "tokens_per_sec": 150977, "dt_s": 4.341, "eta_s": 4219, "world_size": 1, "timestamp": "2026-05-05T06:39:13.562978"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83040, "epoch": 0, "train_loss": 3.6263810843229294, "train_ppl": 37.57658377017389, "lr": 0.00040567877436524486, "grad_norm": 0.782, "tokens_per_sec": 150751, "dt_s": 4.347, "eta_s": 4212, "world_size": 1, "timestamp": "2026-05-05T06:39:17.910314"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83050, "epoch": 0, "train_loss": 3.645959258079529, "train_ppl": 38.319513529801505, "lr": 0.00040531626267712004, "grad_norm": 0.7508, "tokens_per_sec": 147559, "dt_s": 4.441, "eta_s": 4218, "world_size": 1, "timestamp": "2026-05-05T06:39:22.351661"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83060, "epoch": 0, "train_loss": 3.688849091529846, "train_ppl": 39.99878551507394, "lr": 0.0004049537509889951, "grad_norm": 0.6925, "tokens_per_sec": 151482, "dt_s": 4.326, "eta_s": 4205, "world_size": 1, "timestamp": "2026-05-05T06:39:26.677962"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83070, "epoch": 0, "train_loss": 3.6323365718126297, "train_ppl": 37.80103834953316, "lr": 0.0004045912393008703, "grad_norm": 0.6827, "tokens_per_sec": 150695, "dt_s": 4.349, "eta_s": 4193, "world_size": 1, "timestamp": "2026-05-05T06:39:31.026878"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83080, "epoch": 0, "train_loss": 3.6035104393959045, "train_ppl": 36.726936095195704, "lr": 0.0004042287276127454, "grad_norm": 0.7171, "tokens_per_sec": 147829, "dt_s": 4.433, "eta_s": 4206, "world_size": 1, "timestamp": "2026-05-05T06:39:35.460119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83090, "epoch": 0, "train_loss": 3.606108009815216, "train_ppl": 36.82246091049401, "lr": 0.00040386621592462054, "grad_norm": 0.7282, "tokens_per_sec": 149516, "dt_s": 4.383, "eta_s": 4209, "world_size": 1, "timestamp": "2026-05-05T06:39:39.843326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83100, "epoch": 0, "train_loss": 3.652507245540619, "train_ppl": 38.57125251630758, "lr": 0.00040350370423649566, "grad_norm": 1.0988, "tokens_per_sec": 147629, "dt_s": 4.439, "eta_s": 4204, "world_size": 1, "timestamp": "2026-05-05T06:39:44.282553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83110, "epoch": 0, "train_loss": 3.6647621989250183, "train_ppl": 39.04684967614719, "lr": 0.0004031411925483708, "grad_norm": 0.7051, "tokens_per_sec": 149434, "dt_s": 4.386, "eta_s": 4211, "world_size": 1, "timestamp": "2026-05-05T06:39:48.668146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83120, "epoch": 0, "train_loss": 3.577100858092308, "train_ppl": 35.769688974813036, "lr": 0.00040277868086024596, "grad_norm": 0.6891, "tokens_per_sec": 151992, "dt_s": 4.312, "eta_s": 4199, "world_size": 1, "timestamp": "2026-05-05T06:39:52.979950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83130, "epoch": 0, "train_loss": 3.645891308784485, "train_ppl": 38.31690983433136, "lr": 0.00040241616917212103, "grad_norm": 0.6718, "tokens_per_sec": 148047, "dt_s": 4.427, "eta_s": 4194, "world_size": 1, "timestamp": "2026-05-05T06:39:57.406647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83140, "epoch": 0, "train_loss": 3.5740770995616913, "train_ppl": 35.661693431145785, "lr": 0.0004020536574839962, "grad_norm": 0.729, "tokens_per_sec": 150792, "dt_s": 4.346, "eta_s": 4182, "world_size": 1, "timestamp": "2026-05-05T06:40:01.752763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83150, "epoch": 0, "train_loss": 3.70015811920166, "train_ppl": 40.45370036119408, "lr": 0.0004016911457958714, "grad_norm": 0.7606, "tokens_per_sec": 150743, "dt_s": 4.348, "eta_s": 4160, "world_size": 1, "timestamp": "2026-05-05T06:40:06.100318"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83160, "epoch": 0, "train_loss": 3.6091015487909317, "train_ppl": 36.93285553527763, "lr": 0.00040132863410774646, "grad_norm": 0.7081, "tokens_per_sec": 147902, "dt_s": 4.431, "eta_s": 4165, "world_size": 1, "timestamp": "2026-05-05T06:40:10.531355"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83170, "epoch": 0, "train_loss": 3.643012970685959, "train_ppl": 38.20677938530537, "lr": 0.00040096612241962164, "grad_norm": 0.7359, "tokens_per_sec": 150565, "dt_s": 4.353, "eta_s": 4168, "world_size": 1, "timestamp": "2026-05-05T06:40:14.884022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83180, "epoch": 0, "train_loss": 3.733609601855278, "train_ppl": 41.82982500042704, "lr": 0.00040060361073149676, "grad_norm": 0.6857, "tokens_per_sec": 150616, "dt_s": 4.351, "eta_s": 4149, "world_size": 1, "timestamp": "2026-05-05T06:40:19.235234"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83190, "epoch": 0, "train_loss": 3.628072425723076, "train_ppl": 37.64019237875359, "lr": 0.0004002410990433719, "grad_norm": 0.7055, "tokens_per_sec": 148531, "dt_s": 4.412, "eta_s": 4157, "world_size": 1, "timestamp": "2026-05-05T06:40:23.647483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83200, "epoch": 0, "train_loss": 3.591285839676857, "train_ppl": 36.28069710621477, "lr": 0.000399878587355247, "grad_norm": 0.7568, "tokens_per_sec": 150446, "dt_s": 4.356, "eta_s": 4155, "world_size": 1, "timestamp": "2026-05-05T06:40:28.003616"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83210, "epoch": 0, "train_loss": 3.739785224199295, "train_ppl": 42.08894950634325, "lr": 0.00039951607566712213, "grad_norm": 0.751, "tokens_per_sec": 150410, "dt_s": 4.357, "eta_s": 4136, "world_size": 1, "timestamp": "2026-05-05T06:40:32.360782"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83220, "epoch": 0, "train_loss": 3.6885377764701843, "train_ppl": 39.98633522885626, "lr": 0.0003991535639789973, "grad_norm": 0.7398, "tokens_per_sec": 149806, "dt_s": 4.375, "eta_s": 4136, "world_size": 1, "timestamp": "2026-05-05T06:40:36.735514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83230, "epoch": 0, "train_loss": 3.565562143921852, "train_ppl": 35.35932484799484, "lr": 0.0003987910522908724, "grad_norm": 0.742, "tokens_per_sec": 134030, "dt_s": 4.89, "eta_s": 4234, "world_size": 1, "timestamp": "2026-05-05T06:40:41.625160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83240, "epoch": 0, "train_loss": 3.719780743122101, "train_ppl": 41.255347598181196, "lr": 0.00039842854060274756, "grad_norm": 0.686, "tokens_per_sec": 147911, "dt_s": 4.431, "eta_s": 4233, "world_size": 1, "timestamp": "2026-05-05T06:40:46.055937"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83250, "epoch": 0, "train_loss": 3.5769655853509903, "train_ppl": 35.764850638184235, "lr": 0.00039806602891462274, "grad_norm": 0.7532, "tokens_per_sec": 149741, "dt_s": 4.377, "eta_s": 4232, "world_size": 1, "timestamp": "2026-05-05T06:40:50.432558"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83260, "epoch": 0, "train_loss": 3.6515747010707855, "train_ppl": 38.53529987440313, "lr": 0.0003977035172264978, "grad_norm": 0.7032, "tokens_per_sec": 150089, "dt_s": 4.366, "eta_s": 4229, "world_size": 1, "timestamp": "2026-05-05T06:40:54.799058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83270, "epoch": 0, "train_loss": 3.6241520047187805, "train_ppl": 37.492915859558806, "lr": 0.000397341005538373, "grad_norm": 0.7257, "tokens_per_sec": 150372, "dt_s": 4.358, "eta_s": 4222, "world_size": 1, "timestamp": "2026-05-05T06:40:59.157322"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83280, "epoch": 0, "train_loss": 3.5906856656074524, "train_ppl": 36.258928905599895, "lr": 0.0003969784938502481, "grad_norm": 0.6884, "tokens_per_sec": 150667, "dt_s": 4.35, "eta_s": 4116, "world_size": 1, "timestamp": "2026-05-05T06:41:03.507034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83290, "epoch": 0, "train_loss": 3.716120809316635, "train_ppl": 41.104631730138614, "lr": 0.00039661598216212323, "grad_norm": 0.7716, "tokens_per_sec": 150529, "dt_s": 4.354, "eta_s": 4097, "world_size": 1, "timestamp": "2026-05-05T06:41:07.860746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83300, "epoch": 0, "train_loss": 3.6306479275226593, "train_ppl": 37.737259706845165, "lr": 0.00039625347047399836, "grad_norm": 0.7336, "tokens_per_sec": 147290, "dt_s": 4.449, "eta_s": 4106, "world_size": 1, "timestamp": "2026-05-05T06:41:12.310197"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83310, "epoch": 0, "train_loss": 3.783901959657669, "train_ppl": 43.98734415300194, "lr": 0.0003958909587858735, "grad_norm": 0.7569, "tokens_per_sec": 149611, "dt_s": 4.38, "eta_s": 4104, "world_size": 1, "timestamp": "2026-05-05T06:41:16.690636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83320, "epoch": 0, "train_loss": 3.6135798394680023, "train_ppl": 37.09862249717973, "lr": 0.00039552844709774866, "grad_norm": 0.743, "tokens_per_sec": 149814, "dt_s": 4.374, "eta_s": 4103, "world_size": 1, "timestamp": "2026-05-05T06:41:21.065104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83330, "epoch": 0, "train_loss": 3.6375132352113724, "train_ppl": 37.99722896946875, "lr": 0.00039516593540962373, "grad_norm": 0.7869, "tokens_per_sec": 151235, "dt_s": 4.333, "eta_s": 4096, "world_size": 1, "timestamp": "2026-05-05T06:41:25.398483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83340, "epoch": 0, "train_loss": 3.5837049782276154, "train_ppl": 36.00669805480717, "lr": 0.0003948034237214989, "grad_norm": 0.7546, "tokens_per_sec": 151613, "dt_s": 4.323, "eta_s": 4085, "world_size": 1, "timestamp": "2026-05-05T06:41:29.721080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83350, "epoch": 0, "train_loss": 3.6466302573680878, "train_ppl": 38.34523452453746, "lr": 0.0003944409120333741, "grad_norm": 0.7406, "tokens_per_sec": 148845, "dt_s": 4.403, "eta_s": 4072, "world_size": 1, "timestamp": "2026-05-05T06:41:34.124049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83360, "epoch": 0, "train_loss": 3.602218449115753, "train_ppl": 36.67951589055901, "lr": 0.00039407840034524916, "grad_norm": 0.7253, "tokens_per_sec": 151200, "dt_s": 4.334, "eta_s": 4059, "world_size": 1, "timestamp": "2026-05-05T06:41:38.458448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83370, "epoch": 0, "train_loss": 3.6549696773290634, "train_ppl": 38.6663486304192, "lr": 0.00039371588865712433, "grad_norm": 0.7207, "tokens_per_sec": 151149, "dt_s": 4.336, "eta_s": 4048, "world_size": 1, "timestamp": "2026-05-05T06:41:42.794304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83380, "epoch": 0, "train_loss": 3.5850499868392944, "train_ppl": 36.05515995730128, "lr": 0.00039335337696899946, "grad_norm": 0.7632, "tokens_per_sec": 147722, "dt_s": 4.436, "eta_s": 4063, "world_size": 1, "timestamp": "2026-05-05T06:41:47.230758"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83390, "epoch": 0, "train_loss": 3.5496575087308884, "train_ppl": 34.80139627188639, "lr": 0.0003929908652808746, "grad_norm": 0.7136, "tokens_per_sec": 148424, "dt_s": 4.415, "eta_s": 4076, "world_size": 1, "timestamp": "2026-05-05T06:41:51.646236"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83400, "epoch": 0, "train_loss": 3.5623197853565216, "train_ppl": 35.24486290182256, "lr": 0.0003926283535927497, "grad_norm": 0.8451, "tokens_per_sec": 150367, "dt_s": 4.358, "eta_s": 4063, "world_size": 1, "timestamp": "2026-05-05T06:41:56.004631"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83410, "epoch": 0, "train_loss": 3.627855822443962, "train_ppl": 37.63204027257621, "lr": 0.00039226584190462483, "grad_norm": 0.771, "tokens_per_sec": 149295, "dt_s": 4.39, "eta_s": 4069, "world_size": 1, "timestamp": "2026-05-05T06:42:00.394351"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83420, "epoch": 0, "train_loss": 3.632231280207634, "train_ppl": 37.79705842706472, "lr": 0.0003919033302165, "grad_norm": 0.8, "tokens_per_sec": 150958, "dt_s": 4.341, "eta_s": 4065, "world_size": 1, "timestamp": "2026-05-05T06:42:04.735679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83430, "epoch": 0, "train_loss": 3.5731256157159805, "train_ppl": 35.6277780434658, "lr": 0.0003915408185283751, "grad_norm": 0.685, "tokens_per_sec": 151244, "dt_s": 4.333, "eta_s": 4042, "world_size": 1, "timestamp": "2026-05-05T06:42:09.068799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83440, "epoch": 0, "train_loss": 3.651687353849411, "train_ppl": 38.5396412275373, "lr": 0.00039117830684025026, "grad_norm": 0.6802, "tokens_per_sec": 149939, "dt_s": 4.371, "eta_s": 4029, "world_size": 1, "timestamp": "2026-05-05T06:42:13.439647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83450, "epoch": 0, "train_loss": 3.533250331878662, "train_ppl": 34.2350622839851, "lr": 0.00039081579515212544, "grad_norm": 0.7008, "tokens_per_sec": 151533, "dt_s": 4.325, "eta_s": 4019, "world_size": 1, "timestamp": "2026-05-05T06:42:17.764502"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83460, "epoch": 0, "train_loss": 3.7238942831754684, "train_ppl": 41.42540264727506, "lr": 0.0003904532834640005, "grad_norm": 0.7292, "tokens_per_sec": 148096, "dt_s": 4.425, "eta_s": 4021, "world_size": 1, "timestamp": "2026-05-05T06:42:22.189727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83470, "epoch": 0, "train_loss": 3.646504729986191, "train_ppl": 38.34042144973203, "lr": 0.0003900907717758757, "grad_norm": 0.7271, "tokens_per_sec": 150398, "dt_s": 4.358, "eta_s": 4020, "world_size": 1, "timestamp": "2026-05-05T06:42:26.547269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83480, "epoch": 0, "train_loss": 3.7246828228235245, "train_ppl": 41.45808110213449, "lr": 0.0003897282600877508, "grad_norm": 0.7141, "tokens_per_sec": 150051, "dt_s": 4.368, "eta_s": 4022, "world_size": 1, "timestamp": "2026-05-05T06:42:30.914852"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83490, "epoch": 0, "train_loss": 3.6205991208553314, "train_ppl": 37.35994424004321, "lr": 0.00038936574839962593, "grad_norm": 0.7636, "tokens_per_sec": 148522, "dt_s": 4.413, "eta_s": 4025, "world_size": 1, "timestamp": "2026-05-05T06:42:35.327398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83500, "epoch": 0, "train_loss": 3.6275247633457184, "train_ppl": 37.61958390526914, "lr": 0.00038900323671150106, "grad_norm": 0.7388, "tokens_per_sec": 151402, "dt_s": 4.329, "eta_s": 4021, "world_size": 1, "timestamp": "2026-05-05T06:42:39.656019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83510, "epoch": 0, "train_loss": 3.769845128059387, "train_ppl": 43.37334700124205, "lr": 0.0003886407250233762, "grad_norm": 0.728, "tokens_per_sec": 127464, "dt_s": 5.142, "eta_s": 4007, "world_size": 1, "timestamp": "2026-05-05T06:42:44.797592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83520, "epoch": 0, "train_loss": 3.7019717693328857, "train_ppl": 40.52713579312367, "lr": 0.0003882782133352513, "grad_norm": 0.7386, "tokens_per_sec": 150546, "dt_s": 4.353, "eta_s": 4002, "world_size": 1, "timestamp": "2026-05-05T06:42:49.150791"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83530, "epoch": 0, "train_loss": 3.619175463914871, "train_ppl": 37.306794338720636, "lr": 0.00038791570164712643, "grad_norm": 0.7521, "tokens_per_sec": 133486, "dt_s": 4.91, "eta_s": 4097, "world_size": 1, "timestamp": "2026-05-05T06:42:54.060421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83540, "epoch": 0, "train_loss": 3.5616889148950577, "train_ppl": 35.22263497110913, "lr": 0.0003875531899590016, "grad_norm": 0.8432, "tokens_per_sec": 149893, "dt_s": 4.372, "eta_s": 4085, "world_size": 1, "timestamp": "2026-05-05T06:42:58.432553"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83550, "epoch": 0, "train_loss": 3.574302986264229, "train_ppl": 35.669749843365786, "lr": 0.0003871906782708768, "grad_norm": 0.7731, "tokens_per_sec": 151225, "dt_s": 4.334, "eta_s": 4081, "world_size": 1, "timestamp": "2026-05-05T06:43:02.766239"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83560, "epoch": 0, "train_loss": 3.6157290935516357, "train_ppl": 37.1784426092038, "lr": 0.00038682816658275186, "grad_norm": 0.8261, "tokens_per_sec": 150685, "dt_s": 4.349, "eta_s": 4073, "world_size": 1, "timestamp": "2026-05-05T06:43:07.115434"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83570, "epoch": 0, "train_loss": 3.709344834089279, "train_ppl": 40.82704927180844, "lr": 0.00038646565489462703, "grad_norm": 0.7291, "tokens_per_sec": 148878, "dt_s": 4.402, "eta_s": 4077, "world_size": 1, "timestamp": "2026-05-05T06:43:11.517442"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83580, "epoch": 0, "train_loss": 3.6372767984867096, "train_ppl": 37.988246091087966, "lr": 0.00038610314320650216, "grad_norm": 0.6871, "tokens_per_sec": 150821, "dt_s": 4.345, "eta_s": 3970, "world_size": 1, "timestamp": "2026-05-05T06:43:15.862721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83590, "epoch": 0, "train_loss": 3.5910096019506454, "train_ppl": 36.270676393054, "lr": 0.0003857406315183773, "grad_norm": 0.7174, "tokens_per_sec": 151195, "dt_s": 4.335, "eta_s": 3959, "world_size": 1, "timestamp": "2026-05-05T06:43:20.197267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83600, "epoch": 0, "train_loss": 3.5729674845933914, "train_ppl": 35.622144628349304, "lr": 0.0003853781198302524, "grad_norm": 0.7743, "tokens_per_sec": 146900, "dt_s": 4.461, "eta_s": 3978, "world_size": 1, "timestamp": "2026-05-05T06:43:24.658514"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83610, "epoch": 0, "train_loss": 3.6077001243829727, "train_ppl": 36.88113318102489, "lr": 0.00038501560814212753, "grad_norm": 0.7626, "tokens_per_sec": 149470, "dt_s": 4.385, "eta_s": 3980, "world_size": 1, "timestamp": "2026-05-05T06:43:29.043067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83620, "epoch": 0, "train_loss": 3.5738575607538223, "train_ppl": 35.653865164818896, "lr": 0.00038465309645400265, "grad_norm": 0.7341, "tokens_per_sec": 148719, "dt_s": 4.407, "eta_s": 3976, "world_size": 1, "timestamp": "2026-05-05T06:43:33.449761"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83630, "epoch": 0, "train_loss": 3.6492944061756134, "train_ppl": 38.44752813756987, "lr": 0.0003842905847658778, "grad_norm": 0.7513, "tokens_per_sec": 147329, "dt_s": 4.448, "eta_s": 3990, "world_size": 1, "timestamp": "2026-05-05T06:43:37.898030"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83640, "epoch": 0, "train_loss": 3.5953833609819412, "train_ppl": 36.429663022654836, "lr": 0.00038392807307775296, "grad_norm": 0.7629, "tokens_per_sec": 149596, "dt_s": 4.381, "eta_s": 3994, "world_size": 1, "timestamp": "2026-05-05T06:43:42.278910"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83650, "epoch": 0, "train_loss": 3.6274795085191727, "train_ppl": 37.61788147604664, "lr": 0.00038356556138962813, "grad_norm": 0.7336, "tokens_per_sec": 147340, "dt_s": 4.448, "eta_s": 3987, "world_size": 1, "timestamp": "2026-05-05T06:43:46.726843"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83660, "epoch": 0, "train_loss": 3.801817536354065, "train_ppl": 44.78250439974402, "lr": 0.0003832030497015032, "grad_norm": 0.9499, "tokens_per_sec": 150169, "dt_s": 4.364, "eta_s": 3979, "world_size": 1, "timestamp": "2026-05-05T06:43:51.090984"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83670, "epoch": 0, "train_loss": 3.7180727124214172, "train_ppl": 41.18494234220199, "lr": 0.0003828405380133784, "grad_norm": 0.7671, "tokens_per_sec": 151118, "dt_s": 4.337, "eta_s": 3962, "world_size": 1, "timestamp": "2026-05-05T06:43:55.427753"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83680, "epoch": 0, "train_loss": 3.623282343149185, "train_ppl": 37.460323885551446, "lr": 0.0003824780263252535, "grad_norm": 0.7628, "tokens_per_sec": 147741, "dt_s": 4.436, "eta_s": 3956, "world_size": 1, "timestamp": "2026-05-05T06:43:59.863606"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83690, "epoch": 0, "train_loss": 3.6142240166664124, "train_ppl": 37.122528282839184, "lr": 0.00038211551463712863, "grad_norm": 0.6877, "tokens_per_sec": 149186, "dt_s": 4.393, "eta_s": 3953, "world_size": 1, "timestamp": "2026-05-05T06:44:04.256494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83700, "epoch": 0, "train_loss": 3.6351496130228043, "train_ppl": 37.90752393214373, "lr": 0.00038175300294900376, "grad_norm": 0.7799, "tokens_per_sec": 148954, "dt_s": 4.4, "eta_s": 3940, "world_size": 1, "timestamp": "2026-05-05T06:44:08.656259"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83710, "epoch": 0, "train_loss": 3.7350468039512634, "train_ppl": 41.88998613409058, "lr": 0.0003813904912608789, "grad_norm": 0.7123, "tokens_per_sec": 147786, "dt_s": 4.435, "eta_s": 3949, "world_size": 1, "timestamp": "2026-05-05T06:44:13.090784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83720, "epoch": 0, "train_loss": 3.6672977954149246, "train_ppl": 39.145982358255935, "lr": 0.000381027979572754, "grad_norm": 0.78, "tokens_per_sec": 151843, "dt_s": 4.316, "eta_s": 3941, "world_size": 1, "timestamp": "2026-05-05T06:44:17.406822"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83730, "epoch": 0, "train_loss": 3.7031165808439255, "train_ppl": 40.57355829212464, "lr": 0.00038066546788462913, "grad_norm": 0.7376, "tokens_per_sec": 149376, "dt_s": 4.387, "eta_s": 3927, "world_size": 1, "timestamp": "2026-05-05T06:44:21.794142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83740, "epoch": 0, "train_loss": 3.774095207452774, "train_ppl": 43.55807945528121, "lr": 0.0003803029561965043, "grad_norm": 0.8606, "tokens_per_sec": 148744, "dt_s": 4.406, "eta_s": 3925, "world_size": 1, "timestamp": "2026-05-05T06:44:26.200101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83750, "epoch": 0, "train_loss": 3.721075788140297, "train_ppl": 41.308809741031084, "lr": 0.0003799404445083795, "grad_norm": 0.999, "tokens_per_sec": 150750, "dt_s": 4.347, "eta_s": 3912, "world_size": 1, "timestamp": "2026-05-05T06:44:30.547437"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83760, "epoch": 0, "train_loss": 3.6406408697366714, "train_ppl": 38.11625645486755, "lr": 0.00037957793282025455, "grad_norm": 0.7493, "tokens_per_sec": 146724, "dt_s": 4.467, "eta_s": 3913, "world_size": 1, "timestamp": "2026-05-05T06:44:35.014057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83770, "epoch": 0, "train_loss": 3.5756704956293106, "train_ppl": 35.718561928211415, "lr": 0.00037921542113212973, "grad_norm": 0.7697, "tokens_per_sec": 151049, "dt_s": 4.339, "eta_s": 3913, "world_size": 1, "timestamp": "2026-05-05T06:44:39.352794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83780, "epoch": 0, "train_loss": 3.625907763838768, "train_ppl": 37.55880221187053, "lr": 0.00037885290944400486, "grad_norm": 0.7555, "tokens_per_sec": 149568, "dt_s": 4.382, "eta_s": 3907, "world_size": 1, "timestamp": "2026-05-05T06:44:43.734487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83790, "epoch": 0, "train_loss": 3.6336109042167664, "train_ppl": 37.849240143643065, "lr": 0.00037849039775588, "grad_norm": 0.716, "tokens_per_sec": 147594, "dt_s": 4.44, "eta_s": 3909, "world_size": 1, "timestamp": "2026-05-05T06:44:48.174770"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83800, "epoch": 0, "train_loss": 3.6283209919929504, "train_ppl": 37.649549623870115, "lr": 0.0003781278860677551, "grad_norm": 0.7382, "tokens_per_sec": 150643, "dt_s": 4.35, "eta_s": 3905, "world_size": 1, "timestamp": "2026-05-05T06:44:52.525224"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83810, "epoch": 0, "train_loss": 3.7157891541719437, "train_ppl": 41.09100142795952, "lr": 0.00037776537437963023, "grad_norm": 0.7639, "tokens_per_sec": 149808, "dt_s": 4.375, "eta_s": 3884, "world_size": 1, "timestamp": "2026-05-05T06:44:56.899871"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83820, "epoch": 0, "train_loss": 3.6664240807294846, "train_ppl": 39.11179487582111, "lr": 0.00037740286269150535, "grad_norm": 0.7726, "tokens_per_sec": 133107, "dt_s": 4.924, "eta_s": 3984, "world_size": 1, "timestamp": "2026-05-05T06:45:01.823437"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83830, "epoch": 0, "train_loss": 3.686767026782036, "train_ppl": 39.91559209089922, "lr": 0.0003770403510033805, "grad_norm": 0.7445, "tokens_per_sec": 149421, "dt_s": 4.386, "eta_s": 3980, "world_size": 1, "timestamp": "2026-05-05T06:45:06.209416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83840, "epoch": 0, "train_loss": 3.634305387735367, "train_ppl": 37.875534946711205, "lr": 0.00037667783931525565, "grad_norm": 0.7257, "tokens_per_sec": 147917, "dt_s": 4.431, "eta_s": 3974, "world_size": 1, "timestamp": "2026-05-05T06:45:10.640015"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83850, "epoch": 0, "train_loss": 3.6478930860757828, "train_ppl": 38.39368857564242, "lr": 0.00037631532762713083, "grad_norm": 0.6971, "tokens_per_sec": 150015, "dt_s": 4.369, "eta_s": 3973, "world_size": 1, "timestamp": "2026-05-05T06:45:15.008646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83860, "epoch": 0, "train_loss": 3.675247550010681, "train_ppl": 39.45842358145335, "lr": 0.0003759528159390059, "grad_norm": 0.7844, "tokens_per_sec": 150921, "dt_s": 4.342, "eta_s": 3962, "world_size": 1, "timestamp": "2026-05-05T06:45:19.351054"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83870, "epoch": 0, "train_loss": 3.7336430102586746, "train_ppl": 41.83122249143851, "lr": 0.0003755903042508811, "grad_norm": 0.7603, "tokens_per_sec": 147990, "dt_s": 4.428, "eta_s": 3871, "world_size": 1, "timestamp": "2026-05-05T06:45:23.779467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83880, "epoch": 0, "train_loss": 3.641693577170372, "train_ppl": 38.15640284887864, "lr": 0.0003752277925627562, "grad_norm": 0.7489, "tokens_per_sec": 151110, "dt_s": 4.337, "eta_s": 3858, "world_size": 1, "timestamp": "2026-05-05T06:45:28.116450"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83890, "epoch": 0, "train_loss": 3.633708670735359, "train_ppl": 37.852940712976235, "lr": 0.00037486528087463133, "grad_norm": 0.7601, "tokens_per_sec": 149699, "dt_s": 4.378, "eta_s": 3844, "world_size": 1, "timestamp": "2026-05-05T06:45:32.494326"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83900, "epoch": 0, "train_loss": 3.6614784747362137, "train_ppl": 38.91884087918538, "lr": 0.00037450276918650645, "grad_norm": 0.763, "tokens_per_sec": 146686, "dt_s": 4.468, "eta_s": 3857, "world_size": 1, "timestamp": "2026-05-05T06:45:36.962098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83910, "epoch": 0, "train_loss": 3.745335727930069, "train_ppl": 42.32321391891423, "lr": 0.0003741402574983816, "grad_norm": 0.7505, "tokens_per_sec": 149761, "dt_s": 4.376, "eta_s": 3858, "world_size": 1, "timestamp": "2026-05-05T06:45:41.338115"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83920, "epoch": 0, "train_loss": 3.621056005358696, "train_ppl": 37.37701731952878, "lr": 0.0003737777458102567, "grad_norm": 0.738, "tokens_per_sec": 150157, "dt_s": 4.364, "eta_s": 3843, "world_size": 1, "timestamp": "2026-05-05T06:45:45.702608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83930, "epoch": 0, "train_loss": 3.599318891763687, "train_ppl": 36.57331557195162, "lr": 0.0003734152341221318, "grad_norm": 0.7573, "tokens_per_sec": 150551, "dt_s": 4.353, "eta_s": 3841, "world_size": 1, "timestamp": "2026-05-05T06:45:50.055681"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83940, "epoch": 0, "train_loss": 3.6458952575922012, "train_ppl": 38.31706114073933, "lr": 0.000373052722434007, "grad_norm": 0.9272, "tokens_per_sec": 151653, "dt_s": 4.321, "eta_s": 3827, "world_size": 1, "timestamp": "2026-05-05T06:45:54.377126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83950, "epoch": 0, "train_loss": 3.6772687882184982, "train_ppl": 39.53825911094766, "lr": 0.00037269021074588213, "grad_norm": 0.8063, "tokens_per_sec": 149894, "dt_s": 4.372, "eta_s": 3806, "world_size": 1, "timestamp": "2026-05-05T06:45:58.749305"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83960, "epoch": 0, "train_loss": 3.688883438706398, "train_ppl": 40.00015938401601, "lr": 0.00037232769905775725, "grad_norm": 0.7354, "tokens_per_sec": 151006, "dt_s": 4.34, "eta_s": 3795, "world_size": 1, "timestamp": "2026-05-05T06:46:03.089279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83970, "epoch": 0, "train_loss": 3.5913736522197723, "train_ppl": 36.2838831463715, "lr": 0.00037196518736963243, "grad_norm": 0.7344, "tokens_per_sec": 150575, "dt_s": 4.352, "eta_s": 3789, "world_size": 1, "timestamp": "2026-05-05T06:46:07.441638"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83980, "epoch": 0, "train_loss": 3.530364878475666, "train_ppl": 34.13642098788705, "lr": 0.00037160267568150755, "grad_norm": 0.751, "tokens_per_sec": 149128, "dt_s": 4.395, "eta_s": 3792, "world_size": 1, "timestamp": "2026-05-05T06:46:11.836261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 83990, "epoch": 0, "train_loss": 3.708760902285576, "train_ppl": 40.80321601846179, "lr": 0.0003712401639933827, "grad_norm": 0.7235, "tokens_per_sec": 149192, "dt_s": 4.393, "eta_s": 3800, "world_size": 1, "timestamp": "2026-05-05T06:46:16.228973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84000, "epoch": 0, "train_loss": 3.598513647913933, "train_ppl": 36.543876988731924, "lr": 0.0003708776523052578, "grad_norm": 0.7143, "tokens_per_sec": 149941, "dt_s": 4.371, "eta_s": 3795, "world_size": 1, "timestamp": "2026-05-05T06:46:20.599777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84010, "epoch": 0, "train_loss": 3.5679481774568558, "train_ppl": 35.44379411603563, "lr": 0.0003705151406171329, "grad_norm": 0.7467, "tokens_per_sec": 125619, "dt_s": 5.217, "eta_s": 3810, "world_size": 1, "timestamp": "2026-05-05T06:46:25.816853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84020, "epoch": 0, "train_loss": 3.703653931617737, "train_ppl": 40.59536638384192, "lr": 0.00037015262892900805, "grad_norm": 0.7203, "tokens_per_sec": 149890, "dt_s": 4.372, "eta_s": 3809, "world_size": 1, "timestamp": "2026-05-05T06:46:30.189092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84030, "epoch": 0, "train_loss": 3.6299438029527664, "train_ppl": 37.71069732779446, "lr": 0.0003697901172408832, "grad_norm": 0.7336, "tokens_per_sec": 149327, "dt_s": 4.389, "eta_s": 3803, "world_size": 1, "timestamp": "2026-05-05T06:46:34.577853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84040, "epoch": 0, "train_loss": 3.5679651647806168, "train_ppl": 35.44439621635562, "lr": 0.00036942760555275835, "grad_norm": 0.7679, "tokens_per_sec": 149253, "dt_s": 4.391, "eta_s": 3799, "world_size": 1, "timestamp": "2026-05-05T06:46:38.968792"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84050, "epoch": 0, "train_loss": 3.6984487920999527, "train_ppl": 40.384610819937144, "lr": 0.0003690650938646335, "grad_norm": 0.7373, "tokens_per_sec": 148602, "dt_s": 4.41, "eta_s": 3801, "world_size": 1, "timestamp": "2026-05-05T06:46:43.378945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84060, "epoch": 0, "train_loss": 3.6633803099393845, "train_ppl": 38.9929285297559, "lr": 0.0003687025821765086, "grad_norm": 0.7651, "tokens_per_sec": 146848, "dt_s": 4.463, "eta_s": 3799, "world_size": 1, "timestamp": "2026-05-05T06:46:47.841788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84070, "epoch": 0, "train_loss": 3.617723226547241, "train_ppl": 37.25265533876974, "lr": 0.0003683400704883838, "grad_norm": 0.8482, "tokens_per_sec": 149901, "dt_s": 4.372, "eta_s": 3795, "world_size": 1, "timestamp": "2026-05-05T06:46:52.213719"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84080, "epoch": 0, "train_loss": 3.735146552324295, "train_ppl": 41.89416480045789, "lr": 0.0003679775588002589, "grad_norm": 0.7589, "tokens_per_sec": 150179, "dt_s": 4.364, "eta_s": 3786, "world_size": 1, "timestamp": "2026-05-05T06:46:56.577564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84090, "epoch": 0, "train_loss": 3.6355300694704056, "train_ppl": 37.92194883788652, "lr": 0.00036761504711213403, "grad_norm": 0.9203, "tokens_per_sec": 148153, "dt_s": 4.424, "eta_s": 3787, "world_size": 1, "timestamp": "2026-05-05T06:47:01.001106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84100, "epoch": 0, "train_loss": 3.6131070852279663, "train_ppl": 37.081088111148944, "lr": 0.00036725253542400915, "grad_norm": 0.7436, "tokens_per_sec": 151407, "dt_s": 4.328, "eta_s": 3769, "world_size": 1, "timestamp": "2026-05-05T06:47:05.329565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84110, "epoch": 0, "train_loss": 3.6084692776203156, "train_ppl": 36.909511336183876, "lr": 0.0003668900237358843, "grad_norm": 0.7614, "tokens_per_sec": 149415, "dt_s": 4.386, "eta_s": 3751, "world_size": 1, "timestamp": "2026-05-05T06:47:09.715731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84120, "epoch": 0, "train_loss": 3.719770446419716, "train_ppl": 41.254922806332175, "lr": 0.0003665275120477594, "grad_norm": 0.7286, "tokens_per_sec": 132279, "dt_s": 4.954, "eta_s": 3847, "world_size": 1, "timestamp": "2026-05-05T06:47:14.670108"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84130, "epoch": 0, "train_loss": 3.571735292673111, "train_ppl": 35.57827834094781, "lr": 0.0003661650003596345, "grad_norm": 0.8082, "tokens_per_sec": 148354, "dt_s": 4.418, "eta_s": 3851, "world_size": 1, "timestamp": "2026-05-05T06:47:19.087639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84140, "epoch": 0, "train_loss": 3.6801122575998306, "train_ppl": 39.650844931429816, "lr": 0.0003658024886715097, "grad_norm": 0.7796, "tokens_per_sec": 147092, "dt_s": 4.455, "eta_s": 3852, "world_size": 1, "timestamp": "2026-05-05T06:47:23.543088"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84150, "epoch": 0, "train_loss": 3.6717496812343597, "train_ppl": 39.32064430104482, "lr": 0.0003654399769833848, "grad_norm": 0.7847, "tokens_per_sec": 149371, "dt_s": 4.387, "eta_s": 3858, "world_size": 1, "timestamp": "2026-05-05T06:47:27.930572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84160, "epoch": 0, "train_loss": 3.616973251104355, "train_ppl": 37.224727236087986, "lr": 0.00036507746529525995, "grad_norm": 0.7891, "tokens_per_sec": 149791, "dt_s": 4.375, "eta_s": 3851, "world_size": 1, "timestamp": "2026-05-05T06:47:32.305757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84170, "epoch": 0, "train_loss": 3.719824939966202, "train_ppl": 41.257170994641214, "lr": 0.00036471495360713513, "grad_norm": 0.8266, "tokens_per_sec": 147433, "dt_s": 4.445, "eta_s": 3760, "world_size": 1, "timestamp": "2026-05-05T06:47:36.750876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84180, "epoch": 0, "train_loss": 3.6116801500320435, "train_ppl": 37.028213534688774, "lr": 0.00036435244191901025, "grad_norm": 0.6968, "tokens_per_sec": 150341, "dt_s": 4.359, "eta_s": 3746, "world_size": 1, "timestamp": "2026-05-05T06:47:41.110057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84190, "epoch": 0, "train_loss": 3.68806229531765, "train_ppl": 39.96732699948154, "lr": 0.0003639899302308854, "grad_norm": 0.8232, "tokens_per_sec": 150171, "dt_s": 4.364, "eta_s": 3726, "world_size": 1, "timestamp": "2026-05-05T06:47:45.474134"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84200, "epoch": 0, "train_loss": 3.7043711245059967, "train_ppl": 40.62449153483588, "lr": 0.0003636274185427605, "grad_norm": 0.7795, "tokens_per_sec": 146161, "dt_s": 4.484, "eta_s": 3738, "world_size": 1, "timestamp": "2026-05-05T06:47:49.957938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84210, "epoch": 0, "train_loss": 3.7559455037117004, "train_ppl": 42.77464428173213, "lr": 0.0003632649068546356, "grad_norm": 0.73, "tokens_per_sec": 149229, "dt_s": 4.392, "eta_s": 3736, "world_size": 1, "timestamp": "2026-05-05T06:47:54.349584"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84220, "epoch": 0, "train_loss": 3.6108446270227432, "train_ppl": 36.99728853136224, "lr": 0.00036290239516651075, "grad_norm": 0.7348, "tokens_per_sec": 148208, "dt_s": 4.422, "eta_s": 3728, "world_size": 1, "timestamp": "2026-05-05T06:47:58.771491"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84230, "epoch": 0, "train_loss": 3.6795131117105484, "train_ppl": 39.627095406108275, "lr": 0.0003625398834783859, "grad_norm": 0.7711, "tokens_per_sec": 146824, "dt_s": 4.464, "eta_s": 3741, "world_size": 1, "timestamp": "2026-05-05T06:48:03.235055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84240, "epoch": 0, "train_loss": 3.583690896630287, "train_ppl": 36.006191026553935, "lr": 0.00036217737179026105, "grad_norm": 0.737, "tokens_per_sec": 150423, "dt_s": 4.357, "eta_s": 3735, "world_size": 1, "timestamp": "2026-05-05T06:48:07.591826"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84250, "epoch": 0, "train_loss": 3.6135466247797012, "train_ppl": 37.09739029846075, "lr": 0.0003618148601021362, "grad_norm": 0.8388, "tokens_per_sec": 148830, "dt_s": 4.403, "eta_s": 3717, "world_size": 1, "timestamp": "2026-05-05T06:48:11.995253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84260, "epoch": 0, "train_loss": 3.852016568183899, "train_ppl": 47.08792356177176, "lr": 0.0003614523484140113, "grad_norm": 1.2179, "tokens_per_sec": 150178, "dt_s": 4.364, "eta_s": 3708, "world_size": 1, "timestamp": "2026-05-05T06:48:16.359148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84270, "epoch": 0, "train_loss": 3.61156103014946, "train_ppl": 37.02380300093659, "lr": 0.0003610898367258865, "grad_norm": 0.8125, "tokens_per_sec": 150341, "dt_s": 4.359, "eta_s": 3693, "world_size": 1, "timestamp": "2026-05-05T06:48:20.718332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84280, "epoch": 0, "train_loss": 3.599799856543541, "train_ppl": 36.590910279502346, "lr": 0.0003607273250377616, "grad_norm": 0.7837, "tokens_per_sec": 146304, "dt_s": 4.479, "eta_s": 3692, "world_size": 1, "timestamp": "2026-05-05T06:48:25.197749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84290, "epoch": 0, "train_loss": 3.7034434974193573, "train_ppl": 40.58682462922918, "lr": 0.0003603648133496367, "grad_norm": 0.7928, "tokens_per_sec": 150668, "dt_s": 4.35, "eta_s": 3686, "world_size": 1, "timestamp": "2026-05-05T06:48:29.547443"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84300, "epoch": 0, "train_loss": 3.6353582590818405, "train_ppl": 37.915434012794876, "lr": 0.00036000230166151185, "grad_norm": 0.7429, "tokens_per_sec": 150930, "dt_s": 4.342, "eta_s": 3671, "world_size": 1, "timestamp": "2026-05-05T06:48:33.889579"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84310, "epoch": 0, "train_loss": 3.621713787317276, "train_ppl": 37.40161133505018, "lr": 0.000359639789973387, "grad_norm": 0.7976, "tokens_per_sec": 147721, "dt_s": 4.436, "eta_s": 3679, "world_size": 1, "timestamp": "2026-05-05T06:48:38.326042"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84320, "epoch": 0, "train_loss": 3.580356791615486, "train_ppl": 35.88634250928186, "lr": 0.0003592772782852621, "grad_norm": 0.8051, "tokens_per_sec": 149738, "dt_s": 4.377, "eta_s": 3678, "world_size": 1, "timestamp": "2026-05-05T06:48:42.702740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84330, "epoch": 0, "train_loss": 3.6892623007297516, "train_ppl": 40.01531679643737, "lr": 0.0003589147665971372, "grad_norm": 0.7642, "tokens_per_sec": 148610, "dt_s": 4.41, "eta_s": 3662, "world_size": 1, "timestamp": "2026-05-05T06:48:47.112670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84340, "epoch": 0, "train_loss": 3.666203811764717, "train_ppl": 39.103180710005134, "lr": 0.0003585522549090124, "grad_norm": 0.8591, "tokens_per_sec": 147603, "dt_s": 4.44, "eta_s": 3672, "world_size": 1, "timestamp": "2026-05-05T06:48:51.552694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84350, "epoch": 0, "train_loss": 3.687080428004265, "train_ppl": 39.928103646712536, "lr": 0.0003581897432208875, "grad_norm": 0.7728, "tokens_per_sec": 149651, "dt_s": 4.379, "eta_s": 3674, "world_size": 1, "timestamp": "2026-05-05T06:48:55.931957"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84360, "epoch": 0, "train_loss": 3.62115778028965, "train_ppl": 37.38082155647044, "lr": 0.00035782723153276265, "grad_norm": 0.7361, "tokens_per_sec": 147731, "dt_s": 4.436, "eta_s": 3670, "world_size": 1, "timestamp": "2026-05-05T06:49:00.368121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84370, "epoch": 0, "train_loss": 3.5889464765787125, "train_ppl": 36.19592258009013, "lr": 0.00035746471984463783, "grad_norm": 0.778, "tokens_per_sec": 151879, "dt_s": 4.315, "eta_s": 3655, "world_size": 1, "timestamp": "2026-05-05T06:49:04.683126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84380, "epoch": 0, "train_loss": 3.667796567082405, "train_ppl": 39.16551213519676, "lr": 0.00035710220815651295, "grad_norm": 0.8225, "tokens_per_sec": 152606, "dt_s": 4.294, "eta_s": 3632, "world_size": 1, "timestamp": "2026-05-05T06:49:08.977592"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84390, "epoch": 0, "train_loss": 3.6630782783031464, "train_ppl": 38.981153210099386, "lr": 0.0003567396964683881, "grad_norm": 0.7794, "tokens_per_sec": 149429, "dt_s": 4.386, "eta_s": 3618, "world_size": 1, "timestamp": "2026-05-05T06:49:13.363372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84400, "epoch": 0, "train_loss": 3.8271990567445755, "train_ppl": 45.93370019666667, "lr": 0.0003563771847802632, "grad_norm": 1.0247, "tokens_per_sec": 150849, "dt_s": 4.344, "eta_s": 3608, "world_size": 1, "timestamp": "2026-05-05T06:49:17.707839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84410, "epoch": 0, "train_loss": 3.657724380493164, "train_ppl": 38.77300978579788, "lr": 0.0003560146730921383, "grad_norm": 0.8078, "tokens_per_sec": 134910, "dt_s": 4.858, "eta_s": 3673, "world_size": 1, "timestamp": "2026-05-05T06:49:22.565616"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84420, "epoch": 0, "train_loss": 3.6820905804634094, "train_ppl": 39.729364747682766, "lr": 0.00035565216140401345, "grad_norm": 0.7478, "tokens_per_sec": 149337, "dt_s": 4.388, "eta_s": 3681, "world_size": 1, "timestamp": "2026-05-05T06:49:26.954073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84430, "epoch": 0, "train_loss": 3.6045341342687607, "train_ppl": 36.76455252195866, "lr": 0.00035528964971588857, "grad_norm": 0.7811, "tokens_per_sec": 150546, "dt_s": 4.353, "eta_s": 3686, "world_size": 1, "timestamp": "2026-05-05T06:49:31.307304"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84440, "epoch": 0, "train_loss": 3.641125351190567, "train_ppl": 38.13472754830167, "lr": 0.00035492713802776375, "grad_norm": 0.7943, "tokens_per_sec": 151515, "dt_s": 4.325, "eta_s": 3672, "world_size": 1, "timestamp": "2026-05-05T06:49:35.632657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84450, "epoch": 0, "train_loss": 3.6688932478427887, "train_ppl": 39.20848775978906, "lr": 0.0003545646263396389, "grad_norm": 0.862, "tokens_per_sec": 150089, "dt_s": 4.366, "eta_s": 3671, "world_size": 1, "timestamp": "2026-05-05T06:49:39.999131"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84460, "epoch": 0, "train_loss": 3.5752233266830444, "train_ppl": 35.702593267122595, "lr": 0.000354202114651514, "grad_norm": 0.928, "tokens_per_sec": 151382, "dt_s": 4.329, "eta_s": 3580, "world_size": 1, "timestamp": "2026-05-05T06:49:44.328318"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84470, "epoch": 0, "train_loss": 3.666742816567421, "train_ppl": 39.12426319347833, "lr": 0.0003538396029633892, "grad_norm": 0.8015, "tokens_per_sec": 149490, "dt_s": 4.384, "eta_s": 3575, "world_size": 1, "timestamp": "2026-05-05T06:49:48.712290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84480, "epoch": 0, "train_loss": 3.6031541377305984, "train_ppl": 36.71385255768491, "lr": 0.0003534770912752643, "grad_norm": 0.7514, "tokens_per_sec": 151954, "dt_s": 4.313, "eta_s": 3564, "world_size": 1, "timestamp": "2026-05-05T06:49:53.025155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84490, "epoch": 0, "train_loss": 3.6843169182538986, "train_ppl": 39.81791426780892, "lr": 0.0003531145795871394, "grad_norm": 0.7256, "tokens_per_sec": 152178, "dt_s": 4.307, "eta_s": 3556, "world_size": 1, "timestamp": "2026-05-05T06:49:57.331711"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84500, "epoch": 0, "train_loss": 3.541205868124962, "train_ppl": 34.5085068196338, "lr": 0.00035275206789901455, "grad_norm": 0.7194, "tokens_per_sec": 148738, "dt_s": 4.406, "eta_s": 3558, "world_size": 1, "timestamp": "2026-05-05T06:50:01.737865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84510, "epoch": 0, "train_loss": 3.6770975440740585, "train_ppl": 39.53148899528145, "lr": 0.0003523895562108897, "grad_norm": 0.9338, "tokens_per_sec": 127339, "dt_s": 5.147, "eta_s": 3561, "world_size": 1, "timestamp": "2026-05-05T06:50:06.884398"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84520, "epoch": 0, "train_loss": 3.4967023581266403, "train_ppl": 33.00642891593063, "lr": 0.0003520270445227648, "grad_norm": 0.725, "tokens_per_sec": 149635, "dt_s": 4.38, "eta_s": 3556, "world_size": 1, "timestamp": "2026-05-05T06:50:11.264146"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84530, "epoch": 0, "train_loss": 3.5862308889627457, "train_ppl": 36.09776272215307, "lr": 0.0003516645328346399, "grad_norm": 0.7323, "tokens_per_sec": 147770, "dt_s": 4.435, "eta_s": 3572, "world_size": 1, "timestamp": "2026-05-05T06:50:15.699134"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84540, "epoch": 0, "train_loss": 3.65760438144207, "train_ppl": 38.768357340565615, "lr": 0.0003513020211465151, "grad_norm": 0.7922, "tokens_per_sec": 151272, "dt_s": 4.332, "eta_s": 3572, "world_size": 1, "timestamp": "2026-05-05T06:50:20.031457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84550, "epoch": 0, "train_loss": 3.612496793270111, "train_ppl": 37.058464725423555, "lr": 0.0003509395094583902, "grad_norm": 0.8186, "tokens_per_sec": 150372, "dt_s": 4.358, "eta_s": 3559, "world_size": 1, "timestamp": "2026-05-05T06:50:24.389714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84560, "epoch": 0, "train_loss": 3.627026751637459, "train_ppl": 37.60085357637312, "lr": 0.00035057699777026535, "grad_norm": 0.789, "tokens_per_sec": 150769, "dt_s": 4.347, "eta_s": 3551, "world_size": 1, "timestamp": "2026-05-05T06:50:28.736490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84570, "epoch": 0, "train_loss": 3.720096781849861, "train_ppl": 41.268387946268405, "lr": 0.0003502144860821405, "grad_norm": 0.7623, "tokens_per_sec": 151827, "dt_s": 4.316, "eta_s": 3536, "world_size": 1, "timestamp": "2026-05-05T06:50:33.052975"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84580, "epoch": 0, "train_loss": 3.648702308535576, "train_ppl": 38.42477018502421, "lr": 0.00034985197439401565, "grad_norm": 0.7685, "tokens_per_sec": 149169, "dt_s": 4.393, "eta_s": 3525, "world_size": 1, "timestamp": "2026-05-05T06:50:37.446388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84590, "epoch": 0, "train_loss": 3.6893053352832794, "train_ppl": 40.01703887478415, "lr": 0.0003494894627058908, "grad_norm": 0.7762, "tokens_per_sec": 149205, "dt_s": 4.392, "eta_s": 3530, "world_size": 1, "timestamp": "2026-05-05T06:50:41.838706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84600, "epoch": 0, "train_loss": 3.6967710852622986, "train_ppl": 40.31691408574429, "lr": 0.0003491269510177659, "grad_norm": 0.859, "tokens_per_sec": 149892, "dt_s": 4.372, "eta_s": 3528, "world_size": 1, "timestamp": "2026-05-05T06:50:46.210903"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84610, "epoch": 0, "train_loss": 3.638801336288452, "train_ppl": 38.046204777153946, "lr": 0.000348764439329641, "grad_norm": 0.7623, "tokens_per_sec": 148642, "dt_s": 4.409, "eta_s": 3534, "world_size": 1, "timestamp": "2026-05-05T06:50:50.619898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84620, "epoch": 0, "train_loss": 3.5277576744556427, "train_ppl": 34.04753629447278, "lr": 0.00034840192764151615, "grad_norm": 0.7195, "tokens_per_sec": 150247, "dt_s": 4.362, "eta_s": 3537, "world_size": 1, "timestamp": "2026-05-05T06:50:54.981775"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84630, "epoch": 0, "train_loss": 3.6403557807207108, "train_ppl": 38.10539147763915, "lr": 0.00034803941595339127, "grad_norm": 0.7721, "tokens_per_sec": 148897, "dt_s": 4.401, "eta_s": 3534, "world_size": 1, "timestamp": "2026-05-05T06:50:59.383205"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84640, "epoch": 0, "train_loss": 3.5338292717933655, "train_ppl": 34.254888066438305, "lr": 0.00034767690426526645, "grad_norm": 0.809, "tokens_per_sec": 149452, "dt_s": 4.385, "eta_s": 3528, "world_size": 1, "timestamp": "2026-05-05T06:51:03.768299"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84650, "epoch": 0, "train_loss": 3.632710352540016, "train_ppl": 37.81517029010239, "lr": 0.0003473143925771416, "grad_norm": 0.7533, "tokens_per_sec": 150593, "dt_s": 4.352, "eta_s": 3521, "world_size": 1, "timestamp": "2026-05-05T06:51:08.120136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84660, "epoch": 0, "train_loss": 3.714081421494484, "train_ppl": 41.02088886586755, "lr": 0.0003469518808890167, "grad_norm": 0.747, "tokens_per_sec": 149259, "dt_s": 4.391, "eta_s": 3513, "world_size": 1, "timestamp": "2026-05-05T06:51:12.510894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84670, "epoch": 0, "train_loss": 3.6316671818494797, "train_ppl": 37.77574318097583, "lr": 0.0003465893692008919, "grad_norm": 0.7497, "tokens_per_sec": 150223, "dt_s": 4.363, "eta_s": 3509, "world_size": 1, "timestamp": "2026-05-05T06:51:16.873459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84680, "epoch": 0, "train_loss": 3.5790398865938187, "train_ppl": 35.83911470871133, "lr": 0.000346226857512767, "grad_norm": 0.7226, "tokens_per_sec": 150705, "dt_s": 4.349, "eta_s": 3496, "world_size": 1, "timestamp": "2026-05-05T06:51:21.222094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84690, "epoch": 0, "train_loss": 3.7473862916231155, "train_ppl": 42.41008940616813, "lr": 0.0003458643458246421, "grad_norm": 0.79, "tokens_per_sec": 148859, "dt_s": 4.403, "eta_s": 3495, "world_size": 1, "timestamp": "2026-05-05T06:51:25.624671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84700, "epoch": 0, "train_loss": 5.416746065020561, "train_ppl": 225.14532104278015, "lr": 0.00034550183413651725, "grad_norm": 6.2666, "tokens_per_sec": 148722, "dt_s": 4.407, "eta_s": 3499, "world_size": 1, "timestamp": "2026-05-05T06:51:30.031286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84710, "epoch": 0, "train_loss": 3.649176597595215, "train_ppl": 38.442998955653636, "lr": 0.00034513932244839237, "grad_norm": 0.8967, "tokens_per_sec": 134632, "dt_s": 4.868, "eta_s": 3571, "world_size": 1, "timestamp": "2026-05-05T06:51:34.899080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84720, "epoch": 0, "train_loss": 3.639407590031624, "train_ppl": 38.06927742444628, "lr": 0.0003447768107602675, "grad_norm": 0.8345, "tokens_per_sec": 145831, "dt_s": 4.494, "eta_s": 3587, "world_size": 1, "timestamp": "2026-05-05T06:51:39.393034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84730, "epoch": 0, "train_loss": 3.6526876091957092, "train_ppl": 38.57820999581219, "lr": 0.0003444142990721426, "grad_norm": 0.7519, "tokens_per_sec": 149136, "dt_s": 4.394, "eta_s": 3590, "world_size": 1, "timestamp": "2026-05-05T06:51:43.787436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84740, "epoch": 0, "train_loss": 3.6161329448223114, "train_ppl": 37.19346020272627, "lr": 0.0003440517873840178, "grad_norm": 0.7715, "tokens_per_sec": 149217, "dt_s": 4.392, "eta_s": 3584, "world_size": 1, "timestamp": "2026-05-05T06:51:48.179430"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84750, "epoch": 0, "train_loss": 3.6408242881298065, "train_ppl": 38.12324831857743, "lr": 0.0003436892756958929, "grad_norm": 1.0992, "tokens_per_sec": 148915, "dt_s": 4.401, "eta_s": 3578, "world_size": 1, "timestamp": "2026-05-05T06:51:52.580349"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84760, "epoch": 0, "train_loss": 3.678012177348137, "train_ppl": 39.567662350645875, "lr": 0.00034332676400776805, "grad_norm": 0.8742, "tokens_per_sec": 150254, "dt_s": 4.362, "eta_s": 3494, "world_size": 1, "timestamp": "2026-05-05T06:51:56.942019"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84770, "epoch": 0, "train_loss": 3.5920072942972183, "train_ppl": 36.306881427035854, "lr": 0.0003429642523196432, "grad_norm": 0.8033, "tokens_per_sec": 149954, "dt_s": 4.37, "eta_s": 3470, "world_size": 1, "timestamp": "2026-05-05T06:52:01.312428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84780, "epoch": 0, "train_loss": 3.744212418794632, "train_ppl": 42.27569855829372, "lr": 0.00034260174063151835, "grad_norm": 0.8029, "tokens_per_sec": 149502, "dt_s": 4.384, "eta_s": 3464, "world_size": 1, "timestamp": "2026-05-05T06:52:05.696064"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84790, "epoch": 0, "train_loss": 3.635031744837761, "train_ppl": 37.90305610441091, "lr": 0.0003422392289433935, "grad_norm": 0.7723, "tokens_per_sec": 149369, "dt_s": 4.388, "eta_s": 3458, "world_size": 1, "timestamp": "2026-05-05T06:52:10.083597"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84800, "epoch": 0, "train_loss": 3.7766288071870804, "train_ppl": 43.668578114393945, "lr": 0.0003418767172552686, "grad_norm": 0.7536, "tokens_per_sec": 145386, "dt_s": 4.508, "eta_s": 3471, "world_size": 1, "timestamp": "2026-05-05T06:52:14.591345"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84810, "epoch": 0, "train_loss": 3.673152595758438, "train_ppl": 39.375846516972175, "lr": 0.0003415142055671437, "grad_norm": 0.887, "tokens_per_sec": 149646, "dt_s": 4.379, "eta_s": 3469, "world_size": 1, "timestamp": "2026-05-05T06:52:18.970755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84820, "epoch": 0, "train_loss": 3.6771211326122284, "train_ppl": 39.53242149631665, "lr": 0.00034115169387901885, "grad_norm": 0.8784, "tokens_per_sec": 150308, "dt_s": 4.36, "eta_s": 3463, "world_size": 1, "timestamp": "2026-05-05T06:52:23.330836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84830, "epoch": 0, "train_loss": 3.647289678454399, "train_ppl": 38.37052851952276, "lr": 0.00034078918219089397, "grad_norm": 0.8135, "tokens_per_sec": 147038, "dt_s": 4.457, "eta_s": 3470, "world_size": 1, "timestamp": "2026-05-05T06:52:27.787923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84840, "epoch": 0, "train_loss": 3.601308062672615, "train_ppl": 36.646138551994916, "lr": 0.00034042667050276915, "grad_norm": 0.8438, "tokens_per_sec": 150279, "dt_s": 4.361, "eta_s": 3462, "world_size": 1, "timestamp": "2026-05-05T06:52:32.148851"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84850, "epoch": 0, "train_loss": 3.6268937289714813, "train_ppl": 37.595852143246724, "lr": 0.00034006415881464427, "grad_norm": 0.8082, "tokens_per_sec": 149027, "dt_s": 4.398, "eta_s": 3440, "world_size": 1, "timestamp": "2026-05-05T06:52:36.546440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84860, "epoch": 0, "train_loss": 3.6953479200601578, "train_ppl": 40.259577266120495, "lr": 0.0003397016471265194, "grad_norm": 0.8022, "tokens_per_sec": 148888, "dt_s": 4.402, "eta_s": 3439, "world_size": 1, "timestamp": "2026-05-05T06:52:40.948138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84870, "epoch": 0, "train_loss": 3.666824609041214, "train_ppl": 39.127463394624655, "lr": 0.0003393391354383946, "grad_norm": 0.8584, "tokens_per_sec": 151000, "dt_s": 4.34, "eta_s": 3432, "world_size": 1, "timestamp": "2026-05-05T06:52:45.288268"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84880, "epoch": 0, "train_loss": 3.6034228056669235, "train_ppl": 36.7237177178529, "lr": 0.0003389766237502697, "grad_norm": 0.7909, "tokens_per_sec": 147456, "dt_s": 4.444, "eta_s": 3425, "world_size": 1, "timestamp": "2026-05-05T06:52:49.732694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84890, "epoch": 0, "train_loss": 3.7033671736717224, "train_ppl": 40.583727008881375, "lr": 0.0003386141120621448, "grad_norm": 0.7834, "tokens_per_sec": 148076, "dt_s": 4.426, "eta_s": 3431, "world_size": 1, "timestamp": "2026-05-05T06:52:54.158533"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84900, "epoch": 0, "train_loss": 3.671426236629486, "train_ppl": 39.30792830735613, "lr": 0.00033825160037401995, "grad_norm": 0.7769, "tokens_per_sec": 149285, "dt_s": 4.39, "eta_s": 3426, "world_size": 1, "timestamp": "2026-05-05T06:52:58.548534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84910, "epoch": 0, "train_loss": 3.608749821782112, "train_ppl": 36.91986753672188, "lr": 0.00033788908868589507, "grad_norm": 0.7569, "tokens_per_sec": 146719, "dt_s": 4.467, "eta_s": 3431, "world_size": 1, "timestamp": "2026-05-05T06:53:03.015329"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84920, "epoch": 0, "train_loss": 3.6342716962099075, "train_ppl": 37.87425888365764, "lr": 0.0003375265769977702, "grad_norm": 0.7348, "tokens_per_sec": 148457, "dt_s": 4.414, "eta_s": 3438, "world_size": 1, "timestamp": "2026-05-05T06:53:07.429810"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84930, "epoch": 0, "train_loss": 3.628286361694336, "train_ppl": 37.648245831299434, "lr": 0.0003371640653096453, "grad_norm": 0.7774, "tokens_per_sec": 147705, "dt_s": 4.437, "eta_s": 3433, "world_size": 1, "timestamp": "2026-05-05T06:53:11.866730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84940, "epoch": 0, "train_loss": 3.6559211760759354, "train_ppl": 38.703157121526694, "lr": 0.0003368015536215205, "grad_norm": 0.817, "tokens_per_sec": 146319, "dt_s": 4.479, "eta_s": 3437, "world_size": 1, "timestamp": "2026-05-05T06:53:16.345737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84950, "epoch": 0, "train_loss": 3.5993493646383286, "train_ppl": 36.57443008299335, "lr": 0.0003364390419333956, "grad_norm": 0.7761, "tokens_per_sec": 149524, "dt_s": 4.383, "eta_s": 3431, "world_size": 1, "timestamp": "2026-05-05T06:53:20.728701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84960, "epoch": 0, "train_loss": 3.653369128704071, "train_ppl": 38.60451075974588, "lr": 0.00033607653024527075, "grad_norm": 0.7712, "tokens_per_sec": 147427, "dt_s": 4.445, "eta_s": 3423, "world_size": 1, "timestamp": "2026-05-05T06:53:25.174008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84970, "epoch": 0, "train_loss": 3.6987593919038773, "train_ppl": 40.397156220337976, "lr": 0.0003357140185571459, "grad_norm": 0.8627, "tokens_per_sec": 148221, "dt_s": 4.422, "eta_s": 3420, "world_size": 1, "timestamp": "2026-05-05T06:53:29.595562"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84980, "epoch": 0, "train_loss": 3.6151032745838165, "train_ppl": 37.15518291356347, "lr": 0.00033535150686902105, "grad_norm": 0.8058, "tokens_per_sec": 149406, "dt_s": 4.386, "eta_s": 3408, "world_size": 1, "timestamp": "2026-05-05T06:53:33.981983"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 84990, "epoch": 0, "train_loss": 3.562528058886528, "train_ppl": 35.252204238310206, "lr": 0.00033498899518089617, "grad_norm": 0.7734, "tokens_per_sec": 147921, "dt_s": 4.43, "eta_s": 3396, "world_size": 1, "timestamp": "2026-05-05T06:53:38.412435"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85000, "epoch": 0, "train_loss": 3.6194284707307816, "train_ppl": 37.31623440611843, "lr": 0.0003346264834927713, "grad_norm": 0.8636, "tokens_per_sec": 148535, "dt_s": 4.412, "eta_s": 3396, "world_size": 1, "timestamp": "2026-05-05T06:53:42.824597"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85010, "epoch": 0, "train_loss": 3.601536050438881, "train_ppl": 36.65449437574226, "lr": 0.0003342639718046464, "grad_norm": 0.788, "tokens_per_sec": 115176, "dt_s": 5.69, "eta_s": 3464, "world_size": 1, "timestamp": "2026-05-05T06:53:48.514670"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85020, "epoch": 0, "train_loss": 3.602931037545204, "train_ppl": 36.705662603996814, "lr": 0.00033390146011652154, "grad_norm": 0.7584, "tokens_per_sec": 144144, "dt_s": 4.547, "eta_s": 3479, "world_size": 1, "timestamp": "2026-05-05T06:53:53.061254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85030, "epoch": 0, "train_loss": 3.5649349838495255, "train_ppl": 35.33715584374955, "lr": 0.00033353894842839667, "grad_norm": 0.7428, "tokens_per_sec": 147252, "dt_s": 4.451, "eta_s": 3484, "world_size": 1, "timestamp": "2026-05-05T06:53:57.511866"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85040, "epoch": 0, "train_loss": 3.5831703692674637, "train_ppl": 35.987453695963275, "lr": 0.00033317643674027185, "grad_norm": 0.9568, "tokens_per_sec": 148237, "dt_s": 4.421, "eta_s": 3478, "world_size": 1, "timestamp": "2026-05-05T06:54:01.932881"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85050, "epoch": 0, "train_loss": 3.678601920604706, "train_ppl": 39.59100399480747, "lr": 0.00033281392505214697, "grad_norm": 0.7875, "tokens_per_sec": 148965, "dt_s": 4.399, "eta_s": 3472, "world_size": 1, "timestamp": "2026-05-05T06:54:06.332288"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85060, "epoch": 0, "train_loss": 3.602513238787651, "train_ppl": 36.69033022691278, "lr": 0.0003324514133640221, "grad_norm": 0.8059, "tokens_per_sec": 149106, "dt_s": 4.395, "eta_s": 3387, "world_size": 1, "timestamp": "2026-05-05T06:54:10.727545"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85070, "epoch": 0, "train_loss": 3.6423540264368057, "train_ppl": 38.18161154076464, "lr": 0.00033208890167589727, "grad_norm": 0.8244, "tokens_per_sec": 146261, "dt_s": 4.481, "eta_s": 3373, "world_size": 1, "timestamp": "2026-05-05T06:54:15.208309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85080, "epoch": 0, "train_loss": 3.6626377254724503, "train_ppl": 38.963983735017, "lr": 0.0003317263899877724, "grad_norm": 0.7984, "tokens_per_sec": 150645, "dt_s": 4.35, "eta_s": 3353, "world_size": 1, "timestamp": "2026-05-05T06:54:19.558640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85090, "epoch": 0, "train_loss": 3.6190431267023087, "train_ppl": 37.30185758821331, "lr": 0.0003313638782996475, "grad_norm": 0.7865, "tokens_per_sec": 148191, "dt_s": 4.422, "eta_s": 3349, "world_size": 1, "timestamp": "2026-05-05T06:54:23.981040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85100, "epoch": 0, "train_loss": 3.6083193123340607, "train_ppl": 36.903976605769905, "lr": 0.00033100136661152265, "grad_norm": 0.8371, "tokens_per_sec": 147519, "dt_s": 4.443, "eta_s": 3351, "world_size": 1, "timestamp": "2026-05-05T06:54:28.423567"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85110, "epoch": 0, "train_loss": 3.8003844618797302, "train_ppl": 44.71837369880269, "lr": 0.00033063885492339777, "grad_norm": 0.7986, "tokens_per_sec": 149409, "dt_s": 4.386, "eta_s": 3345, "world_size": 1, "timestamp": "2026-05-05T06:54:32.809974"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85120, "epoch": 0, "train_loss": 3.6677258610725403, "train_ppl": 39.16274299600791, "lr": 0.0003302763432352729, "grad_norm": 0.8067, "tokens_per_sec": 150914, "dt_s": 4.343, "eta_s": 3320, "world_size": 1, "timestamp": "2026-05-05T06:54:37.152511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85130, "epoch": 0, "train_loss": 3.7107207477092743, "train_ppl": 40.883262428321295, "lr": 0.000329913831547148, "grad_norm": 0.7896, "tokens_per_sec": 148680, "dt_s": 4.408, "eta_s": 3324, "world_size": 1, "timestamp": "2026-05-05T06:54:41.560371"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85140, "epoch": 0, "train_loss": 3.6947107762098312, "train_ppl": 40.23393429404711, "lr": 0.0003295513198590232, "grad_norm": 0.7899, "tokens_per_sec": 150306, "dt_s": 4.36, "eta_s": 3310, "world_size": 1, "timestamp": "2026-05-05T06:54:45.920565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85150, "epoch": 0, "train_loss": 3.5744218230247498, "train_ppl": 35.67398897276298, "lr": 0.0003291888081708983, "grad_norm": 0.758, "tokens_per_sec": 150235, "dt_s": 4.362, "eta_s": 3294, "world_size": 1, "timestamp": "2026-05-05T06:54:50.282802"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85160, "epoch": 0, "train_loss": 3.641959920525551, "train_ppl": 38.16656690673952, "lr": 0.00032882629648277344, "grad_norm": 0.8483, "tokens_per_sec": 149496, "dt_s": 4.384, "eta_s": 3289, "world_size": 1, "timestamp": "2026-05-05T06:54:54.666595"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85170, "epoch": 0, "train_loss": 3.6287563145160675, "train_ppl": 37.66594288872642, "lr": 0.0003284637847946486, "grad_norm": 0.855, "tokens_per_sec": 150697, "dt_s": 4.349, "eta_s": 3286, "world_size": 1, "timestamp": "2026-05-05T06:54:59.015472"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85180, "epoch": 0, "train_loss": 3.479418560862541, "train_ppl": 32.440854208847576, "lr": 0.00032810127310652375, "grad_norm": 0.7814, "tokens_per_sec": 147634, "dt_s": 4.439, "eta_s": 3286, "world_size": 1, "timestamp": "2026-05-05T06:55:03.454535"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85190, "epoch": 0, "train_loss": 3.5811355113983154, "train_ppl": 35.91429879777194, "lr": 0.00032773876141839887, "grad_norm": 0.7901, "tokens_per_sec": 150702, "dt_s": 4.349, "eta_s": 3280, "world_size": 1, "timestamp": "2026-05-05T06:55:07.803262"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85200, "epoch": 0, "train_loss": 3.5287022590637207, "train_ppl": 34.079712267271454, "lr": 0.000327376249730274, "grad_norm": 0.7494, "tokens_per_sec": 150176, "dt_s": 4.364, "eta_s": 3276, "world_size": 1, "timestamp": "2026-05-05T06:55:12.167196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85210, "epoch": 0, "train_loss": 3.6555925756692886, "train_ppl": 38.690441337678344, "lr": 0.0003270137380421491, "grad_norm": 0.7763, "tokens_per_sec": 147531, "dt_s": 4.442, "eta_s": 3280, "world_size": 1, "timestamp": "2026-05-05T06:55:16.609383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85220, "epoch": 0, "train_loss": 3.6072309017181396, "train_ppl": 36.86383177685455, "lr": 0.00032665122635402424, "grad_norm": 0.7657, "tokens_per_sec": 150861, "dt_s": 4.344, "eta_s": 3275, "world_size": 1, "timestamp": "2026-05-05T06:55:20.953524"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85230, "epoch": 0, "train_loss": 3.6568098813295364, "train_ppl": 38.737568108939385, "lr": 0.00032628871466589937, "grad_norm": 0.8156, "tokens_per_sec": 150445, "dt_s": 4.356, "eta_s": 3258, "world_size": 1, "timestamp": "2026-05-05T06:55:25.309672"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85240, "epoch": 0, "train_loss": 3.6324206441640854, "train_ppl": 37.804216505310315, "lr": 0.0003259262029777745, "grad_norm": 0.7403, "tokens_per_sec": 146908, "dt_s": 4.461, "eta_s": 3271, "world_size": 1, "timestamp": "2026-05-05T06:55:29.770687"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85250, "epoch": 0, "train_loss": 3.6927120983600616, "train_ppl": 40.153599928765466, "lr": 0.00032556369128964967, "grad_norm": 0.8006, "tokens_per_sec": 149698, "dt_s": 4.378, "eta_s": 3268, "world_size": 1, "timestamp": "2026-05-05T06:55:34.148546"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85260, "epoch": 0, "train_loss": 3.6696577221155167, "train_ppl": 39.23847310000423, "lr": 0.0003252011796015248, "grad_norm": 0.8434, "tokens_per_sec": 149807, "dt_s": 4.375, "eta_s": 3254, "world_size": 1, "timestamp": "2026-05-05T06:55:38.523261"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85270, "epoch": 0, "train_loss": 3.6774217933416367, "train_ppl": 39.54430912998176, "lr": 0.00032483866791339997, "grad_norm": 0.8191, "tokens_per_sec": 150856, "dt_s": 4.344, "eta_s": 3250, "world_size": 1, "timestamp": "2026-05-05T06:55:42.867528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85280, "epoch": 0, "train_loss": 3.6337143033742905, "train_ppl": 37.85315392552423, "lr": 0.0003244761562252751, "grad_norm": 0.7777, "tokens_per_sec": 149030, "dt_s": 4.398, "eta_s": 3251, "world_size": 1, "timestamp": "2026-05-05T06:55:47.265039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85290, "epoch": 0, "train_loss": 3.6924035102128983, "train_ppl": 40.141210915411165, "lr": 0.0003241136445371502, "grad_norm": 0.8543, "tokens_per_sec": 147600, "dt_s": 4.44, "eta_s": 3244, "world_size": 1, "timestamp": "2026-05-05T06:55:51.705161"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85300, "epoch": 0, "train_loss": 3.6819698363542557, "train_ppl": 39.72456795052739, "lr": 0.00032375113284902534, "grad_norm": 0.7562, "tokens_per_sec": 135250, "dt_s": 4.846, "eta_s": 3309, "world_size": 1, "timestamp": "2026-05-05T06:55:56.550702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85310, "epoch": 0, "train_loss": 3.664704978466034, "train_ppl": 39.04461546140883, "lr": 0.00032338862116090047, "grad_norm": 0.8795, "tokens_per_sec": 150200, "dt_s": 4.363, "eta_s": 3302, "world_size": 1, "timestamp": "2026-05-05T06:56:00.913934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85320, "epoch": 0, "train_loss": 3.599818468093872, "train_ppl": 36.59159129940807, "lr": 0.0003230261094727756, "grad_norm": 0.7573, "tokens_per_sec": 147752, "dt_s": 4.436, "eta_s": 3311, "world_size": 1, "timestamp": "2026-05-05T06:56:05.349474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85330, "epoch": 0, "train_loss": 3.686036691069603, "train_ppl": 39.88645095121514, "lr": 0.0003226635977846507, "grad_norm": 0.8012, "tokens_per_sec": 150217, "dt_s": 4.363, "eta_s": 3302, "world_size": 1, "timestamp": "2026-05-05T06:56:09.712238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85340, "epoch": 0, "train_loss": 3.6357664614915848, "train_ppl": 37.930914343664625, "lr": 0.00032230108609652584, "grad_norm": 0.7718, "tokens_per_sec": 150081, "dt_s": 4.367, "eta_s": 3287, "world_size": 1, "timestamp": "2026-05-05T06:56:14.078919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85350, "epoch": 0, "train_loss": 3.761929601430893, "train_ppl": 43.031379330700766, "lr": 0.000321938574408401, "grad_norm": 0.8612, "tokens_per_sec": 147102, "dt_s": 4.455, "eta_s": 3225, "world_size": 1, "timestamp": "2026-05-05T06:56:18.534062"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85360, "epoch": 0, "train_loss": 3.735384240746498, "train_ppl": 41.90412374190448, "lr": 0.00032157606272027614, "grad_norm": 0.7754, "tokens_per_sec": 151289, "dt_s": 4.332, "eta_s": 3216, "world_size": 1, "timestamp": "2026-05-05T06:56:22.865914"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85370, "epoch": 0, "train_loss": 3.761312499642372, "train_ppl": 43.00483278135726, "lr": 0.0003212135510321513, "grad_norm": 0.806, "tokens_per_sec": 149094, "dt_s": 4.396, "eta_s": 3206, "world_size": 1, "timestamp": "2026-05-05T06:56:27.261511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85380, "epoch": 0, "train_loss": 3.6047830879688263, "train_ppl": 36.77370633273051, "lr": 0.00032085103934402644, "grad_norm": 0.7691, "tokens_per_sec": 149714, "dt_s": 4.377, "eta_s": 3203, "world_size": 1, "timestamp": "2026-05-05T06:56:31.638939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85390, "epoch": 0, "train_loss": 3.6684450656175613, "train_ppl": 39.190919149760525, "lr": 0.00032048852765590157, "grad_norm": 0.8371, "tokens_per_sec": 150922, "dt_s": 4.342, "eta_s": 3195, "world_size": 1, "timestamp": "2026-05-05T06:56:35.981317"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85400, "epoch": 0, "train_loss": 3.6764493882656097, "train_ppl": 39.505874732985696, "lr": 0.0003201260159677767, "grad_norm": 0.8765, "tokens_per_sec": 147386, "dt_s": 4.447, "eta_s": 3190, "world_size": 1, "timestamp": "2026-05-05T06:56:40.427864"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85410, "epoch": 0, "train_loss": 3.6892722100019455, "train_ppl": 40.01571332106806, "lr": 0.0003197635042796518, "grad_norm": 0.8121, "tokens_per_sec": 150651, "dt_s": 4.35, "eta_s": 3188, "world_size": 1, "timestamp": "2026-05-05T06:56:44.778044"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85420, "epoch": 0, "train_loss": 3.7051542550325394, "train_ppl": 40.656318274901345, "lr": 0.00031940099259152694, "grad_norm": 0.8037, "tokens_per_sec": 149658, "dt_s": 4.379, "eta_s": 3181, "world_size": 1, "timestamp": "2026-05-05T06:56:49.157081"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85430, "epoch": 0, "train_loss": 3.6559814363718033, "train_ppl": 38.70548945549872, "lr": 0.00031903848090340207, "grad_norm": 0.8157, "tokens_per_sec": 147452, "dt_s": 4.445, "eta_s": 3187, "world_size": 1, "timestamp": "2026-05-05T06:56:53.601658"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85440, "epoch": 0, "train_loss": 3.666017398238182, "train_ppl": 39.09589202756585, "lr": 0.0003186759692152772, "grad_norm": 0.8114, "tokens_per_sec": 150533, "dt_s": 4.354, "eta_s": 3184, "world_size": 1, "timestamp": "2026-05-05T06:56:57.955250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85450, "epoch": 0, "train_loss": 3.6084239929914474, "train_ppl": 36.90783994050586, "lr": 0.00031831345752715237, "grad_norm": 0.7792, "tokens_per_sec": 148588, "dt_s": 4.411, "eta_s": 3174, "world_size": 1, "timestamp": "2026-05-05T06:57:02.365831"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85460, "epoch": 0, "train_loss": 3.6376627534627914, "train_ppl": 38.00291067345167, "lr": 0.0003179509458390275, "grad_norm": 0.82, "tokens_per_sec": 147065, "dt_s": 4.456, "eta_s": 3185, "world_size": 1, "timestamp": "2026-05-05T06:57:06.822110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85470, "epoch": 0, "train_loss": 3.6631187051534653, "train_ppl": 38.98272912719994, "lr": 0.00031758843415090267, "grad_norm": 0.8037, "tokens_per_sec": 147222, "dt_s": 4.452, "eta_s": 3191, "world_size": 1, "timestamp": "2026-05-05T06:57:11.273617"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85480, "epoch": 0, "train_loss": 3.716162085533142, "train_ppl": 41.10632840883334, "lr": 0.0003172259224627778, "grad_norm": 0.8045, "tokens_per_sec": 147343, "dt_s": 4.448, "eta_s": 3187, "world_size": 1, "timestamp": "2026-05-05T06:57:15.721461"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85490, "epoch": 0, "train_loss": 3.6756937205791473, "train_ppl": 39.476032696775846, "lr": 0.0003168634107746529, "grad_norm": 0.7965, "tokens_per_sec": 149341, "dt_s": 4.388, "eta_s": 3188, "world_size": 1, "timestamp": "2026-05-05T06:57:20.109812"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85500, "epoch": 0, "train_loss": 3.588272050023079, "train_ppl": 36.1715193187252, "lr": 0.00031650089908652804, "grad_norm": 0.812, "tokens_per_sec": 150253, "dt_s": 4.362, "eta_s": 3176, "world_size": 1, "timestamp": "2026-05-05T06:57:24.471521"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85510, "epoch": 0, "train_loss": 3.660460516810417, "train_ppl": 38.87924329440661, "lr": 0.00031613838739840317, "grad_norm": 0.788, "tokens_per_sec": 126291, "dt_s": 5.189, "eta_s": 3167, "world_size": 1, "timestamp": "2026-05-05T06:57:29.660798"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85520, "epoch": 0, "train_loss": 3.6351838409900665, "train_ppl": 37.908821451837476, "lr": 0.0003157758757102783, "grad_norm": 0.8169, "tokens_per_sec": 150546, "dt_s": 4.353, "eta_s": 3148, "world_size": 1, "timestamp": "2026-05-05T06:57:34.014022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85530, "epoch": 0, "train_loss": 3.5815713554620743, "train_ppl": 35.9299552433436, "lr": 0.00031541336402215347, "grad_norm": 0.7856, "tokens_per_sec": 150380, "dt_s": 4.358, "eta_s": 3131, "world_size": 1, "timestamp": "2026-05-05T06:57:38.372028"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85540, "epoch": 0, "train_loss": 3.5552298724651337, "train_ppl": 34.99586362859186, "lr": 0.0003150508523340286, "grad_norm": 0.8147, "tokens_per_sec": 148084, "dt_s": 4.426, "eta_s": 3132, "world_size": 1, "timestamp": "2026-05-05T06:57:42.797617"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85550, "epoch": 0, "train_loss": 3.6198804825544357, "train_ppl": 37.3331055979861, "lr": 0.0003146883406459037, "grad_norm": 0.7769, "tokens_per_sec": 149983, "dt_s": 4.37, "eta_s": 3129, "world_size": 1, "timestamp": "2026-05-05T06:57:47.167189"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85560, "epoch": 0, "train_loss": 3.619969829916954, "train_ppl": 37.336441361524514, "lr": 0.00031432582895777884, "grad_norm": 0.8514, "tokens_per_sec": 150187, "dt_s": 4.364, "eta_s": 3116, "world_size": 1, "timestamp": "2026-05-05T06:57:51.530808"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85570, "epoch": 0, "train_loss": 3.5986026972532272, "train_ppl": 36.54713134172987, "lr": 0.000313963317269654, "grad_norm": 0.8215, "tokens_per_sec": 151032, "dt_s": 4.339, "eta_s": 3110, "world_size": 1, "timestamp": "2026-05-05T06:57:55.870017"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85580, "epoch": 0, "train_loss": 3.5849811285734177, "train_ppl": 36.05267734698581, "lr": 0.0003136008055815291, "grad_norm": 0.84, "tokens_per_sec": 152249, "dt_s": 4.305, "eta_s": 3098, "world_size": 1, "timestamp": "2026-05-05T06:58:00.174551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85590, "epoch": 0, "train_loss": 3.6365056335926056, "train_ppl": 37.958962182126456, "lr": 0.00031323829389340427, "grad_norm": 0.7798, "tokens_per_sec": 148830, "dt_s": 4.403, "eta_s": 3090, "world_size": 1, "timestamp": "2026-05-05T06:58:04.577961"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85600, "epoch": 0, "train_loss": 3.653176337480545, "train_ppl": 38.597068866271805, "lr": 0.0003128757822052794, "grad_norm": 0.8571, "tokens_per_sec": 135137, "dt_s": 4.85, "eta_s": 3154, "world_size": 1, "timestamp": "2026-05-05T06:58:09.427571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85610, "epoch": 0, "train_loss": 3.5907210260629654, "train_ppl": 36.26021106051107, "lr": 0.0003125132705171545, "grad_norm": 0.8239, "tokens_per_sec": 150102, "dt_s": 4.366, "eta_s": 3150, "world_size": 1, "timestamp": "2026-05-05T06:58:13.793691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85620, "epoch": 0, "train_loss": 3.5625077337026596, "train_ppl": 35.25148773805882, "lr": 0.00031215075882902964, "grad_norm": 0.9052, "tokens_per_sec": 146602, "dt_s": 4.47, "eta_s": 3164, "world_size": 1, "timestamp": "2026-05-05T06:58:18.263986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85630, "epoch": 0, "train_loss": 3.5432801097631454, "train_ppl": 34.58016008875724, "lr": 0.0003117882471409048, "grad_norm": 0.8313, "tokens_per_sec": 149863, "dt_s": 4.373, "eta_s": 3169, "world_size": 1, "timestamp": "2026-05-05T06:58:22.637060"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85640, "epoch": 0, "train_loss": 3.62388613820076, "train_ppl": 37.48294907354484, "lr": 0.00031142573545277994, "grad_norm": 0.804, "tokens_per_sec": 150323, "dt_s": 4.36, "eta_s": 3159, "world_size": 1, "timestamp": "2026-05-05T06:58:26.996731"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85650, "epoch": 0, "train_loss": 3.693189710378647, "train_ppl": 40.17278235119401, "lr": 0.00031106322376465507, "grad_norm": 0.8812, "tokens_per_sec": 148375, "dt_s": 4.417, "eta_s": 3093, "world_size": 1, "timestamp": "2026-05-05T06:58:31.413643"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85660, "epoch": 0, "train_loss": 3.8091296553611755, "train_ppl": 45.111159519674736, "lr": 0.0003107007120765302, "grad_norm": 0.8698, "tokens_per_sec": 151918, "dt_s": 4.314, "eta_s": 3082, "world_size": 1, "timestamp": "2026-05-05T06:58:35.727534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85670, "epoch": 0, "train_loss": 3.6135987490415573, "train_ppl": 37.09932402294339, "lr": 0.0003103382003884053, "grad_norm": 0.8124, "tokens_per_sec": 150365, "dt_s": 4.358, "eta_s": 3061, "world_size": 1, "timestamp": "2026-05-05T06:58:40.086055"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85680, "epoch": 0, "train_loss": 3.6261172592639923, "train_ppl": 37.56667143336499, "lr": 0.00030997568870028044, "grad_norm": 0.8708, "tokens_per_sec": 149222, "dt_s": 4.392, "eta_s": 3060, "world_size": 1, "timestamp": "2026-05-05T06:58:44.477863"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85690, "epoch": 0, "train_loss": 3.7834967523813248, "train_ppl": 43.96952377180198, "lr": 0.0003096131770121556, "grad_norm": 0.841, "tokens_per_sec": 149062, "dt_s": 4.397, "eta_s": 3061, "world_size": 1, "timestamp": "2026-05-05T06:58:48.874412"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85700, "epoch": 0, "train_loss": 3.6526898443698883, "train_ppl": 38.57829622492741, "lr": 0.00030925066532403074, "grad_norm": 0.7852, "tokens_per_sec": 147543, "dt_s": 4.442, "eta_s": 3060, "world_size": 1, "timestamp": "2026-05-05T06:58:53.316260"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85710, "epoch": 0, "train_loss": 3.581068217754364, "train_ppl": 35.91188207505183, "lr": 0.00030888815363590586, "grad_norm": 0.8529, "tokens_per_sec": 150988, "dt_s": 4.34, "eta_s": 3059, "world_size": 1, "timestamp": "2026-05-05T06:58:57.656715"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85720, "epoch": 0, "train_loss": 3.588342398405075, "train_ppl": 36.174064016090234, "lr": 0.000308525641947781, "grad_norm": 0.8137, "tokens_per_sec": 150717, "dt_s": 4.348, "eta_s": 3053, "world_size": 1, "timestamp": "2026-05-05T06:59:02.005024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85730, "epoch": 0, "train_loss": 3.5729664713144302, "train_ppl": 35.62210853319788, "lr": 0.00030816313025965617, "grad_norm": 0.877, "tokens_per_sec": 147989, "dt_s": 4.428, "eta_s": 3054, "world_size": 1, "timestamp": "2026-05-05T06:59:06.433447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85740, "epoch": 0, "train_loss": 3.569048374891281, "train_ppl": 35.48281074645115, "lr": 0.0003078006185715313, "grad_norm": 0.8624, "tokens_per_sec": 149554, "dt_s": 4.382, "eta_s": 3047, "world_size": 1, "timestamp": "2026-05-05T06:59:10.815539"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85750, "epoch": 0, "train_loss": 3.6533937752246857, "train_ppl": 38.6054622383414, "lr": 0.0003074381068834064, "grad_norm": 0.8076, "tokens_per_sec": 150329, "dt_s": 4.36, "eta_s": 3032, "world_size": 1, "timestamp": "2026-05-05T06:59:15.175073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85760, "epoch": 0, "train_loss": 3.630370944738388, "train_ppl": 37.726808583038306, "lr": 0.00030707559519528154, "grad_norm": 0.7742, "tokens_per_sec": 150016, "dt_s": 4.369, "eta_s": 3031, "world_size": 1, "timestamp": "2026-05-05T06:59:19.543653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85770, "epoch": 0, "train_loss": 3.5495531409978867, "train_ppl": 34.79776431858486, "lr": 0.0003067130835071567, "grad_norm": 0.8457, "tokens_per_sec": 150016, "dt_s": 4.369, "eta_s": 3030, "world_size": 1, "timestamp": "2026-05-05T06:59:23.912272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85780, "epoch": 0, "train_loss": 3.6492663472890854, "train_ppl": 38.44644935787532, "lr": 0.0003063505718190318, "grad_norm": 0.9138, "tokens_per_sec": 150766, "dt_s": 4.347, "eta_s": 3014, "world_size": 1, "timestamp": "2026-05-05T06:59:28.259127"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85790, "epoch": 0, "train_loss": 3.5737514048814774, "train_ppl": 35.65008049854568, "lr": 0.0003059880601309069, "grad_norm": 0.8683, "tokens_per_sec": 152902, "dt_s": 4.286, "eta_s": 2996, "world_size": 1, "timestamp": "2026-05-05T06:59:32.545283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85800, "epoch": 0, "train_loss": 3.6815480142831802, "train_ppl": 39.70781478467789, "lr": 0.0003056255484427821, "grad_norm": 0.8618, "tokens_per_sec": 152136, "dt_s": 4.308, "eta_s": 2985, "world_size": 1, "timestamp": "2026-05-05T06:59:36.853018"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85810, "epoch": 0, "train_loss": 3.7274891287088394, "train_ppl": 41.57458856044681, "lr": 0.0003052630367546572, "grad_norm": 0.9059, "tokens_per_sec": 150035, "dt_s": 4.368, "eta_s": 2981, "world_size": 1, "timestamp": "2026-05-05T06:59:41.221057"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85820, "epoch": 0, "train_loss": 3.593633398413658, "train_ppl": 36.36596822400149, "lr": 0.00030490052506653234, "grad_norm": 0.8041, "tokens_per_sec": 152146, "dt_s": 4.307, "eta_s": 2968, "world_size": 1, "timestamp": "2026-05-05T06:59:45.528498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85830, "epoch": 0, "train_loss": 3.5866325050592422, "train_ppl": 36.11226307630271, "lr": 0.0003045380133784075, "grad_norm": 0.7864, "tokens_per_sec": 151028, "dt_s": 4.339, "eta_s": 2962, "world_size": 1, "timestamp": "2026-05-05T06:59:49.867831"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85840, "epoch": 0, "train_loss": 3.6118717342615128, "train_ppl": 37.03530823604219, "lr": 0.00030417550169028264, "grad_norm": 0.872, "tokens_per_sec": 149671, "dt_s": 4.379, "eta_s": 2971, "world_size": 1, "timestamp": "2026-05-05T06:59:54.246484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85850, "epoch": 0, "train_loss": 3.698456421494484, "train_ppl": 40.38491893124143, "lr": 0.00030381299000215776, "grad_norm": 0.9522, "tokens_per_sec": 150926, "dt_s": 4.342, "eta_s": 2971, "world_size": 1, "timestamp": "2026-05-05T06:59:58.588750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85860, "epoch": 0, "train_loss": 3.62904092669487, "train_ppl": 37.67666460049113, "lr": 0.0003034504783140329, "grad_norm": 0.8043, "tokens_per_sec": 151083, "dt_s": 4.338, "eta_s": 2963, "world_size": 1, "timestamp": "2026-05-05T07:00:02.926505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85870, "epoch": 0, "train_loss": 3.596271798014641, "train_ppl": 36.462042865969046, "lr": 0.000303087966625908, "grad_norm": 0.844, "tokens_per_sec": 148758, "dt_s": 4.406, "eta_s": 2972, "world_size": 1, "timestamp": "2026-05-05T07:00:07.332047"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85880, "epoch": 0, "train_loss": 3.529059886932373, "train_ppl": 34.091902301751986, "lr": 0.0003027254549377832, "grad_norm": 0.8438, "tokens_per_sec": 151680, "dt_s": 4.321, "eta_s": 2965, "world_size": 1, "timestamp": "2026-05-05T07:00:11.652719"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85890, "epoch": 0, "train_loss": 3.591368183493614, "train_ppl": 36.28368472029319, "lr": 0.0003023629432496583, "grad_norm": 0.8259, "tokens_per_sec": 132652, "dt_s": 4.94, "eta_s": 3037, "world_size": 1, "timestamp": "2026-05-05T07:00:16.593148"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85900, "epoch": 0, "train_loss": 3.6326584815979004, "train_ppl": 37.813208832464944, "lr": 0.00030200043156153344, "grad_norm": 0.8189, "tokens_per_sec": 148505, "dt_s": 4.413, "eta_s": 3042, "world_size": 1, "timestamp": "2026-05-05T07:00:21.006208"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85910, "epoch": 0, "train_loss": 3.640152081847191, "train_ppl": 38.09763024282436, "lr": 0.00030163791987340856, "grad_norm": 0.9168, "tokens_per_sec": 151130, "dt_s": 4.336, "eta_s": 3037, "world_size": 1, "timestamp": "2026-05-05T07:00:25.342598"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85920, "epoch": 0, "train_loss": 3.521145850419998, "train_ppl": 33.823162552892285, "lr": 0.0003012754081852837, "grad_norm": 0.7832, "tokens_per_sec": 146406, "dt_s": 4.476, "eta_s": 3042, "world_size": 1, "timestamp": "2026-05-05T07:00:29.818932"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85930, "epoch": 0, "train_loss": 3.6340412497520447, "train_ppl": 37.86553190044353, "lr": 0.0003009128964971588, "grad_norm": 0.8101, "tokens_per_sec": 149400, "dt_s": 4.387, "eta_s": 3047, "world_size": 1, "timestamp": "2026-05-05T07:00:34.205555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85940, "epoch": 0, "train_loss": 3.6752723455429077, "train_ppl": 39.45940198619686, "lr": 0.000300550384809034, "grad_norm": 0.8455, "tokens_per_sec": 151507, "dt_s": 4.326, "eta_s": 2959, "world_size": 1, "timestamp": "2026-05-05T07:00:38.531138"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85950, "epoch": 0, "train_loss": 3.6864888668060303, "train_ppl": 39.90449071481179, "lr": 0.0003001878731209091, "grad_norm": 0.939, "tokens_per_sec": 147814, "dt_s": 4.434, "eta_s": 2958, "world_size": 1, "timestamp": "2026-05-05T07:00:42.964844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85960, "epoch": 0, "train_loss": 3.628562793135643, "train_ppl": 37.65865442872278, "lr": 0.00029982536143278424, "grad_norm": 0.8691, "tokens_per_sec": 148031, "dt_s": 4.427, "eta_s": 2966, "world_size": 1, "timestamp": "2026-05-05T07:00:47.392003"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85970, "epoch": 0, "train_loss": 3.609147757291794, "train_ppl": 36.934562186595066, "lr": 0.0002994628497446594, "grad_norm": 0.8654, "tokens_per_sec": 150084, "dt_s": 4.367, "eta_s": 2946, "world_size": 1, "timestamp": "2026-05-05T07:00:51.758636"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85980, "epoch": 0, "train_loss": 3.650299549102783, "train_ppl": 38.486192827058815, "lr": 0.0002991003380565345, "grad_norm": 0.951, "tokens_per_sec": 148662, "dt_s": 4.408, "eta_s": 2945, "world_size": 1, "timestamp": "2026-05-05T07:00:56.167025"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 85990, "epoch": 0, "train_loss": 3.7705047875642776, "train_ppl": 43.401968080896076, "lr": 0.0002987378263684096, "grad_norm": 0.8943, "tokens_per_sec": 149278, "dt_s": 4.39, "eta_s": 2949, "world_size": 1, "timestamp": "2026-05-05T07:01:00.557237"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86000, "epoch": 0, "train_loss": 3.547435998916626, "train_ppl": 34.72417043925071, "lr": 0.0002983753146802848, "grad_norm": 0.9135, "tokens_per_sec": 150091, "dt_s": 4.366, "eta_s": 2936, "world_size": 1, "timestamp": "2026-05-05T07:01:04.923633"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86010, "epoch": 0, "train_loss": 3.6681614816188812, "train_ppl": 39.17980680791173, "lr": 0.0002980128029921599, "grad_norm": 0.8201, "tokens_per_sec": 128527, "dt_s": 5.099, "eta_s": 2919, "world_size": 1, "timestamp": "2026-05-05T07:01:10.022627"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86020, "epoch": 0, "train_loss": 3.6024189442396164, "train_ppl": 36.686870691917, "lr": 0.00029765029130403504, "grad_norm": 0.8154, "tokens_per_sec": 150662, "dt_s": 4.35, "eta_s": 2912, "world_size": 1, "timestamp": "2026-05-05T07:01:14.372492"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86030, "epoch": 0, "train_loss": 3.6096162497997284, "train_ppl": 36.95186980619182, "lr": 0.0002972877796159102, "grad_norm": 0.9317, "tokens_per_sec": 144509, "dt_s": 4.535, "eta_s": 2925, "world_size": 1, "timestamp": "2026-05-05T07:01:18.907565"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86040, "epoch": 0, "train_loss": 3.623555362224579, "train_ppl": 37.47055266480502, "lr": 0.00029692526792778534, "grad_norm": 0.8605, "tokens_per_sec": 150127, "dt_s": 4.365, "eta_s": 2917, "world_size": 1, "timestamp": "2026-05-05T07:01:23.272950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86050, "epoch": 0, "train_loss": 3.663074314594269, "train_ppl": 38.980998700462564, "lr": 0.00029656275623966046, "grad_norm": 0.9118, "tokens_per_sec": 149869, "dt_s": 4.373, "eta_s": 2914, "world_size": 1, "timestamp": "2026-05-05T07:01:27.645847"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86060, "epoch": 0, "train_loss": 3.6594032496213913, "train_ppl": 38.83815926836068, "lr": 0.0002962002445515356, "grad_norm": 0.96, "tokens_per_sec": 148342, "dt_s": 4.418, "eta_s": 2920, "world_size": 1, "timestamp": "2026-05-05T07:01:32.063762"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86070, "epoch": 0, "train_loss": 3.556772395968437, "train_ppl": 35.04988722638015, "lr": 0.0002958377328634107, "grad_norm": 0.8486, "tokens_per_sec": 150446, "dt_s": 4.356, "eta_s": 2917, "world_size": 1, "timestamp": "2026-05-05T07:01:36.419865"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86080, "epoch": 0, "train_loss": 3.562633976340294, "train_ppl": 35.25593825976829, "lr": 0.0002954752211752859, "grad_norm": 0.7946, "tokens_per_sec": 148638, "dt_s": 4.409, "eta_s": 2896, "world_size": 1, "timestamp": "2026-05-05T07:01:40.828971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86090, "epoch": 0, "train_loss": 3.7014382034540176, "train_ppl": 40.505517664157075, "lr": 0.000295112709487161, "grad_norm": 0.9342, "tokens_per_sec": 148477, "dt_s": 4.414, "eta_s": 2898, "world_size": 1, "timestamp": "2026-05-05T07:01:45.242840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86100, "epoch": 0, "train_loss": 3.5325950235128403, "train_ppl": 34.21263511042577, "lr": 0.00029475019779903614, "grad_norm": 0.909, "tokens_per_sec": 150742, "dt_s": 4.348, "eta_s": 2890, "world_size": 1, "timestamp": "2026-05-05T07:01:49.590421"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86110, "epoch": 0, "train_loss": 3.5367369204759598, "train_ppl": 34.35463418966436, "lr": 0.00029438768611091126, "grad_norm": 0.8492, "tokens_per_sec": 148309, "dt_s": 4.419, "eta_s": 2886, "world_size": 1, "timestamp": "2026-05-05T07:01:54.009293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86120, "epoch": 0, "train_loss": 3.607493132352829, "train_ppl": 36.87349987043824, "lr": 0.0002940251744227864, "grad_norm": 0.8776, "tokens_per_sec": 150014, "dt_s": 4.369, "eta_s": 2883, "world_size": 1, "timestamp": "2026-05-05T07:01:58.377969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86130, "epoch": 0, "train_loss": 3.6196208000183105, "train_ppl": 37.323412101113384, "lr": 0.0002936626627346615, "grad_norm": 0.8189, "tokens_per_sec": 150139, "dt_s": 4.365, "eta_s": 2873, "world_size": 1, "timestamp": "2026-05-05T07:02:02.742989"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86140, "epoch": 0, "train_loss": 3.6452003121376038, "train_ppl": 38.29044212371978, "lr": 0.0002933001510465367, "grad_norm": 0.8394, "tokens_per_sec": 145686, "dt_s": 4.498, "eta_s": 2880, "world_size": 1, "timestamp": "2026-05-05T07:02:07.241445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86150, "epoch": 0, "train_loss": 3.6226529479026794, "train_ppl": 37.43675395394485, "lr": 0.0002929376393584118, "grad_norm": 0.839, "tokens_per_sec": 147980, "dt_s": 4.429, "eta_s": 2886, "world_size": 1, "timestamp": "2026-05-05T07:02:11.670163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86160, "epoch": 0, "train_loss": 3.6241392493247986, "train_ppl": 37.49243762569552, "lr": 0.00029257512767028694, "grad_norm": 0.8457, "tokens_per_sec": 149984, "dt_s": 4.37, "eta_s": 2875, "world_size": 1, "timestamp": "2026-05-05T07:02:16.039695"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86170, "epoch": 0, "train_loss": 3.56302972137928, "train_ppl": 35.26989338358278, "lr": 0.0002922126159821621, "grad_norm": 0.9173, "tokens_per_sec": 148742, "dt_s": 4.406, "eta_s": 2875, "world_size": 1, "timestamp": "2026-05-05T07:02:20.445743"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86180, "epoch": 0, "train_loss": 3.544568032026291, "train_ppl": 34.62472533892742, "lr": 0.0002918501042940372, "grad_norm": 0.888, "tokens_per_sec": 150113, "dt_s": 4.366, "eta_s": 2871, "world_size": 1, "timestamp": "2026-05-05T07:02:24.811511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86190, "epoch": 0, "train_loss": 3.6604441553354263, "train_ppl": 38.878607177843705, "lr": 0.0002914875926059123, "grad_norm": 0.9142, "tokens_per_sec": 132374, "dt_s": 4.951, "eta_s": 2925, "world_size": 1, "timestamp": "2026-05-05T07:02:29.762381"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86200, "epoch": 0, "train_loss": 3.6672788560390472, "train_ppl": 39.14524096480275, "lr": 0.0002911250809177875, "grad_norm": 0.837, "tokens_per_sec": 146598, "dt_s": 4.47, "eta_s": 2926, "world_size": 1, "timestamp": "2026-05-05T07:02:34.232799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86210, "epoch": 0, "train_loss": 3.588275730609894, "train_ppl": 36.17165245138728, "lr": 0.0002907625692296626, "grad_norm": 0.8569, "tokens_per_sec": 150593, "dt_s": 4.352, "eta_s": 2919, "world_size": 1, "timestamp": "2026-05-05T07:02:38.584657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86220, "epoch": 0, "train_loss": 3.6496018916368484, "train_ppl": 38.45935201123406, "lr": 0.00029040005754153774, "grad_norm": 0.9232, "tokens_per_sec": 147673, "dt_s": 4.438, "eta_s": 2919, "world_size": 1, "timestamp": "2026-05-05T07:02:43.022564"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86230, "epoch": 0, "train_loss": 3.673792749643326, "train_ppl": 39.40106118786445, "lr": 0.0002900375458534129, "grad_norm": 0.9079, "tokens_per_sec": 150211, "dt_s": 4.363, "eta_s": 2914, "world_size": 1, "timestamp": "2026-05-05T07:02:47.385496"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86240, "epoch": 0, "train_loss": 3.614168033003807, "train_ppl": 37.120450085913824, "lr": 0.00028967503416528804, "grad_norm": 0.8552, "tokens_per_sec": 149894, "dt_s": 4.372, "eta_s": 2835, "world_size": 1, "timestamp": "2026-05-05T07:02:51.757643"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86250, "epoch": 0, "train_loss": 3.687677577137947, "train_ppl": 39.95195379955508, "lr": 0.0002893125224771631, "grad_norm": 0.8874, "tokens_per_sec": 147730, "dt_s": 4.436, "eta_s": 2826, "world_size": 1, "timestamp": "2026-05-05T07:02:56.193862"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86260, "epoch": 0, "train_loss": 3.620815232396126, "train_ppl": 37.36801902765289, "lr": 0.0002889500107890383, "grad_norm": 0.8907, "tokens_per_sec": 148880, "dt_s": 4.402, "eta_s": 2828, "world_size": 1, "timestamp": "2026-05-05T07:03:00.595813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86270, "epoch": 0, "train_loss": 3.632934659719467, "train_ppl": 37.82365345567253, "lr": 0.0002885874991009134, "grad_norm": 0.9416, "tokens_per_sec": 149352, "dt_s": 4.388, "eta_s": 2818, "world_size": 1, "timestamp": "2026-05-05T07:03:04.983811"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86280, "epoch": 0, "train_loss": 3.666774272918701, "train_ppl": 39.1254939194019, "lr": 0.0002882249874127886, "grad_norm": 0.9035, "tokens_per_sec": 149875, "dt_s": 4.373, "eta_s": 2814, "world_size": 1, "timestamp": "2026-05-05T07:03:09.356536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86290, "epoch": 0, "train_loss": 3.707087129354477, "train_ppl": 40.73497782354094, "lr": 0.0002878624757246637, "grad_norm": 0.9384, "tokens_per_sec": 149095, "dt_s": 4.396, "eta_s": 2813, "world_size": 1, "timestamp": "2026-05-05T07:03:13.752095"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86300, "epoch": 0, "train_loss": 3.74075448513031, "train_ppl": 42.12976445770073, "lr": 0.00028749996403653884, "grad_norm": 0.8287, "tokens_per_sec": 147293, "dt_s": 4.449, "eta_s": 2810, "world_size": 1, "timestamp": "2026-05-05T07:03:18.201487"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86310, "epoch": 0, "train_loss": 3.6495430916547775, "train_ppl": 38.45709066850945, "lr": 0.00028713745234841396, "grad_norm": 1.0179, "tokens_per_sec": 149349, "dt_s": 4.388, "eta_s": 2804, "world_size": 1, "timestamp": "2026-05-05T07:03:22.589569"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86320, "epoch": 0, "train_loss": 3.5413122922182083, "train_ppl": 34.51217955161149, "lr": 0.0002867749406602891, "grad_norm": 0.8909, "tokens_per_sec": 150559, "dt_s": 4.353, "eta_s": 2795, "world_size": 1, "timestamp": "2026-05-05T07:03:26.942437"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86330, "epoch": 0, "train_loss": 3.5735746026039124, "train_ppl": 35.64377804027907, "lr": 0.0002864124289721642, "grad_norm": 0.894, "tokens_per_sec": 147268, "dt_s": 4.45, "eta_s": 2801, "world_size": 1, "timestamp": "2026-05-05T07:03:31.392531"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86340, "epoch": 0, "train_loss": 3.6287782788276672, "train_ppl": 37.666770204318404, "lr": 0.0002860499172840394, "grad_norm": 0.9977, "tokens_per_sec": 149961, "dt_s": 4.37, "eta_s": 2793, "world_size": 1, "timestamp": "2026-05-05T07:03:35.762756"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86350, "epoch": 0, "train_loss": 3.6505075246095657, "train_ppl": 38.49419784491112, "lr": 0.0002856874055959145, "grad_norm": 0.9351, "tokens_per_sec": 150076, "dt_s": 4.367, "eta_s": 2778, "world_size": 1, "timestamp": "2026-05-05T07:03:40.129606"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86360, "epoch": 0, "train_loss": 3.6645212322473526, "train_ppl": 39.03744182004286, "lr": 0.00028532489390778964, "grad_norm": 0.9127, "tokens_per_sec": 148012, "dt_s": 4.428, "eta_s": 2779, "world_size": 1, "timestamp": "2026-05-05T07:03:44.557354"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86370, "epoch": 0, "train_loss": 3.6247416138648987, "train_ppl": 37.51502854394411, "lr": 0.0002849623822196648, "grad_norm": 0.9765, "tokens_per_sec": 149630, "dt_s": 4.38, "eta_s": 2778, "world_size": 1, "timestamp": "2026-05-05T07:03:48.937242"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86380, "epoch": 0, "train_loss": 3.6647495925426483, "train_ppl": 39.046357439732496, "lr": 0.0002845998705315399, "grad_norm": 0.972, "tokens_per_sec": 148851, "dt_s": 4.403, "eta_s": 2767, "world_size": 1, "timestamp": "2026-05-05T07:03:53.340041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86390, "epoch": 0, "train_loss": 3.6648569852113724, "train_ppl": 39.0505509574344, "lr": 0.000284237358843415, "grad_norm": 0.9462, "tokens_per_sec": 146649, "dt_s": 4.469, "eta_s": 2776, "world_size": 1, "timestamp": "2026-05-05T07:03:57.808929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86400, "epoch": 0, "train_loss": 3.4401884377002716, "train_ppl": 31.19283552072516, "lr": 0.0002838748471552902, "grad_norm": 1.242, "tokens_per_sec": 148623, "dt_s": 4.41, "eta_s": 2776, "world_size": 1, "timestamp": "2026-05-05T07:04:02.218497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86410, "epoch": 0, "train_loss": 3.6105593889951706, "train_ppl": 36.98673700267617, "lr": 0.0002835123354671653, "grad_norm": 0.937, "tokens_per_sec": 148568, "dt_s": 4.411, "eta_s": 2770, "world_size": 1, "timestamp": "2026-05-05T07:04:06.629663"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86420, "epoch": 0, "train_loss": 3.7654504626989365, "train_ppl": 43.183153879272545, "lr": 0.00028314982377904043, "grad_norm": 1.0233, "tokens_per_sec": 149589, "dt_s": 4.381, "eta_s": 2766, "world_size": 1, "timestamp": "2026-05-05T07:04:11.010737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86430, "epoch": 0, "train_loss": 3.4343462884426117, "train_ppl": 31.011133601510643, "lr": 0.0002827873120909156, "grad_norm": 0.8451, "tokens_per_sec": 151612, "dt_s": 4.323, "eta_s": 2751, "world_size": 1, "timestamp": "2026-05-05T07:04:15.333358"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86440, "epoch": 0, "train_loss": 3.591160371899605, "train_ppl": 36.276145333348076, "lr": 0.00028242480040279074, "grad_norm": 0.911, "tokens_per_sec": 145365, "dt_s": 4.508, "eta_s": 2752, "world_size": 1, "timestamp": "2026-05-05T07:04:19.841701"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86450, "epoch": 0, "train_loss": 3.6979218274354935, "train_ppl": 40.36333516329907, "lr": 0.0002820622887146658, "grad_norm": 0.8376, "tokens_per_sec": 150803, "dt_s": 4.346, "eta_s": 2739, "world_size": 1, "timestamp": "2026-05-05T07:04:24.187489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86460, "epoch": 0, "train_loss": 3.6051055639982224, "train_ppl": 36.785566883803796, "lr": 0.000281699777026541, "grad_norm": 0.9369, "tokens_per_sec": 150932, "dt_s": 4.342, "eta_s": 2726, "world_size": 1, "timestamp": "2026-05-05T07:04:28.529580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86470, "epoch": 0, "train_loss": 3.615912303328514, "train_ppl": 37.18525468737952, "lr": 0.0002813372653384161, "grad_norm": 0.8819, "tokens_per_sec": 147249, "dt_s": 4.451, "eta_s": 2731, "world_size": 1, "timestamp": "2026-05-05T07:04:32.980312"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86480, "epoch": 0, "train_loss": 3.6042693108320236, "train_ppl": 36.75481769587164, "lr": 0.0002809747536502913, "grad_norm": 0.9237, "tokens_per_sec": 134309, "dt_s": 4.879, "eta_s": 2795, "world_size": 1, "timestamp": "2026-05-05T07:04:37.859788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86490, "epoch": 0, "train_loss": 3.7221062779426575, "train_ppl": 41.35139998885599, "lr": 0.0002806122419621664, "grad_norm": 1.0576, "tokens_per_sec": 148462, "dt_s": 4.414, "eta_s": 2779, "world_size": 1, "timestamp": "2026-05-05T07:04:42.274070"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86500, "epoch": 0, "train_loss": 3.5148922950029373, "train_ppl": 33.612307515476004, "lr": 0.00028024973027404153, "grad_norm": 0.9529, "tokens_per_sec": 148526, "dt_s": 4.412, "eta_s": 2783, "world_size": 1, "timestamp": "2026-05-05T07:04:46.686498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86510, "epoch": 0, "train_loss": 3.528364971280098, "train_ppl": 34.068219534942386, "lr": 0.00027988721858591666, "grad_norm": 0.8605, "tokens_per_sec": 127267, "dt_s": 5.149, "eta_s": 2783, "world_size": 1, "timestamp": "2026-05-05T07:04:51.835979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86520, "epoch": 0, "train_loss": 3.654358297586441, "train_ppl": 38.642716033127144, "lr": 0.0002795247068977918, "grad_norm": 0.8564, "tokens_per_sec": 145352, "dt_s": 4.509, "eta_s": 2786, "world_size": 1, "timestamp": "2026-05-05T07:04:56.344749"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86530, "epoch": 0, "train_loss": 3.592895582318306, "train_ppl": 36.339146723206994, "lr": 0.0002791621952096669, "grad_norm": 0.9853, "tokens_per_sec": 147399, "dt_s": 4.446, "eta_s": 2728, "world_size": 1, "timestamp": "2026-05-05T07:05:00.790898"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86540, "epoch": 0, "train_loss": 3.722684621810913, "train_ppl": 41.37532223445856, "lr": 0.0002787996835215421, "grad_norm": 0.8581, "tokens_per_sec": 150136, "dt_s": 4.365, "eta_s": 2717, "world_size": 1, "timestamp": "2026-05-05T07:05:05.156016"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86550, "epoch": 0, "train_loss": 3.484782189130783, "train_ppl": 32.61532236442814, "lr": 0.0002784371718334172, "grad_norm": 0.8455, "tokens_per_sec": 146546, "dt_s": 4.472, "eta_s": 2720, "world_size": 1, "timestamp": "2026-05-05T07:05:09.628058"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86560, "epoch": 0, "train_loss": 3.5196970403194427, "train_ppl": 33.77419469448236, "lr": 0.00027807466014529233, "grad_norm": 0.8742, "tokens_per_sec": 150277, "dt_s": 4.361, "eta_s": 2714, "world_size": 1, "timestamp": "2026-05-05T07:05:13.989069"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86570, "epoch": 0, "train_loss": 3.6120884716510773, "train_ppl": 37.043336042002416, "lr": 0.0002777121484571675, "grad_norm": 0.8784, "tokens_per_sec": 148148, "dt_s": 4.424, "eta_s": 2699, "world_size": 1, "timestamp": "2026-05-05T07:05:18.412754"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86580, "epoch": 0, "train_loss": 3.5639276057481766, "train_ppl": 35.30157589103012, "lr": 0.0002773496367690426, "grad_norm": 0.9391, "tokens_per_sec": 149076, "dt_s": 4.396, "eta_s": 2688, "world_size": 1, "timestamp": "2026-05-05T07:05:22.808894"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86590, "epoch": 0, "train_loss": 3.6915714740753174, "train_ppl": 40.107825868032215, "lr": 0.0002769871250809177, "grad_norm": 0.9124, "tokens_per_sec": 149896, "dt_s": 4.372, "eta_s": 2685, "world_size": 1, "timestamp": "2026-05-05T07:05:27.180993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86600, "epoch": 0, "train_loss": 3.734683036804199, "train_ppl": 41.87475070458638, "lr": 0.0002766246133927929, "grad_norm": 0.8916, "tokens_per_sec": 149740, "dt_s": 4.377, "eta_s": 2669, "world_size": 1, "timestamp": "2026-05-05T07:05:31.557630"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86610, "epoch": 0, "train_loss": 3.764851436018944, "train_ppl": 43.15729376419536, "lr": 0.000276262101704668, "grad_norm": 0.9276, "tokens_per_sec": 150192, "dt_s": 4.363, "eta_s": 2665, "world_size": 1, "timestamp": "2026-05-05T07:05:35.921092"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86620, "epoch": 0, "train_loss": 3.49733804166317, "train_ppl": 33.027417229648094, "lr": 0.00027589959001654313, "grad_norm": 0.9897, "tokens_per_sec": 151608, "dt_s": 4.323, "eta_s": 2648, "world_size": 1, "timestamp": "2026-05-05T07:05:40.243838"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86630, "epoch": 0, "train_loss": 3.5826929956674576, "train_ppl": 35.97027833549537, "lr": 0.0002755370783284183, "grad_norm": 0.9161, "tokens_per_sec": 148371, "dt_s": 4.417, "eta_s": 2646, "world_size": 1, "timestamp": "2026-05-05T07:05:44.660876"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86640, "epoch": 0, "train_loss": 3.5609349459409714, "train_ppl": 35.19608820683543, "lr": 0.00027517456664029343, "grad_norm": 1.3509, "tokens_per_sec": 150242, "dt_s": 4.362, "eta_s": 2641, "world_size": 1, "timestamp": "2026-05-05T07:05:49.022923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86650, "epoch": 0, "train_loss": 3.617679074406624, "train_ppl": 37.2510105906027, "lr": 0.0002748120549521685, "grad_norm": 0.9262, "tokens_per_sec": 151299, "dt_s": 4.332, "eta_s": 2631, "world_size": 1, "timestamp": "2026-05-05T07:05:53.354467"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86660, "epoch": 0, "train_loss": 3.7296510338783264, "train_ppl": 41.66456610478906, "lr": 0.0002744495432640437, "grad_norm": 0.8517, "tokens_per_sec": 147798, "dt_s": 4.434, "eta_s": 2635, "world_size": 1, "timestamp": "2026-05-05T07:05:57.788628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86670, "epoch": 0, "train_loss": 3.6608284562826157, "train_ppl": 38.89355113471203, "lr": 0.0002740870315759188, "grad_norm": 0.9926, "tokens_per_sec": 150438, "dt_s": 4.356, "eta_s": 2635, "world_size": 1, "timestamp": "2026-05-05T07:06:02.144971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86680, "epoch": 0, "train_loss": 3.647650122642517, "train_ppl": 38.38436144637191, "lr": 0.00027372451988779393, "grad_norm": 0.9069, "tokens_per_sec": 151914, "dt_s": 4.314, "eta_s": 2618, "world_size": 1, "timestamp": "2026-05-05T07:06:06.459015"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86690, "epoch": 0, "train_loss": 3.441301316022873, "train_ppl": 31.227568674498855, "lr": 0.0002733620081996691, "grad_norm": 1.0046, "tokens_per_sec": 149478, "dt_s": 4.384, "eta_s": 2616, "world_size": 1, "timestamp": "2026-05-05T07:06:10.843335"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86700, "epoch": 0, "train_loss": 3.4992120265960693, "train_ppl": 33.08936814131653, "lr": 0.00027299949651154423, "grad_norm": 0.8968, "tokens_per_sec": 150823, "dt_s": 4.345, "eta_s": 2613, "world_size": 1, "timestamp": "2026-05-05T07:06:15.188555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86710, "epoch": 0, "train_loss": 3.6365154832601547, "train_ppl": 37.95933606712578, "lr": 0.00027263698482341936, "grad_norm": 0.9089, "tokens_per_sec": 148751, "dt_s": 4.406, "eta_s": 2606, "world_size": 1, "timestamp": "2026-05-05T07:06:19.594344"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86720, "epoch": 0, "train_loss": 3.690368413925171, "train_ppl": 40.059602754490804, "lr": 0.0002722744731352945, "grad_norm": 0.9495, "tokens_per_sec": 150630, "dt_s": 4.351, "eta_s": 2601, "world_size": 1, "timestamp": "2026-05-05T07:06:23.945105"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86730, "epoch": 0, "train_loss": 3.543592855334282, "train_ppl": 34.59097657198964, "lr": 0.0002719119614471696, "grad_norm": 0.8855, "tokens_per_sec": 151266, "dt_s": 4.333, "eta_s": 2599, "world_size": 1, "timestamp": "2026-05-05T07:06:28.277607"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86740, "epoch": 0, "train_loss": 3.6558533161878586, "train_ppl": 38.70053081872753, "lr": 0.0002715494497590448, "grad_norm": 0.9237, "tokens_per_sec": 146189, "dt_s": 4.483, "eta_s": 2606, "world_size": 1, "timestamp": "2026-05-05T07:06:32.760599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86750, "epoch": 0, "train_loss": 3.622072920203209, "train_ppl": 37.41504589591939, "lr": 0.0002711869380709199, "grad_norm": 0.9702, "tokens_per_sec": 149980, "dt_s": 4.37, "eta_s": 2604, "world_size": 1, "timestamp": "2026-05-05T07:06:37.130207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86760, "epoch": 0, "train_loss": 3.6501997858285904, "train_ppl": 38.482353509965776, "lr": 0.00027082442638279503, "grad_norm": 1.0209, "tokens_per_sec": 148905, "dt_s": 4.401, "eta_s": 2599, "world_size": 1, "timestamp": "2026-05-05T07:06:41.531426"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86770, "epoch": 0, "train_loss": 3.676773965358734, "train_ppl": 39.5186995161707, "lr": 0.0002704619146946702, "grad_norm": 0.9538, "tokens_per_sec": 146800, "dt_s": 4.464, "eta_s": 2609, "world_size": 1, "timestamp": "2026-05-05T07:06:45.995728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86780, "epoch": 0, "train_loss": 3.5970382392406464, "train_ppl": 36.48999959103067, "lr": 0.0002700994030065453, "grad_norm": 0.9079, "tokens_per_sec": 132189, "dt_s": 4.958, "eta_s": 2678, "world_size": 1, "timestamp": "2026-05-05T07:06:50.953483"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86790, "epoch": 0, "train_loss": 3.698378711938858, "train_ppl": 40.38178075907185, "lr": 0.0002697368913184204, "grad_norm": 0.9056, "tokens_per_sec": 148847, "dt_s": 4.403, "eta_s": 2664, "world_size": 1, "timestamp": "2026-05-05T07:06:55.356397"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86800, "epoch": 0, "train_loss": 3.52727735042572, "train_ppl": 34.03118637157566, "lr": 0.0002693743796302956, "grad_norm": 0.864, "tokens_per_sec": 147022, "dt_s": 4.458, "eta_s": 2670, "world_size": 1, "timestamp": "2026-05-05T07:06:59.813986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86810, "epoch": 0, "train_loss": 3.647244170308113, "train_ppl": 38.36878238762973, "lr": 0.0002690118679421707, "grad_norm": 0.9683, "tokens_per_sec": 146495, "dt_s": 4.474, "eta_s": 2674, "world_size": 1, "timestamp": "2026-05-05T07:07:04.287590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86820, "epoch": 0, "train_loss": 3.5998159497976303, "train_ppl": 36.591499151057256, "lr": 0.00026864935625404583, "grad_norm": 0.9396, "tokens_per_sec": 145717, "dt_s": 4.497, "eta_s": 2673, "world_size": 1, "timestamp": "2026-05-05T07:07:08.785084"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86830, "epoch": 0, "train_loss": 3.848767563700676, "train_ppl": 46.93518294884318, "lr": 0.000268286844565921, "grad_norm": 1.1211, "tokens_per_sec": 148448, "dt_s": 4.415, "eta_s": 2605, "world_size": 1, "timestamp": "2026-05-05T07:07:13.199830"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86840, "epoch": 0, "train_loss": 3.651009276509285, "train_ppl": 38.51351722817128, "lr": 0.00026792433287779613, "grad_norm": 0.955, "tokens_per_sec": 147279, "dt_s": 4.45, "eta_s": 2606, "world_size": 1, "timestamp": "2026-05-05T07:07:17.649646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86850, "epoch": 0, "train_loss": 3.7271054685115814, "train_ppl": 41.55864110499636, "lr": 0.0002675618211896712, "grad_norm": 1.2388, "tokens_per_sec": 146741, "dt_s": 4.466, "eta_s": 2603, "world_size": 1, "timestamp": "2026-05-05T07:07:22.115732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86860, "epoch": 0, "train_loss": 3.616186112165451, "train_ppl": 37.195437732756965, "lr": 0.0002671993095015464, "grad_norm": 0.8511, "tokens_per_sec": 148244, "dt_s": 4.421, "eta_s": 2592, "world_size": 1, "timestamp": "2026-05-05T07:07:26.536552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86870, "epoch": 0, "train_loss": 3.6354440450668335, "train_ppl": 37.918686765166385, "lr": 0.0002668367978134215, "grad_norm": 0.8427, "tokens_per_sec": 147392, "dt_s": 4.446, "eta_s": 2582, "world_size": 1, "timestamp": "2026-05-05T07:07:30.982925"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86880, "epoch": 0, "train_loss": 3.610182464122772, "train_ppl": 36.97279840861743, "lr": 0.00026647428612529663, "grad_norm": 0.9134, "tokens_per_sec": 146928, "dt_s": 4.46, "eta_s": 2582, "world_size": 1, "timestamp": "2026-05-05T07:07:35.443371"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86890, "epoch": 0, "train_loss": 3.650124952197075, "train_ppl": 38.47947384345266, "lr": 0.0002661117744371718, "grad_norm": 0.8918, "tokens_per_sec": 147687, "dt_s": 4.437, "eta_s": 2577, "world_size": 1, "timestamp": "2026-05-05T07:07:39.880845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86900, "epoch": 0, "train_loss": 3.8340814113616943, "train_ppl": 46.25092257660226, "lr": 0.00026574926274904693, "grad_norm": 0.9651, "tokens_per_sec": 147280, "dt_s": 4.45, "eta_s": 2570, "world_size": 1, "timestamp": "2026-05-05T07:07:44.330602"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86910, "epoch": 0, "train_loss": 3.5751873701810837, "train_ppl": 35.701309549836914, "lr": 0.00026538675106092206, "grad_norm": 0.9281, "tokens_per_sec": 146052, "dt_s": 4.487, "eta_s": 2573, "world_size": 1, "timestamp": "2026-05-05T07:07:48.817772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86920, "epoch": 0, "train_loss": 3.7209962010383606, "train_ppl": 41.305522223403095, "lr": 0.0002650242393727972, "grad_norm": 1.1582, "tokens_per_sec": 147592, "dt_s": 4.44, "eta_s": 2568, "world_size": 1, "timestamp": "2026-05-05T07:07:53.258112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86930, "epoch": 0, "train_loss": 3.6542123407125473, "train_ppl": 38.637076274686954, "lr": 0.0002646617276846723, "grad_norm": 1.0746, "tokens_per_sec": 145204, "dt_s": 4.513, "eta_s": 2570, "world_size": 1, "timestamp": "2026-05-05T07:07:57.771506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86940, "epoch": 0, "train_loss": 3.4960329681634903, "train_ppl": 32.98434213685372, "lr": 0.0002642992159965475, "grad_norm": 0.9463, "tokens_per_sec": 148315, "dt_s": 4.419, "eta_s": 2563, "world_size": 1, "timestamp": "2026-05-05T07:08:02.190240"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86950, "epoch": 0, "train_loss": 3.6727058738470078, "train_ppl": 39.358260391898085, "lr": 0.0002639367043084226, "grad_norm": 0.9722, "tokens_per_sec": 149305, "dt_s": 4.389, "eta_s": 2552, "world_size": 1, "timestamp": "2026-05-05T07:08:06.579587"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86960, "epoch": 0, "train_loss": 3.7164986580610275, "train_ppl": 41.120165998243586, "lr": 0.00026357419262029773, "grad_norm": 0.9706, "tokens_per_sec": 144173, "dt_s": 4.546, "eta_s": 2554, "world_size": 1, "timestamp": "2026-05-05T07:08:11.125267"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86970, "epoch": 0, "train_loss": 3.6152627021074295, "train_ppl": 37.16110694457902, "lr": 0.0002632116809321729, "grad_norm": 0.903, "tokens_per_sec": 146793, "dt_s": 4.465, "eta_s": 2552, "world_size": 1, "timestamp": "2026-05-05T07:08:15.589769"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86980, "epoch": 0, "train_loss": 3.7082312256097794, "train_ppl": 40.78160922945058, "lr": 0.000262849169244048, "grad_norm": 0.9034, "tokens_per_sec": 147719, "dt_s": 4.437, "eta_s": 2539, "world_size": 1, "timestamp": "2026-05-05T07:08:20.026306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 86990, "epoch": 0, "train_loss": 3.616606503725052, "train_ppl": 37.211077668053534, "lr": 0.0002624866575559231, "grad_norm": 0.9813, "tokens_per_sec": 147536, "dt_s": 4.442, "eta_s": 2537, "world_size": 1, "timestamp": "2026-05-05T07:08:24.468334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87000, "epoch": 0, "train_loss": 3.566494509577751, "train_ppl": 35.39230804190813, "lr": 0.0002621241458677983, "grad_norm": 0.9715, "tokens_per_sec": 149348, "dt_s": 4.388, "eta_s": 2533, "world_size": 1, "timestamp": "2026-05-05T07:08:28.856506"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87010, "epoch": 0, "train_loss": 3.6526722759008408, "train_ppl": 38.577618469277866, "lr": 0.0002617616341796734, "grad_norm": 0.9206, "tokens_per_sec": 106980, "dt_s": 6.126, "eta_s": 2521, "world_size": 1, "timestamp": "2026-05-05T07:08:34.982478"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87020, "epoch": 0, "train_loss": 3.615863174200058, "train_ppl": 37.183427853101065, "lr": 0.00026139912249154853, "grad_norm": 0.9297, "tokens_per_sec": 144490, "dt_s": 4.536, "eta_s": 2525, "world_size": 1, "timestamp": "2026-05-05T07:08:39.518132"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87030, "epoch": 0, "train_loss": 3.7702466994524, "train_ppl": 43.39076799426908, "lr": 0.0002610366108034237, "grad_norm": 1.1283, "tokens_per_sec": 146559, "dt_s": 4.472, "eta_s": 2524, "world_size": 1, "timestamp": "2026-05-05T07:08:43.989777"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87040, "epoch": 0, "train_loss": 3.6287846863269806, "train_ppl": 37.66701155489585, "lr": 0.00026067409911529883, "grad_norm": 0.8743, "tokens_per_sec": 146613, "dt_s": 4.47, "eta_s": 2523, "world_size": 1, "timestamp": "2026-05-05T07:08:48.459784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87050, "epoch": 0, "train_loss": 3.603497877717018, "train_ppl": 36.726474746115656, "lr": 0.0002603115874271739, "grad_norm": 0.9975, "tokens_per_sec": 148884, "dt_s": 4.402, "eta_s": 2520, "world_size": 1, "timestamp": "2026-05-05T07:08:52.861572"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87060, "epoch": 0, "train_loss": 3.649935260415077, "train_ppl": 38.4721752957481, "lr": 0.0002599490757390491, "grad_norm": 0.9764, "tokens_per_sec": 148402, "dt_s": 4.416, "eta_s": 2508, "world_size": 1, "timestamp": "2026-05-05T07:08:57.277694"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87070, "epoch": 0, "train_loss": 3.710585579276085, "train_ppl": 40.877736675257296, "lr": 0.0002595865640509242, "grad_norm": 0.9351, "tokens_per_sec": 133426, "dt_s": 4.912, "eta_s": 2546, "world_size": 1, "timestamp": "2026-05-05T07:09:02.189480"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87080, "epoch": 0, "train_loss": 3.510921448469162, "train_ppl": 33.47910284336795, "lr": 0.00025922405236279933, "grad_norm": 0.9253, "tokens_per_sec": 148911, "dt_s": 4.401, "eta_s": 2534, "world_size": 1, "timestamp": "2026-05-05T07:09:06.590502"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87090, "epoch": 0, "train_loss": 3.688696652650833, "train_ppl": 39.99268860976265, "lr": 0.0002588615406746745, "grad_norm": 0.9837, "tokens_per_sec": 148159, "dt_s": 4.423, "eta_s": 2524, "world_size": 1, "timestamp": "2026-05-05T07:09:11.013888"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87100, "epoch": 0, "train_loss": 3.6543397307395935, "train_ppl": 38.64199856639735, "lr": 0.00025849902898654963, "grad_norm": 0.9467, "tokens_per_sec": 150025, "dt_s": 4.368, "eta_s": 2516, "world_size": 1, "timestamp": "2026-05-05T07:09:15.382204"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87110, "epoch": 0, "train_loss": 3.5201588571071625, "train_ppl": 33.78979578672986, "lr": 0.0002581365172984247, "grad_norm": 0.9981, "tokens_per_sec": 149037, "dt_s": 4.397, "eta_s": 2509, "world_size": 1, "timestamp": "2026-05-05T07:09:19.779497"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87120, "epoch": 0, "train_loss": 3.5513522028923035, "train_ppl": 34.86042399780143, "lr": 0.0002577740056102999, "grad_norm": 1.0087, "tokens_per_sec": 148535, "dt_s": 4.412, "eta_s": 2449, "world_size": 1, "timestamp": "2026-05-05T07:09:24.191666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87130, "epoch": 0, "train_loss": 3.6829029321670532, "train_ppl": 39.761652077378294, "lr": 0.000257411493922175, "grad_norm": 0.9707, "tokens_per_sec": 150877, "dt_s": 4.344, "eta_s": 2438, "world_size": 1, "timestamp": "2026-05-05T07:09:28.535338"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87140, "epoch": 0, "train_loss": 3.6183650493621826, "train_ppl": 37.276572617389824, "lr": 0.0002570489822340502, "grad_norm": 0.9607, "tokens_per_sec": 149314, "dt_s": 4.389, "eta_s": 2430, "world_size": 1, "timestamp": "2026-05-05T07:09:32.924473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87150, "epoch": 0, "train_loss": 3.5499092042446136, "train_ppl": 34.81015672963711, "lr": 0.0002566864705459253, "grad_norm": 0.9718, "tokens_per_sec": 146976, "dt_s": 4.459, "eta_s": 2436, "world_size": 1, "timestamp": "2026-05-05T07:09:37.383438"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87160, "epoch": 0, "train_loss": 3.5767745971679688, "train_ppl": 35.758020626591325, "lr": 0.00025632395885780043, "grad_norm": 1.0617, "tokens_per_sec": 149658, "dt_s": 4.379, "eta_s": 2429, "world_size": 1, "timestamp": "2026-05-05T07:09:41.762462"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87170, "epoch": 0, "train_loss": 3.7021960616111755, "train_ppl": 40.53622673621933, "lr": 0.0002559614471696756, "grad_norm": 1.1623, "tokens_per_sec": 148288, "dt_s": 4.419, "eta_s": 2426, "world_size": 1, "timestamp": "2026-05-05T07:09:46.181986"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87180, "epoch": 0, "train_loss": 3.6085617393255234, "train_ppl": 36.91292421031805, "lr": 0.0002555989354815507, "grad_norm": 0.9907, "tokens_per_sec": 148662, "dt_s": 4.408, "eta_s": 2428, "world_size": 1, "timestamp": "2026-05-05T07:09:50.590393"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87190, "epoch": 0, "train_loss": 3.5506486147642136, "train_ppl": 34.83590524389996, "lr": 0.0002552364237934258, "grad_norm": 0.9595, "tokens_per_sec": 151327, "dt_s": 4.331, "eta_s": 2417, "world_size": 1, "timestamp": "2026-05-05T07:09:54.921100"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87200, "epoch": 0, "train_loss": 3.6293223202228546, "train_ppl": 37.687268061868494, "lr": 0.000254873912105301, "grad_norm": 0.9364, "tokens_per_sec": 148673, "dt_s": 4.408, "eta_s": 2407, "world_size": 1, "timestamp": "2026-05-05T07:09:59.329162"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87210, "epoch": 0, "train_loss": 3.588136225938797, "train_ppl": 36.16660668887093, "lr": 0.0002545114004171761, "grad_norm": 0.9148, "tokens_per_sec": 150323, "dt_s": 4.36, "eta_s": 2401, "world_size": 1, "timestamp": "2026-05-05T07:10:03.688836"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87220, "epoch": 0, "train_loss": 3.549112692475319, "train_ppl": 34.782441069501004, "lr": 0.00025414888872905123, "grad_norm": 1.1334, "tokens_per_sec": 150515, "dt_s": 4.354, "eta_s": 2389, "world_size": 1, "timestamp": "2026-05-05T07:10:08.042936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87230, "epoch": 0, "train_loss": 3.678678646683693, "train_ppl": 39.594041773844104, "lr": 0.0002537863770409264, "grad_norm": 0.9697, "tokens_per_sec": 147898, "dt_s": 4.431, "eta_s": 2387, "world_size": 1, "timestamp": "2026-05-05T07:10:12.474112"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87240, "epoch": 0, "train_loss": 3.6439605206251144, "train_ppl": 38.242999374201375, "lr": 0.00025342386535280153, "grad_norm": 0.9132, "tokens_per_sec": 149169, "dt_s": 4.393, "eta_s": 2390, "world_size": 1, "timestamp": "2026-05-05T07:10:16.867512"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87250, "epoch": 0, "train_loss": 3.6139313876628876, "train_ppl": 37.11166674365766, "lr": 0.0002530613536646766, "grad_norm": 0.8904, "tokens_per_sec": 146282, "dt_s": 4.48, "eta_s": 2393, "world_size": 1, "timestamp": "2026-05-05T07:10:21.347620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87260, "epoch": 0, "train_loss": 3.6493489295244217, "train_ppl": 38.44962448270669, "lr": 0.0002526988419765518, "grad_norm": 0.9341, "tokens_per_sec": 147643, "dt_s": 4.439, "eta_s": 2398, "world_size": 1, "timestamp": "2026-05-05T07:10:25.786444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87270, "epoch": 0, "train_loss": 3.551148936152458, "train_ppl": 34.85333875318746, "lr": 0.0002523363302884269, "grad_norm": 0.9223, "tokens_per_sec": 149126, "dt_s": 4.395, "eta_s": 2398, "world_size": 1, "timestamp": "2026-05-05T07:10:30.181122"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87280, "epoch": 0, "train_loss": 3.5929696559906006, "train_ppl": 36.34183859695009, "lr": 0.000251973818600302, "grad_norm": 1.055, "tokens_per_sec": 149507, "dt_s": 4.383, "eta_s": 2388, "world_size": 1, "timestamp": "2026-05-05T07:10:34.564625"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87290, "epoch": 0, "train_loss": 3.626916855573654, "train_ppl": 37.596721617616545, "lr": 0.0002516113069121772, "grad_norm": 0.898, "tokens_per_sec": 148787, "dt_s": 4.405, "eta_s": 2385, "world_size": 1, "timestamp": "2026-05-05T07:10:38.969293"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87300, "epoch": 0, "train_loss": 3.53238545358181, "train_ppl": 34.20546592209471, "lr": 0.00025124879522405233, "grad_norm": 0.979, "tokens_per_sec": 148401, "dt_s": 4.416, "eta_s": 2373, "world_size": 1, "timestamp": "2026-05-05T07:10:43.385424"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87310, "epoch": 0, "train_loss": 3.6391297727823257, "train_ppl": 38.05870259151297, "lr": 0.0002508862835359274, "grad_norm": 0.8619, "tokens_per_sec": 146778, "dt_s": 4.465, "eta_s": 2372, "world_size": 1, "timestamp": "2026-05-05T07:10:47.850422"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87320, "epoch": 0, "train_loss": 3.6012426912784576, "train_ppl": 36.64374302112773, "lr": 0.0002505237718478026, "grad_norm": 1.0597, "tokens_per_sec": 149980, "dt_s": 4.37, "eta_s": 2365, "world_size": 1, "timestamp": "2026-05-05T07:10:52.220059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87330, "epoch": 0, "train_loss": 3.589424341917038, "train_ppl": 36.21322349030329, "lr": 0.0002501612601596777, "grad_norm": 0.9286, "tokens_per_sec": 148584, "dt_s": 4.411, "eta_s": 2363, "world_size": 1, "timestamp": "2026-05-05T07:10:56.630746"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87340, "epoch": 0, "train_loss": 3.682443082332611, "train_ppl": 39.74337189164569, "lr": 0.0002497987484715529, "grad_norm": 1.1363, "tokens_per_sec": 148243, "dt_s": 4.421, "eta_s": 2361, "world_size": 1, "timestamp": "2026-05-05T07:11:01.051628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87350, "epoch": 0, "train_loss": 3.5721485018730164, "train_ppl": 35.59298265064389, "lr": 0.000249436236783428, "grad_norm": 0.9104, "tokens_per_sec": 151347, "dt_s": 4.33, "eta_s": 2347, "world_size": 1, "timestamp": "2026-05-05T07:11:05.381781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87360, "epoch": 0, "train_loss": 3.556658700108528, "train_ppl": 35.04590242584418, "lr": 0.00024907372509530313, "grad_norm": 0.9894, "tokens_per_sec": 149373, "dt_s": 4.387, "eta_s": 2334, "world_size": 1, "timestamp": "2026-05-05T07:11:09.769212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87370, "epoch": 0, "train_loss": 3.556864768266678, "train_ppl": 35.05312501455499, "lr": 0.0002487112134071783, "grad_norm": 0.9442, "tokens_per_sec": 131987, "dt_s": 4.965, "eta_s": 2393, "world_size": 1, "timestamp": "2026-05-05T07:11:14.734560"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87380, "epoch": 0, "train_loss": 3.652629092335701, "train_ppl": 38.5759525861474, "lr": 0.0002483487017190534, "grad_norm": 1.0041, "tokens_per_sec": 147952, "dt_s": 4.43, "eta_s": 2391, "world_size": 1, "timestamp": "2026-05-05T07:11:19.164101"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87390, "epoch": 0, "train_loss": 3.6522938311100006, "train_ppl": 38.563021732728814, "lr": 0.0002479861900309285, "grad_norm": 0.9728, "tokens_per_sec": 145853, "dt_s": 4.493, "eta_s": 2394, "world_size": 1, "timestamp": "2026-05-05T07:11:23.657404"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87400, "epoch": 0, "train_loss": 3.593237057328224, "train_ppl": 36.35155775260231, "lr": 0.0002476236783428037, "grad_norm": 0.976, "tokens_per_sec": 149182, "dt_s": 4.393, "eta_s": 2396, "world_size": 1, "timestamp": "2026-05-05T07:11:28.050442"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87410, "epoch": 0, "train_loss": 3.537140801548958, "train_ppl": 34.368512178523204, "lr": 0.0002472611666546788, "grad_norm": 1.0126, "tokens_per_sec": 148942, "dt_s": 4.4, "eta_s": 2393, "world_size": 1, "timestamp": "2026-05-05T07:11:32.450571"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87420, "epoch": 0, "train_loss": 3.5793733298778534, "train_ppl": 35.851067013413825, "lr": 0.0002468986549665539, "grad_norm": 0.9925, "tokens_per_sec": 147641, "dt_s": 4.439, "eta_s": 2333, "world_size": 1, "timestamp": "2026-05-05T07:11:36.889403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87430, "epoch": 0, "train_loss": 3.7045837938785553, "train_ppl": 40.63313203871381, "lr": 0.0002465361432784291, "grad_norm": 0.9212, "tokens_per_sec": 150285, "dt_s": 4.361, "eta_s": 2321, "world_size": 1, "timestamp": "2026-05-05T07:11:41.250211"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87440, "epoch": 0, "train_loss": 3.546810030937195, "train_ppl": 34.70244102211264, "lr": 0.00024617363159030423, "grad_norm": 0.9924, "tokens_per_sec": 150894, "dt_s": 4.343, "eta_s": 2301, "world_size": 1, "timestamp": "2026-05-05T07:11:45.593402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87450, "epoch": 0, "train_loss": 3.642026364803314, "train_ppl": 38.169102940963896, "lr": 0.0002458111199021793, "grad_norm": 0.9756, "tokens_per_sec": 147621, "dt_s": 4.439, "eta_s": 2302, "world_size": 1, "timestamp": "2026-05-05T07:11:50.032858"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87460, "epoch": 0, "train_loss": 3.5647924542427063, "train_ppl": 35.33211961173564, "lr": 0.0002454486082140545, "grad_norm": 1.1117, "tokens_per_sec": 151120, "dt_s": 4.337, "eta_s": 2291, "world_size": 1, "timestamp": "2026-05-05T07:11:54.369534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87470, "epoch": 0, "train_loss": 3.6274583637714386, "train_ppl": 37.61708606384197, "lr": 0.0002450860965259296, "grad_norm": 1.1429, "tokens_per_sec": 148421, "dt_s": 4.416, "eta_s": 2284, "world_size": 1, "timestamp": "2026-05-05T07:11:58.785064"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87480, "epoch": 0, "train_loss": 3.6459416449069977, "train_ppl": 38.31883860754217, "lr": 0.0002447235848378047, "grad_norm": 1.0012, "tokens_per_sec": 147329, "dt_s": 4.448, "eta_s": 2288, "world_size": 1, "timestamp": "2026-05-05T07:12:03.233372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87490, "epoch": 0, "train_loss": 3.5791035592556, "train_ppl": 35.84139675319187, "lr": 0.0002443610731496799, "grad_norm": 1.0281, "tokens_per_sec": 150513, "dt_s": 4.354, "eta_s": 2285, "world_size": 1, "timestamp": "2026-05-05T07:12:07.587537"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87500, "epoch": 0, "train_loss": 3.62740021944046, "train_ppl": 37.61489890712542, "lr": 0.00024399856146155503, "grad_norm": 0.9297, "tokens_per_sec": 147909, "dt_s": 4.431, "eta_s": 2280, "world_size": 1, "timestamp": "2026-05-05T07:12:12.018410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87510, "epoch": 0, "train_loss": 3.4447221010923386, "train_ppl": 31.334574392611383, "lr": 0.0002436360497734301, "grad_norm": 1.0006, "tokens_per_sec": 106966, "dt_s": 6.127, "eta_s": 2296, "world_size": 1, "timestamp": "2026-05-05T07:12:18.145221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87520, "epoch": 0, "train_loss": 3.571276158094406, "train_ppl": 35.56194687256852, "lr": 0.00024327353808530528, "grad_norm": 1.0291, "tokens_per_sec": 148065, "dt_s": 4.426, "eta_s": 2293, "world_size": 1, "timestamp": "2026-05-05T07:12:22.571383"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87530, "epoch": 0, "train_loss": 3.5300370156764984, "train_ppl": 34.12523075987838, "lr": 0.0002429110263971804, "grad_norm": 0.9568, "tokens_per_sec": 149945, "dt_s": 4.371, "eta_s": 2280, "world_size": 1, "timestamp": "2026-05-05T07:12:26.942074"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87540, "epoch": 0, "train_loss": 3.63647498190403, "train_ppl": 37.957798693670526, "lr": 0.00024254851470905552, "grad_norm": 0.947, "tokens_per_sec": 150139, "dt_s": 4.365, "eta_s": 2277, "world_size": 1, "timestamp": "2026-05-05T07:12:31.307098"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87550, "epoch": 0, "train_loss": 3.6205465346574783, "train_ppl": 37.3579796742786, "lr": 0.0002421860030209307, "grad_norm": 1.065, "tokens_per_sec": 148310, "dt_s": 4.419, "eta_s": 2271, "world_size": 1, "timestamp": "2026-05-05T07:12:35.725939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87560, "epoch": 0, "train_loss": 3.5900344848632812, "train_ppl": 36.23532547517605, "lr": 0.00024182349133280583, "grad_norm": 0.8748, "tokens_per_sec": 150884, "dt_s": 4.343, "eta_s": 2247, "world_size": 1, "timestamp": "2026-05-05T07:12:40.069409"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87570, "epoch": 0, "train_loss": 3.8049309998750687, "train_ppl": 44.92215037208555, "lr": 0.000241460979644681, "grad_norm": 1.3766, "tokens_per_sec": 151519, "dt_s": 4.325, "eta_s": 2233, "world_size": 1, "timestamp": "2026-05-05T07:12:44.394671"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87580, "epoch": 0, "train_loss": 3.633892849087715, "train_ppl": 37.859913047285396, "lr": 0.00024109846795655607, "grad_norm": 0.9446, "tokens_per_sec": 149818, "dt_s": 4.374, "eta_s": 2229, "world_size": 1, "timestamp": "2026-05-05T07:12:48.769049"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87590, "epoch": 0, "train_loss": 3.607836365699768, "train_ppl": 36.88615825747813, "lr": 0.0002407359562684312, "grad_norm": 1.0732, "tokens_per_sec": 151284, "dt_s": 4.332, "eta_s": 2221, "world_size": 1, "timestamp": "2026-05-05T07:12:53.101043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87600, "epoch": 0, "train_loss": 3.6765483170747757, "train_ppl": 39.50978319545467, "lr": 0.00024037344458030638, "grad_norm": 0.9955, "tokens_per_sec": 151567, "dt_s": 4.324, "eta_s": 2207, "world_size": 1, "timestamp": "2026-05-05T07:12:57.424924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87610, "epoch": 0, "train_loss": 3.6027996987104416, "train_ppl": 36.70084204161167, "lr": 0.0002400109328921815, "grad_norm": 0.9269, "tokens_per_sec": 149243, "dt_s": 4.391, "eta_s": 2207, "world_size": 1, "timestamp": "2026-05-05T07:13:01.816160"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87620, "epoch": 0, "train_loss": 3.6723881363868713, "train_ppl": 39.34575678474316, "lr": 0.00023964842120405663, "grad_norm": 1.0055, "tokens_per_sec": 150508, "dt_s": 4.354, "eta_s": 2206, "world_size": 1, "timestamp": "2026-05-05T07:13:06.170484"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87630, "epoch": 0, "train_loss": 3.611647352576256, "train_ppl": 37.026999123407386, "lr": 0.0002392859095159318, "grad_norm": 0.9721, "tokens_per_sec": 149146, "dt_s": 4.394, "eta_s": 2204, "world_size": 1, "timestamp": "2026-05-05T07:13:10.564586"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87640, "epoch": 0, "train_loss": 3.5619188845157623, "train_ppl": 35.23073603857765, "lr": 0.00023892339782780693, "grad_norm": 1.0762, "tokens_per_sec": 149521, "dt_s": 4.383, "eta_s": 2204, "world_size": 1, "timestamp": "2026-05-05T07:13:14.947653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87650, "epoch": 0, "train_loss": 3.61601459980011, "train_ppl": 37.18905880229988, "lr": 0.000238560886139682, "grad_norm": 0.9758, "tokens_per_sec": 150919, "dt_s": 4.342, "eta_s": 2202, "world_size": 1, "timestamp": "2026-05-05T07:13:19.290090"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87660, "epoch": 0, "train_loss": 3.5835664570331573, "train_ppl": 36.001710709418624, "lr": 0.00023819837445155718, "grad_norm": 0.8683, "tokens_per_sec": 149141, "dt_s": 4.394, "eta_s": 2198, "world_size": 1, "timestamp": "2026-05-05T07:13:23.684341"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87670, "epoch": 0, "train_loss": 3.678219437599182, "train_ppl": 39.57586400418697, "lr": 0.0002378358627634323, "grad_norm": 1.086, "tokens_per_sec": 134687, "dt_s": 4.866, "eta_s": 2245, "world_size": 1, "timestamp": "2026-05-05T07:13:28.550107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87680, "epoch": 0, "train_loss": 3.634237065911293, "train_ppl": 37.872947309472934, "lr": 0.00023747335107530742, "grad_norm": 0.994, "tokens_per_sec": 151443, "dt_s": 4.327, "eta_s": 2234, "world_size": 1, "timestamp": "2026-05-05T07:13:32.877575"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87690, "epoch": 0, "train_loss": 3.572581574320793, "train_ppl": 35.608400329008965, "lr": 0.0002371108393871826, "grad_norm": 0.9055, "tokens_per_sec": 149851, "dt_s": 4.373, "eta_s": 2228, "world_size": 1, "timestamp": "2026-05-05T07:13:37.250968"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87700, "epoch": 0, "train_loss": 3.501211166381836, "train_ppl": 33.15558457955863, "lr": 0.00023674832769905773, "grad_norm": 1.0044, "tokens_per_sec": 150049, "dt_s": 4.368, "eta_s": 2226, "world_size": 1, "timestamp": "2026-05-05T07:13:41.618608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87710, "epoch": 0, "train_loss": 3.6670795679092407, "train_ppl": 39.137440560229884, "lr": 0.0002363858160109328, "grad_norm": 0.9876, "tokens_per_sec": 151107, "dt_s": 4.337, "eta_s": 2216, "world_size": 1, "timestamp": "2026-05-05T07:13:45.955679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87720, "epoch": 0, "train_loss": 3.655515819787979, "train_ppl": 38.68747173272399, "lr": 0.00023602330432280797, "grad_norm": 0.9219, "tokens_per_sec": 147231, "dt_s": 4.451, "eta_s": 2170, "world_size": 1, "timestamp": "2026-05-05T07:13:50.406900"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87730, "epoch": 0, "train_loss": 3.604948103427887, "train_ppl": 36.77977506346579, "lr": 0.0002356607926346831, "grad_norm": 0.9255, "tokens_per_sec": 150620, "dt_s": 4.351, "eta_s": 2168, "world_size": 1, "timestamp": "2026-05-05T07:13:54.757988"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87740, "epoch": 0, "train_loss": 3.614191234111786, "train_ppl": 37.121311331475376, "lr": 0.00023529828094655822, "grad_norm": 1.0102, "tokens_per_sec": 149430, "dt_s": 4.386, "eta_s": 2165, "world_size": 1, "timestamp": "2026-05-05T07:13:59.143733"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87750, "epoch": 0, "train_loss": 3.7079276144504547, "train_ppl": 40.76922935722208, "lr": 0.0002349357692584334, "grad_norm": 1.051, "tokens_per_sec": 150944, "dt_s": 4.342, "eta_s": 2158, "world_size": 1, "timestamp": "2026-05-05T07:14:03.485471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87760, "epoch": 0, "train_loss": 3.552782654762268, "train_ppl": 34.91032583907887, "lr": 0.00023457325757030853, "grad_norm": 0.9911, "tokens_per_sec": 152455, "dt_s": 4.299, "eta_s": 2150, "world_size": 1, "timestamp": "2026-05-05T07:14:07.784203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87770, "epoch": 0, "train_loss": 3.6536981612443924, "train_ppl": 38.61721498992697, "lr": 0.0002342107458821837, "grad_norm": 0.964, "tokens_per_sec": 149099, "dt_s": 4.395, "eta_s": 2140, "world_size": 1, "timestamp": "2026-05-05T07:14:12.179669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87780, "epoch": 0, "train_loss": 3.442131459712982, "train_ppl": 31.253502806630166, "lr": 0.00023384823419405877, "grad_norm": 1.095, "tokens_per_sec": 150481, "dt_s": 4.355, "eta_s": 2136, "world_size": 1, "timestamp": "2026-05-05T07:14:16.534774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87790, "epoch": 0, "train_loss": 3.6446900814771652, "train_ppl": 38.27091014947631, "lr": 0.0002334857225059339, "grad_norm": 1.0215, "tokens_per_sec": 150590, "dt_s": 4.352, "eta_s": 2129, "world_size": 1, "timestamp": "2026-05-05T07:14:20.886714"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87800, "epoch": 0, "train_loss": 3.7083829641342163, "train_ppl": 40.78779784017267, "lr": 0.00023312321081780908, "grad_norm": 1.0364, "tokens_per_sec": 145858, "dt_s": 4.493, "eta_s": 2139, "world_size": 1, "timestamp": "2026-05-05T07:14:25.379839"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87810, "epoch": 0, "train_loss": 3.6029423773288727, "train_ppl": 36.70607884063017, "lr": 0.0002327606991296842, "grad_norm": 0.9821, "tokens_per_sec": 149958, "dt_s": 4.37, "eta_s": 2142, "world_size": 1, "timestamp": "2026-05-05T07:14:29.750153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87820, "epoch": 0, "train_loss": 3.569864258170128, "train_ppl": 35.51177239148005, "lr": 0.00023239818744155932, "grad_norm": 0.9547, "tokens_per_sec": 149208, "dt_s": 4.392, "eta_s": 2137, "world_size": 1, "timestamp": "2026-05-05T07:14:34.142416"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87830, "epoch": 0, "train_loss": 3.55206860601902, "train_ppl": 34.88540706245643, "lr": 0.0002320356757534345, "grad_norm": 1.0136, "tokens_per_sec": 147198, "dt_s": 4.452, "eta_s": 2142, "world_size": 1, "timestamp": "2026-05-05T07:14:38.594667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87840, "epoch": 0, "train_loss": 3.560186579823494, "train_ppl": 35.16975850031264, "lr": 0.00023167316406530963, "grad_norm": 0.9633, "tokens_per_sec": 147881, "dt_s": 4.432, "eta_s": 2145, "world_size": 1, "timestamp": "2026-05-05T07:14:43.026355"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87850, "epoch": 0, "train_loss": 3.597313776612282, "train_ppl": 36.500055334911956, "lr": 0.0002313106523771847, "grad_norm": 0.9432, "tokens_per_sec": 146649, "dt_s": 4.469, "eta_s": 2139, "world_size": 1, "timestamp": "2026-05-05T07:14:47.495253"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87860, "epoch": 0, "train_loss": 3.6275368481874466, "train_ppl": 37.62003853473357, "lr": 0.00023094814068905987, "grad_norm": 1.0162, "tokens_per_sec": 147244, "dt_s": 4.451, "eta_s": 2142, "world_size": 1, "timestamp": "2026-05-05T07:14:51.946067"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87870, "epoch": 0, "train_loss": 3.63057479262352, "train_ppl": 37.73449989708315, "lr": 0.000230585629000935, "grad_norm": 0.99, "tokens_per_sec": 150431, "dt_s": 4.357, "eta_s": 2134, "world_size": 1, "timestamp": "2026-05-05T07:14:56.302628"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87880, "epoch": 0, "train_loss": 3.6777122020721436, "train_ppl": 39.55579481028517, "lr": 0.00023022311731281012, "grad_norm": 1.0306, "tokens_per_sec": 148146, "dt_s": 4.424, "eta_s": 2127, "world_size": 1, "timestamp": "2026-05-05T07:15:00.726363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87890, "epoch": 0, "train_loss": 3.631241723895073, "train_ppl": 37.75967460904912, "lr": 0.0002298606056246853, "grad_norm": 0.9899, "tokens_per_sec": 149885, "dt_s": 4.372, "eta_s": 2117, "world_size": 1, "timestamp": "2026-05-05T07:15:05.098781"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87900, "epoch": 0, "train_loss": 3.5302252769470215, "train_ppl": 34.13165582395451, "lr": 0.00022949809393656042, "grad_norm": 0.902, "tokens_per_sec": 149129, "dt_s": 4.395, "eta_s": 2105, "world_size": 1, "timestamp": "2026-05-05T07:15:09.493395"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87910, "epoch": 0, "train_loss": 3.621540129184723, "train_ppl": 37.39511680500156, "lr": 0.0002291355822484355, "grad_norm": 0.9847, "tokens_per_sec": 146237, "dt_s": 4.482, "eta_s": 2104, "world_size": 1, "timestamp": "2026-05-05T07:15:13.974893"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87920, "epoch": 0, "train_loss": 3.6368914395570755, "train_ppl": 37.97360980152919, "lr": 0.00022877307056031067, "grad_norm": 1.0338, "tokens_per_sec": 148226, "dt_s": 4.421, "eta_s": 2106, "world_size": 1, "timestamp": "2026-05-05T07:15:18.396283"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87930, "epoch": 0, "train_loss": 3.5946608930826187, "train_ppl": 36.40335326565965, "lr": 0.0002284105588721858, "grad_norm": 0.9501, "tokens_per_sec": 149012, "dt_s": 4.398, "eta_s": 2099, "world_size": 1, "timestamp": "2026-05-05T07:15:22.794282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87940, "epoch": 0, "train_loss": 3.561121240258217, "train_ppl": 35.20264564884576, "lr": 0.00022804804718406092, "grad_norm": 1.1003, "tokens_per_sec": 148064, "dt_s": 4.426, "eta_s": 2099, "world_size": 1, "timestamp": "2026-05-05T07:15:27.220511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87950, "epoch": 0, "train_loss": 3.5757313668727875, "train_ppl": 35.72073622766667, "lr": 0.0002276855354959361, "grad_norm": 0.9589, "tokens_per_sec": 150437, "dt_s": 4.356, "eta_s": 2091, "world_size": 1, "timestamp": "2026-05-05T07:15:31.576845"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87960, "epoch": 0, "train_loss": 3.5831152349710464, "train_ppl": 35.985469607720056, "lr": 0.00022732302380781122, "grad_norm": 0.9778, "tokens_per_sec": 132758, "dt_s": 4.936, "eta_s": 2130, "world_size": 1, "timestamp": "2026-05-05T07:15:36.513355"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87970, "epoch": 0, "train_loss": 3.5472564697265625, "train_ppl": 34.71793699661533, "lr": 0.00022696051211968635, "grad_norm": 0.9085, "tokens_per_sec": 150086, "dt_s": 4.367, "eta_s": 2120, "world_size": 1, "timestamp": "2026-05-05T07:15:40.879923"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87980, "epoch": 0, "train_loss": 3.63243006169796, "train_ppl": 37.80457252947628, "lr": 0.00022659800043156147, "grad_norm": 0.9963, "tokens_per_sec": 151539, "dt_s": 4.325, "eta_s": 2109, "world_size": 1, "timestamp": "2026-05-05T07:15:45.204626"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 87990, "epoch": 0, "train_loss": 3.6588174998760223, "train_ppl": 38.81541648789783, "lr": 0.0002262354887434366, "grad_norm": 0.9652, "tokens_per_sec": 148740, "dt_s": 4.406, "eta_s": 2103, "world_size": 1, "timestamp": "2026-05-05T07:15:49.610721"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88000, "epoch": 0, "train_loss": 3.633192539215088, "train_ppl": 37.83340865812749, "lr": 0.00022587297705531177, "grad_norm": 0.9689, "tokens_per_sec": 148246, "dt_s": 4.421, "eta_s": 2104, "world_size": 1, "timestamp": "2026-05-05T07:15:54.031486"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88010, "epoch": 0, "train_loss": 3.567500352859497, "train_ppl": 35.42792506674727, "lr": 0.0002255104653671869, "grad_norm": 0.988, "tokens_per_sec": 128118, "dt_s": 5.115, "eta_s": 2044, "world_size": 1, "timestamp": "2026-05-05T07:15:59.146794"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88020, "epoch": 0, "train_loss": 3.5778583735227585, "train_ppl": 35.79679533159943, "lr": 0.00022514795367906202, "grad_norm": 0.9572, "tokens_per_sec": 147527, "dt_s": 4.442, "eta_s": 2047, "world_size": 1, "timestamp": "2026-05-05T07:16:03.589102"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88030, "epoch": 0, "train_loss": 3.6730851978063583, "train_ppl": 39.37319275498559, "lr": 0.0002247854419909372, "grad_norm": 0.9736, "tokens_per_sec": 149011, "dt_s": 4.398, "eta_s": 2050, "world_size": 1, "timestamp": "2026-05-05T07:16:07.987147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88040, "epoch": 0, "train_loss": 3.5706443786621094, "train_ppl": 35.53948666165873, "lr": 0.00022442293030281232, "grad_norm": 0.9427, "tokens_per_sec": 149173, "dt_s": 4.393, "eta_s": 2044, "world_size": 1, "timestamp": "2026-05-05T07:16:12.380463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88050, "epoch": 0, "train_loss": 3.6821862757205963, "train_ppl": 39.733166841378434, "lr": 0.0002240604186146874, "grad_norm": 0.9395, "tokens_per_sec": 150043, "dt_s": 4.368, "eta_s": 2035, "world_size": 1, "timestamp": "2026-05-05T07:16:16.748269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88060, "epoch": 0, "train_loss": 3.637559235095978, "train_ppl": 37.99897687781815, "lr": 0.00022369790692656257, "grad_norm": 0.9247, "tokens_per_sec": 152143, "dt_s": 4.308, "eta_s": 2027, "world_size": 1, "timestamp": "2026-05-05T07:16:21.055785"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88070, "epoch": 0, "train_loss": 3.5316433161497116, "train_ppl": 34.18009018275925, "lr": 0.0002233353952384377, "grad_norm": 0.9382, "tokens_per_sec": 147297, "dt_s": 4.449, "eta_s": 2023, "world_size": 1, "timestamp": "2026-05-05T07:16:25.505026"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88080, "epoch": 0, "train_loss": 3.607678472995758, "train_ppl": 36.88033466197403, "lr": 0.00022297288355031282, "grad_norm": 0.9477, "tokens_per_sec": 151085, "dt_s": 4.338, "eta_s": 2013, "world_size": 1, "timestamp": "2026-05-05T07:16:29.842697"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88090, "epoch": 0, "train_loss": 3.4749330580234528, "train_ppl": 32.29566652849561, "lr": 0.000222610371862188, "grad_norm": 0.8975, "tokens_per_sec": 151386, "dt_s": 4.329, "eta_s": 2003, "world_size": 1, "timestamp": "2026-05-05T07:16:34.171765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88100, "epoch": 0, "train_loss": 3.476855307817459, "train_ppl": 32.35780657203607, "lr": 0.00022224786017406312, "grad_norm": 0.9527, "tokens_per_sec": 148887, "dt_s": 4.402, "eta_s": 2001, "world_size": 1, "timestamp": "2026-05-05T07:16:38.573510"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88110, "epoch": 0, "train_loss": 3.590248078107834, "train_ppl": 36.24306592253592, "lr": 0.0002218853484859382, "grad_norm": 0.9405, "tokens_per_sec": 151100, "dt_s": 4.337, "eta_s": 2000, "world_size": 1, "timestamp": "2026-05-05T07:16:42.910772"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88120, "epoch": 0, "train_loss": 3.488107368350029, "train_ppl": 32.72395466742764, "lr": 0.00022152283679781337, "grad_norm": 0.9086, "tokens_per_sec": 151286, "dt_s": 4.332, "eta_s": 1985, "world_size": 1, "timestamp": "2026-05-05T07:16:47.242706"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88130, "epoch": 0, "train_loss": 3.7590073943138123, "train_ppl": 42.90581627773794, "lr": 0.0002211603251096885, "grad_norm": 1.4689, "tokens_per_sec": 148875, "dt_s": 4.402, "eta_s": 1986, "world_size": 1, "timestamp": "2026-05-05T07:16:51.644784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88140, "epoch": 0, "train_loss": 3.707047939300537, "train_ppl": 40.733381448844, "lr": 0.00022079781342156362, "grad_norm": 1.0809, "tokens_per_sec": 149643, "dt_s": 4.379, "eta_s": 1986, "world_size": 1, "timestamp": "2026-05-05T07:16:56.024276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88150, "epoch": 0, "train_loss": 3.677375540137291, "train_ppl": 39.542480121269946, "lr": 0.0002204353017334388, "grad_norm": 0.9501, "tokens_per_sec": 147500, "dt_s": 4.443, "eta_s": 1986, "world_size": 1, "timestamp": "2026-05-05T07:17:00.467385"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88160, "epoch": 0, "train_loss": 3.68602691590786, "train_ppl": 39.88606105661137, "lr": 0.00022007279004531392, "grad_norm": 0.9268, "tokens_per_sec": 148824, "dt_s": 4.404, "eta_s": 1987, "world_size": 1, "timestamp": "2026-05-05T07:17:04.870956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88170, "epoch": 0, "train_loss": 3.6890674978494644, "train_ppl": 40.00752245667439, "lr": 0.00021971027835718905, "grad_norm": 0.9906, "tokens_per_sec": 149848, "dt_s": 4.374, "eta_s": 1987, "world_size": 1, "timestamp": "2026-05-05T07:17:09.244464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88180, "epoch": 0, "train_loss": 3.6012394577264786, "train_ppl": 36.64362453187154, "lr": 0.00021934776666906417, "grad_norm": 0.9291, "tokens_per_sec": 148092, "dt_s": 4.425, "eta_s": 1985, "world_size": 1, "timestamp": "2026-05-05T07:17:13.669816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88190, "epoch": 0, "train_loss": 3.580305203795433, "train_ppl": 35.88449125885352, "lr": 0.0002189852549809393, "grad_norm": 1.0471, "tokens_per_sec": 151469, "dt_s": 4.327, "eta_s": 1975, "world_size": 1, "timestamp": "2026-05-05T07:17:17.996523"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88200, "epoch": 0, "train_loss": 3.504573315382004, "train_ppl": 33.267246201422275, "lr": 0.00021862274329281447, "grad_norm": 1.0762, "tokens_per_sec": 151520, "dt_s": 4.325, "eta_s": 1960, "world_size": 1, "timestamp": "2026-05-05T07:17:22.321748"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88210, "epoch": 0, "train_loss": 3.557406723499298, "train_ppl": 35.07212738782903, "lr": 0.0002182602316046896, "grad_norm": 0.9089, "tokens_per_sec": 148067, "dt_s": 4.426, "eta_s": 1958, "world_size": 1, "timestamp": "2026-05-05T07:17:26.747862"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88220, "epoch": 0, "train_loss": 3.6075868010520935, "train_ppl": 36.8769539249743, "lr": 0.00021789771991656472, "grad_norm": 0.9355, "tokens_per_sec": 150392, "dt_s": 4.358, "eta_s": 1952, "world_size": 1, "timestamp": "2026-05-05T07:17:31.105543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88230, "epoch": 0, "train_loss": 3.6265156120061874, "train_ppl": 37.58163920097332, "lr": 0.0002175352082284399, "grad_norm": 0.9689, "tokens_per_sec": 151408, "dt_s": 4.328, "eta_s": 1939, "world_size": 1, "timestamp": "2026-05-05T07:17:35.433980"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88240, "epoch": 0, "train_loss": 3.5861169695854187, "train_ppl": 36.09365072172358, "lr": 0.00021717269654031502, "grad_norm": 1.0576, "tokens_per_sec": 149016, "dt_s": 4.398, "eta_s": 1941, "world_size": 1, "timestamp": "2026-05-05T07:17:39.831915"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88250, "epoch": 0, "train_loss": 3.5812221616506577, "train_ppl": 35.91741091565641, "lr": 0.0002168101848521901, "grad_norm": 1.0406, "tokens_per_sec": 150380, "dt_s": 4.358, "eta_s": 1940, "world_size": 1, "timestamp": "2026-05-05T07:17:44.189929"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88260, "epoch": 0, "train_loss": 3.431286185979843, "train_ppl": 30.916381404858658, "lr": 0.00021644767316406527, "grad_norm": 0.9557, "tokens_per_sec": 133851, "dt_s": 4.896, "eta_s": 1977, "world_size": 1, "timestamp": "2026-05-05T07:17:49.086154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88270, "epoch": 0, "train_loss": 3.555866166949272, "train_ppl": 35.01813838948861, "lr": 0.0002160851614759404, "grad_norm": 1.0049, "tokens_per_sec": 149265, "dt_s": 4.391, "eta_s": 1975, "world_size": 1, "timestamp": "2026-05-05T07:17:53.476723"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88280, "epoch": 0, "train_loss": 3.529961436986923, "train_ppl": 34.122651717117655, "lr": 0.00021572264978781552, "grad_norm": 0.9726, "tokens_per_sec": 150934, "dt_s": 4.342, "eta_s": 1972, "world_size": 1, "timestamp": "2026-05-05T07:17:57.818768"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88290, "epoch": 0, "train_loss": 3.5400720834732056, "train_ppl": 34.46940377565103, "lr": 0.0002153601380996907, "grad_norm": 1.0149, "tokens_per_sec": 148679, "dt_s": 4.408, "eta_s": 1969, "world_size": 1, "timestamp": "2026-05-05T07:18:02.226644"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88300, "epoch": 0, "train_loss": 3.636056572198868, "train_ppl": 37.941920104420085, "lr": 0.00021499762641156582, "grad_norm": 1.0578, "tokens_per_sec": 149212, "dt_s": 4.392, "eta_s": 1967, "world_size": 1, "timestamp": "2026-05-05T07:18:06.618801"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88310, "epoch": 0, "train_loss": 3.648483380675316, "train_ppl": 38.41635885307767, "lr": 0.0002146351147234409, "grad_norm": 0.9102, "tokens_per_sec": 149722, "dt_s": 4.377, "eta_s": 1917, "world_size": 1, "timestamp": "2026-05-05T07:18:10.995969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88320, "epoch": 0, "train_loss": 3.6286632865667343, "train_ppl": 37.66243906627902, "lr": 0.00021427260303531607, "grad_norm": 0.9929, "tokens_per_sec": 147758, "dt_s": 4.435, "eta_s": 1917, "world_size": 1, "timestamp": "2026-05-05T07:18:15.431327"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88330, "epoch": 0, "train_loss": 3.5562057346105576, "train_ppl": 35.03003143597682, "lr": 0.0002139100913471912, "grad_norm": 1.0929, "tokens_per_sec": 150067, "dt_s": 4.367, "eta_s": 1915, "world_size": 1, "timestamp": "2026-05-05T07:18:19.798436"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88340, "epoch": 0, "train_loss": 3.553221106529236, "train_ppl": 34.92563568919859, "lr": 0.00021354757965906632, "grad_norm": 1.061, "tokens_per_sec": 149203, "dt_s": 4.392, "eta_s": 1909, "world_size": 1, "timestamp": "2026-05-05T07:18:24.190840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88350, "epoch": 0, "train_loss": 3.509814068675041, "train_ppl": 33.4420492813234, "lr": 0.0002131850679709415, "grad_norm": 0.9441, "tokens_per_sec": 148254, "dt_s": 4.421, "eta_s": 1907, "world_size": 1, "timestamp": "2026-05-05T07:18:28.611378"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88360, "epoch": 0, "train_loss": 3.5265971571207047, "train_ppl": 34.00804645714358, "lr": 0.00021282255628281662, "grad_norm": 0.9194, "tokens_per_sec": 149539, "dt_s": 4.383, "eta_s": 1903, "world_size": 1, "timestamp": "2026-05-05T07:18:32.993899"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88370, "epoch": 0, "train_loss": 3.6415434181690216, "train_ppl": 38.15067375168025, "lr": 0.00021246004459469174, "grad_norm": 1.0129, "tokens_per_sec": 148234, "dt_s": 4.421, "eta_s": 1897, "world_size": 1, "timestamp": "2026-05-05T07:18:37.415024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88380, "epoch": 0, "train_loss": 3.574934706091881, "train_ppl": 35.69229025045067, "lr": 0.00021209753290656687, "grad_norm": 0.9429, "tokens_per_sec": 150274, "dt_s": 4.361, "eta_s": 1892, "world_size": 1, "timestamp": "2026-05-05T07:18:41.776119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88390, "epoch": 0, "train_loss": 3.5631613433361053, "train_ppl": 35.274535981494154, "lr": 0.000211735021218442, "grad_norm": 0.9414, "tokens_per_sec": 152930, "dt_s": 4.285, "eta_s": 1879, "world_size": 1, "timestamp": "2026-05-05T07:18:46.061459"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88400, "epoch": 0, "train_loss": 3.5754882246255875, "train_ppl": 35.71205206337503, "lr": 0.00021137250953031712, "grad_norm": 0.9617, "tokens_per_sec": 149446, "dt_s": 4.385, "eta_s": 1871, "world_size": 1, "timestamp": "2026-05-05T07:18:50.446732"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88410, "epoch": 0, "train_loss": 3.6425155103206635, "train_ppl": 38.18777775354679, "lr": 0.0002110099978421923, "grad_norm": 0.9323, "tokens_per_sec": 152386, "dt_s": 4.301, "eta_s": 1860, "world_size": 1, "timestamp": "2026-05-05T07:18:54.747400"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88420, "epoch": 0, "train_loss": 3.5126085579395294, "train_ppl": 33.53563342806804, "lr": 0.00021064748615406742, "grad_norm": 1.0593, "tokens_per_sec": 152593, "dt_s": 4.295, "eta_s": 1845, "world_size": 1, "timestamp": "2026-05-05T07:18:59.042239"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88430, "epoch": 0, "train_loss": 3.711745470762253, "train_ppl": 40.925177922031104, "lr": 0.0002102849744659426, "grad_norm": 0.9861, "tokens_per_sec": 148641, "dt_s": 4.409, "eta_s": 1845, "world_size": 1, "timestamp": "2026-05-05T07:19:03.451248"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88440, "epoch": 0, "train_loss": 3.739809513092041, "train_ppl": 42.089971812738895, "lr": 0.00020992246277781772, "grad_norm": 0.9713, "tokens_per_sec": 150185, "dt_s": 4.364, "eta_s": 1847, "world_size": 1, "timestamp": "2026-05-05T07:19:07.814934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88450, "epoch": 0, "train_loss": 3.5457902401685715, "train_ppl": 34.6670698317817, "lr": 0.0002095599510896928, "grad_norm": 0.9122, "tokens_per_sec": 148658, "dt_s": 4.409, "eta_s": 1845, "world_size": 1, "timestamp": "2026-05-05T07:19:12.223439"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88460, "epoch": 0, "train_loss": 3.5900443494319916, "train_ppl": 36.23568292279697, "lr": 0.00020919743940156797, "grad_norm": 0.9282, "tokens_per_sec": 149190, "dt_s": 4.393, "eta_s": 1848, "world_size": 1, "timestamp": "2026-05-05T07:19:16.616235"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88470, "epoch": 0, "train_loss": 3.615968272089958, "train_ppl": 37.18733595827089, "lr": 0.0002088349277134431, "grad_norm": 0.9658, "tokens_per_sec": 150116, "dt_s": 4.366, "eta_s": 1850, "world_size": 1, "timestamp": "2026-05-05T07:19:20.981926"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88480, "epoch": 0, "train_loss": 3.5675883293151855, "train_ppl": 35.43104202713459, "lr": 0.00020847241602531822, "grad_norm": 0.931, "tokens_per_sec": 151467, "dt_s": 4.327, "eta_s": 1838, "world_size": 1, "timestamp": "2026-05-05T07:19:25.308650"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88490, "epoch": 0, "train_loss": 3.58572755753994, "train_ppl": 36.07959815566406, "lr": 0.0002081099043371934, "grad_norm": 0.9375, "tokens_per_sec": 151075, "dt_s": 4.338, "eta_s": 1832, "world_size": 1, "timestamp": "2026-05-05T07:19:29.646647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88500, "epoch": 0, "train_loss": 3.7033550441265106, "train_ppl": 40.5832347497152, "lr": 0.00020774739264906852, "grad_norm": 0.9835, "tokens_per_sec": 151598, "dt_s": 4.323, "eta_s": 1820, "world_size": 1, "timestamp": "2026-05-05T07:19:33.969669"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88510, "epoch": 0, "train_loss": 3.7122892588377, "train_ppl": 40.94743859776711, "lr": 0.0002073848809609436, "grad_norm": 1.1575, "tokens_per_sec": 127332, "dt_s": 5.147, "eta_s": 1815, "world_size": 1, "timestamp": "2026-05-05T07:19:39.116529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88520, "epoch": 0, "train_loss": 3.599992021918297, "train_ppl": 36.59794246113814, "lr": 0.00020702236927281877, "grad_norm": 0.9858, "tokens_per_sec": 149514, "dt_s": 4.383, "eta_s": 1813, "world_size": 1, "timestamp": "2026-05-05T07:19:43.499818"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88530, "epoch": 0, "train_loss": 3.5292592644691467, "train_ppl": 34.09870013890233, "lr": 0.0002066598575846939, "grad_norm": 0.9752, "tokens_per_sec": 149019, "dt_s": 4.398, "eta_s": 1814, "world_size": 1, "timestamp": "2026-05-05T07:19:47.897642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88540, "epoch": 0, "train_loss": 3.605610102415085, "train_ppl": 36.80413129831908, "lr": 0.00020629734589656902, "grad_norm": 0.9029, "tokens_per_sec": 147008, "dt_s": 4.458, "eta_s": 1820, "world_size": 1, "timestamp": "2026-05-05T07:19:52.355640"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88550, "epoch": 0, "train_loss": 3.544515237212181, "train_ppl": 34.62289738124331, "lr": 0.0002059348342084442, "grad_norm": 0.9712, "tokens_per_sec": 134825, "dt_s": 4.861, "eta_s": 1860, "world_size": 1, "timestamp": "2026-05-05T07:19:57.216442"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88560, "epoch": 0, "train_loss": 3.619980275630951, "train_ppl": 37.33683136934959, "lr": 0.00020557232252031932, "grad_norm": 0.9337, "tokens_per_sec": 149091, "dt_s": 4.396, "eta_s": 1856, "world_size": 1, "timestamp": "2026-05-05T07:20:01.612158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88570, "epoch": 0, "train_loss": 3.558615133166313, "train_ppl": 35.11453450303082, "lr": 0.00020520981083219444, "grad_norm": 0.9021, "tokens_per_sec": 150328, "dt_s": 4.36, "eta_s": 1850, "world_size": 1, "timestamp": "2026-05-05T07:20:05.971702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88580, "epoch": 0, "train_loss": 3.53999325633049, "train_ppl": 34.466686758129065, "lr": 0.00020484729914406957, "grad_norm": 0.9695, "tokens_per_sec": 153056, "dt_s": 4.282, "eta_s": 1836, "world_size": 1, "timestamp": "2026-05-05T07:20:10.253534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88590, "epoch": 0, "train_loss": 3.598363146185875, "train_ppl": 36.538377485947905, "lr": 0.0002044847874559447, "grad_norm": 0.9617, "tokens_per_sec": 148627, "dt_s": 4.409, "eta_s": 1827, "world_size": 1, "timestamp": "2026-05-05T07:20:14.663014"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88600, "epoch": 0, "train_loss": 3.569671392440796, "train_ppl": 35.504924048024506, "lr": 0.00020412227576781982, "grad_norm": 0.9619, "tokens_per_sec": 150187, "dt_s": 4.364, "eta_s": 1782, "world_size": 1, "timestamp": "2026-05-05T07:20:19.026608"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88610, "epoch": 0, "train_loss": 3.64822818338871, "train_ppl": 38.40655635337595, "lr": 0.000203759764079695, "grad_norm": 0.968, "tokens_per_sec": 150712, "dt_s": 4.348, "eta_s": 1774, "world_size": 1, "timestamp": "2026-05-05T07:20:23.375038"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88620, "epoch": 0, "train_loss": 3.5485899746418, "train_ppl": 34.76426441830439, "lr": 0.00020339725239157012, "grad_norm": 1.0111, "tokens_per_sec": 148729, "dt_s": 4.406, "eta_s": 1773, "world_size": 1, "timestamp": "2026-05-05T07:20:27.781432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88630, "epoch": 0, "train_loss": 3.6467660069465637, "train_ppl": 38.35044022728871, "lr": 0.0002030347407034453, "grad_norm": 1.001, "tokens_per_sec": 149354, "dt_s": 4.388, "eta_s": 1777, "world_size": 1, "timestamp": "2026-05-05T07:20:32.169403"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88640, "epoch": 0, "train_loss": 3.6855844408273697, "train_ppl": 39.86841637248931, "lr": 0.00020267222901532042, "grad_norm": 1.1172, "tokens_per_sec": 150393, "dt_s": 4.358, "eta_s": 1769, "world_size": 1, "timestamp": "2026-05-05T07:20:36.527086"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88650, "epoch": 0, "train_loss": 3.5692373365163803, "train_ppl": 35.489516269556205, "lr": 0.0002023097173271955, "grad_norm": 0.9917, "tokens_per_sec": 147394, "dt_s": 4.446, "eta_s": 1771, "world_size": 1, "timestamp": "2026-05-05T07:20:40.973388"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88660, "epoch": 0, "train_loss": 3.6445820927619934, "train_ppl": 38.266777546202086, "lr": 0.00020194720563907067, "grad_norm": 0.9936, "tokens_per_sec": 150107, "dt_s": 4.366, "eta_s": 1768, "world_size": 1, "timestamp": "2026-05-05T07:20:45.339329"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88670, "epoch": 0, "train_loss": 3.639546647667885, "train_ppl": 38.07457161626945, "lr": 0.0002015846939509458, "grad_norm": 0.9312, "tokens_per_sec": 148613, "dt_s": 4.41, "eta_s": 1764, "world_size": 1, "timestamp": "2026-05-05T07:20:49.749152"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88680, "epoch": 0, "train_loss": 3.625444859266281, "train_ppl": 37.54142009403082, "lr": 0.00020122218226282092, "grad_norm": 0.9582, "tokens_per_sec": 148179, "dt_s": 4.423, "eta_s": 1763, "world_size": 1, "timestamp": "2026-05-05T07:20:54.171907"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88690, "epoch": 0, "train_loss": 3.5787577778100967, "train_ppl": 35.829005605651616, "lr": 0.0002008596705746961, "grad_norm": 1.0534, "tokens_per_sec": 149889, "dt_s": 4.372, "eta_s": 1759, "world_size": 1, "timestamp": "2026-05-05T07:20:58.544219"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88700, "epoch": 0, "train_loss": 3.644320383667946, "train_ppl": 38.2567640928814, "lr": 0.00020049715888657122, "grad_norm": 0.9772, "tokens_per_sec": 145780, "dt_s": 4.496, "eta_s": 1759, "world_size": 1, "timestamp": "2026-05-05T07:21:03.039765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88710, "epoch": 0, "train_loss": 3.4866082817316055, "train_ppl": 32.674935376145534, "lr": 0.0002001346471984463, "grad_norm": 0.9437, "tokens_per_sec": 147912, "dt_s": 4.431, "eta_s": 1760, "world_size": 1, "timestamp": "2026-05-05T07:21:07.470482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88720, "epoch": 0, "train_loss": 3.5746401101350784, "train_ppl": 35.68177699471122, "lr": 0.00019977213551032147, "grad_norm": 0.9872, "tokens_per_sec": 149500, "dt_s": 4.384, "eta_s": 1753, "world_size": 1, "timestamp": "2026-05-05T07:21:11.854144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88730, "epoch": 0, "train_loss": 3.5594292283058167, "train_ppl": 35.143132714148535, "lr": 0.0001994096238221966, "grad_norm": 1.0, "tokens_per_sec": 147039, "dt_s": 4.457, "eta_s": 1751, "world_size": 1, "timestamp": "2026-05-05T07:21:16.311202"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88740, "epoch": 0, "train_loss": 3.6510483473539352, "train_ppl": 38.51502201321626, "lr": 0.00019904711213407172, "grad_norm": 0.9971, "tokens_per_sec": 149832, "dt_s": 4.374, "eta_s": 1747, "world_size": 1, "timestamp": "2026-05-05T07:21:20.685136"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88750, "epoch": 0, "train_loss": 3.5525757670402527, "train_ppl": 34.90310406836481, "lr": 0.0001986846004459469, "grad_norm": 0.9269, "tokens_per_sec": 149425, "dt_s": 4.386, "eta_s": 1734, "world_size": 1, "timestamp": "2026-05-05T07:21:25.071015"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88760, "epoch": 0, "train_loss": 3.6133603155612946, "train_ppl": 37.09047935647541, "lr": 0.00019832208875782202, "grad_norm": 1.1036, "tokens_per_sec": 148791, "dt_s": 4.405, "eta_s": 1728, "world_size": 1, "timestamp": "2026-05-05T07:21:29.475580"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88770, "epoch": 0, "train_loss": 3.5726234018802643, "train_ppl": 35.60988977264108, "lr": 0.00019795957706969714, "grad_norm": 0.9179, "tokens_per_sec": 149147, "dt_s": 4.394, "eta_s": 1724, "world_size": 1, "timestamp": "2026-05-05T07:21:33.869637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88780, "epoch": 0, "train_loss": 3.463792607188225, "train_ppl": 31.937874923676784, "lr": 0.00019759706538157227, "grad_norm": 1.0224, "tokens_per_sec": 147385, "dt_s": 4.447, "eta_s": 1719, "world_size": 1, "timestamp": "2026-05-05T07:21:38.316232"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88790, "epoch": 0, "train_loss": 3.7334602028131485, "train_ppl": 41.82357613143868, "lr": 0.0001972345536934474, "grad_norm": 1.0868, "tokens_per_sec": 148718, "dt_s": 4.407, "eta_s": 1717, "world_size": 1, "timestamp": "2026-05-05T07:21:42.722960"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88800, "epoch": 0, "train_loss": 3.627757966518402, "train_ppl": 37.628357934616865, "lr": 0.00019687204200532251, "grad_norm": 0.9628, "tokens_per_sec": 149995, "dt_s": 4.369, "eta_s": 1711, "world_size": 1, "timestamp": "2026-05-05T07:21:47.092217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88810, "epoch": 0, "train_loss": 3.5453625917434692, "train_ppl": 34.6522476835253, "lr": 0.0001965095303171977, "grad_norm": 0.9425, "tokens_per_sec": 146938, "dt_s": 4.46, "eta_s": 1711, "world_size": 1, "timestamp": "2026-05-05T07:21:51.552321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88820, "epoch": 0, "train_loss": 3.5073326975107193, "train_ppl": 33.359170014198035, "lr": 0.00019614701862907282, "grad_norm": 0.9544, "tokens_per_sec": 147326, "dt_s": 4.448, "eta_s": 1711, "world_size": 1, "timestamp": "2026-05-05T07:21:56.000689"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88830, "epoch": 0, "train_loss": 3.5920769423246384, "train_ppl": 36.30941021777064, "lr": 0.00019578450694094794, "grad_norm": 0.9897, "tokens_per_sec": 147470, "dt_s": 4.444, "eta_s": 1706, "world_size": 1, "timestamp": "2026-05-05T07:22:00.444722"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88840, "epoch": 0, "train_loss": 3.717859372496605, "train_ppl": 41.17615688687682, "lr": 0.00019542199525282312, "grad_norm": 0.9169, "tokens_per_sec": 146777, "dt_s": 4.465, "eta_s": 1706, "world_size": 1, "timestamp": "2026-05-05T07:22:04.909727"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88850, "epoch": 0, "train_loss": 3.6910243034362793, "train_ppl": 40.08588604628211, "lr": 0.0001950594835646982, "grad_norm": 0.9858, "tokens_per_sec": 135012, "dt_s": 4.854, "eta_s": 1739, "world_size": 1, "timestamp": "2026-05-05T07:22:09.763813"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88860, "epoch": 0, "train_loss": 3.5882535874843597, "train_ppl": 36.17085150681402, "lr": 0.00019469697187657337, "grad_norm": 1.0174, "tokens_per_sec": 147233, "dt_s": 4.451, "eta_s": 1734, "world_size": 1, "timestamp": "2026-05-05T07:22:14.215000"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88870, "epoch": 0, "train_loss": 3.599892646074295, "train_ppl": 36.5943056904239, "lr": 0.0001943344601884485, "grad_norm": 0.9738, "tokens_per_sec": 148648, "dt_s": 4.409, "eta_s": 1726, "world_size": 1, "timestamp": "2026-05-05T07:22:18.623806"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88880, "epoch": 0, "train_loss": 3.6635347604751587, "train_ppl": 38.9989514735702, "lr": 0.00019397194850032362, "grad_norm": 1.1149, "tokens_per_sec": 149201, "dt_s": 4.392, "eta_s": 1718, "world_size": 1, "timestamp": "2026-05-05T07:22:23.016265"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88890, "epoch": 0, "train_loss": 3.608293831348419, "train_ppl": 36.90303626805231, "lr": 0.0001936094368121988, "grad_norm": 1.0365, "tokens_per_sec": 146420, "dt_s": 4.476, "eta_s": 1714, "world_size": 1, "timestamp": "2026-05-05T07:22:27.492133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88900, "epoch": 0, "train_loss": 3.6532847583293915, "train_ppl": 38.6012538201053, "lr": 0.00019324692512407392, "grad_norm": 0.978, "tokens_per_sec": 146827, "dt_s": 4.463, "eta_s": 1680, "world_size": 1, "timestamp": "2026-05-05T07:22:31.955610"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88910, "epoch": 0, "train_loss": 3.5637588500976562, "train_ppl": 35.29561905326539, "lr": 0.000192884413435949, "grad_norm": 0.9689, "tokens_per_sec": 148894, "dt_s": 4.402, "eta_s": 1672, "world_size": 1, "timestamp": "2026-05-05T07:22:36.357126"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88920, "epoch": 0, "train_loss": 3.584784060716629, "train_ppl": 36.04557322314977, "lr": 0.00019252190174782417, "grad_norm": 0.962, "tokens_per_sec": 146980, "dt_s": 4.459, "eta_s": 1671, "world_size": 1, "timestamp": "2026-05-05T07:22:40.815993"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88930, "epoch": 0, "train_loss": 3.6695701330900192, "train_ppl": 39.235036390894585, "lr": 0.0001921593900596993, "grad_norm": 1.0113, "tokens_per_sec": 149167, "dt_s": 4.393, "eta_s": 1667, "world_size": 1, "timestamp": "2026-05-05T07:22:45.209432"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88940, "epoch": 0, "train_loss": 3.591865912079811, "train_ppl": 36.30174864248334, "lr": 0.00019179687837157441, "grad_norm": 0.9739, "tokens_per_sec": 150293, "dt_s": 4.361, "eta_s": 1654, "world_size": 1, "timestamp": "2026-05-05T07:22:49.570015"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88950, "epoch": 0, "train_loss": 3.648917004466057, "train_ppl": 38.43302071245824, "lr": 0.0001914343666834496, "grad_norm": 1.1939, "tokens_per_sec": 148135, "dt_s": 4.424, "eta_s": 1646, "world_size": 1, "timestamp": "2026-05-05T07:22:53.994061"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88960, "epoch": 0, "train_loss": 3.6009343564510345, "train_ppl": 36.632446220635195, "lr": 0.00019107185499532472, "grad_norm": 1.0005, "tokens_per_sec": 149861, "dt_s": 4.373, "eta_s": 1640, "world_size": 1, "timestamp": "2026-05-05T07:22:58.367171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88970, "epoch": 0, "train_loss": 3.6363944113254547, "train_ppl": 37.9547405350688, "lr": 0.00019070934330719984, "grad_norm": 1.0253, "tokens_per_sec": 148733, "dt_s": 4.406, "eta_s": 1632, "world_size": 1, "timestamp": "2026-05-05T07:23:02.773464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88980, "epoch": 0, "train_loss": 3.551956295967102, "train_ppl": 34.88148930058421, "lr": 0.00019034683161907496, "grad_norm": 0.9548, "tokens_per_sec": 150475, "dt_s": 4.355, "eta_s": 1624, "world_size": 1, "timestamp": "2026-05-05T07:23:07.128728"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 88990, "epoch": 0, "train_loss": 3.6121072322130203, "train_ppl": 37.04403100232171, "lr": 0.0001899843199309501, "grad_norm": 1.0303, "tokens_per_sec": 150603, "dt_s": 4.352, "eta_s": 1619, "world_size": 1, "timestamp": "2026-05-05T07:23:11.480333"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89000, "epoch": 0, "train_loss": 3.625276818871498, "train_ppl": 37.53511214898714, "lr": 0.0001896218082428252, "grad_norm": 0.9575, "tokens_per_sec": 148581, "dt_s": 4.411, "eta_s": 1614, "world_size": 1, "timestamp": "2026-05-05T07:23:15.891106"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89010, "epoch": 0, "train_loss": 3.675257131457329, "train_ppl": 39.458801652044926, "lr": 0.0001892592965547004, "grad_norm": 0.9843, "tokens_per_sec": 126292, "dt_s": 5.189, "eta_s": 1612, "world_size": 1, "timestamp": "2026-05-05T07:23:21.080364"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89020, "epoch": 0, "train_loss": 3.575012728571892, "train_ppl": 35.695075160094646, "lr": 0.00018889678486657552, "grad_norm": 1.0865, "tokens_per_sec": 147496, "dt_s": 4.443, "eta_s": 1611, "world_size": 1, "timestamp": "2026-05-05T07:23:25.523594"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89030, "epoch": 0, "train_loss": 3.6723732501268387, "train_ppl": 39.34517107793599, "lr": 0.00018853427317845064, "grad_norm": 1.0088, "tokens_per_sec": 147398, "dt_s": 4.446, "eta_s": 1613, "world_size": 1, "timestamp": "2026-05-05T07:23:29.969786"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89040, "epoch": 0, "train_loss": 3.5400453060865402, "train_ppl": 34.46848078745567, "lr": 0.00018817176149032582, "grad_norm": 0.9153, "tokens_per_sec": 150468, "dt_s": 4.355, "eta_s": 1609, "world_size": 1, "timestamp": "2026-05-05T07:23:34.325273"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89050, "epoch": 0, "train_loss": 3.6583616584539413, "train_ppl": 38.797726845389526, "lr": 0.0001878092498022009, "grad_norm": 0.9624, "tokens_per_sec": 150581, "dt_s": 4.352, "eta_s": 1600, "world_size": 1, "timestamp": "2026-05-05T07:23:38.677449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89060, "epoch": 0, "train_loss": 3.4999291002750397, "train_ppl": 33.11310416548643, "lr": 0.00018744673811407607, "grad_norm": 1.0409, "tokens_per_sec": 149148, "dt_s": 4.394, "eta_s": 1594, "world_size": 1, "timestamp": "2026-05-05T07:23:43.071489"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89070, "epoch": 0, "train_loss": 3.6530130356550217, "train_ppl": 38.59076640908148, "lr": 0.0001870842264259512, "grad_norm": 0.9969, "tokens_per_sec": 150970, "dt_s": 4.341, "eta_s": 1583, "world_size": 1, "timestamp": "2026-05-05T07:23:47.412494"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89080, "epoch": 0, "train_loss": 3.5205315351486206, "train_ppl": 33.80239084845062, "lr": 0.00018672171473782631, "grad_norm": 0.9588, "tokens_per_sec": 150122, "dt_s": 4.366, "eta_s": 1573, "world_size": 1, "timestamp": "2026-05-05T07:23:51.777997"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89090, "epoch": 0, "train_loss": 3.536604106426239, "train_ppl": 34.35007171455806, "lr": 0.0001863592030497015, "grad_norm": 1.0618, "tokens_per_sec": 149416, "dt_s": 4.386, "eta_s": 1570, "world_size": 1, "timestamp": "2026-05-05T07:23:56.164144"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89100, "epoch": 0, "train_loss": 3.5743662416934967, "train_ppl": 35.672006220067296, "lr": 0.00018599669136157662, "grad_norm": 0.95, "tokens_per_sec": 151559, "dt_s": 4.324, "eta_s": 1564, "world_size": 1, "timestamp": "2026-05-05T07:24:00.488278"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89110, "epoch": 0, "train_loss": 3.584493577480316, "train_ppl": 36.03510410900985, "lr": 0.00018563417967345169, "grad_norm": 1.1124, "tokens_per_sec": 147019, "dt_s": 4.458, "eta_s": 1564, "world_size": 1, "timestamp": "2026-05-05T07:24:04.945938"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89120, "epoch": 0, "train_loss": 3.570554867386818, "train_ppl": 35.53630561925614, "lr": 0.00018527166798532686, "grad_norm": 1.0014, "tokens_per_sec": 149741, "dt_s": 4.377, "eta_s": 1562, "world_size": 1, "timestamp": "2026-05-05T07:24:09.322540"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89130, "epoch": 0, "train_loss": 3.619534358382225, "train_ppl": 37.32018594374624, "lr": 0.000184909156297202, "grad_norm": 0.9344, "tokens_per_sec": 150873, "dt_s": 4.344, "eta_s": 1556, "world_size": 1, "timestamp": "2026-05-05T07:24:13.666334"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89140, "epoch": 0, "train_loss": 3.4960701912641525, "train_ppl": 32.98556993919252, "lr": 0.0001845466446090771, "grad_norm": 0.9361, "tokens_per_sec": 133340, "dt_s": 4.915, "eta_s": 1590, "world_size": 1, "timestamp": "2026-05-05T07:24:18.581292"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89150, "epoch": 0, "train_loss": 3.5977598428726196, "train_ppl": 36.51634040993856, "lr": 0.0001841841329209523, "grad_norm": 1.0868, "tokens_per_sec": 151834, "dt_s": 4.316, "eta_s": 1584, "world_size": 1, "timestamp": "2026-05-05T07:24:22.897589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89160, "epoch": 0, "train_loss": 3.536930650472641, "train_ppl": 34.36129035756078, "lr": 0.00018382162123282742, "grad_norm": 1.1611, "tokens_per_sec": 150229, "dt_s": 4.362, "eta_s": 1573, "world_size": 1, "timestamp": "2026-05-05T07:24:27.259976"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89170, "epoch": 0, "train_loss": 3.6419455856084824, "train_ppl": 38.16601979608952, "lr": 0.00018345910954470254, "grad_norm": 0.942, "tokens_per_sec": 148776, "dt_s": 4.405, "eta_s": 1571, "world_size": 1, "timestamp": "2026-05-05T07:24:31.664969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89180, "epoch": 0, "train_loss": 3.5385612696409225, "train_ppl": 34.41736624304649, "lr": 0.00018309659785657766, "grad_norm": 1.1355, "tokens_per_sec": 151446, "dt_s": 4.327, "eta_s": 1565, "world_size": 1, "timestamp": "2026-05-05T07:24:35.992324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89190, "epoch": 0, "train_loss": 3.6597703099250793, "train_ppl": 38.85241783161259, "lr": 0.0001827340861684528, "grad_norm": 1.1399, "tokens_per_sec": 149572, "dt_s": 4.382, "eta_s": 1523, "world_size": 1, "timestamp": "2026-05-05T07:24:40.373919"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89200, "epoch": 0, "train_loss": 3.5651481449604034, "train_ppl": 35.34468915402034, "lr": 0.0001823715744803279, "grad_norm": 1.0723, "tokens_per_sec": 150788, "dt_s": 4.346, "eta_s": 1521, "world_size": 1, "timestamp": "2026-05-05T07:24:44.720154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89210, "epoch": 0, "train_loss": 3.7000048011541367, "train_ppl": 40.447498554276216, "lr": 0.0001820090627922031, "grad_norm": 1.0192, "tokens_per_sec": 150700, "dt_s": 4.349, "eta_s": 1516, "world_size": 1, "timestamp": "2026-05-05T07:24:49.068939"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89220, "epoch": 0, "train_loss": 3.608378440141678, "train_ppl": 36.90615872151018, "lr": 0.00018164655110407821, "grad_norm": 1.1014, "tokens_per_sec": 149362, "dt_s": 4.388, "eta_s": 1510, "world_size": 1, "timestamp": "2026-05-05T07:24:53.456646"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89230, "epoch": 0, "train_loss": 3.5746061205863953, "train_ppl": 35.680564207826116, "lr": 0.00018128403941595334, "grad_norm": 1.0836, "tokens_per_sec": 151507, "dt_s": 4.326, "eta_s": 1506, "world_size": 1, "timestamp": "2026-05-05T07:24:57.782281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89240, "epoch": 0, "train_loss": 3.520367130637169, "train_ppl": 33.796834039692015, "lr": 0.00018092152772782852, "grad_norm": 0.9732, "tokens_per_sec": 149762, "dt_s": 4.376, "eta_s": 1501, "world_size": 1, "timestamp": "2026-05-05T07:25:02.158287"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89250, "epoch": 0, "train_loss": 3.577537417411804, "train_ppl": 35.785307974952445, "lr": 0.00018055901603970359, "grad_norm": 1.0637, "tokens_per_sec": 148460, "dt_s": 4.414, "eta_s": 1501, "world_size": 1, "timestamp": "2026-05-05T07:25:06.572666"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89260, "epoch": 0, "train_loss": 3.682282343506813, "train_ppl": 39.73698410211122, "lr": 0.0001801965043515787, "grad_norm": 0.9511, "tokens_per_sec": 148900, "dt_s": 4.401, "eta_s": 1501, "world_size": 1, "timestamp": "2026-05-05T07:25:10.974006"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89270, "epoch": 0, "train_loss": 3.567257896065712, "train_ppl": 35.419336366861515, "lr": 0.0001798339926634539, "grad_norm": 1.0089, "tokens_per_sec": 149732, "dt_s": 4.377, "eta_s": 1496, "world_size": 1, "timestamp": "2026-05-05T07:25:15.350896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89280, "epoch": 0, "train_loss": 3.6983691602945328, "train_ppl": 40.38139504850691, "lr": 0.000179471480975329, "grad_norm": 1.0418, "tokens_per_sec": 147457, "dt_s": 4.444, "eta_s": 1499, "world_size": 1, "timestamp": "2026-05-05T07:25:19.795311"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89290, "epoch": 0, "train_loss": 3.600987657904625, "train_ppl": 36.63439883530546, "lr": 0.0001791089692872042, "grad_norm": 1.0567, "tokens_per_sec": 150822, "dt_s": 4.345, "eta_s": 1493, "world_size": 1, "timestamp": "2026-05-05T07:25:24.140550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89300, "epoch": 0, "train_loss": 3.5289757400751114, "train_ppl": 34.08903369600899, "lr": 0.00017874645759907931, "grad_norm": 0.9777, "tokens_per_sec": 148823, "dt_s": 4.404, "eta_s": 1488, "world_size": 1, "timestamp": "2026-05-05T07:25:28.544171"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89310, "epoch": 0, "train_loss": 3.480582058429718, "train_ppl": 32.47862103033867, "lr": 0.00017838394591095438, "grad_norm": 1.0455, "tokens_per_sec": 150875, "dt_s": 4.344, "eta_s": 1479, "world_size": 1, "timestamp": "2026-05-05T07:25:32.887887"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89320, "epoch": 0, "train_loss": 3.624532327055931, "train_ppl": 37.5071779648718, "lr": 0.00017802143422282956, "grad_norm": 1.0072, "tokens_per_sec": 151540, "dt_s": 4.325, "eta_s": 1471, "world_size": 1, "timestamp": "2026-05-05T07:25:37.212547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89330, "epoch": 0, "train_loss": 3.6040810644626617, "train_ppl": 36.74789938607761, "lr": 0.0001776589225347047, "grad_norm": 1.15, "tokens_per_sec": 146825, "dt_s": 4.464, "eta_s": 1468, "world_size": 1, "timestamp": "2026-05-05T07:25:41.676123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89340, "epoch": 0, "train_loss": 3.6773266196250916, "train_ppl": 39.54054573020487, "lr": 0.0001772964108465798, "grad_norm": 1.1455, "tokens_per_sec": 150591, "dt_s": 4.352, "eta_s": 1464, "world_size": 1, "timestamp": "2026-05-05T07:25:46.028008"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89350, "epoch": 0, "train_loss": 3.5423552095890045, "train_ppl": 34.54819167874224, "lr": 0.000176933899158455, "grad_norm": 1.0762, "tokens_per_sec": 151367, "dt_s": 4.33, "eta_s": 1455, "world_size": 1, "timestamp": "2026-05-05T07:25:50.357630"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89360, "epoch": 0, "train_loss": 3.5549894124269485, "train_ppl": 34.98744953355476, "lr": 0.00017657138747033011, "grad_norm": 1.0998, "tokens_per_sec": 148156, "dt_s": 4.423, "eta_s": 1456, "world_size": 1, "timestamp": "2026-05-05T07:25:54.781071"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89370, "epoch": 0, "train_loss": 3.6122251003980637, "train_ppl": 37.04839757235743, "lr": 0.00017620887578220524, "grad_norm": 1.0622, "tokens_per_sec": 150893, "dt_s": 4.343, "eta_s": 1453, "world_size": 1, "timestamp": "2026-05-05T07:25:59.124269"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89380, "epoch": 0, "train_loss": 3.5602328926324844, "train_ppl": 35.17138734833828, "lr": 0.00017584636409408036, "grad_norm": 1.0614, "tokens_per_sec": 149465, "dt_s": 4.385, "eta_s": 1443, "world_size": 1, "timestamp": "2026-05-05T07:26:03.508950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89390, "epoch": 0, "train_loss": 3.6334837675094604, "train_ppl": 37.844428421756895, "lr": 0.00017548385240595549, "grad_norm": 0.9981, "tokens_per_sec": 151000, "dt_s": 4.34, "eta_s": 1438, "world_size": 1, "timestamp": "2026-05-05T07:26:07.849094"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89400, "epoch": 0, "train_loss": 3.7945083677768707, "train_ppl": 44.456374845647794, "lr": 0.0001751213407178306, "grad_norm": 1.066, "tokens_per_sec": 151895, "dt_s": 4.315, "eta_s": 1433, "world_size": 1, "timestamp": "2026-05-05T07:26:12.163653"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89410, "epoch": 0, "train_loss": 3.6052028089761734, "train_ppl": 36.789144269382895, "lr": 0.0001747588290297058, "grad_norm": 1.053, "tokens_per_sec": 148771, "dt_s": 4.405, "eta_s": 1427, "world_size": 1, "timestamp": "2026-05-05T07:26:16.568831"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89420, "epoch": 0, "train_loss": 3.5610341131687164, "train_ppl": 35.19957867839771, "lr": 0.0001743963173415809, "grad_norm": 1.1026, "tokens_per_sec": 149986, "dt_s": 4.369, "eta_s": 1425, "world_size": 1, "timestamp": "2026-05-05T07:26:20.938339"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89430, "epoch": 0, "train_loss": 3.5969249308109283, "train_ppl": 36.48586520071166, "lr": 0.00017403380565345604, "grad_norm": 1.028, "tokens_per_sec": 152042, "dt_s": 4.31, "eta_s": 1415, "world_size": 1, "timestamp": "2026-05-05T07:26:25.248709"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89440, "epoch": 0, "train_loss": 3.6616805344820023, "train_ppl": 38.926705604825294, "lr": 0.00017367129396533121, "grad_norm": 1.0219, "tokens_per_sec": 134519, "dt_s": 4.872, "eta_s": 1446, "world_size": 1, "timestamp": "2026-05-05T07:26:30.120606"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89450, "epoch": 0, "train_loss": 3.548302561044693, "train_ppl": 34.754274131758024, "lr": 0.00017330878227720628, "grad_norm": 0.9931, "tokens_per_sec": 151347, "dt_s": 4.33, "eta_s": 1442, "world_size": 1, "timestamp": "2026-05-05T07:26:34.450774"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89460, "epoch": 0, "train_loss": 3.6667975336313248, "train_ppl": 39.12640401685693, "lr": 0.0001729462705890814, "grad_norm": 0.9674, "tokens_per_sec": 150569, "dt_s": 4.353, "eta_s": 1434, "world_size": 1, "timestamp": "2026-05-05T07:26:38.803346"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89470, "epoch": 0, "train_loss": 3.594742327928543, "train_ppl": 36.406317887834106, "lr": 0.0001725837589009566, "grad_norm": 1.0466, "tokens_per_sec": 149451, "dt_s": 4.385, "eta_s": 1431, "world_size": 1, "timestamp": "2026-05-05T07:26:43.188464"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89480, "epoch": 0, "train_loss": 3.508873865008354, "train_ppl": 33.41062172043668, "lr": 0.0001722212472128317, "grad_norm": 0.9782, "tokens_per_sec": 150166, "dt_s": 4.364, "eta_s": 1430, "world_size": 1, "timestamp": "2026-05-05T07:26:47.552679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89490, "epoch": 0, "train_loss": 3.6093418449163437, "train_ppl": 36.94173142374125, "lr": 0.0001718587355247069, "grad_norm": 1.0622, "tokens_per_sec": 149651, "dt_s": 4.379, "eta_s": 1394, "world_size": 1, "timestamp": "2026-05-05T07:26:51.931950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89500, "epoch": 0, "train_loss": 3.676307201385498, "train_ppl": 39.500257915239686, "lr": 0.000171496223836582, "grad_norm": 0.9269, "tokens_per_sec": 149690, "dt_s": 4.378, "eta_s": 1393, "world_size": 1, "timestamp": "2026-05-05T07:26:56.310059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89510, "epoch": 0, "train_loss": 3.4076937437057495, "train_ppl": 30.195525275154814, "lr": 0.00017113371214845708, "grad_norm": 1.0792, "tokens_per_sec": 127728, "dt_s": 5.131, "eta_s": 1389, "world_size": 1, "timestamp": "2026-05-05T07:27:01.440945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89520, "epoch": 0, "train_loss": 3.673082262277603, "train_ppl": 39.37307717401572, "lr": 0.00017077120046033226, "grad_norm": 1.047, "tokens_per_sec": 148587, "dt_s": 4.411, "eta_s": 1386, "world_size": 1, "timestamp": "2026-05-05T07:27:05.851551"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89530, "epoch": 0, "train_loss": 3.5966532677412033, "train_ppl": 36.4759546847912, "lr": 0.00017040868877220739, "grad_norm": 1.0707, "tokens_per_sec": 148334, "dt_s": 4.418, "eta_s": 1385, "world_size": 1, "timestamp": "2026-05-05T07:27:10.269700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89540, "epoch": 0, "train_loss": 3.739325761795044, "train_ppl": 42.069615658338925, "lr": 0.0001700461770840825, "grad_norm": 1.1245, "tokens_per_sec": 150130, "dt_s": 4.365, "eta_s": 1380, "world_size": 1, "timestamp": "2026-05-05T07:27:14.634969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89550, "epoch": 0, "train_loss": 3.577118933200836, "train_ppl": 35.77033552166645, "lr": 0.0001696836653959577, "grad_norm": 1.0294, "tokens_per_sec": 148516, "dt_s": 4.413, "eta_s": 1378, "world_size": 1, "timestamp": "2026-05-05T07:27:19.047691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89560, "epoch": 0, "train_loss": 3.566112518310547, "train_ppl": 35.37879107115668, "lr": 0.0001693211537078328, "grad_norm": 1.1157, "tokens_per_sec": 150469, "dt_s": 4.355, "eta_s": 1373, "world_size": 1, "timestamp": "2026-05-05T07:27:23.403118"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89570, "epoch": 0, "train_loss": 3.589556246995926, "train_ppl": 36.21800051345445, "lr": 0.00016895864201970794, "grad_norm": 0.9984, "tokens_per_sec": 148593, "dt_s": 4.41, "eta_s": 1368, "world_size": 1, "timestamp": "2026-05-05T07:27:27.813573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89580, "epoch": 0, "train_loss": 3.5594568252563477, "train_ppl": 35.144102570826036, "lr": 0.00016859613033158306, "grad_norm": 1.02, "tokens_per_sec": 148900, "dt_s": 4.401, "eta_s": 1363, "world_size": 1, "timestamp": "2026-05-05T07:27:32.214920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89590, "epoch": 0, "train_loss": 3.7138253301382065, "train_ppl": 41.01038511581938, "lr": 0.00016823361864345818, "grad_norm": 1.0228, "tokens_per_sec": 151226, "dt_s": 4.334, "eta_s": 1357, "world_size": 1, "timestamp": "2026-05-05T07:27:36.548556"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89600, "epoch": 0, "train_loss": 3.515778511762619, "train_ppl": 33.64210850884726, "lr": 0.0001678711069553333, "grad_norm": 1.0019, "tokens_per_sec": 149476, "dt_s": 4.384, "eta_s": 1351, "world_size": 1, "timestamp": "2026-05-05T07:27:40.932946"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89610, "epoch": 0, "train_loss": 3.608499839901924, "train_ppl": 36.91063939230126, "lr": 0.0001675085952672085, "grad_norm": 0.962, "tokens_per_sec": 148344, "dt_s": 4.418, "eta_s": 1350, "world_size": 1, "timestamp": "2026-05-05T07:27:45.350799"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89620, "epoch": 0, "train_loss": 3.4679657965898514, "train_ppl": 32.07143621927393, "lr": 0.0001671460835790836, "grad_norm": 1.0604, "tokens_per_sec": 150532, "dt_s": 4.354, "eta_s": 1342, "world_size": 1, "timestamp": "2026-05-05T07:27:49.704414"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89630, "epoch": 0, "train_loss": 3.596528083086014, "train_ppl": 36.4713887407803, "lr": 0.00016678357189095873, "grad_norm": 1.0722, "tokens_per_sec": 147837, "dt_s": 4.433, "eta_s": 1340, "world_size": 1, "timestamp": "2026-05-05T07:27:54.137402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89640, "epoch": 0, "train_loss": 3.533313199877739, "train_ppl": 34.237214641505666, "lr": 0.0001664210602028339, "grad_norm": 1.0667, "tokens_per_sec": 149162, "dt_s": 4.394, "eta_s": 1339, "world_size": 1, "timestamp": "2026-05-05T07:27:58.531013"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89650, "epoch": 0, "train_loss": 3.597841128706932, "train_ppl": 36.51930879177688, "lr": 0.00016605854851470898, "grad_norm": 1.2476, "tokens_per_sec": 150440, "dt_s": 4.356, "eta_s": 1333, "world_size": 1, "timestamp": "2026-05-05T07:28:02.887356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89660, "epoch": 0, "train_loss": 3.6599970906972885, "train_ppl": 38.86122981208674, "lr": 0.0001656960368265841, "grad_norm": 1.0262, "tokens_per_sec": 147836, "dt_s": 4.433, "eta_s": 1329, "world_size": 1, "timestamp": "2026-05-05T07:28:07.320354"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89670, "epoch": 0, "train_loss": 3.682354748249054, "train_ppl": 39.73986135236459, "lr": 0.00016533352513845929, "grad_norm": 1.0666, "tokens_per_sec": 149699, "dt_s": 4.378, "eta_s": 1326, "world_size": 1, "timestamp": "2026-05-05T07:28:11.698217"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89680, "epoch": 0, "train_loss": 3.6523717790842056, "train_ppl": 38.566027759307424, "lr": 0.0001649710134503344, "grad_norm": 1.0342, "tokens_per_sec": 150316, "dt_s": 4.36, "eta_s": 1318, "world_size": 1, "timestamp": "2026-05-05T07:28:16.058073"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89690, "epoch": 0, "train_loss": 3.6691503524780273, "train_ppl": 39.21856973973915, "lr": 0.00016460850176220953, "grad_norm": 1.0875, "tokens_per_sec": 147957, "dt_s": 4.429, "eta_s": 1315, "world_size": 1, "timestamp": "2026-05-05T07:28:20.487460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89700, "epoch": 0, "train_loss": 3.6028097569942474, "train_ppl": 36.70121119095334, "lr": 0.0001642459900740847, "grad_norm": 1.0263, "tokens_per_sec": 151046, "dt_s": 4.339, "eta_s": 1310, "world_size": 1, "timestamp": "2026-05-05T07:28:24.826295"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89710, "epoch": 0, "train_loss": 3.6515633910894394, "train_ppl": 38.53486404334501, "lr": 0.00016388347838595978, "grad_norm": 0.9921, "tokens_per_sec": 148370, "dt_s": 4.417, "eta_s": 1305, "world_size": 1, "timestamp": "2026-05-05T07:28:29.243343"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89720, "epoch": 0, "train_loss": 3.5643890649080276, "train_ppl": 35.31786988579964, "lr": 0.00016352096669783496, "grad_norm": 0.9816, "tokens_per_sec": 149025, "dt_s": 4.398, "eta_s": 1301, "world_size": 1, "timestamp": "2026-05-05T07:28:33.641004"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89730, "epoch": 0, "train_loss": 3.5233797878026962, "train_ppl": 33.8988058398455, "lr": 0.00016315845500971008, "grad_norm": 0.9847, "tokens_per_sec": 135454, "dt_s": 4.838, "eta_s": 1325, "world_size": 1, "timestamp": "2026-05-05T07:28:38.479250"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89740, "epoch": 0, "train_loss": 3.5782285034656525, "train_ppl": 35.81004724972586, "lr": 0.0001627959433215852, "grad_norm": 1.0002, "tokens_per_sec": 148157, "dt_s": 4.423, "eta_s": 1320, "world_size": 1, "timestamp": "2026-05-05T07:28:42.902637"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89750, "epoch": 0, "train_loss": 3.5537041425704956, "train_ppl": 34.94251010514782, "lr": 0.0001624334316334604, "grad_norm": 0.9943, "tokens_per_sec": 152356, "dt_s": 4.302, "eta_s": 1314, "world_size": 1, "timestamp": "2026-05-05T07:28:47.204155"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89760, "epoch": 0, "train_loss": 3.586163431406021, "train_ppl": 36.09532773740659, "lr": 0.0001620709199453355, "grad_norm": 1.0768, "tokens_per_sec": 149785, "dt_s": 4.375, "eta_s": 1307, "world_size": 1, "timestamp": "2026-05-05T07:28:51.579505"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89770, "epoch": 0, "train_loss": 3.6409930735826492, "train_ppl": 38.12968351137691, "lr": 0.00016170840825721063, "grad_norm": 1.1814, "tokens_per_sec": 147843, "dt_s": 4.433, "eta_s": 1304, "world_size": 1, "timestamp": "2026-05-05T07:28:56.012333"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89780, "epoch": 0, "train_loss": 3.570699706673622, "train_ppl": 35.541453045183445, "lr": 0.00016134589656908576, "grad_norm": 1.0617, "tokens_per_sec": 150016, "dt_s": 4.369, "eta_s": 1273, "world_size": 1, "timestamp": "2026-05-05T07:29:00.380901"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89790, "epoch": 0, "train_loss": 3.4923769384622574, "train_ppl": 32.8639705774034, "lr": 0.00016098338488096088, "grad_norm": 1.0689, "tokens_per_sec": 148674, "dt_s": 4.408, "eta_s": 1267, "world_size": 1, "timestamp": "2026-05-05T07:29:04.788934"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89800, "epoch": 0, "train_loss": 3.615103766322136, "train_ppl": 37.15520118419517, "lr": 0.000160620873192836, "grad_norm": 1.0468, "tokens_per_sec": 148232, "dt_s": 4.421, "eta_s": 1270, "world_size": 1, "timestamp": "2026-05-05T07:29:09.210119"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89810, "epoch": 0, "train_loss": 3.6064825356006622, "train_ppl": 36.836254454446276, "lr": 0.00016025836150471119, "grad_norm": 1.0505, "tokens_per_sec": 149197, "dt_s": 4.393, "eta_s": 1267, "world_size": 1, "timestamp": "2026-05-05T07:29:13.602684"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89820, "epoch": 0, "train_loss": 3.556087449193001, "train_ppl": 35.025888139132014, "lr": 0.0001598958498165863, "grad_norm": 1.0268, "tokens_per_sec": 148298, "dt_s": 4.419, "eta_s": 1261, "world_size": 1, "timestamp": "2026-05-05T07:29:18.021909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89830, "epoch": 0, "train_loss": 3.500357359647751, "train_ppl": 33.12728819971088, "lr": 0.00015953333812846143, "grad_norm": 1.1471, "tokens_per_sec": 149509, "dt_s": 4.383, "eta_s": 1258, "world_size": 1, "timestamp": "2026-05-05T07:29:22.405321"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89840, "epoch": 0, "train_loss": 3.5707259327173233, "train_ppl": 35.54238516910711, "lr": 0.0001591708264403366, "grad_norm": 1.0807, "tokens_per_sec": 149839, "dt_s": 4.374, "eta_s": 1251, "world_size": 1, "timestamp": "2026-05-05T07:29:26.779079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89850, "epoch": 0, "train_loss": 3.6078263968229294, "train_ppl": 36.885790545742246, "lr": 0.00015880831475221168, "grad_norm": 1.1054, "tokens_per_sec": 148785, "dt_s": 4.405, "eta_s": 1246, "world_size": 1, "timestamp": "2026-05-05T07:29:31.183840"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89860, "epoch": 0, "train_loss": 3.6233271658420563, "train_ppl": 37.46200299577466, "lr": 0.0001584458030640868, "grad_norm": 1.0235, "tokens_per_sec": 150501, "dt_s": 4.355, "eta_s": 1240, "world_size": 1, "timestamp": "2026-05-05T07:29:35.538358"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89870, "epoch": 0, "train_loss": 3.5507689714431763, "train_ppl": 34.84009823008566, "lr": 0.00015808329137596198, "grad_norm": 0.9942, "tokens_per_sec": 150833, "dt_s": 4.345, "eta_s": 1231, "world_size": 1, "timestamp": "2026-05-05T07:29:39.883309"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89880, "epoch": 0, "train_loss": 3.51978862285614, "train_ppl": 33.7772879625499, "lr": 0.0001577207796878371, "grad_norm": 1.0009, "tokens_per_sec": 146334, "dt_s": 4.479, "eta_s": 1232, "world_size": 1, "timestamp": "2026-05-05T07:29:44.361825"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89890, "epoch": 0, "train_loss": 3.5827623903751373, "train_ppl": 35.97277456905732, "lr": 0.00015735826799971223, "grad_norm": 1.0326, "tokens_per_sec": 150788, "dt_s": 4.346, "eta_s": 1226, "world_size": 1, "timestamp": "2026-05-05T07:29:48.708041"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89900, "epoch": 0, "train_loss": 3.5889885425567627, "train_ppl": 36.19744522900052, "lr": 0.0001569957563115874, "grad_norm": 0.9505, "tokens_per_sec": 149275, "dt_s": 4.39, "eta_s": 1221, "world_size": 1, "timestamp": "2026-05-05T07:29:53.098367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89910, "epoch": 0, "train_loss": 3.549344480037689, "train_ppl": 34.790504141153555, "lr": 0.00015663324462346248, "grad_norm": 1.0548, "tokens_per_sec": 150204, "dt_s": 4.363, "eta_s": 1217, "world_size": 1, "timestamp": "2026-05-05T07:29:57.461460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89920, "epoch": 0, "train_loss": 3.6097965985536575, "train_ppl": 36.958534630845065, "lr": 0.00015627073293533766, "grad_norm": 1.0907, "tokens_per_sec": 150537, "dt_s": 4.353, "eta_s": 1213, "world_size": 1, "timestamp": "2026-05-05T07:30:01.814936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89930, "epoch": 0, "train_loss": 3.664743795990944, "train_ppl": 39.046131106158704, "lr": 0.00015590822124721278, "grad_norm": 0.9974, "tokens_per_sec": 149567, "dt_s": 4.382, "eta_s": 1203, "world_size": 1, "timestamp": "2026-05-05T07:30:06.196668"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89940, "epoch": 0, "train_loss": 3.546211749315262, "train_ppl": 34.68168539888596, "lr": 0.0001555457095590879, "grad_norm": 1.0693, "tokens_per_sec": 149679, "dt_s": 4.378, "eta_s": 1201, "world_size": 1, "timestamp": "2026-05-05T07:30:10.575107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89950, "epoch": 0, "train_loss": 3.5863390117883682, "train_ppl": 36.10166592526611, "lr": 0.00015518319787096309, "grad_norm": 1.0911, "tokens_per_sec": 151163, "dt_s": 4.335, "eta_s": 1193, "world_size": 1, "timestamp": "2026-05-05T07:30:14.910563"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89960, "epoch": 0, "train_loss": 3.615548014640808, "train_ppl": 37.17171098680567, "lr": 0.0001548206861828382, "grad_norm": 1.1069, "tokens_per_sec": 148575, "dt_s": 4.411, "eta_s": 1192, "world_size": 1, "timestamp": "2026-05-05T07:30:19.321538"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89970, "epoch": 0, "train_loss": 3.581034556031227, "train_ppl": 35.91067323956595, "lr": 0.00015445817449471333, "grad_norm": 1.463, "tokens_per_sec": 150928, "dt_s": 4.342, "eta_s": 1187, "world_size": 1, "timestamp": "2026-05-05T07:30:23.663750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89980, "epoch": 0, "train_loss": 3.5598945021629333, "train_ppl": 35.159487699536136, "lr": 0.00015409566280658846, "grad_norm": 0.9908, "tokens_per_sec": 150496, "dt_s": 4.355, "eta_s": 1181, "world_size": 1, "timestamp": "2026-05-05T07:30:28.018430"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 89990, "epoch": 0, "train_loss": 3.6268311142921448, "train_ppl": 37.593498164717964, "lr": 0.00015373315111846358, "grad_norm": 1.1159, "tokens_per_sec": 147557, "dt_s": 4.441, "eta_s": 1180, "world_size": 1, "timestamp": "2026-05-05T07:30:32.459829"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90000, "epoch": 0, "train_loss": 3.6323302537202835, "train_ppl": 37.80079951983656, "lr": 0.0001533706394303387, "grad_norm": 1.061, "tokens_per_sec": 150961, "dt_s": 4.341, "eta_s": 1176, "world_size": 1, "timestamp": "2026-05-05T07:30:36.801063"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90010, "epoch": 0, "train_loss": 3.751315340399742, "train_ppl": 42.577048496486036, "lr": 0.00015300812774221388, "grad_norm": 1.1305, "tokens_per_sec": 126237, "dt_s": 5.192, "eta_s": 1172, "world_size": 1, "timestamp": "2026-05-05T07:30:41.992582"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90020, "epoch": 0, "train_loss": 3.617035001516342, "train_ppl": 37.227025949303446, "lr": 0.000152645616054089, "grad_norm": 1.1192, "tokens_per_sec": 149065, "dt_s": 4.396, "eta_s": 1170, "world_size": 1, "timestamp": "2026-05-05T07:30:46.389043"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90030, "epoch": 0, "train_loss": 3.5926287174224854, "train_ppl": 36.329450374467584, "lr": 0.00015228310436596413, "grad_norm": 0.9987, "tokens_per_sec": 133498, "dt_s": 4.909, "eta_s": 1195, "world_size": 1, "timestamp": "2026-05-05T07:30:51.298188"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90040, "epoch": 0, "train_loss": 3.5476667284965515, "train_ppl": 34.732183256870904, "lr": 0.0001519205926778393, "grad_norm": 0.9647, "tokens_per_sec": 147225, "dt_s": 4.451, "eta_s": 1192, "world_size": 1, "timestamp": "2026-05-05T07:30:55.749593"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90050, "epoch": 0, "train_loss": 3.6338351368904114, "train_ppl": 37.85772813156245, "lr": 0.00015155808098971438, "grad_norm": 1.1059, "tokens_per_sec": 150701, "dt_s": 4.349, "eta_s": 1187, "world_size": 1, "timestamp": "2026-05-05T07:31:00.098350"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90060, "epoch": 0, "train_loss": 3.5706745386123657, "train_ppl": 35.5405585469725, "lr": 0.0001511955693015895, "grad_norm": 1.0508, "tokens_per_sec": 150153, "dt_s": 4.365, "eta_s": 1180, "world_size": 1, "timestamp": "2026-05-05T07:31:04.462948"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90070, "epoch": 0, "train_loss": 3.615706145763397, "train_ppl": 37.17758945596479, "lr": 0.00015083305761346468, "grad_norm": 1.029, "tokens_per_sec": 146327, "dt_s": 4.479, "eta_s": 1180, "world_size": 1, "timestamp": "2026-05-05T07:31:08.941677"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90080, "epoch": 0, "train_loss": 3.5779143273830414, "train_ppl": 35.79879835652196, "lr": 0.0001504705459253398, "grad_norm": 1.0539, "tokens_per_sec": 150141, "dt_s": 4.365, "eta_s": 1147, "world_size": 1, "timestamp": "2026-05-05T07:31:13.306657"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90090, "epoch": 0, "train_loss": 3.5996004790067673, "train_ppl": 36.583615601164176, "lr": 0.00015010803423721493, "grad_norm": 1.0334, "tokens_per_sec": 150091, "dt_s": 4.366, "eta_s": 1138, "world_size": 1, "timestamp": "2026-05-05T07:31:17.673059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90100, "epoch": 0, "train_loss": 3.5054072737693787, "train_ppl": 33.2950012720952, "lr": 0.0001497455225490901, "grad_norm": 0.9736, "tokens_per_sec": 150272, "dt_s": 4.361, "eta_s": 1134, "world_size": 1, "timestamp": "2026-05-05T07:31:22.034231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90110, "epoch": 0, "train_loss": 3.6298577189445496, "train_ppl": 37.707451179538566, "lr": 0.00014938301086096518, "grad_norm": 1.1076, "tokens_per_sec": 151601, "dt_s": 4.323, "eta_s": 1128, "world_size": 1, "timestamp": "2026-05-05T07:31:26.357128"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90120, "epoch": 0, "train_loss": 3.5959693789482117, "train_ppl": 36.4510177161972, "lr": 0.0001490204991728403, "grad_norm": 1.0901, "tokens_per_sec": 150542, "dt_s": 4.353, "eta_s": 1117, "world_size": 1, "timestamp": "2026-05-05T07:31:30.710458"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90130, "epoch": 0, "train_loss": 3.589285284280777, "train_ppl": 36.20818811515514, "lr": 0.00014865798748471548, "grad_norm": 1.0261, "tokens_per_sec": 149731, "dt_s": 4.377, "eta_s": 1113, "world_size": 1, "timestamp": "2026-05-05T07:31:35.087402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90140, "epoch": 0, "train_loss": 3.5899219512939453, "train_ppl": 36.231248014094206, "lr": 0.0001482954757965906, "grad_norm": 1.0531, "tokens_per_sec": 151392, "dt_s": 4.329, "eta_s": 1107, "world_size": 1, "timestamp": "2026-05-05T07:31:39.416290"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90150, "epoch": 0, "train_loss": 3.5490482598543167, "train_ppl": 34.78020001785723, "lr": 0.00014793296410846578, "grad_norm": 1.0367, "tokens_per_sec": 149093, "dt_s": 4.396, "eta_s": 1104, "world_size": 1, "timestamp": "2026-05-05T07:31:43.811913"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90160, "epoch": 0, "train_loss": 3.7520704567432404, "train_ppl": 42.609211263450874, "lr": 0.0001475704524203409, "grad_norm": 1.126, "tokens_per_sec": 150273, "dt_s": 4.361, "eta_s": 1102, "world_size": 1, "timestamp": "2026-05-05T07:31:48.173053"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90170, "epoch": 0, "train_loss": 3.9361886084079742, "train_ppl": 51.222997873274764, "lr": 0.00014720794073221603, "grad_norm": 1.5829, "tokens_per_sec": 151400, "dt_s": 4.329, "eta_s": 1096, "world_size": 1, "timestamp": "2026-05-05T07:31:52.501705"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90180, "epoch": 0, "train_loss": 3.6295772790908813, "train_ppl": 37.69687799008872, "lr": 0.00014684542904409116, "grad_norm": 1.0423, "tokens_per_sec": 146826, "dt_s": 4.464, "eta_s": 1096, "world_size": 1, "timestamp": "2026-05-05T07:31:56.965281"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90190, "epoch": 0, "train_loss": 3.638451874256134, "train_ppl": 38.03291139601181, "lr": 0.00014648291735596628, "grad_norm": 1.1141, "tokens_per_sec": 150189, "dt_s": 4.364, "eta_s": 1094, "world_size": 1, "timestamp": "2026-05-05T07:32:01.328824"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90200, "epoch": 0, "train_loss": 3.5228473842144012, "train_ppl": 33.880762797493894, "lr": 0.0001461204056678414, "grad_norm": 1.0568, "tokens_per_sec": 150856, "dt_s": 4.344, "eta_s": 1087, "world_size": 1, "timestamp": "2026-05-05T07:32:05.673087"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90210, "epoch": 0, "train_loss": 3.59906966984272, "train_ppl": 36.56420183570724, "lr": 0.00014575789397971658, "grad_norm": 1.0466, "tokens_per_sec": 149473, "dt_s": 4.384, "eta_s": 1084, "world_size": 1, "timestamp": "2026-05-05T07:32:10.057529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90220, "epoch": 0, "train_loss": 3.5627634823322296, "train_ppl": 35.26050441068982, "lr": 0.0001453953822915917, "grad_norm": 1.0148, "tokens_per_sec": 151293, "dt_s": 4.332, "eta_s": 1079, "world_size": 1, "timestamp": "2026-05-05T07:32:14.389272"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90230, "epoch": 0, "train_loss": 3.492096722126007, "train_ppl": 32.85476284611217, "lr": 0.00014503287060346683, "grad_norm": 1.1205, "tokens_per_sec": 149519, "dt_s": 4.383, "eta_s": 1071, "world_size": 1, "timestamp": "2026-05-05T07:32:18.772402"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90240, "epoch": 0, "train_loss": 3.582916706800461, "train_ppl": 35.97832618737972, "lr": 0.000144670358915342, "grad_norm": 1.055, "tokens_per_sec": 147848, "dt_s": 4.433, "eta_s": 1070, "world_size": 1, "timestamp": "2026-05-05T07:32:23.205021"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90250, "epoch": 0, "train_loss": 3.6190695017576218, "train_ppl": 37.302841439744995, "lr": 0.00014430784722721708, "grad_norm": 1.0374, "tokens_per_sec": 150342, "dt_s": 4.359, "eta_s": 1066, "world_size": 1, "timestamp": "2026-05-05T07:32:27.564153"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90260, "epoch": 0, "train_loss": 3.5827567726373672, "train_ppl": 35.97257248401056, "lr": 0.0001439453355390922, "grad_norm": 1.0444, "tokens_per_sec": 147837, "dt_s": 4.433, "eta_s": 1064, "world_size": 1, "timestamp": "2026-05-05T07:32:31.997133"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90270, "epoch": 0, "train_loss": 3.6394329518079758, "train_ppl": 38.070242941189754, "lr": 0.00014358282385096738, "grad_norm": 1.0419, "tokens_per_sec": 151209, "dt_s": 4.334, "eta_s": 1060, "world_size": 1, "timestamp": "2026-05-05T07:32:36.331322"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90280, "epoch": 0, "train_loss": 3.722940817475319, "train_ppl": 41.385923770604236, "lr": 0.0001432203121628425, "grad_norm": 1.1121, "tokens_per_sec": 151137, "dt_s": 4.336, "eta_s": 1053, "world_size": 1, "timestamp": "2026-05-05T07:32:40.667468"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90290, "epoch": 0, "train_loss": 3.606761083006859, "train_ppl": 36.84651652674902, "lr": 0.00014285780047471763, "grad_norm": 1.1121, "tokens_per_sec": 148606, "dt_s": 4.41, "eta_s": 1048, "world_size": 1, "timestamp": "2026-05-05T07:32:45.077522"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90300, "epoch": 0, "train_loss": 3.5489886105060577, "train_ppl": 34.778125463467376, "lr": 0.0001424952887865928, "grad_norm": 1.1075, "tokens_per_sec": 150735, "dt_s": 4.348, "eta_s": 1043, "world_size": 1, "timestamp": "2026-05-05T07:32:49.425280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90310, "epoch": 0, "train_loss": 3.520531788468361, "train_ppl": 33.80239941126458, "lr": 0.00014213277709846788, "grad_norm": 1.0741, "tokens_per_sec": 151661, "dt_s": 4.321, "eta_s": 1033, "world_size": 1, "timestamp": "2026-05-05T07:32:53.746490"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90320, "epoch": 0, "train_loss": 3.6445930153131485, "train_ppl": 38.26719551932003, "lr": 0.000141770265410343, "grad_norm": 1.0374, "tokens_per_sec": 147165, "dt_s": 4.453, "eta_s": 1035, "world_size": 1, "timestamp": "2026-05-05T07:32:58.199717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90330, "epoch": 0, "train_loss": 3.5949631482362747, "train_ppl": 36.41435802983412, "lr": 0.00014140775372221818, "grad_norm": 1.0982, "tokens_per_sec": 135256, "dt_s": 4.845, "eta_s": 1054, "world_size": 1, "timestamp": "2026-05-05T07:33:03.045056"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90340, "epoch": 0, "train_loss": 3.59428071975708, "train_ppl": 36.389516312174855, "lr": 0.0001410452420340933, "grad_norm": 1.0814, "tokens_per_sec": 150804, "dt_s": 4.346, "eta_s": 1047, "world_size": 1, "timestamp": "2026-05-05T07:33:07.390809"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90350, "epoch": 0, "train_loss": 3.4855191707611084, "train_ppl": 32.63936811742863, "lr": 0.00014068273034596848, "grad_norm": 1.121, "tokens_per_sec": 148965, "dt_s": 4.399, "eta_s": 1045, "world_size": 1, "timestamp": "2026-05-05T07:33:11.790258"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90360, "epoch": 0, "train_loss": 3.601299360394478, "train_ppl": 36.64581964849218, "lr": 0.0001403202186578436, "grad_norm": 1.0593, "tokens_per_sec": 151491, "dt_s": 4.326, "eta_s": 1040, "world_size": 1, "timestamp": "2026-05-05T07:33:16.116330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90370, "epoch": 0, "train_loss": 3.501312419772148, "train_ppl": 33.158941864870485, "lr": 0.00013995770696971873, "grad_norm": 1.0619, "tokens_per_sec": 148812, "dt_s": 4.404, "eta_s": 1034, "world_size": 1, "timestamp": "2026-05-05T07:33:20.520286"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90380, "epoch": 0, "train_loss": 3.5795495361089706, "train_ppl": 35.857384751409846, "lr": 0.00013959519528159385, "grad_norm": 1.2138, "tokens_per_sec": 151186, "dt_s": 4.335, "eta_s": 1006, "world_size": 1, "timestamp": "2026-05-05T07:33:24.855079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90390, "epoch": 0, "train_loss": 3.5723612159490585, "train_ppl": 35.600554584361895, "lr": 0.00013923268359346898, "grad_norm": 1.0065, "tokens_per_sec": 149975, "dt_s": 4.37, "eta_s": 1002, "world_size": 1, "timestamp": "2026-05-05T07:33:29.224869"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90400, "epoch": 0, "train_loss": 3.5156824439764023, "train_ppl": 33.638876741196015, "lr": 0.0001388701719053441, "grad_norm": 1.1885, "tokens_per_sec": 150037, "dt_s": 4.368, "eta_s": 997, "world_size": 1, "timestamp": "2026-05-05T07:33:33.592873"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90410, "epoch": 0, "train_loss": 3.5196080058813095, "train_ppl": 33.7711877618966, "lr": 0.00013850766021721928, "grad_norm": 1.1757, "tokens_per_sec": 151926, "dt_s": 4.314, "eta_s": 992, "world_size": 1, "timestamp": "2026-05-05T07:33:37.906557"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90420, "epoch": 0, "train_loss": 3.50411294400692, "train_ppl": 33.25193443836241, "lr": 0.0001381451485290944, "grad_norm": 1.0717, "tokens_per_sec": 150966, "dt_s": 4.341, "eta_s": 984, "world_size": 1, "timestamp": "2026-05-05T07:33:42.247667"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90430, "epoch": 0, "train_loss": 3.5670038014650345, "train_ppl": 35.410337648042464, "lr": 0.00013778263684096953, "grad_norm": 1.1384, "tokens_per_sec": 151002, "dt_s": 4.34, "eta_s": 980, "world_size": 1, "timestamp": "2026-05-05T07:33:46.587737"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90440, "epoch": 0, "train_loss": 3.5345918238162994, "train_ppl": 34.2810191625226, "lr": 0.0001374201251528447, "grad_norm": 0.955, "tokens_per_sec": 150645, "dt_s": 4.35, "eta_s": 975, "world_size": 1, "timestamp": "2026-05-05T07:33:50.938105"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90450, "epoch": 0, "train_loss": 3.5927781015634537, "train_ppl": 36.334877823580946, "lr": 0.00013705761346471978, "grad_norm": 1.0704, "tokens_per_sec": 151685, "dt_s": 4.321, "eta_s": 969, "world_size": 1, "timestamp": "2026-05-05T07:33:55.258639"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90460, "epoch": 0, "train_loss": 3.5790267139673233, "train_ppl": 35.838642616548704, "lr": 0.0001366951017765949, "grad_norm": 1.0275, "tokens_per_sec": 149182, "dt_s": 4.393, "eta_s": 968, "world_size": 1, "timestamp": "2026-05-05T07:33:59.651673"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90470, "epoch": 0, "train_loss": 3.5756651759147644, "train_ppl": 35.718371916163356, "lr": 0.00013633259008847008, "grad_norm": 1.0372, "tokens_per_sec": 149848, "dt_s": 4.374, "eta_s": 965, "world_size": 1, "timestamp": "2026-05-05T07:34:04.025185"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90480, "epoch": 0, "train_loss": 3.5031219720840454, "train_ppl": 33.218999026682134, "lr": 0.0001359700784003452, "grad_norm": 1.0791, "tokens_per_sec": 149878, "dt_s": 4.373, "eta_s": 962, "world_size": 1, "timestamp": "2026-05-05T07:34:08.397830"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90490, "epoch": 0, "train_loss": 3.6047309041023254, "train_ppl": 36.7717873886179, "lr": 0.00013560756671222033, "grad_norm": 1.0379, "tokens_per_sec": 151317, "dt_s": 4.331, "eta_s": 957, "world_size": 1, "timestamp": "2026-05-05T07:34:12.728835"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90500, "epoch": 0, "train_loss": 3.5030953884124756, "train_ppl": 33.21811595545982, "lr": 0.0001352450550240955, "grad_norm": 1.0004, "tokens_per_sec": 151287, "dt_s": 4.332, "eta_s": 953, "world_size": 1, "timestamp": "2026-05-05T07:34:17.060730"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90510, "epoch": 0, "train_loss": 3.648097962141037, "train_ppl": 38.40155532931565, "lr": 0.00013488254333597058, "grad_norm": 1.087, "tokens_per_sec": 126103, "dt_s": 5.197, "eta_s": 950, "world_size": 1, "timestamp": "2026-05-05T07:34:22.257750"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90520, "epoch": 0, "train_loss": 3.637826442718506, "train_ppl": 38.00913185077084, "lr": 0.0001345200316478457, "grad_norm": 1.054, "tokens_per_sec": 149838, "dt_s": 4.374, "eta_s": 946, "world_size": 1, "timestamp": "2026-05-05T07:34:26.631528"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90530, "epoch": 0, "train_loss": 3.475411996245384, "train_ppl": 32.31113786220965, "lr": 0.00013415751995972088, "grad_norm": 1.1433, "tokens_per_sec": 147242, "dt_s": 4.451, "eta_s": 945, "world_size": 1, "timestamp": "2026-05-05T07:34:31.082440"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90540, "epoch": 0, "train_loss": 3.6332855075597763, "train_ppl": 37.83692613100864, "lr": 0.000133795008271596, "grad_norm": 1.0013, "tokens_per_sec": 148194, "dt_s": 4.422, "eta_s": 944, "world_size": 1, "timestamp": "2026-05-05T07:34:35.504765"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90550, "epoch": 0, "train_loss": 3.680217683315277, "train_ppl": 39.655025370483784, "lr": 0.00013343249658347113, "grad_norm": 1.1892, "tokens_per_sec": 151582, "dt_s": 4.323, "eta_s": 940, "world_size": 1, "timestamp": "2026-05-05T07:34:39.828231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90560, "epoch": 0, "train_loss": 3.660261571407318, "train_ppl": 38.871509217032305, "lr": 0.0001330699848953463, "grad_norm": 1.2077, "tokens_per_sec": 149753, "dt_s": 4.376, "eta_s": 933, "world_size": 1, "timestamp": "2026-05-05T07:34:44.204498"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90570, "epoch": 0, "train_loss": 3.686855360865593, "train_ppl": 39.91911815387969, "lr": 0.00013270747320722143, "grad_norm": 1.0874, "tokens_per_sec": 149861, "dt_s": 4.373, "eta_s": 929, "world_size": 1, "timestamp": "2026-05-05T07:34:48.577623"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90580, "epoch": 0, "train_loss": 3.658103719353676, "train_ppl": 38.78772068518014, "lr": 0.00013234496151909655, "grad_norm": 0.9992, "tokens_per_sec": 151778, "dt_s": 4.318, "eta_s": 919, "world_size": 1, "timestamp": "2026-05-05T07:34:52.895493"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90590, "epoch": 0, "train_loss": 3.6753890216350555, "train_ppl": 39.46400622361609, "lr": 0.00013198244983097168, "grad_norm": 1.1323, "tokens_per_sec": 147967, "dt_s": 4.429, "eta_s": 915, "world_size": 1, "timestamp": "2026-05-05T07:34:57.324588"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90600, "epoch": 0, "train_loss": 3.5151913315057755, "train_ppl": 33.622360325371396, "lr": 0.0001316199381428468, "grad_norm": 1.267, "tokens_per_sec": 151191, "dt_s": 4.335, "eta_s": 911, "world_size": 1, "timestamp": "2026-05-05T07:35:01.659254"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90610, "epoch": 0, "train_loss": 3.482604220509529, "train_ppl": 32.54436451577955, "lr": 0.00013125742645472198, "grad_norm": 1.1273, "tokens_per_sec": 150841, "dt_s": 4.345, "eta_s": 905, "world_size": 1, "timestamp": "2026-05-05T07:35:06.003969"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90620, "epoch": 0, "train_loss": 3.5988646149635315, "train_ppl": 36.55670493638135, "lr": 0.0001308949147665971, "grad_norm": 1.0515, "tokens_per_sec": 132673, "dt_s": 4.94, "eta_s": 924, "world_size": 1, "timestamp": "2026-05-05T07:35:10.943643"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90630, "epoch": 0, "train_loss": 3.543317124247551, "train_ppl": 34.58144007924253, "lr": 0.00013053240307847223, "grad_norm": 1.1312, "tokens_per_sec": 151161, "dt_s": 4.336, "eta_s": 920, "world_size": 1, "timestamp": "2026-05-05T07:35:15.279142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90640, "epoch": 0, "train_loss": 3.6315988451242447, "train_ppl": 37.77316179859614, "lr": 0.0001301698913903474, "grad_norm": 1.1921, "tokens_per_sec": 150217, "dt_s": 4.363, "eta_s": 913, "world_size": 1, "timestamp": "2026-05-05T07:35:19.641920"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90650, "epoch": 0, "train_loss": 3.4571114480495453, "train_ppl": 31.725204133554247, "lr": 0.00012980737970222248, "grad_norm": 1.0416, "tokens_per_sec": 149045, "dt_s": 4.397, "eta_s": 911, "world_size": 1, "timestamp": "2026-05-05T07:35:24.038956"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90660, "epoch": 0, "train_loss": 3.7465039044618607, "train_ppl": 42.37268379331175, "lr": 0.0001294448680140976, "grad_norm": 1.1499, "tokens_per_sec": 150758, "dt_s": 4.347, "eta_s": 907, "world_size": 1, "timestamp": "2026-05-05T07:35:28.386059"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90670, "epoch": 0, "train_loss": 3.6131834983825684, "train_ppl": 37.083921702328034, "lr": 0.00012908235632597278, "grad_norm": 1.1408, "tokens_per_sec": 149249, "dt_s": 4.391, "eta_s": 880, "world_size": 1, "timestamp": "2026-05-05T07:35:32.777123"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90680, "epoch": 0, "train_loss": 3.587557941675186, "train_ppl": 36.145698155475145, "lr": 0.0001287198446378479, "grad_norm": 1.0545, "tokens_per_sec": 149175, "dt_s": 4.393, "eta_s": 878, "world_size": 1, "timestamp": "2026-05-05T07:35:37.170371"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90690, "epoch": 0, "train_loss": 3.6164742410182953, "train_ppl": 37.20615635566005, "lr": 0.00012835733294972303, "grad_norm": 1.0743, "tokens_per_sec": 147591, "dt_s": 4.44, "eta_s": 877, "world_size": 1, "timestamp": "2026-05-05T07:35:41.610740"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90700, "epoch": 0, "train_loss": 3.4907289892435074, "train_ppl": 32.80985702320654, "lr": 0.0001279948212615982, "grad_norm": 1.2108, "tokens_per_sec": 149558, "dt_s": 4.382, "eta_s": 872, "world_size": 1, "timestamp": "2026-05-05T07:35:45.992702"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90710, "epoch": 0, "train_loss": 3.632140353322029, "train_ppl": 37.79362181449953, "lr": 0.00012763230957347327, "grad_norm": 1.0751, "tokens_per_sec": 150509, "dt_s": 4.354, "eta_s": 868, "world_size": 1, "timestamp": "2026-05-05T07:35:50.346998"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90720, "epoch": 0, "train_loss": 3.5302266478538513, "train_ppl": 34.131702615306665, "lr": 0.0001272697978853484, "grad_norm": 1.0744, "tokens_per_sec": 150977, "dt_s": 4.341, "eta_s": 861, "world_size": 1, "timestamp": "2026-05-05T07:35:54.687790"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90730, "epoch": 0, "train_loss": 3.572582170367241, "train_ppl": 35.60842155327582, "lr": 0.00012690728619722358, "grad_norm": 1.2395, "tokens_per_sec": 147829, "dt_s": 4.433, "eta_s": 859, "world_size": 1, "timestamp": "2026-05-05T07:35:59.121034"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90740, "epoch": 0, "train_loss": 3.5578057765960693, "train_ppl": 35.08612582174694, "lr": 0.0001265447745090987, "grad_norm": 1.0846, "tokens_per_sec": 150272, "dt_s": 4.361, "eta_s": 851, "world_size": 1, "timestamp": "2026-05-05T07:36:03.482196"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90750, "epoch": 0, "train_loss": 3.6556744426488876, "train_ppl": 38.69360893690913, "lr": 0.00012618226282097383, "grad_norm": 1.1356, "tokens_per_sec": 150260, "dt_s": 4.362, "eta_s": 846, "world_size": 1, "timestamp": "2026-05-05T07:36:07.843691"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90760, "epoch": 0, "train_loss": 3.5299468338489532, "train_ppl": 34.12215342296507, "lr": 0.000125819751132849, "grad_norm": 1.0312, "tokens_per_sec": 146579, "dt_s": 4.471, "eta_s": 846, "world_size": 1, "timestamp": "2026-05-05T07:36:12.314725"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90770, "epoch": 0, "train_loss": 3.616757392883301, "train_ppl": 37.21669283986402, "lr": 0.00012545723944472413, "grad_norm": 1.1761, "tokens_per_sec": 148770, "dt_s": 4.405, "eta_s": 844, "world_size": 1, "timestamp": "2026-05-05T07:36:16.719912"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90780, "epoch": 0, "train_loss": 3.5440659672021866, "train_ppl": 34.607345845471706, "lr": 0.00012509472775659925, "grad_norm": 1.0246, "tokens_per_sec": 147545, "dt_s": 4.442, "eta_s": 840, "world_size": 1, "timestamp": "2026-05-05T07:36:21.161676"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90790, "epoch": 0, "train_loss": 3.6231469810009003, "train_ppl": 37.45525351881046, "lr": 0.00012473221606847438, "grad_norm": 1.1427, "tokens_per_sec": 148275, "dt_s": 4.42, "eta_s": 838, "world_size": 1, "timestamp": "2026-05-05T07:36:25.581550"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90800, "epoch": 0, "train_loss": 3.593617156147957, "train_ppl": 36.36537756307996, "lr": 0.0001243697043803495, "grad_norm": 1.0454, "tokens_per_sec": 150419, "dt_s": 4.357, "eta_s": 833, "world_size": 1, "timestamp": "2026-05-05T07:36:29.938449"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90810, "epoch": 0, "train_loss": 3.5969644486904144, "train_ppl": 36.48730707322529, "lr": 0.00012400719269222468, "grad_norm": 1.1322, "tokens_per_sec": 148565, "dt_s": 4.411, "eta_s": 827, "world_size": 1, "timestamp": "2026-05-05T07:36:34.349717"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90820, "epoch": 0, "train_loss": 3.5154592990875244, "train_ppl": 33.63137123522242, "lr": 0.0001236446810040998, "grad_norm": 1.0774, "tokens_per_sec": 151032, "dt_s": 4.339, "eta_s": 820, "world_size": 1, "timestamp": "2026-05-05T07:36:38.688943"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90830, "epoch": 0, "train_loss": 3.5681252628564835, "train_ppl": 35.450071250259, "lr": 0.00012328216931597493, "grad_norm": 1.0974, "tokens_per_sec": 150685, "dt_s": 4.349, "eta_s": 812, "world_size": 1, "timestamp": "2026-05-05T07:36:43.038124"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90840, "epoch": 0, "train_loss": 3.6094783544540405, "train_ppl": 36.946774666637175, "lr": 0.0001229196576278501, "grad_norm": 1.0765, "tokens_per_sec": 147569, "dt_s": 4.441, "eta_s": 808, "world_size": 1, "timestamp": "2026-05-05T07:36:47.479199"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90850, "epoch": 0, "train_loss": 3.5510496497154236, "train_ppl": 34.84987846114677, "lr": 0.00012255714593972517, "grad_norm": 1.0271, "tokens_per_sec": 150220, "dt_s": 4.363, "eta_s": 804, "world_size": 1, "timestamp": "2026-05-05T07:36:51.841853"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90860, "epoch": 0, "train_loss": 3.6457734555006027, "train_ppl": 38.31239432676809, "lr": 0.0001221946342516003, "grad_norm": 1.0536, "tokens_per_sec": 150541, "dt_s": 4.353, "eta_s": 798, "world_size": 1, "timestamp": "2026-05-05T07:36:56.195221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90870, "epoch": 0, "train_loss": 3.7270089834928513, "train_ppl": 41.55463151216688, "lr": 0.00012183212256347548, "grad_norm": 1.0639, "tokens_per_sec": 148354, "dt_s": 4.418, "eta_s": 796, "world_size": 1, "timestamp": "2026-05-05T07:37:00.612757"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90880, "epoch": 0, "train_loss": 3.698679730296135, "train_ppl": 40.39393824610145, "lr": 0.0001214696108753506, "grad_norm": 1.1227, "tokens_per_sec": 151523, "dt_s": 4.325, "eta_s": 791, "world_size": 1, "timestamp": "2026-05-05T07:37:04.937936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90890, "epoch": 0, "train_loss": 3.6672622859477997, "train_ppl": 39.14459232996204, "lr": 0.00012110709918722573, "grad_norm": 1.0668, "tokens_per_sec": 150282, "dt_s": 4.361, "eta_s": 784, "world_size": 1, "timestamp": "2026-05-05T07:37:09.298784"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90900, "epoch": 0, "train_loss": 3.5444335639476776, "train_ppl": 34.62006973166088, "lr": 0.0001207445874991009, "grad_norm": 1.0859, "tokens_per_sec": 150263, "dt_s": 4.361, "eta_s": 779, "world_size": 1, "timestamp": "2026-05-05T07:37:13.660203"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90910, "epoch": 0, "train_loss": 3.51791113615036, "train_ppl": 33.71393104793622, "lr": 0.00012038207581097597, "grad_norm": 1.1529, "tokens_per_sec": 151431, "dt_s": 4.328, "eta_s": 774, "world_size": 1, "timestamp": "2026-05-05T07:37:17.987953"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90920, "epoch": 0, "train_loss": 3.5572262704372406, "train_ppl": 35.065799086046916, "lr": 0.0001200195641228511, "grad_norm": 1.3139, "tokens_per_sec": 133798, "dt_s": 4.898, "eta_s": 787, "world_size": 1, "timestamp": "2026-05-05T07:37:22.886078"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90930, "epoch": 0, "train_loss": 3.5636378079652786, "train_ppl": 35.29134705482271, "lr": 0.00011965705243472628, "grad_norm": 1.2345, "tokens_per_sec": 150354, "dt_s": 4.359, "eta_s": 783, "world_size": 1, "timestamp": "2026-05-05T07:37:27.244872"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90940, "epoch": 0, "train_loss": 3.5912161469459534, "train_ppl": 36.27816869346137, "lr": 0.0001192945407466014, "grad_norm": 1.0711, "tokens_per_sec": 149309, "dt_s": 4.389, "eta_s": 780, "world_size": 1, "timestamp": "2026-05-05T07:37:31.634132"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90950, "epoch": 0, "train_loss": 3.681532844901085, "train_ppl": 39.707212446231814, "lr": 0.00011893202905847652, "grad_norm": 1.0583, "tokens_per_sec": 147094, "dt_s": 4.455, "eta_s": 779, "world_size": 1, "timestamp": "2026-05-05T07:37:36.089527"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90960, "epoch": 0, "train_loss": 3.6359453797340393, "train_ppl": 37.93770148334713, "lr": 0.0001185695173703517, "grad_norm": 1.0344, "tokens_per_sec": 150154, "dt_s": 4.365, "eta_s": 775, "world_size": 1, "timestamp": "2026-05-05T07:37:40.454104"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90970, "epoch": 0, "train_loss": 3.588058516383171, "train_ppl": 36.16379630713481, "lr": 0.00011820700568222683, "grad_norm": 1.176, "tokens_per_sec": 149225, "dt_s": 4.392, "eta_s": 754, "world_size": 1, "timestamp": "2026-05-05T07:37:44.845879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90980, "epoch": 0, "train_loss": 3.5777525901794434, "train_ppl": 35.793008827187364, "lr": 0.0001178444939941019, "grad_norm": 1.0387, "tokens_per_sec": 148256, "dt_s": 4.42, "eta_s": 751, "world_size": 1, "timestamp": "2026-05-05T07:37:49.266330"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 90990, "epoch": 0, "train_loss": 3.5903583467006683, "train_ppl": 36.247062614765966, "lr": 0.00011748198230597707, "grad_norm": 1.1237, "tokens_per_sec": 149520, "dt_s": 4.383, "eta_s": 747, "world_size": 1, "timestamp": "2026-05-05T07:37:53.649427"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91000, "epoch": 0, "train_loss": 3.6128960996866226, "train_ppl": 37.0732653629726, "lr": 0.0001171194706178522, "grad_norm": 1.0933, "tokens_per_sec": 148459, "dt_s": 4.414, "eta_s": 741, "world_size": 1, "timestamp": "2026-05-05T07:37:58.063834"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91010, "epoch": 0, "train_loss": 3.7454509139060974, "train_ppl": 42.32808924039708, "lr": 0.00011675695892972738, "grad_norm": 1.25, "tokens_per_sec": 127470, "dt_s": 5.141, "eta_s": 737, "world_size": 1, "timestamp": "2026-05-05T07:38:03.205145"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91020, "epoch": 0, "train_loss": 3.5734689235687256, "train_ppl": 35.64001143923424, "lr": 0.0001163944472416025, "grad_norm": 1.0592, "tokens_per_sec": 149747, "dt_s": 4.376, "eta_s": 732, "world_size": 1, "timestamp": "2026-05-05T07:38:07.581589"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91030, "epoch": 0, "train_loss": 3.5125438570976257, "train_ppl": 33.53346371454337, "lr": 0.00011603193555347762, "grad_norm": 1.175, "tokens_per_sec": 144580, "dt_s": 4.533, "eta_s": 731, "world_size": 1, "timestamp": "2026-05-05T07:38:12.114447"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91040, "epoch": 0, "train_loss": 3.569445952773094, "train_ppl": 35.4969207319117, "lr": 0.0001156694238653528, "grad_norm": 1.091, "tokens_per_sec": 147894, "dt_s": 4.431, "eta_s": 728, "world_size": 1, "timestamp": "2026-05-05T07:38:16.545726"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91050, "epoch": 0, "train_loss": 3.6723297238349915, "train_ppl": 39.34345856580679, "lr": 0.00011530691217722787, "grad_norm": 1.0715, "tokens_per_sec": 146990, "dt_s": 4.459, "eta_s": 725, "world_size": 1, "timestamp": "2026-05-05T07:38:21.004257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91060, "epoch": 0, "train_loss": 3.5898988246917725, "train_ppl": 36.23041011812404, "lr": 0.000114944400489103, "grad_norm": 1.1298, "tokens_per_sec": 147616, "dt_s": 4.44, "eta_s": 723, "world_size": 1, "timestamp": "2026-05-05T07:38:25.443879"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91070, "epoch": 0, "train_loss": 3.678834229707718, "train_ppl": 39.60020241382972, "lr": 0.00011458188880097818, "grad_norm": 1.0908, "tokens_per_sec": 147970, "dt_s": 4.429, "eta_s": 720, "world_size": 1, "timestamp": "2026-05-05T07:38:29.872896"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91080, "epoch": 0, "train_loss": 3.5076365023851395, "train_ppl": 33.36930623229323, "lr": 0.0001142193771128533, "grad_norm": 1.0256, "tokens_per_sec": 149495, "dt_s": 4.384, "eta_s": 711, "world_size": 1, "timestamp": "2026-05-05T07:38:34.256700"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91090, "epoch": 0, "train_loss": 3.5800571590662003, "train_ppl": 35.87559140376227, "lr": 0.00011385686542472842, "grad_norm": 1.0787, "tokens_per_sec": 148510, "dt_s": 4.413, "eta_s": 706, "world_size": 1, "timestamp": "2026-05-05T07:38:38.669615"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91100, "epoch": 0, "train_loss": 3.628240466117859, "train_ppl": 37.64651798300425, "lr": 0.0001134943537366036, "grad_norm": 1.0614, "tokens_per_sec": 150670, "dt_s": 4.35, "eta_s": 698, "world_size": 1, "timestamp": "2026-05-05T07:38:43.019270"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91110, "epoch": 0, "train_loss": 3.608889788389206, "train_ppl": 36.92503544697434, "lr": 0.00011313184204847867, "grad_norm": 1.149, "tokens_per_sec": 148787, "dt_s": 4.405, "eta_s": 693, "world_size": 1, "timestamp": "2026-05-05T07:38:47.423951"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91120, "epoch": 0, "train_loss": 3.5313400328159332, "train_ppl": 34.16972550285656, "lr": 0.0001127693303603538, "grad_norm": 1.0818, "tokens_per_sec": 150136, "dt_s": 4.365, "eta_s": 686, "world_size": 1, "timestamp": "2026-05-05T07:38:51.789080"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91130, "epoch": 0, "train_loss": 3.647003322839737, "train_ppl": 38.35954247627629, "lr": 0.00011240681867222897, "grad_norm": 1.2506, "tokens_per_sec": 149740, "dt_s": 4.377, "eta_s": 682, "world_size": 1, "timestamp": "2026-05-05T07:38:56.165745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91140, "epoch": 0, "train_loss": 3.532741367816925, "train_ppl": 34.2176423010799, "lr": 0.0001120443069841041, "grad_norm": 1.0977, "tokens_per_sec": 148885, "dt_s": 4.402, "eta_s": 677, "world_size": 1, "timestamp": "2026-05-05T07:39:00.567542"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91150, "epoch": 0, "train_loss": 3.631213054060936, "train_ppl": 37.75859206095932, "lr": 0.00011168179529597922, "grad_norm": 1.256, "tokens_per_sec": 152282, "dt_s": 4.304, "eta_s": 671, "world_size": 1, "timestamp": "2026-05-05T07:39:04.871120"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91160, "epoch": 0, "train_loss": 3.499547600746155, "train_ppl": 33.100473941212876, "lr": 0.0001113192836078544, "grad_norm": 1.216, "tokens_per_sec": 150713, "dt_s": 4.348, "eta_s": 665, "world_size": 1, "timestamp": "2026-05-05T07:39:09.219504"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91170, "epoch": 0, "train_loss": 3.5636038035154343, "train_ppl": 35.290147012385354, "lr": 0.00011095677191972952, "grad_norm": 1.0483, "tokens_per_sec": 148350, "dt_s": 4.418, "eta_s": 662, "world_size": 1, "timestamp": "2026-05-05T07:39:13.637207"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91180, "epoch": 0, "train_loss": 3.5777387470006943, "train_ppl": 35.79251334159776, "lr": 0.0001105942602316046, "grad_norm": 1.2739, "tokens_per_sec": 151080, "dt_s": 4.338, "eta_s": 657, "world_size": 1, "timestamp": "2026-05-05T07:39:17.975007"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91190, "epoch": 0, "train_loss": 3.6707920879125595, "train_ppl": 39.28300913711998, "lr": 0.00011023174854347977, "grad_norm": 1.1466, "tokens_per_sec": 150472, "dt_s": 4.355, "eta_s": 651, "world_size": 1, "timestamp": "2026-05-05T07:39:22.330375"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91200, "epoch": 0, "train_loss": 3.5590496361255646, "train_ppl": 35.12979518735128, "lr": 0.0001098692368553549, "grad_norm": 1.0994, "tokens_per_sec": 150024, "dt_s": 4.368, "eta_s": 649, "world_size": 1, "timestamp": "2026-05-05T07:39:26.698745"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91210, "epoch": 0, "train_loss": 3.572874590754509, "train_ppl": 35.61883570427726, "lr": 0.00010950672516723008, "grad_norm": 1.0826, "tokens_per_sec": 133935, "dt_s": 4.893, "eta_s": 660, "world_size": 1, "timestamp": "2026-05-05T07:39:31.591844"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91220, "epoch": 0, "train_loss": 3.5426340252161026, "train_ppl": 34.55782559745139, "lr": 0.0001091442134791052, "grad_norm": 1.091, "tokens_per_sec": 149010, "dt_s": 4.398, "eta_s": 655, "world_size": 1, "timestamp": "2026-05-05T07:39:35.989942"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91230, "epoch": 0, "train_loss": 3.6008739173412323, "train_ppl": 36.63023225510147, "lr": 0.00010878170179098032, "grad_norm": 1.1377, "tokens_per_sec": 150996, "dt_s": 4.34, "eta_s": 651, "world_size": 1, "timestamp": "2026-05-05T07:39:40.330165"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91240, "epoch": 0, "train_loss": 3.661089688539505, "train_ppl": 38.90371271206149, "lr": 0.0001084191901028555, "grad_norm": 1.1811, "tokens_per_sec": 151785, "dt_s": 4.318, "eta_s": 645, "world_size": 1, "timestamp": "2026-05-05T07:39:44.647895"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91250, "epoch": 0, "train_loss": 3.5739991068840027, "train_ppl": 35.65891218864386, "lr": 0.00010805667841473057, "grad_norm": 1.1744, "tokens_per_sec": 148215, "dt_s": 4.422, "eta_s": 642, "world_size": 1, "timestamp": "2026-05-05T07:39:49.069555"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91260, "epoch": 0, "train_loss": 3.537141874432564, "train_ppl": 34.368549051956265, "lr": 0.0001076941667266057, "grad_norm": 1.2442, "tokens_per_sec": 151308, "dt_s": 4.331, "eta_s": 622, "world_size": 1, "timestamp": "2026-05-05T07:39:53.400837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91270, "epoch": 0, "train_loss": 3.696974352002144, "train_ppl": 40.325110006381934, "lr": 0.00010733165503848087, "grad_norm": 1.1218, "tokens_per_sec": 150949, "dt_s": 4.342, "eta_s": 616, "world_size": 1, "timestamp": "2026-05-05T07:39:57.742463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91280, "epoch": 0, "train_loss": 3.543546512722969, "train_ppl": 34.589373572951295, "lr": 0.000106969143350356, "grad_norm": 1.2385, "tokens_per_sec": 148481, "dt_s": 4.414, "eta_s": 614, "world_size": 1, "timestamp": "2026-05-05T07:40:02.156214"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91290, "epoch": 0, "train_loss": 3.622788891196251, "train_ppl": 37.4418435755201, "lr": 0.00010660663166223112, "grad_norm": 1.0779, "tokens_per_sec": 149276, "dt_s": 4.39, "eta_s": 611, "world_size": 1, "timestamp": "2026-05-05T07:40:06.546460"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91300, "epoch": 0, "train_loss": 3.5663156509399414, "train_ppl": 35.38597838797632, "lr": 0.0001062441199741063, "grad_norm": 1.1406, "tokens_per_sec": 149402, "dt_s": 4.387, "eta_s": 606, "world_size": 1, "timestamp": "2026-05-05T07:40:10.933023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91310, "epoch": 0, "train_loss": 3.4919531643390656, "train_ppl": 32.85004662760054, "lr": 0.00010588160828598137, "grad_norm": 1.1764, "tokens_per_sec": 149803, "dt_s": 4.375, "eta_s": 603, "world_size": 1, "timestamp": "2026-05-05T07:40:15.307837"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91320, "epoch": 0, "train_loss": 3.567485421895981, "train_ppl": 35.42739609763966, "lr": 0.0001055190965978565, "grad_norm": 1.042, "tokens_per_sec": 151621, "dt_s": 4.322, "eta_s": 598, "world_size": 1, "timestamp": "2026-05-05T07:40:19.630201"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91330, "epoch": 0, "train_loss": 3.6859322041273117, "train_ppl": 39.8822835556394, "lr": 0.00010515658490973167, "grad_norm": 1.0693, "tokens_per_sec": 148911, "dt_s": 4.401, "eta_s": 593, "world_size": 1, "timestamp": "2026-05-05T07:40:24.031239"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91340, "epoch": 0, "train_loss": 3.5528946816921234, "train_ppl": 34.914236954773926, "lr": 0.0001047940732216068, "grad_norm": 1.0918, "tokens_per_sec": 150249, "dt_s": 4.362, "eta_s": 588, "world_size": 1, "timestamp": "2026-05-05T07:40:28.393040"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91350, "epoch": 0, "train_loss": 3.579228848218918, "train_ppl": 35.84588756595371, "lr": 0.00010443156153348192, "grad_norm": 1.0202, "tokens_per_sec": 151112, "dt_s": 4.337, "eta_s": 582, "world_size": 1, "timestamp": "2026-05-05T07:40:32.729971"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91360, "epoch": 0, "train_loss": 3.591759204864502, "train_ppl": 36.29787519064116, "lr": 0.0001040690498453571, "grad_norm": 1.2158, "tokens_per_sec": 146690, "dt_s": 4.468, "eta_s": 580, "world_size": 1, "timestamp": "2026-05-05T07:40:37.197620"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91370, "epoch": 0, "train_loss": 3.5357405096292496, "train_ppl": 34.320419908094415, "lr": 0.00010370653815723222, "grad_norm": 1.1434, "tokens_per_sec": 150613, "dt_s": 4.351, "eta_s": 577, "world_size": 1, "timestamp": "2026-05-05T07:40:41.548909"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91380, "epoch": 0, "train_loss": 3.3950876593589783, "train_ppl": 29.817267122903523, "lr": 0.0001033440264691073, "grad_norm": 1.0632, "tokens_per_sec": 151008, "dt_s": 4.34, "eta_s": 571, "world_size": 1, "timestamp": "2026-05-05T07:40:45.888788"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91390, "epoch": 0, "train_loss": 3.411737948656082, "train_ppl": 30.317889434040886, "lr": 0.00010298151478098247, "grad_norm": 1.0581, "tokens_per_sec": 147498, "dt_s": 4.443, "eta_s": 569, "world_size": 1, "timestamp": "2026-05-05T07:40:50.331979"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91400, "epoch": 0, "train_loss": 3.687270849943161, "train_ppl": 39.935707557578134, "lr": 0.0001026190030928576, "grad_norm": 1.1037, "tokens_per_sec": 149235, "dt_s": 4.391, "eta_s": 566, "world_size": 1, "timestamp": "2026-05-05T07:40:54.723444"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91410, "epoch": 0, "train_loss": 3.5524054020643234, "train_ppl": 34.89715830836936, "lr": 0.00010225649140473272, "grad_norm": 1.0681, "tokens_per_sec": 148102, "dt_s": 4.425, "eta_s": 560, "world_size": 1, "timestamp": "2026-05-05T07:40:59.148503"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91420, "epoch": 0, "train_loss": 3.5526604503393173, "train_ppl": 34.90605990351809, "lr": 0.0001018939797166079, "grad_norm": 1.1855, "tokens_per_sec": 148638, "dt_s": 4.409, "eta_s": 557, "world_size": 1, "timestamp": "2026-05-05T07:41:03.557590"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91430, "epoch": 0, "train_loss": 3.601450726389885, "train_ppl": 36.651366999290325, "lr": 0.00010153146802848302, "grad_norm": 1.1251, "tokens_per_sec": 150056, "dt_s": 4.367, "eta_s": 553, "world_size": 1, "timestamp": "2026-05-05T07:41:07.925022"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91440, "epoch": 0, "train_loss": 3.5369561314582825, "train_ppl": 34.362165928262165, "lr": 0.0001011689563403582, "grad_norm": 1.1357, "tokens_per_sec": 146111, "dt_s": 4.485, "eta_s": 550, "world_size": 1, "timestamp": "2026-05-05T07:41:12.410381"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91450, "epoch": 0, "train_loss": 3.5494435876607895, "train_ppl": 34.79395231619284, "lr": 0.00010080644465223327, "grad_norm": 1.1265, "tokens_per_sec": 151571, "dt_s": 4.324, "eta_s": 544, "world_size": 1, "timestamp": "2026-05-05T07:41:16.734152"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91460, "epoch": 0, "train_loss": 3.523434817790985, "train_ppl": 33.90067134206269, "lr": 0.0001004439329641084, "grad_norm": 1.1644, "tokens_per_sec": 150508, "dt_s": 4.354, "eta_s": 538, "world_size": 1, "timestamp": "2026-05-05T07:41:21.088457"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91470, "epoch": 0, "train_loss": 3.4467390179634094, "train_ppl": 31.39783740102024, "lr": 0.00010008142127598357, "grad_norm": 1.2984, "tokens_per_sec": 149200, "dt_s": 4.392, "eta_s": 533, "world_size": 1, "timestamp": "2026-05-05T07:41:25.480941"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91480, "epoch": 0, "train_loss": 3.753943234682083, "train_ppl": 42.689083622547905, "lr": 9.97189095878587e-05, "grad_norm": 1.1511, "tokens_per_sec": 151862, "dt_s": 4.316, "eta_s": 527, "world_size": 1, "timestamp": "2026-05-05T07:41:29.796456"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91490, "epoch": 0, "train_loss": 3.515737012028694, "train_ppl": 33.64071239926476, "lr": 9.935639789973382e-05, "grad_norm": 1.0775, "tokens_per_sec": 151334, "dt_s": 4.331, "eta_s": 519, "world_size": 1, "timestamp": "2026-05-05T07:41:34.127045"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91500, "epoch": 0, "train_loss": 3.5991025120019913, "train_ppl": 36.565402702766974, "lr": 9.8993886211609e-05, "grad_norm": 1.1067, "tokens_per_sec": 148283, "dt_s": 4.42, "eta_s": 517, "world_size": 1, "timestamp": "2026-05-05T07:41:38.546679"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91510, "epoch": 0, "train_loss": 3.509243369102478, "train_ppl": 33.42296936306136, "lr": 9.863137452348407e-05, "grad_norm": 1.1509, "tokens_per_sec": 115245, "dt_s": 5.687, "eta_s": 526, "world_size": 1, "timestamp": "2026-05-05T07:41:44.233356"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91520, "epoch": 0, "train_loss": 3.5877091735601425, "train_ppl": 36.151164950906725, "lr": 9.826886283535919e-05, "grad_norm": 1.1349, "tokens_per_sec": 147743, "dt_s": 4.436, "eta_s": 523, "world_size": 1, "timestamp": "2026-05-05T07:41:48.669154"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91530, "epoch": 0, "train_loss": 3.6089030355215073, "train_ppl": 36.925524601044074, "lr": 9.790635114723437e-05, "grad_norm": 1.1215, "tokens_per_sec": 148602, "dt_s": 4.41, "eta_s": 520, "world_size": 1, "timestamp": "2026-05-05T07:41:53.079332"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91540, "epoch": 0, "train_loss": 3.589962884783745, "train_ppl": 36.23273111586927, "lr": 9.75438394591095e-05, "grad_norm": 1.1414, "tokens_per_sec": 150304, "dt_s": 4.36, "eta_s": 516, "world_size": 1, "timestamp": "2026-05-05T07:41:57.439549"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91550, "epoch": 0, "train_loss": 3.565370336174965, "train_ppl": 35.352543305961234, "lr": 9.718132777098462e-05, "grad_norm": 1.291, "tokens_per_sec": 145001, "dt_s": 4.52, "eta_s": 514, "world_size": 1, "timestamp": "2026-05-05T07:42:01.959257"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91560, "epoch": 0, "train_loss": 3.6423061788082123, "train_ppl": 38.17978468490225, "lr": 9.68188160828598e-05, "grad_norm": 1.155, "tokens_per_sec": 150581, "dt_s": 4.352, "eta_s": 497, "world_size": 1, "timestamp": "2026-05-05T07:42:06.311448"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91570, "epoch": 0, "train_loss": 3.5420253425836563, "train_ppl": 34.536797249639406, "lr": 9.645630439473492e-05, "grad_norm": 1.0998, "tokens_per_sec": 149180, "dt_s": 4.393, "eta_s": 492, "world_size": 1, "timestamp": "2026-05-05T07:42:10.704541"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91580, "epoch": 0, "train_loss": 3.5108030289411545, "train_ppl": 33.47513849854367, "lr": 9.609379270660999e-05, "grad_norm": 1.0804, "tokens_per_sec": 149429, "dt_s": 4.386, "eta_s": 487, "world_size": 1, "timestamp": "2026-05-05T07:42:15.090322"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91590, "epoch": 0, "train_loss": 3.5955651253461838, "train_ppl": 36.43628523901696, "lr": 9.573128101848517e-05, "grad_norm": 1.2431, "tokens_per_sec": 150594, "dt_s": 4.352, "eta_s": 482, "world_size": 1, "timestamp": "2026-05-05T07:42:19.442142"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91600, "epoch": 0, "train_loss": 3.5669140815734863, "train_ppl": 35.40716077890531, "lr": 9.53687693303603e-05, "grad_norm": 1.1253, "tokens_per_sec": 150583, "dt_s": 4.352, "eta_s": 474, "world_size": 1, "timestamp": "2026-05-05T07:42:23.794306"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91610, "epoch": 0, "train_loss": 3.611881762742996, "train_ppl": 37.035679645807406, "lr": 9.500625764223542e-05, "grad_norm": 1.1435, "tokens_per_sec": 149479, "dt_s": 4.384, "eta_s": 470, "world_size": 1, "timestamp": "2026-05-05T07:42:28.178599"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91620, "epoch": 0, "train_loss": 3.494889035820961, "train_ppl": 32.946631854204554, "lr": 9.46437459541106e-05, "grad_norm": 1.1627, "tokens_per_sec": 150940, "dt_s": 4.342, "eta_s": 465, "world_size": 1, "timestamp": "2026-05-05T07:42:32.520463"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91630, "epoch": 0, "train_loss": 3.5721342265605927, "train_ppl": 35.59247455332309, "lr": 9.428123426598572e-05, "grad_norm": 1.1445, "tokens_per_sec": 147685, "dt_s": 4.438, "eta_s": 462, "world_size": 1, "timestamp": "2026-05-05T07:42:36.958011"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91640, "epoch": 0, "train_loss": 3.727396696805954, "train_ppl": 41.57074591970853, "lr": 9.39187225778609e-05, "grad_norm": 1.1334, "tokens_per_sec": 149600, "dt_s": 4.381, "eta_s": 458, "world_size": 1, "timestamp": "2026-05-05T07:42:41.338753"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91650, "epoch": 0, "train_loss": 3.6290923207998276, "train_ppl": 37.67860100870562, "lr": 9.355621088973597e-05, "grad_norm": 1.0804, "tokens_per_sec": 150548, "dt_s": 4.353, "eta_s": 454, "world_size": 1, "timestamp": "2026-05-05T07:42:45.691914"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91660, "epoch": 0, "train_loss": 3.539849430322647, "train_ppl": 34.46172990863955, "lr": 9.319369920161109e-05, "grad_norm": 1.1981, "tokens_per_sec": 147168, "dt_s": 4.453, "eta_s": 451, "world_size": 1, "timestamp": "2026-05-05T07:42:50.145039"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91670, "epoch": 0, "train_loss": 3.6243732273578644, "train_ppl": 37.501211058861315, "lr": 9.283118751348627e-05, "grad_norm": 1.1871, "tokens_per_sec": 150167, "dt_s": 4.364, "eta_s": 447, "world_size": 1, "timestamp": "2026-05-05T07:42:54.509276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91680, "epoch": 0, "train_loss": 3.574697881937027, "train_ppl": 35.68383845481149, "lr": 9.24686758253614e-05, "grad_norm": 1.0598, "tokens_per_sec": 151338, "dt_s": 4.33, "eta_s": 440, "world_size": 1, "timestamp": "2026-05-05T07:42:58.839687"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91690, "epoch": 0, "train_loss": 3.4865881204605103, "train_ppl": 32.67427661455615, "lr": 9.210616413723652e-05, "grad_norm": 1.0602, "tokens_per_sec": 148305, "dt_s": 4.419, "eta_s": 437, "world_size": 1, "timestamp": "2026-05-05T07:43:03.258707"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91700, "epoch": 0, "train_loss": 3.4993231147527695, "train_ppl": 33.093044182408256, "lr": 9.17436524491117e-05, "grad_norm": 1.0625, "tokens_per_sec": 150718, "dt_s": 4.348, "eta_s": 432, "world_size": 1, "timestamp": "2026-05-05T07:43:07.606935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91710, "epoch": 0, "train_loss": 3.4959630519151688, "train_ppl": 32.98203607601466, "lr": 9.138114076098677e-05, "grad_norm": 1.0742, "tokens_per_sec": 149926, "dt_s": 4.371, "eta_s": 426, "world_size": 1, "timestamp": "2026-05-05T07:43:11.978158"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91720, "epoch": 0, "train_loss": 3.588925376534462, "train_ppl": 36.19515885257937, "lr": 9.101862907286189e-05, "grad_norm": 1.1285, "tokens_per_sec": 150107, "dt_s": 4.366, "eta_s": 422, "world_size": 1, "timestamp": "2026-05-05T07:43:16.344113"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91730, "epoch": 0, "train_loss": 3.6741385757923126, "train_ppl": 39.41468946149191, "lr": 9.065611738473707e-05, "grad_norm": 1.1433, "tokens_per_sec": 150999, "dt_s": 4.34, "eta_s": 418, "world_size": 1, "timestamp": "2026-05-05T07:43:20.684324"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91740, "epoch": 0, "train_loss": 3.569750040769577, "train_ppl": 35.507716560776174, "lr": 9.02936056966122e-05, "grad_norm": 1.1156, "tokens_per_sec": 148465, "dt_s": 4.414, "eta_s": 413, "world_size": 1, "timestamp": "2026-05-05T07:43:25.098534"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91750, "epoch": 0, "train_loss": 3.5548543632030487, "train_ppl": 34.98272482469034, "lr": 8.993109400848732e-05, "grad_norm": 1.1806, "tokens_per_sec": 150655, "dt_s": 4.35, "eta_s": 409, "world_size": 1, "timestamp": "2026-05-05T07:43:29.448593"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91760, "epoch": 0, "train_loss": 3.5157707184553146, "train_ppl": 33.641846326578914, "lr": 8.95685823203625e-05, "grad_norm": 1.0444, "tokens_per_sec": 150340, "dt_s": 4.359, "eta_s": 404, "world_size": 1, "timestamp": "2026-05-05T07:43:33.807816"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91770, "epoch": 0, "train_loss": 3.5658835768699646, "train_ppl": 35.37069232686725, "lr": 8.920607063223762e-05, "grad_norm": 1.0861, "tokens_per_sec": 148148, "dt_s": 4.424, "eta_s": 401, "world_size": 1, "timestamp": "2026-05-05T07:43:38.231482"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91780, "epoch": 0, "train_loss": 3.534416362643242, "train_ppl": 34.27500470235428, "lr": 8.884355894411269e-05, "grad_norm": 1.1145, "tokens_per_sec": 150016, "dt_s": 4.369, "eta_s": 397, "world_size": 1, "timestamp": "2026-05-05T07:43:42.600147"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91790, "epoch": 0, "train_loss": 3.5210666209459305, "train_ppl": 33.82048286766834, "lr": 8.848104725598787e-05, "grad_norm": 1.0609, "tokens_per_sec": 149973, "dt_s": 4.37, "eta_s": 392, "world_size": 1, "timestamp": "2026-05-05T07:43:46.969935"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91800, "epoch": 0, "train_loss": 3.589224860072136, "train_ppl": 36.206000330140206, "lr": 8.811853556786299e-05, "grad_norm": 1.1248, "tokens_per_sec": 133310, "dt_s": 4.916, "eta_s": 398, "world_size": 1, "timestamp": "2026-05-05T07:43:51.885981"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91810, "epoch": 0, "train_loss": 3.5223558396101, "train_ppl": 33.864112983749465, "lr": 8.775602387973812e-05, "grad_norm": 1.1475, "tokens_per_sec": 150217, "dt_s": 4.363, "eta_s": 393, "world_size": 1, "timestamp": "2026-05-05T07:43:56.248763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91820, "epoch": 0, "train_loss": 3.5706221163272858, "train_ppl": 35.53869547851403, "lr": 8.73935121916133e-05, "grad_norm": 1.1849, "tokens_per_sec": 150314, "dt_s": 4.36, "eta_s": 387, "world_size": 1, "timestamp": "2026-05-05T07:44:00.608708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91830, "epoch": 0, "train_loss": 3.611796408891678, "train_ppl": 37.03251864281725, "lr": 8.703100050348842e-05, "grad_norm": 1.0614, "tokens_per_sec": 151146, "dt_s": 4.336, "eta_s": 382, "world_size": 1, "timestamp": "2026-05-05T07:44:04.944642"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91840, "epoch": 0, "train_loss": 3.624358206987381, "train_ppl": 37.50064778100796, "lr": 8.666848881536349e-05, "grad_norm": 1.1809, "tokens_per_sec": 152593, "dt_s": 4.295, "eta_s": 377, "world_size": 1, "timestamp": "2026-05-05T07:44:09.239475"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91850, "epoch": 0, "train_loss": 3.5830732583999634, "train_ppl": 35.98395909280045, "lr": 8.630597712723867e-05, "grad_norm": 1.1153, "tokens_per_sec": 148280, "dt_s": 4.42, "eta_s": 364, "world_size": 1, "timestamp": "2026-05-05T07:44:13.659231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91860, "epoch": 0, "train_loss": 3.588227406144142, "train_ppl": 36.16990451784153, "lr": 8.594346543911379e-05, "grad_norm": 1.2047, "tokens_per_sec": 150635, "dt_s": 4.351, "eta_s": 359, "world_size": 1, "timestamp": "2026-05-05T07:44:18.009854"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91870, "epoch": 0, "train_loss": 3.591407299041748, "train_ppl": 36.2851040042672, "lr": 8.558095375098897e-05, "grad_norm": 1.1416, "tokens_per_sec": 151598, "dt_s": 4.323, "eta_s": 354, "world_size": 1, "timestamp": "2026-05-05T07:44:22.332861"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91880, "epoch": 0, "train_loss": 3.6406984478235245, "train_ppl": 38.118451179175636, "lr": 8.52184420628641e-05, "grad_norm": 1.0897, "tokens_per_sec": 149487, "dt_s": 4.384, "eta_s": 351, "world_size": 1, "timestamp": "2026-05-05T07:44:26.716924"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91890, "epoch": 0, "train_loss": 3.5884869992733, "train_ppl": 36.17929519536161, "lr": 8.485593037473922e-05, "grad_norm": 1.1515, "tokens_per_sec": 150128, "dt_s": 4.365, "eta_s": 348, "world_size": 1, "timestamp": "2026-05-05T07:44:31.082280"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91900, "epoch": 0, "train_loss": 3.6238666474819183, "train_ppl": 37.48221851104271, "lr": 8.44934186866144e-05, "grad_norm": 1.101, "tokens_per_sec": 151657, "dt_s": 4.321, "eta_s": 342, "world_size": 1, "timestamp": "2026-05-05T07:44:35.403573"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91910, "epoch": 0, "train_loss": 3.5586469173431396, "train_ppl": 35.11565060734178, "lr": 8.413090699848947e-05, "grad_norm": 1.0982, "tokens_per_sec": 150075, "dt_s": 4.367, "eta_s": 338, "world_size": 1, "timestamp": "2026-05-05T07:44:39.770476"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91920, "epoch": 0, "train_loss": 3.5321299880743027, "train_ppl": 34.19672872145761, "lr": 8.376839531036459e-05, "grad_norm": 1.1524, "tokens_per_sec": 150892, "dt_s": 4.343, "eta_s": 334, "world_size": 1, "timestamp": "2026-05-05T07:44:44.113708"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91930, "epoch": 0, "train_loss": 3.533224895596504, "train_ppl": 34.23419148235618, "lr": 8.340588362223977e-05, "grad_norm": 1.2143, "tokens_per_sec": 150166, "dt_s": 4.364, "eta_s": 329, "world_size": 1, "timestamp": "2026-05-05T07:44:48.477973"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91940, "epoch": 0, "train_loss": 3.562728375196457, "train_ppl": 35.25926653710329, "lr": 8.304337193411489e-05, "grad_norm": 1.0774, "tokens_per_sec": 149265, "dt_s": 4.391, "eta_s": 325, "world_size": 1, "timestamp": "2026-05-05T07:44:52.868536"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91950, "epoch": 0, "train_loss": 3.624935269355774, "train_ppl": 37.52229423871013, "lr": 8.268086024599002e-05, "grad_norm": 1.0672, "tokens_per_sec": 151665, "dt_s": 4.321, "eta_s": 321, "world_size": 1, "timestamp": "2026-05-05T07:44:57.189647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91960, "epoch": 0, "train_loss": 3.5793312191963196, "train_ppl": 35.849557332335245, "lr": 8.23183485578652e-05, "grad_norm": 1.1928, "tokens_per_sec": 147140, "dt_s": 4.454, "eta_s": 318, "world_size": 1, "timestamp": "2026-05-05T07:45:01.643680"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91970, "epoch": 0, "train_loss": 3.5173837542533875, "train_ppl": 33.696155618657556, "lr": 8.195583686974032e-05, "grad_norm": 1.1339, "tokens_per_sec": 150453, "dt_s": 4.356, "eta_s": 313, "world_size": 1, "timestamp": "2026-05-05T07:45:05.999529"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91980, "epoch": 0, "train_loss": 3.584598168730736, "train_ppl": 36.038873262714425, "lr": 8.159332518161539e-05, "grad_norm": 1.1545, "tokens_per_sec": 150174, "dt_s": 4.364, "eta_s": 309, "world_size": 1, "timestamp": "2026-05-05T07:45:10.363543"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 91990, "epoch": 0, "train_loss": 3.5361168384552, "train_ppl": 34.33333810201648, "lr": 8.123081349349057e-05, "grad_norm": 1.2189, "tokens_per_sec": 149517, "dt_s": 4.383, "eta_s": 304, "world_size": 1, "timestamp": "2026-05-05T07:45:14.746755"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92000, "epoch": 0, "train_loss": 3.514901578426361, "train_ppl": 33.61261955420731, "lr": 8.086830180536569e-05, "grad_norm": 1.1201, "tokens_per_sec": 150885, "dt_s": 4.343, "eta_s": 300, "world_size": 1, "timestamp": "2026-05-05T07:45:19.090212"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92010, "epoch": 0, "train_loss": 3.48607237637043, "train_ppl": 32.65742939429512, "lr": 8.050579011724082e-05, "grad_norm": 1.2209, "tokens_per_sec": 127603, "dt_s": 5.136, "eta_s": 295, "world_size": 1, "timestamp": "2026-05-05T07:45:24.226099"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92020, "epoch": 0, "train_loss": 3.5899788588285446, "train_ppl": 36.23330990376212, "lr": 8.0143278429116e-05, "grad_norm": 1.2647, "tokens_per_sec": 147335, "dt_s": 4.448, "eta_s": 292, "world_size": 1, "timestamp": "2026-05-05T07:45:28.674231"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92030, "epoch": 0, "train_loss": 3.6594563871622086, "train_ppl": 38.84022308746672, "lr": 7.978076674099112e-05, "grad_norm": 1.1497, "tokens_per_sec": 149357, "dt_s": 4.388, "eta_s": 288, "world_size": 1, "timestamp": "2026-05-05T07:45:33.062103"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92040, "epoch": 0, "train_loss": 3.618513509631157, "train_ppl": 37.28210711820362, "lr": 7.941825505286619e-05, "grad_norm": 1.1625, "tokens_per_sec": 147189, "dt_s": 4.453, "eta_s": 284, "world_size": 1, "timestamp": "2026-05-05T07:45:37.514607"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92050, "epoch": 0, "train_loss": 3.615387439727783, "train_ppl": 37.165742621744656, "lr": 7.905574336474137e-05, "grad_norm": 1.1193, "tokens_per_sec": 150128, "dt_s": 4.365, "eta_s": 280, "world_size": 1, "timestamp": "2026-05-05T07:45:41.879945"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92060, "epoch": 0, "train_loss": 3.5592833161354065, "train_ppl": 35.13800527746601, "lr": 7.869323167661649e-05, "grad_norm": 1.0965, "tokens_per_sec": 150762, "dt_s": 4.347, "eta_s": 275, "world_size": 1, "timestamp": "2026-05-05T07:45:46.226950"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92070, "epoch": 0, "train_loss": 3.5926752537488937, "train_ppl": 36.331141052967126, "lr": 7.833071998849167e-05, "grad_norm": 1.1416, "tokens_per_sec": 147391, "dt_s": 4.446, "eta_s": 271, "world_size": 1, "timestamp": "2026-05-05T07:45:50.673367"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92080, "epoch": 0, "train_loss": 3.571214586496353, "train_ppl": 35.559757334077084, "lr": 7.796820830036679e-05, "grad_norm": 1.177, "tokens_per_sec": 149158, "dt_s": 4.394, "eta_s": 267, "world_size": 1, "timestamp": "2026-05-05T07:45:55.067079"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92090, "epoch": 0, "train_loss": 3.4790532141923904, "train_ppl": 32.42900421559512, "lr": 7.760569661224192e-05, "grad_norm": 1.1902, "tokens_per_sec": 149548, "dt_s": 4.382, "eta_s": 261, "world_size": 1, "timestamp": "2026-05-05T07:45:59.449372"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92100, "epoch": 0, "train_loss": 3.5210740119218826, "train_ppl": 33.82073283496766, "lr": 7.72431849241171e-05, "grad_norm": 1.0958, "tokens_per_sec": 131274, "dt_s": 4.992, "eta_s": 264, "world_size": 1, "timestamp": "2026-05-05T07:46:04.441662"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92110, "epoch": 0, "train_loss": 3.538332626223564, "train_ppl": 34.40949783837594, "lr": 7.688067323599216e-05, "grad_norm": 1.1342, "tokens_per_sec": 148545, "dt_s": 4.412, "eta_s": 261, "world_size": 1, "timestamp": "2026-05-05T07:46:08.853511"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92120, "epoch": 0, "train_loss": 3.528283655643463, "train_ppl": 34.065449368612335, "lr": 7.651816154786729e-05, "grad_norm": 1.162, "tokens_per_sec": 147548, "dt_s": 4.442, "eta_s": 256, "world_size": 1, "timestamp": "2026-05-05T07:46:13.295193"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92130, "epoch": 0, "train_loss": 3.6675285398960114, "train_ppl": 39.155016119847055, "lr": 7.615564985974247e-05, "grad_norm": 1.2002, "tokens_per_sec": 147500, "dt_s": 4.443, "eta_s": 252, "world_size": 1, "timestamp": "2026-05-05T07:46:17.738279"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92140, "epoch": 0, "train_loss": 3.6574714481830597, "train_ppl": 38.76320407900538, "lr": 7.579313817161759e-05, "grad_norm": 1.1667, "tokens_per_sec": 149087, "dt_s": 4.396, "eta_s": 248, "world_size": 1, "timestamp": "2026-05-05T07:46:22.134110"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92150, "epoch": 0, "train_loss": 3.5692098289728165, "train_ppl": 35.48854005356807, "lr": 7.543062648349272e-05, "grad_norm": 1.1504, "tokens_per_sec": 145496, "dt_s": 4.504, "eta_s": 238, "world_size": 1, "timestamp": "2026-05-05T07:46:26.638410"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92160, "epoch": 0, "train_loss": 3.5878549218177795, "train_ppl": 36.156434304200076, "lr": 7.50681147953679e-05, "grad_norm": 1.1567, "tokens_per_sec": 149205, "dt_s": 4.392, "eta_s": 233, "world_size": 1, "timestamp": "2026-05-05T07:46:31.030763"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92170, "epoch": 0, "train_loss": 3.672042265534401, "train_ppl": 39.3321505874321, "lr": 7.470560310724302e-05, "grad_norm": 1.3099, "tokens_per_sec": 150464, "dt_s": 4.356, "eta_s": 228, "world_size": 1, "timestamp": "2026-05-05T07:46:35.386374"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92180, "epoch": 0, "train_loss": 3.5589792132377625, "train_ppl": 35.127321332835294, "lr": 7.434309141911809e-05, "grad_norm": 1.0845, "tokens_per_sec": 147326, "dt_s": 4.448, "eta_s": 224, "world_size": 1, "timestamp": "2026-05-05T07:46:39.834724"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92190, "epoch": 0, "train_loss": 3.572799652814865, "train_ppl": 35.616166602126825, "lr": 7.398057973099327e-05, "grad_norm": 1.1337, "tokens_per_sec": 148550, "dt_s": 4.412, "eta_s": 219, "world_size": 1, "timestamp": "2026-05-05T07:46:44.246428"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92200, "epoch": 0, "train_loss": 3.541797637939453, "train_ppl": 34.528933955797996, "lr": 7.361806804286839e-05, "grad_norm": 1.0893, "tokens_per_sec": 149111, "dt_s": 4.395, "eta_s": 214, "world_size": 1, "timestamp": "2026-05-05T07:46:48.641547"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92210, "epoch": 0, "train_loss": 3.61541984975338, "train_ppl": 37.166947183934184, "lr": 7.325555635474351e-05, "grad_norm": 1.1672, "tokens_per_sec": 146792, "dt_s": 4.465, "eta_s": 210, "world_size": 1, "timestamp": "2026-05-05T07:46:53.106121"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92220, "epoch": 0, "train_loss": 3.526055544614792, "train_ppl": 33.98963226101032, "lr": 7.289304466661869e-05, "grad_norm": 1.0725, "tokens_per_sec": 149788, "dt_s": 4.375, "eta_s": 206, "world_size": 1, "timestamp": "2026-05-05T07:46:57.481377"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92230, "epoch": 0, "train_loss": 3.450054258108139, "train_ppl": 31.50210150680875, "lr": 7.253053297849382e-05, "grad_norm": 1.1663, "tokens_per_sec": 149226, "dt_s": 4.392, "eta_s": 201, "world_size": 1, "timestamp": "2026-05-05T07:47:01.873107"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92240, "epoch": 0, "train_loss": 3.63226380944252, "train_ppl": 37.79828795645402, "lr": 7.216802129036889e-05, "grad_norm": 1.1823, "tokens_per_sec": 149562, "dt_s": 4.382, "eta_s": 196, "world_size": 1, "timestamp": "2026-05-05T07:47:06.255001"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92250, "epoch": 0, "train_loss": 3.252645805478096, "train_ppl": 25.858666489378628, "lr": 7.180550960224406e-05, "grad_norm": 1.344, "tokens_per_sec": 149645, "dt_s": 4.379, "eta_s": 192, "world_size": 1, "timestamp": "2026-05-05T07:47:10.634445"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92260, "epoch": 0, "train_loss": 3.4815097600221634, "train_ppl": 32.508765479153595, "lr": 7.144299791411919e-05, "grad_norm": 1.1136, "tokens_per_sec": 146907, "dt_s": 4.461, "eta_s": 187, "world_size": 1, "timestamp": "2026-05-05T07:47:15.095473"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92270, "epoch": 0, "train_loss": 3.6265597492456436, "train_ppl": 37.58329798738875, "lr": 7.108048622599431e-05, "grad_norm": 1.1037, "tokens_per_sec": 150378, "dt_s": 4.358, "eta_s": 183, "world_size": 1, "timestamp": "2026-05-05T07:47:19.453552"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92280, "epoch": 0, "train_loss": 3.6408365964889526, "train_ppl": 38.123717556097326, "lr": 7.071797453786944e-05, "grad_norm": 1.1019, "tokens_per_sec": 151570, "dt_s": 4.324, "eta_s": 178, "world_size": 1, "timestamp": "2026-05-05T07:47:23.777382"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92290, "epoch": 0, "train_loss": 3.5040623992681503, "train_ppl": 33.25025377049746, "lr": 7.035546284974456e-05, "grad_norm": 1.1043, "tokens_per_sec": 148317, "dt_s": 4.419, "eta_s": 174, "world_size": 1, "timestamp": "2026-05-05T07:47:28.196024"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92300, "epoch": 0, "train_loss": 3.53856298327446, "train_ppl": 34.41742522185008, "lr": 6.99929511616198e-05, "grad_norm": 1.1097, "tokens_per_sec": 150491, "dt_s": 4.355, "eta_s": 169, "world_size": 1, "timestamp": "2026-05-05T07:47:32.550820"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92310, "epoch": 0, "train_loss": 3.5937843918800354, "train_ppl": 36.37145966217701, "lr": 6.963043947349492e-05, "grad_norm": 1.1033, "tokens_per_sec": 151290, "dt_s": 4.332, "eta_s": 164, "world_size": 1, "timestamp": "2026-05-05T07:47:36.882651"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92320, "epoch": 0, "train_loss": 3.5886353254318237, "train_ppl": 36.1846619292397, "lr": 6.926792778537004e-05, "grad_norm": 1.1358, "tokens_per_sec": 147618, "dt_s": 4.44, "eta_s": 160, "world_size": 1, "timestamp": "2026-05-05T07:47:41.322238"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92330, "epoch": 0, "train_loss": 3.4532195031642914, "train_ppl": 31.601971350745355, "lr": 6.890541609724517e-05, "grad_norm": 1.0699, "tokens_per_sec": 149558, "dt_s": 4.382, "eta_s": 156, "world_size": 1, "timestamp": "2026-05-05T07:47:45.704221"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92340, "epoch": 0, "train_loss": 3.4911115914583206, "train_ppl": 32.82241254890413, "lr": 6.854290440912029e-05, "grad_norm": 1.1121, "tokens_per_sec": 149325, "dt_s": 4.389, "eta_s": 151, "world_size": 1, "timestamp": "2026-05-05T07:47:50.093020"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92350, "epoch": 0, "train_loss": 3.5416851490736008, "train_ppl": 34.52505005363019, "lr": 6.818039272099541e-05, "grad_norm": 1.1469, "tokens_per_sec": 150384, "dt_s": 4.358, "eta_s": 147, "world_size": 1, "timestamp": "2026-05-05T07:47:54.450936"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92360, "epoch": 0, "train_loss": 3.4926075786352158, "train_ppl": 32.871551203426314, "lr": 6.781788103287054e-05, "grad_norm": 1.0511, "tokens_per_sec": 150445, "dt_s": 4.356, "eta_s": 143, "world_size": 1, "timestamp": "2026-05-05T07:47:58.807091"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92370, "epoch": 0, "train_loss": 3.542065605521202, "train_ppl": 34.538187830544345, "lr": 6.745536934474566e-05, "grad_norm": 1.0981, "tokens_per_sec": 148896, "dt_s": 4.401, "eta_s": 138, "world_size": 1, "timestamp": "2026-05-05T07:48:03.208542"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92380, "epoch": 0, "train_loss": 3.7030939012765884, "train_ppl": 40.57263811181193, "lr": 6.709285765662079e-05, "grad_norm": 1.1913, "tokens_per_sec": 151000, "dt_s": 4.34, "eta_s": 134, "world_size": 1, "timestamp": "2026-05-05T07:48:07.548692"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92390, "epoch": 0, "train_loss": 3.6551431715488434, "train_ppl": 38.673057600373504, "lr": 6.673034596849602e-05, "grad_norm": 1.1627, "tokens_per_sec": 134015, "dt_s": 4.89, "eta_s": 132, "world_size": 1, "timestamp": "2026-05-05T07:48:12.438895"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92400, "epoch": 0, "train_loss": 3.6005605161190033, "train_ppl": 36.618754094270955, "lr": 6.636783428037114e-05, "grad_norm": 1.1036, "tokens_per_sec": 148977, "dt_s": 4.399, "eta_s": 128, "world_size": 1, "timestamp": "2026-05-05T07:48:16.837959"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92410, "epoch": 0, "train_loss": 3.678428277373314, "train_ppl": 39.58412988177864, "lr": 6.600532259224616e-05, "grad_norm": 1.1539, "tokens_per_sec": 150039, "dt_s": 4.368, "eta_s": 124, "world_size": 1, "timestamp": "2026-05-05T07:48:21.205891"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92420, "epoch": 0, "train_loss": 3.597458854317665, "train_ppl": 36.50535106332308, "lr": 6.564281090412139e-05, "grad_norm": 1.2021, "tokens_per_sec": 151384, "dt_s": 4.329, "eta_s": 119, "world_size": 1, "timestamp": "2026-05-05T07:48:25.535023"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92430, "epoch": 0, "train_loss": 3.6027510464191437, "train_ppl": 36.69905650498936, "lr": 6.528029921599651e-05, "grad_norm": 1.1788, "tokens_per_sec": 148204, "dt_s": 4.422, "eta_s": 115, "world_size": 1, "timestamp": "2026-05-05T07:48:29.957046"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92440, "epoch": 0, "train_loss": 3.712192624807358, "train_ppl": 40.943481872923435, "lr": 6.491778752787164e-05, "grad_norm": 1.2784, "tokens_per_sec": 149270, "dt_s": 4.39, "eta_s": 108, "world_size": 1, "timestamp": "2026-05-05T07:48:34.347474"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92450, "epoch": 0, "train_loss": 3.498536631464958, "train_ppl": 33.0670272884832, "lr": 6.455527583974676e-05, "grad_norm": 1.2296, "tokens_per_sec": 148468, "dt_s": 4.414, "eta_s": 103, "world_size": 1, "timestamp": "2026-05-05T07:48:38.761641"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92460, "epoch": 0, "train_loss": 3.5393144339323044, "train_ppl": 34.443297938491845, "lr": 6.419276415162189e-05, "grad_norm": 1.1465, "tokens_per_sec": 148801, "dt_s": 4.404, "eta_s": 99, "world_size": 1, "timestamp": "2026-05-05T07:48:43.165889"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92470, "epoch": 0, "train_loss": 3.5040895342826843, "train_ppl": 33.25115602885812, "lr": 6.383025246349701e-05, "grad_norm": 1.1298, "tokens_per_sec": 150272, "dt_s": 4.361, "eta_s": 95, "world_size": 1, "timestamp": "2026-05-05T07:48:47.527032"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92480, "epoch": 0, "train_loss": 3.5844563245773315, "train_ppl": 36.03376172177651, "lr": 6.346774077537214e-05, "grad_norm": 1.1351, "tokens_per_sec": 146159, "dt_s": 4.484, "eta_s": 91, "world_size": 1, "timestamp": "2026-05-05T07:48:52.010933"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92490, "epoch": 0, "train_loss": 3.567954331636429, "train_ppl": 35.44401224418058, "lr": 6.310522908724726e-05, "grad_norm": 1.1264, "tokens_per_sec": 151794, "dt_s": 4.317, "eta_s": 86, "world_size": 1, "timestamp": "2026-05-05T07:48:56.328363"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92500, "epoch": 0, "train_loss": 3.6501351445913315, "train_ppl": 38.47986604341958, "lr": 6.274271739912249e-05, "grad_norm": 1.1445, "tokens_per_sec": 151665, "dt_s": 4.321, "eta_s": 81, "world_size": 1, "timestamp": "2026-05-05T07:49:00.649455"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92510, "epoch": 0, "train_loss": 3.624020665884018, "train_ppl": 37.48799190703813, "lr": 6.238020571099762e-05, "grad_norm": 1.1586, "tokens_per_sec": 124164, "dt_s": 5.278, "eta_s": 77, "world_size": 1, "timestamp": "2026-05-05T07:49:05.927647"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92520, "epoch": 0, "train_loss": 3.602555423974991, "train_ppl": 36.691878048014296, "lr": 6.201769402287274e-05, "grad_norm": 1.178, "tokens_per_sec": 147218, "dt_s": 4.452, "eta_s": 73, "world_size": 1, "timestamp": "2026-05-05T07:49:10.379276"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92530, "epoch": 0, "train_loss": 3.5467105209827423, "train_ppl": 34.69898795559723, "lr": 6.165518233474786e-05, "grad_norm": 1.1956, "tokens_per_sec": 147524, "dt_s": 4.442, "eta_s": 69, "world_size": 1, "timestamp": "2026-05-05T07:49:14.821635"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92540, "epoch": 0, "train_loss": 3.500152975320816, "train_ppl": 33.1205181930742, "lr": 6.129267064662299e-05, "grad_norm": 1.1197, "tokens_per_sec": 149437, "dt_s": 4.386, "eta_s": 65, "world_size": 1, "timestamp": "2026-05-05T07:49:19.207150"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92550, "epoch": 0, "train_loss": 3.545844152569771, "train_ppl": 34.66893886714052, "lr": 6.093015895849811e-05, "grad_norm": 1.1343, "tokens_per_sec": 150396, "dt_s": 4.358, "eta_s": 60, "world_size": 1, "timestamp": "2026-05-05T07:49:23.564720"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92560, "epoch": 0, "train_loss": 3.5803413093090057, "train_ppl": 35.885786910229655, "lr": 6.0567647270373236e-05, "grad_norm": 1.0871, "tokens_per_sec": 147448, "dt_s": 4.445, "eta_s": 56, "world_size": 1, "timestamp": "2026-05-05T07:49:28.009423"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92570, "epoch": 0, "train_loss": 3.5700555443763733, "train_ppl": 35.5185659534345, "lr": 6.020513558224836e-05, "grad_norm": 1.19, "tokens_per_sec": 150958, "dt_s": 4.341, "eta_s": 51, "world_size": 1, "timestamp": "2026-05-05T07:49:32.350753"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92580, "epoch": 0, "train_loss": 3.6429590731859207, "train_ppl": 38.204720190905206, "lr": 5.9842623894123485e-05, "grad_norm": 1.1237, "tokens_per_sec": 150503, "dt_s": 4.354, "eta_s": 46, "world_size": 1, "timestamp": "2026-05-05T07:49:36.705247"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92590, "epoch": 0, "train_loss": 3.545429974794388, "train_ppl": 34.65458273636602, "lr": 5.948011220599872e-05, "grad_norm": 1.2662, "tokens_per_sec": 146511, "dt_s": 4.473, "eta_s": 42, "world_size": 1, "timestamp": "2026-05-05T07:49:41.178359"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92600, "epoch": 0, "train_loss": 3.5657153874635696, "train_ppl": 35.36474385137032, "lr": 5.911760051787384e-05, "grad_norm": 1.2144, "tokens_per_sec": 149463, "dt_s": 4.385, "eta_s": 38, "world_size": 1, "timestamp": "2026-05-05T07:49:45.563116"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92610, "epoch": 0, "train_loss": 3.468594938516617, "train_ppl": 32.091620053033324, "lr": 5.875508882974886e-05, "grad_norm": 1.1633, "tokens_per_sec": 148736, "dt_s": 4.406, "eta_s": 33, "world_size": 1, "timestamp": "2026-05-05T07:49:49.969320"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92620, "epoch": 0, "train_loss": 3.6450369358062744, "train_ppl": 38.2841868827537, "lr": 5.839257714162409e-05, "grad_norm": 1.1215, "tokens_per_sec": 146844, "dt_s": 4.463, "eta_s": 29, "world_size": 1, "timestamp": "2026-05-05T07:49:54.432282"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92630, "epoch": 0, "train_loss": 3.5026592016220093, "train_ppl": 33.20362981163784, "lr": 5.8030065453499213e-05, "grad_norm": 1.1083, "tokens_per_sec": 148232, "dt_s": 4.421, "eta_s": 25, "world_size": 1, "timestamp": "2026-05-05T07:49:58.853441"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92640, "epoch": 0, "train_loss": 3.5872756391763687, "train_ppl": 36.135495574739124, "lr": 5.766755376537434e-05, "grad_norm": 1.1785, "tokens_per_sec": 147602, "dt_s": 4.44, "eta_s": 20, "world_size": 1, "timestamp": "2026-05-05T07:50:03.293471"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92650, "epoch": 0, "train_loss": 3.5151236802339554, "train_ppl": 33.62008580687169, "lr": 5.730504207724946e-05, "grad_norm": 1.1296, "tokens_per_sec": 149626, "dt_s": 4.38, "eta_s": 16, "world_size": 1, "timestamp": "2026-05-05T07:50:07.673466"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92660, "epoch": 0, "train_loss": 3.658666878938675, "train_ppl": 38.809570513756995, "lr": 5.6942530389124586e-05, "grad_norm": 1.1513, "tokens_per_sec": 150972, "dt_s": 4.341, "eta_s": 11, "world_size": 1, "timestamp": "2026-05-05T07:50:12.014406"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92670, "epoch": 0, "train_loss": 3.5102588683366776, "train_ppl": 33.456927602218904, "lr": 5.658001870099971e-05, "grad_norm": 1.1199, "tokens_per_sec": 147812, "dt_s": 4.434, "eta_s": 7, "world_size": 1, "timestamp": "2026-05-05T07:50:16.448163"} {"run_name": "final_c6_18l448_factorized_aggressive", "stage": "pretraining", "event": "train_step", "step": 92680, "epoch": 0, "train_loss": 3.5001932084560394, "train_ppl": 33.12185076216787, "lr": 5.6217507012874834e-05, "grad_norm": 1.2193, "tokens_per_sec": 150174, "dt_s": 4.364, "eta_s": 3, "world_size": 1, "timestamp": "2026-05-05T07:50:20.812150"}