{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 35.0, "eval_steps": 1, "global_step": 770, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045454545454545456, "grad_norm": 5.237588882446289, "learning_rate": 0.0, "loss": 2.0682, "step": 1 }, { "epoch": 0.045454545454545456, "eval_loss": 2.063732147216797, "eval_runtime": 0.2778, "eval_samples_per_second": 316.813, "eval_steps_per_second": 39.602, "step": 1 }, { "epoch": 0.09090909090909091, "grad_norm": 5.7836594581604, "learning_rate": 3e-06, "loss": 2.0543, "step": 2 }, { "epoch": 0.09090909090909091, "eval_loss": 2.058272123336792, "eval_runtime": 0.2138, "eval_samples_per_second": 411.689, "eval_steps_per_second": 51.461, "step": 2 }, { "epoch": 0.13636363636363635, "grad_norm": 4.997707366943359, "learning_rate": 6e-06, "loss": 2.106, "step": 3 }, { "epoch": 0.13636363636363635, "eval_loss": 2.044473171234131, "eval_runtime": 0.2229, "eval_samples_per_second": 394.85, "eval_steps_per_second": 49.356, "step": 3 }, { "epoch": 0.18181818181818182, "grad_norm": 4.480862140655518, "learning_rate": 9e-06, "loss": 2.0133, "step": 4 }, { "epoch": 0.18181818181818182, "eval_loss": 2.026616096496582, "eval_runtime": 0.2098, "eval_samples_per_second": 419.399, "eval_steps_per_second": 52.425, "step": 4 }, { "epoch": 0.22727272727272727, "grad_norm": 4.413949489593506, "learning_rate": 1.2e-05, "loss": 2.0339, "step": 5 }, { "epoch": 0.22727272727272727, "eval_loss": 2.0050275325775146, "eval_runtime": 0.2083, "eval_samples_per_second": 422.489, "eval_steps_per_second": 52.811, "step": 5 }, { "epoch": 0.2727272727272727, "grad_norm": 3.8636281490325928, "learning_rate": 1.5e-05, "loss": 1.9456, "step": 6 }, { "epoch": 0.2727272727272727, "eval_loss": 1.978696346282959, "eval_runtime": 0.2234, "eval_samples_per_second": 393.959, "eval_steps_per_second": 49.245, "step": 6 }, { "epoch": 0.3181818181818182, "grad_norm": 5.352145671844482, "learning_rate": 1.8e-05, "loss": 2.0702, "step": 7 }, { "epoch": 0.3181818181818182, "eval_loss": 1.9451583623886108, "eval_runtime": 0.2365, "eval_samples_per_second": 372.165, "eval_steps_per_second": 46.521, "step": 7 }, { "epoch": 0.36363636363636365, "grad_norm": 6.098653316497803, "learning_rate": 2.1e-05, "loss": 1.9057, "step": 8 }, { "epoch": 0.36363636363636365, "eval_loss": 1.908401608467102, "eval_runtime": 0.2109, "eval_samples_per_second": 417.279, "eval_steps_per_second": 52.16, "step": 8 }, { "epoch": 0.4090909090909091, "grad_norm": 4.3218302726745605, "learning_rate": 2.4e-05, "loss": 2.0159, "step": 9 }, { "epoch": 0.4090909090909091, "eval_loss": 1.860684871673584, "eval_runtime": 0.2261, "eval_samples_per_second": 389.203, "eval_steps_per_second": 48.65, "step": 9 }, { "epoch": 0.45454545454545453, "grad_norm": 4.778627395629883, "learning_rate": 2.7000000000000002e-05, "loss": 1.8808, "step": 10 }, { "epoch": 0.45454545454545453, "eval_loss": 1.793589472770691, "eval_runtime": 0.2922, "eval_samples_per_second": 301.187, "eval_steps_per_second": 37.648, "step": 10 }, { "epoch": 0.5, "grad_norm": 5.957038879394531, "learning_rate": 3e-05, "loss": 1.896, "step": 11 }, { "epoch": 0.5, "eval_loss": 1.7104023694992065, "eval_runtime": 0.3181, "eval_samples_per_second": 276.671, "eval_steps_per_second": 34.584, "step": 11 }, { "epoch": 0.5454545454545454, "grad_norm": 6.62753963470459, "learning_rate": 2.9960526315789475e-05, "loss": 1.7627, "step": 12 }, { "epoch": 0.5454545454545454, "eval_loss": 1.6353049278259277, "eval_runtime": 0.4101, "eval_samples_per_second": 214.57, "eval_steps_per_second": 26.821, "step": 12 }, { "epoch": 0.5909090909090909, "grad_norm": 5.637991905212402, "learning_rate": 2.992105263157895e-05, "loss": 1.6927, "step": 13 }, { "epoch": 0.5909090909090909, "eval_loss": 1.5653632879257202, "eval_runtime": 0.3772, "eval_samples_per_second": 233.322, "eval_steps_per_second": 29.165, "step": 13 }, { "epoch": 0.6363636363636364, "grad_norm": 7.619434356689453, "learning_rate": 2.9881578947368423e-05, "loss": 1.5805, "step": 14 }, { "epoch": 0.6363636363636364, "eval_loss": 1.4975870847702026, "eval_runtime": 0.2484, "eval_samples_per_second": 354.217, "eval_steps_per_second": 44.277, "step": 14 }, { "epoch": 0.6818181818181818, "grad_norm": 8.660569190979004, "learning_rate": 2.9842105263157894e-05, "loss": 1.5803, "step": 15 }, { "epoch": 0.6818181818181818, "eval_loss": 1.4246007204055786, "eval_runtime": 0.3233, "eval_samples_per_second": 272.164, "eval_steps_per_second": 34.02, "step": 15 }, { "epoch": 0.7272727272727273, "grad_norm": 6.809484481811523, "learning_rate": 2.980263157894737e-05, "loss": 1.4897, "step": 16 }, { "epoch": 0.7272727272727273, "eval_loss": 1.3582329750061035, "eval_runtime": 0.2729, "eval_samples_per_second": 322.483, "eval_steps_per_second": 40.31, "step": 16 }, { "epoch": 0.7727272727272727, "grad_norm": 7.0124711990356445, "learning_rate": 2.9763157894736842e-05, "loss": 1.3831, "step": 17 }, { "epoch": 0.7727272727272727, "eval_loss": 1.2863445281982422, "eval_runtime": 0.2734, "eval_samples_per_second": 321.916, "eval_steps_per_second": 40.24, "step": 17 }, { "epoch": 0.8181818181818182, "grad_norm": 6.749629974365234, "learning_rate": 2.9723684210526316e-05, "loss": 1.2616, "step": 18 }, { "epoch": 0.8181818181818182, "eval_loss": 1.1985043287277222, "eval_runtime": 0.2953, "eval_samples_per_second": 297.968, "eval_steps_per_second": 37.246, "step": 18 }, { "epoch": 0.8636363636363636, "grad_norm": 8.935945510864258, "learning_rate": 2.968421052631579e-05, "loss": 1.2058, "step": 19 }, { "epoch": 0.8636363636363636, "eval_loss": 1.1089844703674316, "eval_runtime": 0.3886, "eval_samples_per_second": 226.48, "eval_steps_per_second": 28.31, "step": 19 }, { "epoch": 0.9090909090909091, "grad_norm": 5.048995018005371, "learning_rate": 2.9644736842105265e-05, "loss": 1.1399, "step": 20 }, { "epoch": 0.9090909090909091, "eval_loss": 1.0176739692687988, "eval_runtime": 0.2913, "eval_samples_per_second": 302.091, "eval_steps_per_second": 37.761, "step": 20 }, { "epoch": 0.9545454545454546, "grad_norm": 6.563332557678223, "learning_rate": 2.9605263157894735e-05, "loss": 0.9906, "step": 21 }, { "epoch": 0.9545454545454546, "eval_loss": 0.930864155292511, "eval_runtime": 0.2425, "eval_samples_per_second": 362.831, "eval_steps_per_second": 45.354, "step": 21 }, { "epoch": 1.0, "grad_norm": 12.079025268554688, "learning_rate": 2.9565789473684213e-05, "loss": 1.0795, "step": 22 }, { "epoch": 1.0, "eval_loss": 0.8574727773666382, "eval_runtime": 0.2662, "eval_samples_per_second": 330.588, "eval_steps_per_second": 41.323, "step": 22 }, { "epoch": 1.0454545454545454, "grad_norm": 5.452284336090088, "learning_rate": 2.9526315789473684e-05, "loss": 0.8862, "step": 23 }, { "epoch": 1.0454545454545454, "eval_loss": 0.7834421396255493, "eval_runtime": 0.2559, "eval_samples_per_second": 343.941, "eval_steps_per_second": 42.993, "step": 23 }, { "epoch": 1.0909090909090908, "grad_norm": 6.780595302581787, "learning_rate": 2.9486842105263158e-05, "loss": 0.7825, "step": 24 }, { "epoch": 1.0909090909090908, "eval_loss": 0.7133036255836487, "eval_runtime": 0.4064, "eval_samples_per_second": 216.526, "eval_steps_per_second": 27.066, "step": 24 }, { "epoch": 1.1363636363636362, "grad_norm": 6.756824016571045, "learning_rate": 2.9447368421052635e-05, "loss": 0.9249, "step": 25 }, { "epoch": 1.1363636363636362, "eval_loss": 0.652009904384613, "eval_runtime": 0.4122, "eval_samples_per_second": 213.486, "eval_steps_per_second": 26.686, "step": 25 }, { "epoch": 1.1818181818181819, "grad_norm": 4.798681259155273, "learning_rate": 2.9407894736842106e-05, "loss": 0.5773, "step": 26 }, { "epoch": 1.1818181818181819, "eval_loss": 0.6013602614402771, "eval_runtime": 0.4182, "eval_samples_per_second": 210.411, "eval_steps_per_second": 26.301, "step": 26 }, { "epoch": 1.2272727272727273, "grad_norm": 4.608880996704102, "learning_rate": 2.936842105263158e-05, "loss": 0.6573, "step": 27 }, { "epoch": 1.2272727272727273, "eval_loss": 0.5579346418380737, "eval_runtime": 0.5426, "eval_samples_per_second": 162.18, "eval_steps_per_second": 20.272, "step": 27 }, { "epoch": 1.2727272727272727, "grad_norm": 4.582436561584473, "learning_rate": 2.9328947368421055e-05, "loss": 0.5408, "step": 28 }, { "epoch": 1.2727272727272727, "eval_loss": 0.5213125348091125, "eval_runtime": 0.274, "eval_samples_per_second": 321.214, "eval_steps_per_second": 40.152, "step": 28 }, { "epoch": 1.3181818181818181, "grad_norm": 6.145488262176514, "learning_rate": 2.928947368421053e-05, "loss": 0.6888, "step": 29 }, { "epoch": 1.3181818181818181, "eval_loss": 0.47387245297431946, "eval_runtime": 0.2153, "eval_samples_per_second": 408.668, "eval_steps_per_second": 51.083, "step": 29 }, { "epoch": 1.3636363636363638, "grad_norm": 4.611596584320068, "learning_rate": 2.925e-05, "loss": 0.584, "step": 30 }, { "epoch": 1.3636363636363638, "eval_loss": 0.41591426730155945, "eval_runtime": 0.2262, "eval_samples_per_second": 388.952, "eval_steps_per_second": 48.619, "step": 30 }, { "epoch": 1.4090909090909092, "grad_norm": 4.470975875854492, "learning_rate": 2.9210526315789474e-05, "loss": 0.4962, "step": 31 }, { "epoch": 1.4090909090909092, "eval_loss": 0.3586600720882416, "eval_runtime": 0.2233, "eval_samples_per_second": 394.029, "eval_steps_per_second": 49.254, "step": 31 }, { "epoch": 1.4545454545454546, "grad_norm": 3.111593008041382, "learning_rate": 2.9171052631578948e-05, "loss": 0.3594, "step": 32 }, { "epoch": 1.4545454545454546, "eval_loss": 0.3188125491142273, "eval_runtime": 0.3382, "eval_samples_per_second": 260.203, "eval_steps_per_second": 32.525, "step": 32 }, { "epoch": 1.5, "grad_norm": 3.246596336364746, "learning_rate": 2.9131578947368422e-05, "loss": 0.3643, "step": 33 }, { "epoch": 1.5, "eval_loss": 0.2900885343551636, "eval_runtime": 0.2904, "eval_samples_per_second": 302.998, "eval_steps_per_second": 37.875, "step": 33 }, { "epoch": 1.5454545454545454, "grad_norm": 4.4003376960754395, "learning_rate": 2.9092105263157893e-05, "loss": 0.3334, "step": 34 }, { "epoch": 1.5454545454545454, "eval_loss": 0.260213166475296, "eval_runtime": 0.3641, "eval_samples_per_second": 241.707, "eval_steps_per_second": 30.213, "step": 34 }, { "epoch": 1.5909090909090908, "grad_norm": 5.7509236335754395, "learning_rate": 2.905263157894737e-05, "loss": 0.3754, "step": 35 }, { "epoch": 1.5909090909090908, "eval_loss": 0.2297886312007904, "eval_runtime": 0.3003, "eval_samples_per_second": 293.032, "eval_steps_per_second": 36.629, "step": 35 }, { "epoch": 1.6363636363636362, "grad_norm": 3.7421319484710693, "learning_rate": 2.901315789473684e-05, "loss": 0.3108, "step": 36 }, { "epoch": 1.6363636363636362, "eval_loss": 0.21363353729248047, "eval_runtime": 0.4783, "eval_samples_per_second": 183.979, "eval_steps_per_second": 22.997, "step": 36 }, { "epoch": 1.6818181818181817, "grad_norm": 3.7049357891082764, "learning_rate": 2.8973684210526315e-05, "loss": 0.2933, "step": 37 }, { "epoch": 1.6818181818181817, "eval_loss": 0.20323915779590607, "eval_runtime": 0.25, "eval_samples_per_second": 351.979, "eval_steps_per_second": 43.997, "step": 37 }, { "epoch": 1.7272727272727273, "grad_norm": 2.6143414974212646, "learning_rate": 2.893421052631579e-05, "loss": 0.2208, "step": 38 }, { "epoch": 1.7272727272727273, "eval_loss": 0.19065451622009277, "eval_runtime": 0.284, "eval_samples_per_second": 309.864, "eval_steps_per_second": 38.733, "step": 38 }, { "epoch": 1.7727272727272727, "grad_norm": 3.0895273685455322, "learning_rate": 2.8894736842105263e-05, "loss": 0.2448, "step": 39 }, { "epoch": 1.7727272727272727, "eval_loss": 0.17271381616592407, "eval_runtime": 0.3543, "eval_samples_per_second": 248.403, "eval_steps_per_second": 31.05, "step": 39 }, { "epoch": 1.8181818181818183, "grad_norm": 1.7658973932266235, "learning_rate": 2.8855263157894738e-05, "loss": 0.1742, "step": 40 }, { "epoch": 1.8181818181818183, "eval_loss": 0.152969092130661, "eval_runtime": 0.2714, "eval_samples_per_second": 324.231, "eval_steps_per_second": 40.529, "step": 40 }, { "epoch": 1.8636363636363638, "grad_norm": 1.7428200244903564, "learning_rate": 2.8815789473684212e-05, "loss": 0.1717, "step": 41 }, { "epoch": 1.8636363636363638, "eval_loss": 0.13160385191440582, "eval_runtime": 0.2485, "eval_samples_per_second": 354.091, "eval_steps_per_second": 44.261, "step": 41 }, { "epoch": 1.9090909090909092, "grad_norm": 1.9848284721374512, "learning_rate": 2.8776315789473686e-05, "loss": 0.1487, "step": 42 }, { "epoch": 1.9090909090909092, "eval_loss": 0.11496426910161972, "eval_runtime": 0.2812, "eval_samples_per_second": 312.902, "eval_steps_per_second": 39.113, "step": 42 }, { "epoch": 1.9545454545454546, "grad_norm": 1.8623422384262085, "learning_rate": 2.8736842105263157e-05, "loss": 0.1671, "step": 43 }, { "epoch": 1.9545454545454546, "eval_loss": 0.10060829669237137, "eval_runtime": 0.531, "eval_samples_per_second": 165.721, "eval_steps_per_second": 20.715, "step": 43 }, { "epoch": 2.0, "grad_norm": 1.254258632659912, "learning_rate": 2.8697368421052634e-05, "loss": 0.1296, "step": 44 }, { "epoch": 2.0, "eval_loss": 0.09032303839921951, "eval_runtime": 0.4212, "eval_samples_per_second": 208.91, "eval_steps_per_second": 26.114, "step": 44 }, { "epoch": 2.0454545454545454, "grad_norm": 1.7023710012435913, "learning_rate": 2.8657894736842105e-05, "loss": 0.1269, "step": 45 }, { "epoch": 2.0454545454545454, "eval_loss": 0.08172891288995743, "eval_runtime": 0.3434, "eval_samples_per_second": 256.286, "eval_steps_per_second": 32.036, "step": 45 }, { "epoch": 2.090909090909091, "grad_norm": 1.1132336854934692, "learning_rate": 2.861842105263158e-05, "loss": 0.1087, "step": 46 }, { "epoch": 2.090909090909091, "eval_loss": 0.07363786548376083, "eval_runtime": 0.2227, "eval_samples_per_second": 395.148, "eval_steps_per_second": 49.393, "step": 46 }, { "epoch": 2.1363636363636362, "grad_norm": 1.2574397325515747, "learning_rate": 2.8578947368421053e-05, "loss": 0.1007, "step": 47 }, { "epoch": 2.1363636363636362, "eval_loss": 0.0676058903336525, "eval_runtime": 0.2162, "eval_samples_per_second": 406.971, "eval_steps_per_second": 50.871, "step": 47 }, { "epoch": 2.1818181818181817, "grad_norm": 1.1193581819534302, "learning_rate": 2.8539473684210527e-05, "loss": 0.0932, "step": 48 }, { "epoch": 2.1818181818181817, "eval_loss": 0.060314346104860306, "eval_runtime": 0.2456, "eval_samples_per_second": 358.319, "eval_steps_per_second": 44.79, "step": 48 }, { "epoch": 2.227272727272727, "grad_norm": 1.1668117046356201, "learning_rate": 2.8499999999999998e-05, "loss": 0.0885, "step": 49 }, { "epoch": 2.227272727272727, "eval_loss": 0.05352572351694107, "eval_runtime": 0.2143, "eval_samples_per_second": 410.66, "eval_steps_per_second": 51.333, "step": 49 }, { "epoch": 2.2727272727272725, "grad_norm": 0.9329622387886047, "learning_rate": 2.8460526315789476e-05, "loss": 0.0768, "step": 50 }, { "epoch": 2.2727272727272725, "eval_loss": 0.049994777888059616, "eval_runtime": 0.2184, "eval_samples_per_second": 402.932, "eval_steps_per_second": 50.367, "step": 50 }, { "epoch": 2.3181818181818183, "grad_norm": 1.4205875396728516, "learning_rate": 2.8421052631578946e-05, "loss": 0.0871, "step": 51 }, { "epoch": 2.3181818181818183, "eval_loss": 0.046269264072179794, "eval_runtime": 0.2199, "eval_samples_per_second": 400.111, "eval_steps_per_second": 50.014, "step": 51 }, { "epoch": 2.3636363636363638, "grad_norm": 0.6296802163124084, "learning_rate": 2.838157894736842e-05, "loss": 0.0597, "step": 52 }, { "epoch": 2.3636363636363638, "eval_loss": 0.04288846254348755, "eval_runtime": 0.2154, "eval_samples_per_second": 408.528, "eval_steps_per_second": 51.066, "step": 52 }, { "epoch": 2.409090909090909, "grad_norm": 0.8016664981842041, "learning_rate": 2.8342105263157898e-05, "loss": 0.0573, "step": 53 }, { "epoch": 2.409090909090909, "eval_loss": 0.03866353631019592, "eval_runtime": 0.2104, "eval_samples_per_second": 418.258, "eval_steps_per_second": 52.282, "step": 53 }, { "epoch": 2.4545454545454546, "grad_norm": 0.5186643600463867, "learning_rate": 2.830263157894737e-05, "loss": 0.0533, "step": 54 }, { "epoch": 2.4545454545454546, "eval_loss": 0.03540382906794548, "eval_runtime": 0.2148, "eval_samples_per_second": 409.705, "eval_steps_per_second": 51.213, "step": 54 }, { "epoch": 2.5, "grad_norm": 0.616000771522522, "learning_rate": 2.8263157894736843e-05, "loss": 0.0543, "step": 55 }, { "epoch": 2.5, "eval_loss": 0.03242316469550133, "eval_runtime": 0.2116, "eval_samples_per_second": 415.828, "eval_steps_per_second": 51.979, "step": 55 }, { "epoch": 2.5454545454545454, "grad_norm": 0.6781826615333557, "learning_rate": 2.8223684210526317e-05, "loss": 0.0527, "step": 56 }, { "epoch": 2.5454545454545454, "eval_loss": 0.029892653226852417, "eval_runtime": 0.2231, "eval_samples_per_second": 394.465, "eval_steps_per_second": 49.308, "step": 56 }, { "epoch": 2.590909090909091, "grad_norm": 0.40553542971611023, "learning_rate": 2.818421052631579e-05, "loss": 0.043, "step": 57 }, { "epoch": 2.590909090909091, "eval_loss": 0.02773384563624859, "eval_runtime": 0.212, "eval_samples_per_second": 415.108, "eval_steps_per_second": 51.889, "step": 57 }, { "epoch": 2.6363636363636362, "grad_norm": 0.46068763732910156, "learning_rate": 2.8144736842105262e-05, "loss": 0.0408, "step": 58 }, { "epoch": 2.6363636363636362, "eval_loss": 0.025741351768374443, "eval_runtime": 0.2177, "eval_samples_per_second": 404.269, "eval_steps_per_second": 50.534, "step": 58 }, { "epoch": 2.6818181818181817, "grad_norm": 0.42782941460609436, "learning_rate": 2.810526315789474e-05, "loss": 0.0404, "step": 59 }, { "epoch": 2.6818181818181817, "eval_loss": 0.023805884644389153, "eval_runtime": 0.2164, "eval_samples_per_second": 406.611, "eval_steps_per_second": 50.826, "step": 59 }, { "epoch": 2.7272727272727275, "grad_norm": 0.3100360035896301, "learning_rate": 2.806578947368421e-05, "loss": 0.0348, "step": 60 }, { "epoch": 2.7272727272727275, "eval_loss": 0.022079171612858772, "eval_runtime": 0.2121, "eval_samples_per_second": 414.803, "eval_steps_per_second": 51.85, "step": 60 }, { "epoch": 2.7727272727272725, "grad_norm": 0.3292113244533539, "learning_rate": 2.8026315789473685e-05, "loss": 0.0331, "step": 61 }, { "epoch": 2.7727272727272725, "eval_loss": 0.020567093044519424, "eval_runtime": 0.2183, "eval_samples_per_second": 403.093, "eval_steps_per_second": 50.387, "step": 61 }, { "epoch": 2.8181818181818183, "grad_norm": 0.4177182912826538, "learning_rate": 2.798684210526316e-05, "loss": 0.0323, "step": 62 }, { "epoch": 2.8181818181818183, "eval_loss": 0.019224492833018303, "eval_runtime": 0.2119, "eval_samples_per_second": 415.211, "eval_steps_per_second": 51.901, "step": 62 }, { "epoch": 2.8636363636363638, "grad_norm": 0.23254263401031494, "learning_rate": 2.7947368421052633e-05, "loss": 0.0252, "step": 63 }, { "epoch": 2.8636363636363638, "eval_loss": 0.01814187504351139, "eval_runtime": 0.2203, "eval_samples_per_second": 399.469, "eval_steps_per_second": 49.934, "step": 63 }, { "epoch": 2.909090909090909, "grad_norm": 0.38803598284721375, "learning_rate": 2.7907894736842104e-05, "loss": 0.031, "step": 64 }, { "epoch": 2.909090909090909, "eval_loss": 0.01718403585255146, "eval_runtime": 0.2179, "eval_samples_per_second": 403.933, "eval_steps_per_second": 50.492, "step": 64 }, { "epoch": 2.9545454545454546, "grad_norm": 0.33151182532310486, "learning_rate": 2.786842105263158e-05, "loss": 0.03, "step": 65 }, { "epoch": 2.9545454545454546, "eval_loss": 0.016221443191170692, "eval_runtime": 0.2114, "eval_samples_per_second": 416.237, "eval_steps_per_second": 52.03, "step": 65 }, { "epoch": 3.0, "grad_norm": 0.25049498677253723, "learning_rate": 2.7828947368421055e-05, "loss": 0.0244, "step": 66 }, { "epoch": 3.0, "eval_loss": 0.015314313583076, "eval_runtime": 0.2173, "eval_samples_per_second": 404.944, "eval_steps_per_second": 50.618, "step": 66 }, { "epoch": 3.0454545454545454, "grad_norm": 0.2723033130168915, "learning_rate": 2.7789473684210526e-05, "loss": 0.0235, "step": 67 }, { "epoch": 3.0454545454545454, "eval_loss": 0.014571275562047958, "eval_runtime": 0.2218, "eval_samples_per_second": 396.808, "eval_steps_per_second": 49.601, "step": 67 }, { "epoch": 3.090909090909091, "grad_norm": 0.20975647866725922, "learning_rate": 2.7750000000000004e-05, "loss": 0.0222, "step": 68 }, { "epoch": 3.090909090909091, "eval_loss": 0.013959475792944431, "eval_runtime": 0.2232, "eval_samples_per_second": 394.228, "eval_steps_per_second": 49.279, "step": 68 }, { "epoch": 3.1363636363636362, "grad_norm": 0.2025345116853714, "learning_rate": 2.7710526315789474e-05, "loss": 0.0228, "step": 69 }, { "epoch": 3.1363636363636362, "eval_loss": 0.013426948338747025, "eval_runtime": 0.2201, "eval_samples_per_second": 399.844, "eval_steps_per_second": 49.981, "step": 69 }, { "epoch": 3.1818181818181817, "grad_norm": 0.2033005654811859, "learning_rate": 2.767105263157895e-05, "loss": 0.0209, "step": 70 }, { "epoch": 3.1818181818181817, "eval_loss": 0.012989457696676254, "eval_runtime": 0.2125, "eval_samples_per_second": 414.107, "eval_steps_per_second": 51.763, "step": 70 }, { "epoch": 3.227272727272727, "grad_norm": 0.18534056842327118, "learning_rate": 2.7631578947368423e-05, "loss": 0.0199, "step": 71 }, { "epoch": 3.227272727272727, "eval_loss": 0.012577124871313572, "eval_runtime": 0.2145, "eval_samples_per_second": 410.253, "eval_steps_per_second": 51.282, "step": 71 }, { "epoch": 3.2727272727272725, "grad_norm": 0.16536517441272736, "learning_rate": 2.7592105263157897e-05, "loss": 0.017, "step": 72 }, { "epoch": 3.2727272727272725, "eval_loss": 0.012171071022748947, "eval_runtime": 0.225, "eval_samples_per_second": 391.108, "eval_steps_per_second": 48.888, "step": 72 }, { "epoch": 3.3181818181818183, "grad_norm": 0.14233346283435822, "learning_rate": 2.7552631578947368e-05, "loss": 0.0173, "step": 73 }, { "epoch": 3.3181818181818183, "eval_loss": 0.011801562272012234, "eval_runtime": 0.2206, "eval_samples_per_second": 398.895, "eval_steps_per_second": 49.862, "step": 73 }, { "epoch": 3.3636363636363638, "grad_norm": 0.18418766558170319, "learning_rate": 2.7513157894736842e-05, "loss": 0.0193, "step": 74 }, { "epoch": 3.3636363636363638, "eval_loss": 0.01138223335146904, "eval_runtime": 0.2269, "eval_samples_per_second": 387.873, "eval_steps_per_second": 48.484, "step": 74 }, { "epoch": 3.409090909090909, "grad_norm": 0.1584126502275467, "learning_rate": 2.7473684210526316e-05, "loss": 0.0174, "step": 75 }, { "epoch": 3.409090909090909, "eval_loss": 0.010961382649838924, "eval_runtime": 0.2368, "eval_samples_per_second": 371.65, "eval_steps_per_second": 46.456, "step": 75 }, { "epoch": 3.4545454545454546, "grad_norm": 0.15311338007450104, "learning_rate": 2.743421052631579e-05, "loss": 0.0152, "step": 76 }, { "epoch": 3.4545454545454546, "eval_loss": 0.01055182795971632, "eval_runtime": 0.2222, "eval_samples_per_second": 396.112, "eval_steps_per_second": 49.514, "step": 76 }, { "epoch": 3.5, "grad_norm": 0.1895849108695984, "learning_rate": 2.739473684210526e-05, "loss": 0.0185, "step": 77 }, { "epoch": 3.5, "eval_loss": 0.01013518963009119, "eval_runtime": 0.2228, "eval_samples_per_second": 394.993, "eval_steps_per_second": 49.374, "step": 77 }, { "epoch": 3.5454545454545454, "grad_norm": 0.1422702521085739, "learning_rate": 2.735526315789474e-05, "loss": 0.0163, "step": 78 }, { "epoch": 3.5454545454545454, "eval_loss": 0.009774941019713879, "eval_runtime": 0.2328, "eval_samples_per_second": 378.047, "eval_steps_per_second": 47.256, "step": 78 }, { "epoch": 3.590909090909091, "grad_norm": 0.15089201927185059, "learning_rate": 2.7315789473684213e-05, "loss": 0.0162, "step": 79 }, { "epoch": 3.590909090909091, "eval_loss": 0.009458563290536404, "eval_runtime": 0.2335, "eval_samples_per_second": 376.887, "eval_steps_per_second": 47.111, "step": 79 }, { "epoch": 3.6363636363636362, "grad_norm": 0.16338452696800232, "learning_rate": 2.7276315789473683e-05, "loss": 0.015, "step": 80 }, { "epoch": 3.6363636363636362, "eval_loss": 0.00917022954672575, "eval_runtime": 0.2355, "eval_samples_per_second": 373.621, "eval_steps_per_second": 46.703, "step": 80 }, { "epoch": 3.6818181818181817, "grad_norm": 0.14390893280506134, "learning_rate": 2.723684210526316e-05, "loss": 0.0148, "step": 81 }, { "epoch": 3.6818181818181817, "eval_loss": 0.00891400221735239, "eval_runtime": 0.2182, "eval_samples_per_second": 403.39, "eval_steps_per_second": 50.424, "step": 81 }, { "epoch": 3.7272727272727275, "grad_norm": 0.23557034134864807, "learning_rate": 2.719736842105263e-05, "loss": 0.0173, "step": 82 }, { "epoch": 3.7272727272727275, "eval_loss": 0.008688293397426605, "eval_runtime": 0.2236, "eval_samples_per_second": 393.639, "eval_steps_per_second": 49.205, "step": 82 }, { "epoch": 3.7727272727272725, "grad_norm": 0.12254065275192261, "learning_rate": 2.7157894736842106e-05, "loss": 0.0133, "step": 83 }, { "epoch": 3.7727272727272725, "eval_loss": 0.008477870374917984, "eval_runtime": 0.2215, "eval_samples_per_second": 397.361, "eval_steps_per_second": 49.67, "step": 83 }, { "epoch": 3.8181818181818183, "grad_norm": 0.10980476438999176, "learning_rate": 2.711842105263158e-05, "loss": 0.0128, "step": 84 }, { "epoch": 3.8181818181818183, "eval_loss": 0.00827844813466072, "eval_runtime": 0.2234, "eval_samples_per_second": 393.942, "eval_steps_per_second": 49.243, "step": 84 }, { "epoch": 3.8636363636363638, "grad_norm": 0.13196319341659546, "learning_rate": 2.7078947368421054e-05, "loss": 0.013, "step": 85 }, { "epoch": 3.8636363636363638, "eval_loss": 0.008079243823885918, "eval_runtime": 0.2221, "eval_samples_per_second": 396.214, "eval_steps_per_second": 49.527, "step": 85 }, { "epoch": 3.909090909090909, "grad_norm": 0.10154274851083755, "learning_rate": 2.7039473684210525e-05, "loss": 0.0122, "step": 86 }, { "epoch": 3.909090909090909, "eval_loss": 0.007896007038652897, "eval_runtime": 0.224, "eval_samples_per_second": 392.924, "eval_steps_per_second": 49.115, "step": 86 }, { "epoch": 3.9545454545454546, "grad_norm": 0.1324293613433838, "learning_rate": 2.7000000000000002e-05, "loss": 0.0126, "step": 87 }, { "epoch": 3.9545454545454546, "eval_loss": 0.007718592882156372, "eval_runtime": 0.2196, "eval_samples_per_second": 400.741, "eval_steps_per_second": 50.093, "step": 87 }, { "epoch": 4.0, "grad_norm": 0.10327129811048508, "learning_rate": 2.6960526315789473e-05, "loss": 0.012, "step": 88 }, { "epoch": 4.0, "eval_loss": 0.007555495481938124, "eval_runtime": 0.2221, "eval_samples_per_second": 396.243, "eval_steps_per_second": 49.53, "step": 88 }, { "epoch": 4.045454545454546, "grad_norm": 0.09408023953437805, "learning_rate": 2.6921052631578947e-05, "loss": 0.0115, "step": 89 }, { "epoch": 4.045454545454546, "eval_loss": 0.0074074105359613895, "eval_runtime": 0.2205, "eval_samples_per_second": 399.137, "eval_steps_per_second": 49.892, "step": 89 }, { "epoch": 4.090909090909091, "grad_norm": 0.09438669681549072, "learning_rate": 2.688157894736842e-05, "loss": 0.0117, "step": 90 }, { "epoch": 4.090909090909091, "eval_loss": 0.007270295638591051, "eval_runtime": 0.2207, "eval_samples_per_second": 398.716, "eval_steps_per_second": 49.839, "step": 90 }, { "epoch": 4.136363636363637, "grad_norm": 0.10392805188894272, "learning_rate": 2.6842105263157896e-05, "loss": 0.0121, "step": 91 }, { "epoch": 4.136363636363637, "eval_loss": 0.007134940009564161, "eval_runtime": 0.2226, "eval_samples_per_second": 395.399, "eval_steps_per_second": 49.425, "step": 91 }, { "epoch": 4.181818181818182, "grad_norm": 0.09916353225708008, "learning_rate": 2.6802631578947366e-05, "loss": 0.0111, "step": 92 }, { "epoch": 4.181818181818182, "eval_loss": 0.007011328358203173, "eval_runtime": 0.2218, "eval_samples_per_second": 396.679, "eval_steps_per_second": 49.585, "step": 92 }, { "epoch": 4.2272727272727275, "grad_norm": 0.11726672202348709, "learning_rate": 2.6763157894736844e-05, "loss": 0.0128, "step": 93 }, { "epoch": 4.2272727272727275, "eval_loss": 0.006890672724694014, "eval_runtime": 0.2242, "eval_samples_per_second": 392.462, "eval_steps_per_second": 49.058, "step": 93 }, { "epoch": 4.2727272727272725, "grad_norm": 0.10044334828853607, "learning_rate": 2.6723684210526318e-05, "loss": 0.0115, "step": 94 }, { "epoch": 4.2727272727272725, "eval_loss": 0.006776686292141676, "eval_runtime": 0.2201, "eval_samples_per_second": 399.833, "eval_steps_per_second": 49.979, "step": 94 }, { "epoch": 4.318181818181818, "grad_norm": 0.09276948869228363, "learning_rate": 2.668421052631579e-05, "loss": 0.011, "step": 95 }, { "epoch": 4.318181818181818, "eval_loss": 0.00667022867128253, "eval_runtime": 0.2225, "eval_samples_per_second": 395.502, "eval_steps_per_second": 49.438, "step": 95 }, { "epoch": 4.363636363636363, "grad_norm": 0.09718704223632812, "learning_rate": 2.6644736842105266e-05, "loss": 0.0113, "step": 96 }, { "epoch": 4.363636363636363, "eval_loss": 0.006560015957802534, "eval_runtime": 0.2172, "eval_samples_per_second": 405.161, "eval_steps_per_second": 50.645, "step": 96 }, { "epoch": 4.409090909090909, "grad_norm": 0.11359906196594238, "learning_rate": 2.6605263157894737e-05, "loss": 0.0105, "step": 97 }, { "epoch": 4.409090909090909, "eval_loss": 0.0064485338516533375, "eval_runtime": 0.221, "eval_samples_per_second": 398.174, "eval_steps_per_second": 49.772, "step": 97 }, { "epoch": 4.454545454545454, "grad_norm": 0.0942469909787178, "learning_rate": 2.656578947368421e-05, "loss": 0.0104, "step": 98 }, { "epoch": 4.454545454545454, "eval_loss": 0.00633326917886734, "eval_runtime": 0.2241, "eval_samples_per_second": 392.749, "eval_steps_per_second": 49.094, "step": 98 }, { "epoch": 4.5, "grad_norm": 0.08770338445901871, "learning_rate": 2.6526315789473685e-05, "loss": 0.0097, "step": 99 }, { "epoch": 4.5, "eval_loss": 0.006226606201380491, "eval_runtime": 0.221, "eval_samples_per_second": 398.22, "eval_steps_per_second": 49.777, "step": 99 }, { "epoch": 4.545454545454545, "grad_norm": 0.0902254730463028, "learning_rate": 2.648684210526316e-05, "loss": 0.0102, "step": 100 }, { "epoch": 4.545454545454545, "eval_loss": 0.0061218636110424995, "eval_runtime": 0.2218, "eval_samples_per_second": 396.725, "eval_steps_per_second": 49.591, "step": 100 }, { "epoch": 4.590909090909091, "grad_norm": 0.07302330434322357, "learning_rate": 2.644736842105263e-05, "loss": 0.0086, "step": 101 }, { "epoch": 4.590909090909091, "eval_loss": 0.006022432819008827, "eval_runtime": 0.2242, "eval_samples_per_second": 392.497, "eval_steps_per_second": 49.062, "step": 101 }, { "epoch": 4.636363636363637, "grad_norm": 0.09044598042964935, "learning_rate": 2.6407894736842108e-05, "loss": 0.0098, "step": 102 }, { "epoch": 4.636363636363637, "eval_loss": 0.005927449557930231, "eval_runtime": 0.219, "eval_samples_per_second": 401.867, "eval_steps_per_second": 50.233, "step": 102 }, { "epoch": 4.681818181818182, "grad_norm": 0.07847205549478531, "learning_rate": 2.636842105263158e-05, "loss": 0.0093, "step": 103 }, { "epoch": 4.681818181818182, "eval_loss": 0.005836833734065294, "eval_runtime": 0.2477, "eval_samples_per_second": 355.291, "eval_steps_per_second": 44.411, "step": 103 }, { "epoch": 4.7272727272727275, "grad_norm": 0.09054490178823471, "learning_rate": 2.6328947368421053e-05, "loss": 0.0093, "step": 104 }, { "epoch": 4.7272727272727275, "eval_loss": 0.005744776222854853, "eval_runtime": 0.2237, "eval_samples_per_second": 393.373, "eval_steps_per_second": 49.172, "step": 104 }, { "epoch": 4.7727272727272725, "grad_norm": 0.08056215196847916, "learning_rate": 2.6289473684210527e-05, "loss": 0.0095, "step": 105 }, { "epoch": 4.7727272727272725, "eval_loss": 0.005655229557305574, "eval_runtime": 0.2221, "eval_samples_per_second": 396.284, "eval_steps_per_second": 49.535, "step": 105 }, { "epoch": 4.818181818181818, "grad_norm": 0.07413677871227264, "learning_rate": 2.625e-05, "loss": 0.0095, "step": 106 }, { "epoch": 4.818181818181818, "eval_loss": 0.005573854316025972, "eval_runtime": 0.2219, "eval_samples_per_second": 396.499, "eval_steps_per_second": 49.562, "step": 106 }, { "epoch": 4.863636363636363, "grad_norm": 0.09156908839941025, "learning_rate": 2.6210526315789475e-05, "loss": 0.0094, "step": 107 }, { "epoch": 4.863636363636363, "eval_loss": 0.005500171799212694, "eval_runtime": 0.218, "eval_samples_per_second": 403.697, "eval_steps_per_second": 50.462, "step": 107 }, { "epoch": 4.909090909090909, "grad_norm": 0.07806240767240524, "learning_rate": 2.617105263157895e-05, "loss": 0.009, "step": 108 }, { "epoch": 4.909090909090909, "eval_loss": 0.005432323087006807, "eval_runtime": 0.2208, "eval_samples_per_second": 398.497, "eval_steps_per_second": 49.812, "step": 108 }, { "epoch": 4.954545454545455, "grad_norm": 0.07705673575401306, "learning_rate": 2.6131578947368424e-05, "loss": 0.0091, "step": 109 }, { "epoch": 4.954545454545455, "eval_loss": 0.005366500001400709, "eval_runtime": 0.2187, "eval_samples_per_second": 402.388, "eval_steps_per_second": 50.299, "step": 109 }, { "epoch": 5.0, "grad_norm": 0.0743311420083046, "learning_rate": 2.6092105263157894e-05, "loss": 0.0087, "step": 110 }, { "epoch": 5.0, "eval_loss": 0.005299717653542757, "eval_runtime": 0.215, "eval_samples_per_second": 409.298, "eval_steps_per_second": 51.162, "step": 110 }, { "epoch": 5.045454545454546, "grad_norm": 0.0689927488565445, "learning_rate": 2.605263157894737e-05, "loss": 0.0081, "step": 111 }, { "epoch": 5.045454545454546, "eval_loss": 0.005235993303358555, "eval_runtime": 0.2192, "eval_samples_per_second": 401.457, "eval_steps_per_second": 50.182, "step": 111 }, { "epoch": 5.090909090909091, "grad_norm": 0.06892900913953781, "learning_rate": 2.6013157894736843e-05, "loss": 0.0082, "step": 112 }, { "epoch": 5.090909090909091, "eval_loss": 0.005173509940505028, "eval_runtime": 0.219, "eval_samples_per_second": 401.777, "eval_steps_per_second": 50.222, "step": 112 }, { "epoch": 5.136363636363637, "grad_norm": 0.06960764527320862, "learning_rate": 2.5973684210526317e-05, "loss": 0.0081, "step": 113 }, { "epoch": 5.136363636363637, "eval_loss": 0.005112760700285435, "eval_runtime": 0.2203, "eval_samples_per_second": 399.491, "eval_steps_per_second": 49.936, "step": 113 }, { "epoch": 5.181818181818182, "grad_norm": 0.07173731923103333, "learning_rate": 2.5934210526315788e-05, "loss": 0.008, "step": 114 }, { "epoch": 5.181818181818182, "eval_loss": 0.00505533954128623, "eval_runtime": 0.2227, "eval_samples_per_second": 395.105, "eval_steps_per_second": 49.388, "step": 114 }, { "epoch": 5.2272727272727275, "grad_norm": 0.06811046600341797, "learning_rate": 2.5894736842105265e-05, "loss": 0.0074, "step": 115 }, { "epoch": 5.2272727272727275, "eval_loss": 0.0049970815889537334, "eval_runtime": 0.2171, "eval_samples_per_second": 405.344, "eval_steps_per_second": 50.668, "step": 115 }, { "epoch": 5.2727272727272725, "grad_norm": 0.0676768496632576, "learning_rate": 2.5855263157894736e-05, "loss": 0.0076, "step": 116 }, { "epoch": 5.2727272727272725, "eval_loss": 0.004939272068440914, "eval_runtime": 0.2319, "eval_samples_per_second": 379.44, "eval_steps_per_second": 47.43, "step": 116 }, { "epoch": 5.318181818181818, "grad_norm": 0.06927932053804398, "learning_rate": 2.581578947368421e-05, "loss": 0.0078, "step": 117 }, { "epoch": 5.318181818181818, "eval_loss": 0.004879767540842295, "eval_runtime": 0.2354, "eval_samples_per_second": 373.859, "eval_steps_per_second": 46.732, "step": 117 }, { "epoch": 5.363636363636363, "grad_norm": 0.0733099952340126, "learning_rate": 2.5776315789473684e-05, "loss": 0.009, "step": 118 }, { "epoch": 5.363636363636363, "eval_loss": 0.00482180854305625, "eval_runtime": 0.22, "eval_samples_per_second": 399.97, "eval_steps_per_second": 49.996, "step": 118 }, { "epoch": 5.409090909090909, "grad_norm": 0.07873851805925369, "learning_rate": 2.5736842105263158e-05, "loss": 0.0085, "step": 119 }, { "epoch": 5.409090909090909, "eval_loss": 0.004767131991684437, "eval_runtime": 0.2272, "eval_samples_per_second": 387.355, "eval_steps_per_second": 48.419, "step": 119 }, { "epoch": 5.454545454545454, "grad_norm": 0.06912100315093994, "learning_rate": 2.5697368421052632e-05, "loss": 0.0075, "step": 120 }, { "epoch": 5.454545454545454, "eval_loss": 0.004715087823569775, "eval_runtime": 0.2216, "eval_samples_per_second": 397.159, "eval_steps_per_second": 49.645, "step": 120 }, { "epoch": 5.5, "grad_norm": 0.059973061084747314, "learning_rate": 2.5657894736842107e-05, "loss": 0.0078, "step": 121 }, { "epoch": 5.5, "eval_loss": 0.004667165223509073, "eval_runtime": 0.2226, "eval_samples_per_second": 395.251, "eval_steps_per_second": 49.406, "step": 121 }, { "epoch": 5.545454545454545, "grad_norm": 0.06346078962087631, "learning_rate": 2.561842105263158e-05, "loss": 0.0073, "step": 122 }, { "epoch": 5.545454545454545, "eval_loss": 0.004621806554496288, "eval_runtime": 0.2263, "eval_samples_per_second": 388.791, "eval_steps_per_second": 48.599, "step": 122 }, { "epoch": 5.590909090909091, "grad_norm": 0.07588130235671997, "learning_rate": 2.557894736842105e-05, "loss": 0.0079, "step": 123 }, { "epoch": 5.590909090909091, "eval_loss": 0.004576975479722023, "eval_runtime": 0.2216, "eval_samples_per_second": 397.081, "eval_steps_per_second": 49.635, "step": 123 }, { "epoch": 5.636363636363637, "grad_norm": 0.0569930225610733, "learning_rate": 2.553947368421053e-05, "loss": 0.0068, "step": 124 }, { "epoch": 5.636363636363637, "eval_loss": 0.004534974228590727, "eval_runtime": 0.2207, "eval_samples_per_second": 398.807, "eval_steps_per_second": 49.851, "step": 124 }, { "epoch": 5.681818181818182, "grad_norm": 0.07023297250270844, "learning_rate": 2.55e-05, "loss": 0.0078, "step": 125 }, { "epoch": 5.681818181818182, "eval_loss": 0.004494456574320793, "eval_runtime": 0.2276, "eval_samples_per_second": 386.655, "eval_steps_per_second": 48.332, "step": 125 }, { "epoch": 5.7272727272727275, "grad_norm": 0.0586245059967041, "learning_rate": 2.5460526315789474e-05, "loss": 0.0072, "step": 126 }, { "epoch": 5.7272727272727275, "eval_loss": 0.0044531743042171, "eval_runtime": 0.2354, "eval_samples_per_second": 373.803, "eval_steps_per_second": 46.725, "step": 126 }, { "epoch": 5.7727272727272725, "grad_norm": 0.0652911588549614, "learning_rate": 2.5421052631578948e-05, "loss": 0.0073, "step": 127 }, { "epoch": 5.7727272727272725, "eval_loss": 0.004411415662616491, "eval_runtime": 0.236, "eval_samples_per_second": 372.941, "eval_steps_per_second": 46.618, "step": 127 }, { "epoch": 5.818181818181818, "grad_norm": 0.05701863393187523, "learning_rate": 2.5381578947368422e-05, "loss": 0.0067, "step": 128 }, { "epoch": 5.818181818181818, "eval_loss": 0.004371690563857555, "eval_runtime": 0.2358, "eval_samples_per_second": 373.191, "eval_steps_per_second": 46.649, "step": 128 }, { "epoch": 5.863636363636363, "grad_norm": 0.05990603566169739, "learning_rate": 2.5342105263157893e-05, "loss": 0.0071, "step": 129 }, { "epoch": 5.863636363636363, "eval_loss": 0.004331877455115318, "eval_runtime": 0.2301, "eval_samples_per_second": 382.487, "eval_steps_per_second": 47.811, "step": 129 }, { "epoch": 5.909090909090909, "grad_norm": 0.06283283233642578, "learning_rate": 2.530263157894737e-05, "loss": 0.0071, "step": 130 }, { "epoch": 5.909090909090909, "eval_loss": 0.0042935688979923725, "eval_runtime": 0.2387, "eval_samples_per_second": 368.63, "eval_steps_per_second": 46.079, "step": 130 }, { "epoch": 5.954545454545455, "grad_norm": 0.060048509389162064, "learning_rate": 2.526315789473684e-05, "loss": 0.0067, "step": 131 }, { "epoch": 5.954545454545455, "eval_loss": 0.0042540752328932285, "eval_runtime": 0.2471, "eval_samples_per_second": 356.096, "eval_steps_per_second": 44.512, "step": 131 }, { "epoch": 6.0, "grad_norm": 0.060563940554857254, "learning_rate": 2.5223684210526315e-05, "loss": 0.0064, "step": 132 }, { "epoch": 6.0, "eval_loss": 0.004213025793433189, "eval_runtime": 0.2399, "eval_samples_per_second": 366.883, "eval_steps_per_second": 45.86, "step": 132 }, { "epoch": 6.045454545454546, "grad_norm": 0.060382332652807236, "learning_rate": 2.518421052631579e-05, "loss": 0.0071, "step": 133 }, { "epoch": 6.045454545454546, "eval_loss": 0.004174065310508013, "eval_runtime": 0.2268, "eval_samples_per_second": 388.075, "eval_steps_per_second": 48.509, "step": 133 }, { "epoch": 6.090909090909091, "grad_norm": 0.06080484017729759, "learning_rate": 2.5144736842105264e-05, "loss": 0.0073, "step": 134 }, { "epoch": 6.090909090909091, "eval_loss": 0.0041358619928359985, "eval_runtime": 0.2229, "eval_samples_per_second": 394.875, "eval_steps_per_second": 49.359, "step": 134 }, { "epoch": 6.136363636363637, "grad_norm": 0.057626206427812576, "learning_rate": 2.5105263157894738e-05, "loss": 0.0066, "step": 135 }, { "epoch": 6.136363636363637, "eval_loss": 0.004101672675460577, "eval_runtime": 0.2283, "eval_samples_per_second": 385.395, "eval_steps_per_second": 48.174, "step": 135 }, { "epoch": 6.181818181818182, "grad_norm": 0.06599877029657364, "learning_rate": 2.5065789473684212e-05, "loss": 0.0075, "step": 136 }, { "epoch": 6.181818181818182, "eval_loss": 0.004067064728587866, "eval_runtime": 0.221, "eval_samples_per_second": 398.26, "eval_steps_per_second": 49.783, "step": 136 }, { "epoch": 6.2272727272727275, "grad_norm": 0.05654873698949814, "learning_rate": 2.5026315789473686e-05, "loss": 0.0066, "step": 137 }, { "epoch": 6.2272727272727275, "eval_loss": 0.0040321690030395985, "eval_runtime": 0.2329, "eval_samples_per_second": 377.882, "eval_steps_per_second": 47.235, "step": 137 }, { "epoch": 6.2727272727272725, "grad_norm": 0.05717283487319946, "learning_rate": 2.4986842105263157e-05, "loss": 0.0067, "step": 138 }, { "epoch": 6.2727272727272725, "eval_loss": 0.003995668143033981, "eval_runtime": 0.2203, "eval_samples_per_second": 399.464, "eval_steps_per_second": 49.933, "step": 138 }, { "epoch": 6.318181818181818, "grad_norm": 0.06036869063973427, "learning_rate": 2.4947368421052635e-05, "loss": 0.0064, "step": 139 }, { "epoch": 6.318181818181818, "eval_loss": 0.003956829197704792, "eval_runtime": 0.2294, "eval_samples_per_second": 383.681, "eval_steps_per_second": 47.96, "step": 139 }, { "epoch": 6.363636363636363, "grad_norm": 0.05111813545227051, "learning_rate": 2.4907894736842105e-05, "loss": 0.0063, "step": 140 }, { "epoch": 6.363636363636363, "eval_loss": 0.003918844275176525, "eval_runtime": 0.2186, "eval_samples_per_second": 402.513, "eval_steps_per_second": 50.314, "step": 140 }, { "epoch": 6.409090909090909, "grad_norm": 0.0621768981218338, "learning_rate": 2.486842105263158e-05, "loss": 0.0064, "step": 141 }, { "epoch": 6.409090909090909, "eval_loss": 0.00388046121224761, "eval_runtime": 0.2309, "eval_samples_per_second": 381.086, "eval_steps_per_second": 47.636, "step": 141 }, { "epoch": 6.454545454545454, "grad_norm": 0.06089349836111069, "learning_rate": 2.4828947368421054e-05, "loss": 0.0066, "step": 142 }, { "epoch": 6.454545454545454, "eval_loss": 0.003839746816083789, "eval_runtime": 0.2206, "eval_samples_per_second": 398.928, "eval_steps_per_second": 49.866, "step": 142 }, { "epoch": 6.5, "grad_norm": 0.05007468909025192, "learning_rate": 2.4789473684210528e-05, "loss": 0.0061, "step": 143 }, { "epoch": 6.5, "eval_loss": 0.003801233833655715, "eval_runtime": 0.2295, "eval_samples_per_second": 383.433, "eval_steps_per_second": 47.929, "step": 143 }, { "epoch": 6.545454545454545, "grad_norm": 0.053182121366262436, "learning_rate": 2.475e-05, "loss": 0.0059, "step": 144 }, { "epoch": 6.545454545454545, "eval_loss": 0.003766607493162155, "eval_runtime": 0.2261, "eval_samples_per_second": 389.124, "eval_steps_per_second": 48.64, "step": 144 }, { "epoch": 6.590909090909091, "grad_norm": 0.051414087414741516, "learning_rate": 2.4710526315789476e-05, "loss": 0.0061, "step": 145 }, { "epoch": 6.590909090909091, "eval_loss": 0.0037348391488194466, "eval_runtime": 0.2309, "eval_samples_per_second": 381.083, "eval_steps_per_second": 47.635, "step": 145 }, { "epoch": 6.636363636363637, "grad_norm": 0.051980625838041306, "learning_rate": 2.4671052631578947e-05, "loss": 0.0061, "step": 146 }, { "epoch": 6.636363636363637, "eval_loss": 0.0037048642989248037, "eval_runtime": 0.2327, "eval_samples_per_second": 378.163, "eval_steps_per_second": 47.27, "step": 146 }, { "epoch": 6.681818181818182, "grad_norm": 0.054644446820020676, "learning_rate": 2.463157894736842e-05, "loss": 0.006, "step": 147 }, { "epoch": 6.681818181818182, "eval_loss": 0.003674545791000128, "eval_runtime": 0.2332, "eval_samples_per_second": 377.322, "eval_steps_per_second": 47.165, "step": 147 }, { "epoch": 6.7272727272727275, "grad_norm": 0.04687352105975151, "learning_rate": 2.45921052631579e-05, "loss": 0.0057, "step": 148 }, { "epoch": 6.7272727272727275, "eval_loss": 0.0036456272937357426, "eval_runtime": 0.2302, "eval_samples_per_second": 382.325, "eval_steps_per_second": 47.791, "step": 148 }, { "epoch": 6.7727272727272725, "grad_norm": 0.0500478520989418, "learning_rate": 2.455263157894737e-05, "loss": 0.0054, "step": 149 }, { "epoch": 6.7727272727272725, "eval_loss": 0.003618737915530801, "eval_runtime": 0.2281, "eval_samples_per_second": 385.776, "eval_steps_per_second": 48.222, "step": 149 }, { "epoch": 6.818181818181818, "grad_norm": 0.05092916265130043, "learning_rate": 2.4513157894736843e-05, "loss": 0.0054, "step": 150 }, { "epoch": 6.818181818181818, "eval_loss": 0.0035921267699450254, "eval_runtime": 0.2298, "eval_samples_per_second": 382.977, "eval_steps_per_second": 47.872, "step": 150 }, { "epoch": 6.863636363636363, "grad_norm": 0.05389472842216492, "learning_rate": 2.4473684210526318e-05, "loss": 0.0057, "step": 151 }, { "epoch": 6.863636363636363, "eval_loss": 0.003567308420315385, "eval_runtime": 0.2896, "eval_samples_per_second": 303.912, "eval_steps_per_second": 37.989, "step": 151 }, { "epoch": 6.909090909090909, "grad_norm": 0.051427211612463, "learning_rate": 2.4434210526315792e-05, "loss": 0.0058, "step": 152 }, { "epoch": 6.909090909090909, "eval_loss": 0.003539604600518942, "eval_runtime": 0.2314, "eval_samples_per_second": 380.243, "eval_steps_per_second": 47.53, "step": 152 }, { "epoch": 6.954545454545455, "grad_norm": 0.05391733720898628, "learning_rate": 2.4394736842105262e-05, "loss": 0.0058, "step": 153 }, { "epoch": 6.954545454545455, "eval_loss": 0.0035100304521620274, "eval_runtime": 0.2452, "eval_samples_per_second": 358.914, "eval_steps_per_second": 44.864, "step": 153 }, { "epoch": 7.0, "grad_norm": 0.05612335354089737, "learning_rate": 2.4355263157894737e-05, "loss": 0.0056, "step": 154 }, { "epoch": 7.0, "eval_loss": 0.0034810365177690983, "eval_runtime": 0.2328, "eval_samples_per_second": 378.038, "eval_steps_per_second": 47.255, "step": 154 }, { "epoch": 7.045454545454546, "grad_norm": 0.05799683555960655, "learning_rate": 2.431578947368421e-05, "loss": 0.0062, "step": 155 }, { "epoch": 7.045454545454546, "eval_loss": 0.003452845150604844, "eval_runtime": 0.2326, "eval_samples_per_second": 378.28, "eval_steps_per_second": 47.285, "step": 155 }, { "epoch": 7.090909090909091, "grad_norm": 0.05095871537923813, "learning_rate": 2.4276315789473685e-05, "loss": 0.0051, "step": 156 }, { "epoch": 7.090909090909091, "eval_loss": 0.003425983479246497, "eval_runtime": 0.2387, "eval_samples_per_second": 368.611, "eval_steps_per_second": 46.076, "step": 156 }, { "epoch": 7.136363636363637, "grad_norm": 0.05834353715181351, "learning_rate": 2.4236842105263156e-05, "loss": 0.0061, "step": 157 }, { "epoch": 7.136363636363637, "eval_loss": 0.003400736255571246, "eval_runtime": 0.233, "eval_samples_per_second": 377.737, "eval_steps_per_second": 47.217, "step": 157 }, { "epoch": 7.181818181818182, "grad_norm": 0.05226532742381096, "learning_rate": 2.4197368421052633e-05, "loss": 0.006, "step": 158 }, { "epoch": 7.181818181818182, "eval_loss": 0.003375994274392724, "eval_runtime": 0.2371, "eval_samples_per_second": 371.159, "eval_steps_per_second": 46.395, "step": 158 }, { "epoch": 7.2272727272727275, "grad_norm": 0.044102054089307785, "learning_rate": 2.4157894736842104e-05, "loss": 0.0051, "step": 159 }, { "epoch": 7.2272727272727275, "eval_loss": 0.003351524705067277, "eval_runtime": 0.2391, "eval_samples_per_second": 368.077, "eval_steps_per_second": 46.01, "step": 159 }, { "epoch": 7.2727272727272725, "grad_norm": 0.050387196242809296, "learning_rate": 2.4118421052631578e-05, "loss": 0.0055, "step": 160 }, { "epoch": 7.2727272727272725, "eval_loss": 0.003328080987557769, "eval_runtime": 0.2367, "eval_samples_per_second": 371.775, "eval_steps_per_second": 46.472, "step": 160 }, { "epoch": 7.318181818181818, "grad_norm": 0.05944162234663963, "learning_rate": 2.4078947368421056e-05, "loss": 0.0062, "step": 161 }, { "epoch": 7.318181818181818, "eval_loss": 0.003306704806163907, "eval_runtime": 0.2302, "eval_samples_per_second": 382.244, "eval_steps_per_second": 47.781, "step": 161 }, { "epoch": 7.363636363636363, "grad_norm": 0.058280494064092636, "learning_rate": 2.4039473684210526e-05, "loss": 0.0055, "step": 162 }, { "epoch": 7.363636363636363, "eval_loss": 0.0032875537872314453, "eval_runtime": 0.2313, "eval_samples_per_second": 380.46, "eval_steps_per_second": 47.557, "step": 162 }, { "epoch": 7.409090909090909, "grad_norm": 0.04580385982990265, "learning_rate": 2.4e-05, "loss": 0.0051, "step": 163 }, { "epoch": 7.409090909090909, "eval_loss": 0.003268357366323471, "eval_runtime": 0.2329, "eval_samples_per_second": 377.841, "eval_steps_per_second": 47.23, "step": 163 }, { "epoch": 7.454545454545454, "grad_norm": 0.047211576253175735, "learning_rate": 2.3960526315789475e-05, "loss": 0.0049, "step": 164 }, { "epoch": 7.454545454545454, "eval_loss": 0.003249667352065444, "eval_runtime": 0.2288, "eval_samples_per_second": 384.62, "eval_steps_per_second": 48.077, "step": 164 }, { "epoch": 7.5, "grad_norm": 0.04698212072253227, "learning_rate": 2.392105263157895e-05, "loss": 0.0051, "step": 165 }, { "epoch": 7.5, "eval_loss": 0.003230377798900008, "eval_runtime": 0.2336, "eval_samples_per_second": 376.761, "eval_steps_per_second": 47.095, "step": 165 }, { "epoch": 7.545454545454545, "grad_norm": 0.049539972096681595, "learning_rate": 2.388157894736842e-05, "loss": 0.0053, "step": 166 }, { "epoch": 7.545454545454545, "eval_loss": 0.003210590686649084, "eval_runtime": 0.2308, "eval_samples_per_second": 381.225, "eval_steps_per_second": 47.653, "step": 166 }, { "epoch": 7.590909090909091, "grad_norm": 0.06876406818628311, "learning_rate": 2.3842105263157897e-05, "loss": 0.0054, "step": 167 }, { "epoch": 7.590909090909091, "eval_loss": 0.0031811357475817204, "eval_runtime": 0.2314, "eval_samples_per_second": 380.236, "eval_steps_per_second": 47.53, "step": 167 }, { "epoch": 7.636363636363637, "grad_norm": 0.03961968049407005, "learning_rate": 2.3802631578947368e-05, "loss": 0.0048, "step": 168 }, { "epoch": 7.636363636363637, "eval_loss": 0.003153204219415784, "eval_runtime": 0.2327, "eval_samples_per_second": 378.242, "eval_steps_per_second": 47.28, "step": 168 }, { "epoch": 7.681818181818182, "grad_norm": 0.046262938529253006, "learning_rate": 2.3763157894736842e-05, "loss": 0.0054, "step": 169 }, { "epoch": 7.681818181818182, "eval_loss": 0.0031256629154086113, "eval_runtime": 0.2285, "eval_samples_per_second": 385.163, "eval_steps_per_second": 48.145, "step": 169 }, { "epoch": 7.7272727272727275, "grad_norm": 0.04695883020758629, "learning_rate": 2.3723684210526316e-05, "loss": 0.0053, "step": 170 }, { "epoch": 7.7272727272727275, "eval_loss": 0.00310018053278327, "eval_runtime": 0.2345, "eval_samples_per_second": 375.19, "eval_steps_per_second": 46.899, "step": 170 }, { "epoch": 7.7727272727272725, "grad_norm": 0.047219086438417435, "learning_rate": 2.368421052631579e-05, "loss": 0.0052, "step": 171 }, { "epoch": 7.7727272727272725, "eval_loss": 0.003074278589338064, "eval_runtime": 0.2331, "eval_samples_per_second": 377.522, "eval_steps_per_second": 47.19, "step": 171 }, { "epoch": 7.818181818181818, "grad_norm": 0.05439964681863785, "learning_rate": 2.364473684210526e-05, "loss": 0.0055, "step": 172 }, { "epoch": 7.818181818181818, "eval_loss": 0.003049066523090005, "eval_runtime": 0.2239, "eval_samples_per_second": 393.01, "eval_steps_per_second": 49.126, "step": 172 }, { "epoch": 7.863636363636363, "grad_norm": 0.041486483067274094, "learning_rate": 2.360526315789474e-05, "loss": 0.0047, "step": 173 }, { "epoch": 7.863636363636363, "eval_loss": 0.0030262693762779236, "eval_runtime": 0.2278, "eval_samples_per_second": 386.243, "eval_steps_per_second": 48.28, "step": 173 }, { "epoch": 7.909090909090909, "grad_norm": 0.040691111236810684, "learning_rate": 2.3565789473684213e-05, "loss": 0.0046, "step": 174 }, { "epoch": 7.909090909090909, "eval_loss": 0.0030057693365961313, "eval_runtime": 0.2276, "eval_samples_per_second": 386.567, "eval_steps_per_second": 48.321, "step": 174 }, { "epoch": 7.954545454545455, "grad_norm": 0.048391714692115784, "learning_rate": 2.3526315789473684e-05, "loss": 0.0055, "step": 175 }, { "epoch": 7.954545454545455, "eval_loss": 0.0029874229803681374, "eval_runtime": 0.2269, "eval_samples_per_second": 387.906, "eval_steps_per_second": 48.488, "step": 175 }, { "epoch": 8.0, "grad_norm": 0.04458646848797798, "learning_rate": 2.348684210526316e-05, "loss": 0.005, "step": 176 }, { "epoch": 8.0, "eval_loss": 0.0029713741969317198, "eval_runtime": 0.2305, "eval_samples_per_second": 381.854, "eval_steps_per_second": 47.732, "step": 176 }, { "epoch": 8.045454545454545, "grad_norm": 0.044490914791822433, "learning_rate": 2.3447368421052632e-05, "loss": 0.005, "step": 177 }, { "epoch": 8.045454545454545, "eval_loss": 0.002958006225526333, "eval_runtime": 0.2331, "eval_samples_per_second": 377.519, "eval_steps_per_second": 47.19, "step": 177 }, { "epoch": 8.090909090909092, "grad_norm": 0.04664753004908562, "learning_rate": 2.3407894736842106e-05, "loss": 0.0053, "step": 178 }, { "epoch": 8.090909090909092, "eval_loss": 0.0029434128664433956, "eval_runtime": 0.2369, "eval_samples_per_second": 371.478, "eval_steps_per_second": 46.435, "step": 178 }, { "epoch": 8.136363636363637, "grad_norm": 0.05114319175481796, "learning_rate": 2.336842105263158e-05, "loss": 0.0052, "step": 179 }, { "epoch": 8.136363636363637, "eval_loss": 0.002928072353824973, "eval_runtime": 0.2273, "eval_samples_per_second": 387.111, "eval_steps_per_second": 48.389, "step": 179 }, { "epoch": 8.181818181818182, "grad_norm": 0.03715480864048004, "learning_rate": 2.3328947368421054e-05, "loss": 0.0044, "step": 180 }, { "epoch": 8.181818181818182, "eval_loss": 0.002913246164098382, "eval_runtime": 0.2291, "eval_samples_per_second": 384.095, "eval_steps_per_second": 48.012, "step": 180 }, { "epoch": 8.227272727272727, "grad_norm": 0.03329971432685852, "learning_rate": 2.3289473684210525e-05, "loss": 0.0043, "step": 181 }, { "epoch": 8.227272727272727, "eval_loss": 0.0028981559444218874, "eval_runtime": 0.2387, "eval_samples_per_second": 368.641, "eval_steps_per_second": 46.08, "step": 181 }, { "epoch": 8.272727272727273, "grad_norm": 0.036768488585948944, "learning_rate": 2.3250000000000003e-05, "loss": 0.0043, "step": 182 }, { "epoch": 8.272727272727273, "eval_loss": 0.002883592387661338, "eval_runtime": 0.2382, "eval_samples_per_second": 369.423, "eval_steps_per_second": 46.178, "step": 182 }, { "epoch": 8.318181818181818, "grad_norm": 0.03704945370554924, "learning_rate": 2.3210526315789473e-05, "loss": 0.0042, "step": 183 }, { "epoch": 8.318181818181818, "eval_loss": 0.0028684174176305532, "eval_runtime": 0.2436, "eval_samples_per_second": 361.18, "eval_steps_per_second": 45.148, "step": 183 }, { "epoch": 8.363636363636363, "grad_norm": 0.038721974939107895, "learning_rate": 2.3171052631578948e-05, "loss": 0.0045, "step": 184 }, { "epoch": 8.363636363636363, "eval_loss": 0.002850217279046774, "eval_runtime": 0.2397, "eval_samples_per_second": 367.125, "eval_steps_per_second": 45.891, "step": 184 }, { "epoch": 8.409090909090908, "grad_norm": 0.0400218740105629, "learning_rate": 2.3131578947368422e-05, "loss": 0.0046, "step": 185 }, { "epoch": 8.409090909090908, "eval_loss": 0.0028304813895374537, "eval_runtime": 0.2401, "eval_samples_per_second": 366.549, "eval_steps_per_second": 45.819, "step": 185 }, { "epoch": 8.454545454545455, "grad_norm": 0.04041934013366699, "learning_rate": 2.3092105263157896e-05, "loss": 0.0047, "step": 186 }, { "epoch": 8.454545454545455, "eval_loss": 0.0028114623855799437, "eval_runtime": 0.2373, "eval_samples_per_second": 370.773, "eval_steps_per_second": 46.347, "step": 186 }, { "epoch": 8.5, "grad_norm": 0.03471284359693527, "learning_rate": 2.3052631578947367e-05, "loss": 0.0042, "step": 187 }, { "epoch": 8.5, "eval_loss": 0.002793875988572836, "eval_runtime": 0.2482, "eval_samples_per_second": 354.499, "eval_steps_per_second": 44.312, "step": 187 }, { "epoch": 8.545454545454545, "grad_norm": 0.044632624834775925, "learning_rate": 2.3013157894736844e-05, "loss": 0.0048, "step": 188 }, { "epoch": 8.545454545454545, "eval_loss": 0.0027756269555538893, "eval_runtime": 0.2261, "eval_samples_per_second": 389.244, "eval_steps_per_second": 48.656, "step": 188 }, { "epoch": 8.590909090909092, "grad_norm": 0.039824243634939194, "learning_rate": 2.297368421052632e-05, "loss": 0.0044, "step": 189 }, { "epoch": 8.590909090909092, "eval_loss": 0.00275724777020514, "eval_runtime": 0.2454, "eval_samples_per_second": 358.66, "eval_steps_per_second": 44.832, "step": 189 }, { "epoch": 8.636363636363637, "grad_norm": 0.03765185549855232, "learning_rate": 2.293421052631579e-05, "loss": 0.0046, "step": 190 }, { "epoch": 8.636363636363637, "eval_loss": 0.002737644361332059, "eval_runtime": 0.2301, "eval_samples_per_second": 382.383, "eval_steps_per_second": 47.798, "step": 190 }, { "epoch": 8.681818181818182, "grad_norm": 0.04460470378398895, "learning_rate": 2.2894736842105263e-05, "loss": 0.0049, "step": 191 }, { "epoch": 8.681818181818182, "eval_loss": 0.002716499613597989, "eval_runtime": 0.2404, "eval_samples_per_second": 366.123, "eval_steps_per_second": 45.765, "step": 191 }, { "epoch": 8.727272727272727, "grad_norm": 0.04597329720854759, "learning_rate": 2.2855263157894737e-05, "loss": 0.0046, "step": 192 }, { "epoch": 8.727272727272727, "eval_loss": 0.002695793053135276, "eval_runtime": 0.2287, "eval_samples_per_second": 384.748, "eval_steps_per_second": 48.093, "step": 192 }, { "epoch": 8.772727272727273, "grad_norm": 0.04175286740064621, "learning_rate": 2.281578947368421e-05, "loss": 0.0048, "step": 193 }, { "epoch": 8.772727272727273, "eval_loss": 0.0026768320240080357, "eval_runtime": 0.2297, "eval_samples_per_second": 383.191, "eval_steps_per_second": 47.899, "step": 193 }, { "epoch": 8.818181818181818, "grad_norm": 0.03605563938617706, "learning_rate": 2.2776315789473682e-05, "loss": 0.0042, "step": 194 }, { "epoch": 8.818181818181818, "eval_loss": 0.0026587164029479027, "eval_runtime": 0.2319, "eval_samples_per_second": 379.432, "eval_steps_per_second": 47.429, "step": 194 }, { "epoch": 8.863636363636363, "grad_norm": 0.03600858151912689, "learning_rate": 2.273684210526316e-05, "loss": 0.004, "step": 195 }, { "epoch": 8.863636363636363, "eval_loss": 0.0026421842630952597, "eval_runtime": 0.2375, "eval_samples_per_second": 370.592, "eval_steps_per_second": 46.324, "step": 195 }, { "epoch": 8.909090909090908, "grad_norm": 0.04040640592575073, "learning_rate": 2.269736842105263e-05, "loss": 0.0046, "step": 196 }, { "epoch": 8.909090909090908, "eval_loss": 0.002626256085932255, "eval_runtime": 0.5446, "eval_samples_per_second": 161.597, "eval_steps_per_second": 20.2, "step": 196 }, { "epoch": 8.954545454545455, "grad_norm": 0.04418746754527092, "learning_rate": 2.2657894736842105e-05, "loss": 0.0042, "step": 197 }, { "epoch": 8.954545454545455, "eval_loss": 0.002609600778669119, "eval_runtime": 0.233, "eval_samples_per_second": 377.684, "eval_steps_per_second": 47.211, "step": 197 }, { "epoch": 9.0, "grad_norm": 0.04399528354406357, "learning_rate": 2.261842105263158e-05, "loss": 0.0044, "step": 198 }, { "epoch": 9.0, "eval_loss": 0.0025943187065422535, "eval_runtime": 0.3847, "eval_samples_per_second": 228.728, "eval_steps_per_second": 28.591, "step": 198 }, { "epoch": 9.045454545454545, "grad_norm": 0.04438379034399986, "learning_rate": 2.2578947368421053e-05, "loss": 0.0045, "step": 199 }, { "epoch": 9.045454545454545, "eval_loss": 0.0025796808768063784, "eval_runtime": 0.4906, "eval_samples_per_second": 179.357, "eval_steps_per_second": 22.42, "step": 199 }, { "epoch": 9.090909090909092, "grad_norm": 0.03908194229006767, "learning_rate": 2.2539473684210524e-05, "loss": 0.0045, "step": 200 }, { "epoch": 9.090909090909092, "eval_loss": 0.002565391594544053, "eval_runtime": 0.2293, "eval_samples_per_second": 383.783, "eval_steps_per_second": 47.973, "step": 200 }, { "epoch": 9.136363636363637, "grad_norm": 0.03590917959809303, "learning_rate": 2.25e-05, "loss": 0.0044, "step": 201 }, { "epoch": 9.136363636363637, "eval_loss": 0.0025517421308904886, "eval_runtime": 0.2317, "eval_samples_per_second": 379.855, "eval_steps_per_second": 47.482, "step": 201 }, { "epoch": 9.181818181818182, "grad_norm": 0.0374373197555542, "learning_rate": 2.2460526315789476e-05, "loss": 0.0039, "step": 202 }, { "epoch": 9.181818181818182, "eval_loss": 0.0025385154876857996, "eval_runtime": 0.232, "eval_samples_per_second": 379.247, "eval_steps_per_second": 47.406, "step": 202 }, { "epoch": 9.227272727272727, "grad_norm": 0.03761666640639305, "learning_rate": 2.2421052631578946e-05, "loss": 0.004, "step": 203 }, { "epoch": 9.227272727272727, "eval_loss": 0.0025266585871577263, "eval_runtime": 0.2386, "eval_samples_per_second": 368.745, "eval_steps_per_second": 46.093, "step": 203 }, { "epoch": 9.272727272727273, "grad_norm": 0.033979009836912155, "learning_rate": 2.2381578947368424e-05, "loss": 0.004, "step": 204 }, { "epoch": 9.272727272727273, "eval_loss": 0.0025138070341199636, "eval_runtime": 0.2314, "eval_samples_per_second": 380.28, "eval_steps_per_second": 47.535, "step": 204 }, { "epoch": 9.318181818181818, "grad_norm": 0.054837603121995926, "learning_rate": 2.2342105263157895e-05, "loss": 0.0042, "step": 205 }, { "epoch": 9.318181818181818, "eval_loss": 0.002499848371371627, "eval_runtime": 0.227, "eval_samples_per_second": 387.733, "eval_steps_per_second": 48.467, "step": 205 }, { "epoch": 9.363636363636363, "grad_norm": 0.03884384036064148, "learning_rate": 2.230263157894737e-05, "loss": 0.0043, "step": 206 }, { "epoch": 9.363636363636363, "eval_loss": 0.0024857125245034695, "eval_runtime": 0.2294, "eval_samples_per_second": 383.548, "eval_steps_per_second": 47.944, "step": 206 }, { "epoch": 9.409090909090908, "grad_norm": 0.03517827019095421, "learning_rate": 2.2263157894736843e-05, "loss": 0.004, "step": 207 }, { "epoch": 9.409090909090908, "eval_loss": 0.00247101578861475, "eval_runtime": 0.2336, "eval_samples_per_second": 376.726, "eval_steps_per_second": 47.091, "step": 207 }, { "epoch": 9.454545454545455, "grad_norm": 0.04209022969007492, "learning_rate": 2.2223684210526317e-05, "loss": 0.0041, "step": 208 }, { "epoch": 9.454545454545455, "eval_loss": 0.0024564675986766815, "eval_runtime": 0.242, "eval_samples_per_second": 363.631, "eval_steps_per_second": 45.454, "step": 208 }, { "epoch": 9.5, "grad_norm": 0.04031739383935928, "learning_rate": 2.2184210526315788e-05, "loss": 0.0042, "step": 209 }, { "epoch": 9.5, "eval_loss": 0.002442182507365942, "eval_runtime": 0.2384, "eval_samples_per_second": 369.056, "eval_steps_per_second": 46.132, "step": 209 }, { "epoch": 9.545454545454545, "grad_norm": 0.03341998904943466, "learning_rate": 2.2144736842105265e-05, "loss": 0.0038, "step": 210 }, { "epoch": 9.545454545454545, "eval_loss": 0.0024283959064632654, "eval_runtime": 0.2386, "eval_samples_per_second": 368.766, "eval_steps_per_second": 46.096, "step": 210 }, { "epoch": 9.590909090909092, "grad_norm": 0.033409975469112396, "learning_rate": 2.2105263157894736e-05, "loss": 0.0037, "step": 211 }, { "epoch": 9.590909090909092, "eval_loss": 0.002414784161373973, "eval_runtime": 0.2392, "eval_samples_per_second": 367.843, "eval_steps_per_second": 45.98, "step": 211 }, { "epoch": 9.636363636363637, "grad_norm": 0.038544610142707825, "learning_rate": 2.206578947368421e-05, "loss": 0.0042, "step": 212 }, { "epoch": 9.636363636363637, "eval_loss": 0.0024007910396903753, "eval_runtime": 0.2355, "eval_samples_per_second": 373.655, "eval_steps_per_second": 46.707, "step": 212 }, { "epoch": 9.681818181818182, "grad_norm": 0.031284794211387634, "learning_rate": 2.2026315789473684e-05, "loss": 0.0039, "step": 213 }, { "epoch": 9.681818181818182, "eval_loss": 0.00238687708042562, "eval_runtime": 0.2461, "eval_samples_per_second": 357.651, "eval_steps_per_second": 44.706, "step": 213 }, { "epoch": 9.727272727272727, "grad_norm": 0.03589053079485893, "learning_rate": 2.198684210526316e-05, "loss": 0.004, "step": 214 }, { "epoch": 9.727272727272727, "eval_loss": 0.002372899791225791, "eval_runtime": 0.2388, "eval_samples_per_second": 368.519, "eval_steps_per_second": 46.065, "step": 214 }, { "epoch": 9.772727272727273, "grad_norm": 0.03422442823648453, "learning_rate": 2.1947368421052633e-05, "loss": 0.0037, "step": 215 }, { "epoch": 9.772727272727273, "eval_loss": 0.0023599357809871435, "eval_runtime": 0.2324, "eval_samples_per_second": 378.632, "eval_steps_per_second": 47.329, "step": 215 }, { "epoch": 9.818181818181818, "grad_norm": 0.03365776687860489, "learning_rate": 2.1907894736842107e-05, "loss": 0.0035, "step": 216 }, { "epoch": 9.818181818181818, "eval_loss": 0.0023472688626497984, "eval_runtime": 0.231, "eval_samples_per_second": 380.916, "eval_steps_per_second": 47.614, "step": 216 }, { "epoch": 9.863636363636363, "grad_norm": 0.030327491462230682, "learning_rate": 2.186842105263158e-05, "loss": 0.0037, "step": 217 }, { "epoch": 9.863636363636363, "eval_loss": 0.0023344962392002344, "eval_runtime": 0.229, "eval_samples_per_second": 384.224, "eval_steps_per_second": 48.028, "step": 217 }, { "epoch": 9.909090909090908, "grad_norm": 0.039349548518657684, "learning_rate": 2.1828947368421052e-05, "loss": 0.004, "step": 218 }, { "epoch": 9.909090909090908, "eval_loss": 0.0023220828734338284, "eval_runtime": 0.228, "eval_samples_per_second": 385.959, "eval_steps_per_second": 48.245, "step": 218 }, { "epoch": 9.954545454545455, "grad_norm": 0.03199224919080734, "learning_rate": 2.178947368421053e-05, "loss": 0.0034, "step": 219 }, { "epoch": 9.954545454545455, "eval_loss": 0.0023102990817278624, "eval_runtime": 0.2311, "eval_samples_per_second": 380.788, "eval_steps_per_second": 47.598, "step": 219 }, { "epoch": 10.0, "grad_norm": 0.03278977796435356, "learning_rate": 2.175e-05, "loss": 0.0036, "step": 220 }, { "epoch": 10.0, "eval_loss": 0.002298795385286212, "eval_runtime": 0.3275, "eval_samples_per_second": 268.678, "eval_steps_per_second": 33.585, "step": 220 }, { "epoch": 10.045454545454545, "grad_norm": 0.0341983363032341, "learning_rate": 2.1710526315789474e-05, "loss": 0.0039, "step": 221 }, { "epoch": 10.045454545454545, "eval_loss": 0.0022870004177093506, "eval_runtime": 0.3861, "eval_samples_per_second": 227.931, "eval_steps_per_second": 28.491, "step": 221 }, { "epoch": 10.090909090909092, "grad_norm": 0.03134067356586456, "learning_rate": 2.167105263157895e-05, "loss": 0.0038, "step": 222 }, { "epoch": 10.090909090909092, "eval_loss": 0.002274780999869108, "eval_runtime": 0.2973, "eval_samples_per_second": 296.022, "eval_steps_per_second": 37.003, "step": 222 }, { "epoch": 10.136363636363637, "grad_norm": 0.03246266394853592, "learning_rate": 2.1631578947368423e-05, "loss": 0.0035, "step": 223 }, { "epoch": 10.136363636363637, "eval_loss": 0.002262603724375367, "eval_runtime": 0.2788, "eval_samples_per_second": 315.607, "eval_steps_per_second": 39.451, "step": 223 }, { "epoch": 10.181818181818182, "grad_norm": 0.035311244428157806, "learning_rate": 2.1592105263157893e-05, "loss": 0.0036, "step": 224 }, { "epoch": 10.181818181818182, "eval_loss": 0.002250505844131112, "eval_runtime": 0.3259, "eval_samples_per_second": 270.042, "eval_steps_per_second": 33.755, "step": 224 }, { "epoch": 10.227272727272727, "grad_norm": 0.03288138657808304, "learning_rate": 2.155263157894737e-05, "loss": 0.0039, "step": 225 }, { "epoch": 10.227272727272727, "eval_loss": 0.0022388615179806948, "eval_runtime": 0.3627, "eval_samples_per_second": 242.648, "eval_steps_per_second": 30.331, "step": 225 }, { "epoch": 10.272727272727273, "grad_norm": 0.032804686576128006, "learning_rate": 2.151315789473684e-05, "loss": 0.0038, "step": 226 }, { "epoch": 10.272727272727273, "eval_loss": 0.0022277701646089554, "eval_runtime": 0.4861, "eval_samples_per_second": 181.023, "eval_steps_per_second": 22.628, "step": 226 }, { "epoch": 10.318181818181818, "grad_norm": 0.036528490483760834, "learning_rate": 2.1473684210526316e-05, "loss": 0.004, "step": 227 }, { "epoch": 10.318181818181818, "eval_loss": 0.0022167994175106287, "eval_runtime": 0.3048, "eval_samples_per_second": 288.714, "eval_steps_per_second": 36.089, "step": 227 }, { "epoch": 10.363636363636363, "grad_norm": 0.029931485652923584, "learning_rate": 2.143421052631579e-05, "loss": 0.0036, "step": 228 }, { "epoch": 10.363636363636363, "eval_loss": 0.002205794909968972, "eval_runtime": 0.2918, "eval_samples_per_second": 301.612, "eval_steps_per_second": 37.701, "step": 228 }, { "epoch": 10.409090909090908, "grad_norm": 0.03588961437344551, "learning_rate": 2.1394736842105264e-05, "loss": 0.0039, "step": 229 }, { "epoch": 10.409090909090908, "eval_loss": 0.0021950947120785713, "eval_runtime": 0.2407, "eval_samples_per_second": 365.554, "eval_steps_per_second": 45.694, "step": 229 }, { "epoch": 10.454545454545455, "grad_norm": 0.033503517508506775, "learning_rate": 2.1355263157894738e-05, "loss": 0.0036, "step": 230 }, { "epoch": 10.454545454545455, "eval_loss": 0.0021843963768333197, "eval_runtime": 0.2737, "eval_samples_per_second": 321.531, "eval_steps_per_second": 40.191, "step": 230 }, { "epoch": 10.5, "grad_norm": 0.032428622245788574, "learning_rate": 2.1315789473684212e-05, "loss": 0.0035, "step": 231 }, { "epoch": 10.5, "eval_loss": 0.002173727611079812, "eval_runtime": 0.4053, "eval_samples_per_second": 217.137, "eval_steps_per_second": 27.142, "step": 231 }, { "epoch": 10.545454545454545, "grad_norm": 0.0326942577958107, "learning_rate": 2.1276315789473687e-05, "loss": 0.0035, "step": 232 }, { "epoch": 10.545454545454545, "eval_loss": 0.0021637016907334328, "eval_runtime": 0.7117, "eval_samples_per_second": 123.656, "eval_steps_per_second": 15.457, "step": 232 }, { "epoch": 10.590909090909092, "grad_norm": 0.03240852802991867, "learning_rate": 2.1236842105263157e-05, "loss": 0.0034, "step": 233 }, { "epoch": 10.590909090909092, "eval_loss": 0.002153951907530427, "eval_runtime": 0.2454, "eval_samples_per_second": 358.581, "eval_steps_per_second": 44.823, "step": 233 }, { "epoch": 10.636363636363637, "grad_norm": 0.029470907524228096, "learning_rate": 2.119736842105263e-05, "loss": 0.0035, "step": 234 }, { "epoch": 10.636363636363637, "eval_loss": 0.002144550671800971, "eval_runtime": 0.2443, "eval_samples_per_second": 360.165, "eval_steps_per_second": 45.021, "step": 234 }, { "epoch": 10.681818181818182, "grad_norm": 0.02820722572505474, "learning_rate": 2.1157894736842106e-05, "loss": 0.0034, "step": 235 }, { "epoch": 10.681818181818182, "eval_loss": 0.002135734772309661, "eval_runtime": 0.2643, "eval_samples_per_second": 333.008, "eval_steps_per_second": 41.626, "step": 235 }, { "epoch": 10.727272727272727, "grad_norm": 0.02772766724228859, "learning_rate": 2.111842105263158e-05, "loss": 0.0033, "step": 236 }, { "epoch": 10.727272727272727, "eval_loss": 0.0021269202698022127, "eval_runtime": 0.2751, "eval_samples_per_second": 319.882, "eval_steps_per_second": 39.985, "step": 236 }, { "epoch": 10.772727272727273, "grad_norm": 0.03653711825609207, "learning_rate": 2.107894736842105e-05, "loss": 0.0038, "step": 237 }, { "epoch": 10.772727272727273, "eval_loss": 0.0021178810857236385, "eval_runtime": 0.227, "eval_samples_per_second": 387.716, "eval_steps_per_second": 48.465, "step": 237 }, { "epoch": 10.818181818181818, "grad_norm": 0.03011268563568592, "learning_rate": 2.1039473684210528e-05, "loss": 0.0035, "step": 238 }, { "epoch": 10.818181818181818, "eval_loss": 0.002109181135892868, "eval_runtime": 0.2398, "eval_samples_per_second": 366.897, "eval_steps_per_second": 45.862, "step": 238 }, { "epoch": 10.863636363636363, "grad_norm": 0.025909798219799995, "learning_rate": 2.1e-05, "loss": 0.003, "step": 239 }, { "epoch": 10.863636363636363, "eval_loss": 0.0021006783936172724, "eval_runtime": 0.2342, "eval_samples_per_second": 375.674, "eval_steps_per_second": 46.959, "step": 239 }, { "epoch": 10.909090909090908, "grad_norm": 0.02720109187066555, "learning_rate": 2.0960526315789473e-05, "loss": 0.0033, "step": 240 }, { "epoch": 10.909090909090908, "eval_loss": 0.002092132344841957, "eval_runtime": 0.2362, "eval_samples_per_second": 372.632, "eval_steps_per_second": 46.579, "step": 240 }, { "epoch": 10.954545454545455, "grad_norm": 0.03358568996191025, "learning_rate": 2.0921052631578947e-05, "loss": 0.0034, "step": 241 }, { "epoch": 10.954545454545455, "eval_loss": 0.0020830982830375433, "eval_runtime": 0.2268, "eval_samples_per_second": 387.964, "eval_steps_per_second": 48.496, "step": 241 }, { "epoch": 11.0, "grad_norm": 0.030720144510269165, "learning_rate": 2.088157894736842e-05, "loss": 0.0036, "step": 242 }, { "epoch": 11.0, "eval_loss": 0.002074107527732849, "eval_runtime": 0.2253, "eval_samples_per_second": 390.639, "eval_steps_per_second": 48.83, "step": 242 }, { "epoch": 11.045454545454545, "grad_norm": 0.029408905655145645, "learning_rate": 2.0842105263157895e-05, "loss": 0.0035, "step": 243 }, { "epoch": 11.045454545454545, "eval_loss": 0.0020653316751122475, "eval_runtime": 0.234, "eval_samples_per_second": 376.079, "eval_steps_per_second": 47.01, "step": 243 }, { "epoch": 11.090909090909092, "grad_norm": 0.02971459925174713, "learning_rate": 2.080263157894737e-05, "loss": 0.0034, "step": 244 }, { "epoch": 11.090909090909092, "eval_loss": 0.0020563837606459856, "eval_runtime": 0.2306, "eval_samples_per_second": 381.673, "eval_steps_per_second": 47.709, "step": 244 }, { "epoch": 11.136363636363637, "grad_norm": 0.028164513409137726, "learning_rate": 2.0763157894736844e-05, "loss": 0.0034, "step": 245 }, { "epoch": 11.136363636363637, "eval_loss": 0.0020477415528148413, "eval_runtime": 0.2363, "eval_samples_per_second": 372.455, "eval_steps_per_second": 46.557, "step": 245 }, { "epoch": 11.181818181818182, "grad_norm": 0.027845608070492744, "learning_rate": 2.0723684210526315e-05, "loss": 0.0034, "step": 246 }, { "epoch": 11.181818181818182, "eval_loss": 0.002039202954620123, "eval_runtime": 0.2314, "eval_samples_per_second": 380.293, "eval_steps_per_second": 47.537, "step": 246 }, { "epoch": 11.227272727272727, "grad_norm": 0.03046409972012043, "learning_rate": 2.0684210526315792e-05, "loss": 0.0035, "step": 247 }, { "epoch": 11.227272727272727, "eval_loss": 0.0020310634281486273, "eval_runtime": 0.2258, "eval_samples_per_second": 389.786, "eval_steps_per_second": 48.723, "step": 247 }, { "epoch": 11.272727272727273, "grad_norm": 0.025676798075437546, "learning_rate": 2.0644736842105263e-05, "loss": 0.0031, "step": 248 }, { "epoch": 11.272727272727273, "eval_loss": 0.0020227304194122553, "eval_runtime": 0.2266, "eval_samples_per_second": 388.395, "eval_steps_per_second": 48.549, "step": 248 }, { "epoch": 11.318181818181818, "grad_norm": 0.029285188764333725, "learning_rate": 2.0605263157894737e-05, "loss": 0.0036, "step": 249 }, { "epoch": 11.318181818181818, "eval_loss": 0.0020139189437031746, "eval_runtime": 0.2399, "eval_samples_per_second": 366.874, "eval_steps_per_second": 45.859, "step": 249 }, { "epoch": 11.363636363636363, "grad_norm": 0.03067379631102085, "learning_rate": 2.056578947368421e-05, "loss": 0.0033, "step": 250 }, { "epoch": 11.363636363636363, "eval_loss": 0.0020049491431564093, "eval_runtime": 0.2296, "eval_samples_per_second": 383.216, "eval_steps_per_second": 47.902, "step": 250 }, { "epoch": 11.409090909090908, "grad_norm": 0.030429691076278687, "learning_rate": 2.0526315789473685e-05, "loss": 0.0034, "step": 251 }, { "epoch": 11.409090909090908, "eval_loss": 0.0019955127499997616, "eval_runtime": 0.3825, "eval_samples_per_second": 230.047, "eval_steps_per_second": 28.756, "step": 251 }, { "epoch": 11.454545454545455, "grad_norm": 0.03006516583263874, "learning_rate": 2.0486842105263156e-05, "loss": 0.0032, "step": 252 }, { "epoch": 11.454545454545455, "eval_loss": 0.001985815353691578, "eval_runtime": 0.5232, "eval_samples_per_second": 168.209, "eval_steps_per_second": 21.026, "step": 252 }, { "epoch": 11.5, "grad_norm": 0.03021743707358837, "learning_rate": 2.0447368421052634e-05, "loss": 0.0035, "step": 253 }, { "epoch": 11.5, "eval_loss": 0.001975873252376914, "eval_runtime": 0.5816, "eval_samples_per_second": 151.301, "eval_steps_per_second": 18.913, "step": 253 }, { "epoch": 11.545454545454545, "grad_norm": 0.026514986529946327, "learning_rate": 2.0407894736842104e-05, "loss": 0.0032, "step": 254 }, { "epoch": 11.545454545454545, "eval_loss": 0.0019660864491015673, "eval_runtime": 0.2403, "eval_samples_per_second": 366.164, "eval_steps_per_second": 45.77, "step": 254 }, { "epoch": 11.590909090909092, "grad_norm": 0.028690319508314133, "learning_rate": 2.036842105263158e-05, "loss": 0.0033, "step": 255 }, { "epoch": 11.590909090909092, "eval_loss": 0.0019563750829547644, "eval_runtime": 0.2248, "eval_samples_per_second": 391.417, "eval_steps_per_second": 48.927, "step": 255 }, { "epoch": 11.636363636363637, "grad_norm": 0.03033028170466423, "learning_rate": 2.0328947368421056e-05, "loss": 0.0034, "step": 256 }, { "epoch": 11.636363636363637, "eval_loss": 0.0019468939863145351, "eval_runtime": 0.2311, "eval_samples_per_second": 380.835, "eval_steps_per_second": 47.604, "step": 256 }, { "epoch": 11.681818181818182, "grad_norm": 0.03320786729454994, "learning_rate": 2.0289473684210527e-05, "loss": 0.0035, "step": 257 }, { "epoch": 11.681818181818182, "eval_loss": 0.0019374135881662369, "eval_runtime": 0.2307, "eval_samples_per_second": 381.512, "eval_steps_per_second": 47.689, "step": 257 }, { "epoch": 11.727272727272727, "grad_norm": 0.027468524873256683, "learning_rate": 2.025e-05, "loss": 0.0031, "step": 258 }, { "epoch": 11.727272727272727, "eval_loss": 0.0019284605514258146, "eval_runtime": 0.2303, "eval_samples_per_second": 382.049, "eval_steps_per_second": 47.756, "step": 258 }, { "epoch": 11.772727272727273, "grad_norm": 0.02426382340490818, "learning_rate": 2.0210526315789475e-05, "loss": 0.0029, "step": 259 }, { "epoch": 11.772727272727273, "eval_loss": 0.0019197481451556087, "eval_runtime": 0.23, "eval_samples_per_second": 382.529, "eval_steps_per_second": 47.816, "step": 259 }, { "epoch": 11.818181818181818, "grad_norm": 0.028253108263015747, "learning_rate": 2.017105263157895e-05, "loss": 0.003, "step": 260 }, { "epoch": 11.818181818181818, "eval_loss": 0.0019117832416668534, "eval_runtime": 0.2345, "eval_samples_per_second": 375.238, "eval_steps_per_second": 46.905, "step": 260 }, { "epoch": 11.863636363636363, "grad_norm": 0.03305625915527344, "learning_rate": 2.013157894736842e-05, "loss": 0.0034, "step": 261 }, { "epoch": 11.863636363636363, "eval_loss": 0.0019041887717321515, "eval_runtime": 0.2239, "eval_samples_per_second": 393.025, "eval_steps_per_second": 49.128, "step": 261 }, { "epoch": 11.909090909090908, "grad_norm": 0.027725212275981903, "learning_rate": 2.0092105263157898e-05, "loss": 0.0033, "step": 262 }, { "epoch": 11.909090909090908, "eval_loss": 0.0018966187490150332, "eval_runtime": 0.2303, "eval_samples_per_second": 382.148, "eval_steps_per_second": 47.769, "step": 262 }, { "epoch": 11.954545454545455, "grad_norm": 0.02550244890153408, "learning_rate": 2.0052631578947368e-05, "loss": 0.0032, "step": 263 }, { "epoch": 11.954545454545455, "eval_loss": 0.0018891972722485662, "eval_runtime": 0.2274, "eval_samples_per_second": 386.939, "eval_steps_per_second": 48.367, "step": 263 }, { "epoch": 12.0, "grad_norm": 0.02780972793698311, "learning_rate": 2.0013157894736842e-05, "loss": 0.0034, "step": 264 }, { "epoch": 12.0, "eval_loss": 0.001881771837361157, "eval_runtime": 0.2332, "eval_samples_per_second": 377.388, "eval_steps_per_second": 47.174, "step": 264 }, { "epoch": 12.045454545454545, "grad_norm": 0.03385490924119949, "learning_rate": 1.9973684210526317e-05, "loss": 0.0034, "step": 265 }, { "epoch": 12.045454545454545, "eval_loss": 0.001874623354524374, "eval_runtime": 0.2413, "eval_samples_per_second": 364.627, "eval_steps_per_second": 45.578, "step": 265 }, { "epoch": 12.090909090909092, "grad_norm": 0.029128815978765488, "learning_rate": 1.993421052631579e-05, "loss": 0.003, "step": 266 }, { "epoch": 12.090909090909092, "eval_loss": 0.0018677938496693969, "eval_runtime": 0.235, "eval_samples_per_second": 374.427, "eval_steps_per_second": 46.803, "step": 266 }, { "epoch": 12.136363636363637, "grad_norm": 0.025781184434890747, "learning_rate": 1.989473684210526e-05, "loss": 0.0031, "step": 267 }, { "epoch": 12.136363636363637, "eval_loss": 0.001861188909970224, "eval_runtime": 0.2382, "eval_samples_per_second": 369.363, "eval_steps_per_second": 46.17, "step": 267 }, { "epoch": 12.181818181818182, "grad_norm": 0.0294223353266716, "learning_rate": 1.985526315789474e-05, "loss": 0.0033, "step": 268 }, { "epoch": 12.181818181818182, "eval_loss": 0.001854045782238245, "eval_runtime": 0.2289, "eval_samples_per_second": 384.52, "eval_steps_per_second": 48.065, "step": 268 }, { "epoch": 12.227272727272727, "grad_norm": 0.028326552361249924, "learning_rate": 1.9815789473684213e-05, "loss": 0.003, "step": 269 }, { "epoch": 12.227272727272727, "eval_loss": 0.0018470593495294452, "eval_runtime": 0.2289, "eval_samples_per_second": 384.399, "eval_steps_per_second": 48.05, "step": 269 }, { "epoch": 12.272727272727273, "grad_norm": 0.030360590666532516, "learning_rate": 1.9776315789473684e-05, "loss": 0.0031, "step": 270 }, { "epoch": 12.272727272727273, "eval_loss": 0.0018398199463263154, "eval_runtime": 0.2311, "eval_samples_per_second": 380.8, "eval_steps_per_second": 47.6, "step": 270 }, { "epoch": 12.318181818181818, "grad_norm": 0.02833518758416176, "learning_rate": 1.9736842105263158e-05, "loss": 0.0034, "step": 271 }, { "epoch": 12.318181818181818, "eval_loss": 0.0018325141863897443, "eval_runtime": 0.233, "eval_samples_per_second": 377.758, "eval_steps_per_second": 47.22, "step": 271 }, { "epoch": 12.363636363636363, "grad_norm": 0.029960816726088524, "learning_rate": 1.9697368421052632e-05, "loss": 0.0032, "step": 272 }, { "epoch": 12.363636363636363, "eval_loss": 0.0018252148292958736, "eval_runtime": 0.231, "eval_samples_per_second": 381.016, "eval_steps_per_second": 47.627, "step": 272 }, { "epoch": 12.409090909090908, "grad_norm": 0.027226990088820457, "learning_rate": 1.9657894736842106e-05, "loss": 0.0029, "step": 273 }, { "epoch": 12.409090909090908, "eval_loss": 0.0018177549354732037, "eval_runtime": 0.233, "eval_samples_per_second": 377.605, "eval_steps_per_second": 47.201, "step": 273 }, { "epoch": 12.454545454545455, "grad_norm": 0.02402249164879322, "learning_rate": 1.9618421052631577e-05, "loss": 0.0029, "step": 274 }, { "epoch": 12.454545454545455, "eval_loss": 0.0018104868941009045, "eval_runtime": 0.2464, "eval_samples_per_second": 357.208, "eval_steps_per_second": 44.651, "step": 274 }, { "epoch": 12.5, "grad_norm": 0.025068577378988266, "learning_rate": 1.9578947368421055e-05, "loss": 0.003, "step": 275 }, { "epoch": 12.5, "eval_loss": 0.0018031727522611618, "eval_runtime": 0.2561, "eval_samples_per_second": 343.628, "eval_steps_per_second": 42.953, "step": 275 }, { "epoch": 12.545454545454545, "grad_norm": 0.03290198743343353, "learning_rate": 1.9539473684210525e-05, "loss": 0.0032, "step": 276 }, { "epoch": 12.545454545454545, "eval_loss": 0.0017959319520741701, "eval_runtime": 0.2473, "eval_samples_per_second": 355.844, "eval_steps_per_second": 44.48, "step": 276 }, { "epoch": 12.590909090909092, "grad_norm": 0.025103066116571426, "learning_rate": 1.95e-05, "loss": 0.0028, "step": 277 }, { "epoch": 12.590909090909092, "eval_loss": 0.0017883635591715574, "eval_runtime": 0.2312, "eval_samples_per_second": 380.663, "eval_steps_per_second": 47.583, "step": 277 }, { "epoch": 12.636363636363637, "grad_norm": 0.02768297679722309, "learning_rate": 1.9460526315789474e-05, "loss": 0.003, "step": 278 }, { "epoch": 12.636363636363637, "eval_loss": 0.0017810885328799486, "eval_runtime": 0.2411, "eval_samples_per_second": 365.033, "eval_steps_per_second": 45.629, "step": 278 }, { "epoch": 12.681818181818182, "grad_norm": 0.026979558169841766, "learning_rate": 1.9421052631578948e-05, "loss": 0.0033, "step": 279 }, { "epoch": 12.681818181818182, "eval_loss": 0.0017738312017172575, "eval_runtime": 0.2981, "eval_samples_per_second": 295.202, "eval_steps_per_second": 36.9, "step": 279 }, { "epoch": 12.727272727272727, "grad_norm": 0.025757014751434326, "learning_rate": 1.938157894736842e-05, "loss": 0.0031, "step": 280 }, { "epoch": 12.727272727272727, "eval_loss": 0.0017666955245658755, "eval_runtime": 0.2467, "eval_samples_per_second": 356.773, "eval_steps_per_second": 44.597, "step": 280 }, { "epoch": 12.772727272727273, "grad_norm": 0.026617391034960747, "learning_rate": 1.9342105263157896e-05, "loss": 0.003, "step": 281 }, { "epoch": 12.772727272727273, "eval_loss": 0.0017593905795365572, "eval_runtime": 0.2388, "eval_samples_per_second": 368.469, "eval_steps_per_second": 46.059, "step": 281 }, { "epoch": 12.818181818181818, "grad_norm": 0.027713097631931305, "learning_rate": 1.9302631578947367e-05, "loss": 0.0028, "step": 282 }, { "epoch": 12.818181818181818, "eval_loss": 0.0017523803981021047, "eval_runtime": 0.2531, "eval_samples_per_second": 347.71, "eval_steps_per_second": 43.464, "step": 282 }, { "epoch": 12.863636363636363, "grad_norm": 0.021941719576716423, "learning_rate": 1.926315789473684e-05, "loss": 0.0028, "step": 283 }, { "epoch": 12.863636363636363, "eval_loss": 0.0017456583445891738, "eval_runtime": 0.2275, "eval_samples_per_second": 386.831, "eval_steps_per_second": 48.354, "step": 283 }, { "epoch": 12.909090909090908, "grad_norm": 0.029443973675370216, "learning_rate": 1.922368421052632e-05, "loss": 0.0029, "step": 284 }, { "epoch": 12.909090909090908, "eval_loss": 0.0017391174333170056, "eval_runtime": 0.2259, "eval_samples_per_second": 389.61, "eval_steps_per_second": 48.701, "step": 284 }, { "epoch": 12.954545454545455, "grad_norm": 0.023187711834907532, "learning_rate": 1.918421052631579e-05, "loss": 0.0027, "step": 285 }, { "epoch": 12.954545454545455, "eval_loss": 0.0017328561516478658, "eval_runtime": 0.2229, "eval_samples_per_second": 394.794, "eval_steps_per_second": 49.349, "step": 285 }, { "epoch": 13.0, "grad_norm": 0.02683272212743759, "learning_rate": 1.9144736842105264e-05, "loss": 0.0028, "step": 286 }, { "epoch": 13.0, "eval_loss": 0.0017264141933992505, "eval_runtime": 0.2281, "eval_samples_per_second": 385.759, "eval_steps_per_second": 48.22, "step": 286 }, { "epoch": 13.045454545454545, "grad_norm": 0.026485104113817215, "learning_rate": 1.9105263157894738e-05, "loss": 0.0029, "step": 287 }, { "epoch": 13.045454545454545, "eval_loss": 0.0017197772394865751, "eval_runtime": 0.2245, "eval_samples_per_second": 392.011, "eval_steps_per_second": 49.001, "step": 287 }, { "epoch": 13.090909090909092, "grad_norm": 0.025229312479496002, "learning_rate": 1.9065789473684212e-05, "loss": 0.0027, "step": 288 }, { "epoch": 13.090909090909092, "eval_loss": 0.0017132211942225695, "eval_runtime": 0.2288, "eval_samples_per_second": 384.654, "eval_steps_per_second": 48.082, "step": 288 }, { "epoch": 13.136363636363637, "grad_norm": 0.026387052610516548, "learning_rate": 1.9026315789473683e-05, "loss": 0.003, "step": 289 }, { "epoch": 13.136363636363637, "eval_loss": 0.001706792158074677, "eval_runtime": 0.2251, "eval_samples_per_second": 390.981, "eval_steps_per_second": 48.873, "step": 289 }, { "epoch": 13.181818181818182, "grad_norm": 0.0232387688010931, "learning_rate": 1.898684210526316e-05, "loss": 0.0028, "step": 290 }, { "epoch": 13.181818181818182, "eval_loss": 0.0017004094552248716, "eval_runtime": 0.2307, "eval_samples_per_second": 381.375, "eval_steps_per_second": 47.672, "step": 290 }, { "epoch": 13.227272727272727, "grad_norm": 0.030720511451363564, "learning_rate": 1.894736842105263e-05, "loss": 0.003, "step": 291 }, { "epoch": 13.227272727272727, "eval_loss": 0.0016942427027970552, "eval_runtime": 0.2316, "eval_samples_per_second": 379.934, "eval_steps_per_second": 47.492, "step": 291 }, { "epoch": 13.272727272727273, "grad_norm": 0.023519422858953476, "learning_rate": 1.8907894736842105e-05, "loss": 0.0025, "step": 292 }, { "epoch": 13.272727272727273, "eval_loss": 0.0016882912022992969, "eval_runtime": 0.2298, "eval_samples_per_second": 383.008, "eval_steps_per_second": 47.876, "step": 292 }, { "epoch": 13.318181818181818, "grad_norm": 0.02608366496860981, "learning_rate": 1.886842105263158e-05, "loss": 0.003, "step": 293 }, { "epoch": 13.318181818181818, "eval_loss": 0.001682400587014854, "eval_runtime": 0.2333, "eval_samples_per_second": 377.26, "eval_steps_per_second": 47.157, "step": 293 }, { "epoch": 13.363636363636363, "grad_norm": 0.02541464753448963, "learning_rate": 1.8828947368421053e-05, "loss": 0.0028, "step": 294 }, { "epoch": 13.363636363636363, "eval_loss": 0.0016764701576903462, "eval_runtime": 0.2276, "eval_samples_per_second": 386.598, "eval_steps_per_second": 48.325, "step": 294 }, { "epoch": 13.409090909090908, "grad_norm": 0.026540333405137062, "learning_rate": 1.8789473684210524e-05, "loss": 0.0028, "step": 295 }, { "epoch": 13.409090909090908, "eval_loss": 0.0016703385626897216, "eval_runtime": 0.2313, "eval_samples_per_second": 380.436, "eval_steps_per_second": 47.554, "step": 295 }, { "epoch": 13.454545454545455, "grad_norm": 0.021979449316859245, "learning_rate": 1.8750000000000002e-05, "loss": 0.0027, "step": 296 }, { "epoch": 13.454545454545455, "eval_loss": 0.0016644434072077274, "eval_runtime": 0.2267, "eval_samples_per_second": 388.248, "eval_steps_per_second": 48.531, "step": 296 }, { "epoch": 13.5, "grad_norm": 0.027137625962495804, "learning_rate": 1.8710526315789476e-05, "loss": 0.0027, "step": 297 }, { "epoch": 13.5, "eval_loss": 0.001658798661082983, "eval_runtime": 0.2286, "eval_samples_per_second": 384.972, "eval_steps_per_second": 48.121, "step": 297 }, { "epoch": 13.545454545454545, "grad_norm": 0.02321833185851574, "learning_rate": 1.8671052631578947e-05, "loss": 0.0027, "step": 298 }, { "epoch": 13.545454545454545, "eval_loss": 0.001653428073041141, "eval_runtime": 0.227, "eval_samples_per_second": 387.714, "eval_steps_per_second": 48.464, "step": 298 }, { "epoch": 13.590909090909092, "grad_norm": 0.028996985405683517, "learning_rate": 1.8631578947368424e-05, "loss": 0.0029, "step": 299 }, { "epoch": 13.590909090909092, "eval_loss": 0.0016476112650707364, "eval_runtime": 0.2299, "eval_samples_per_second": 382.812, "eval_steps_per_second": 47.852, "step": 299 }, { "epoch": 13.636363636363637, "grad_norm": 0.028486257418990135, "learning_rate": 1.8592105263157895e-05, "loss": 0.0027, "step": 300 }, { "epoch": 13.636363636363637, "eval_loss": 0.001642104354687035, "eval_runtime": 0.2398, "eval_samples_per_second": 367.028, "eval_steps_per_second": 45.878, "step": 300 }, { "epoch": 13.681818181818182, "grad_norm": 0.022658037021756172, "learning_rate": 1.855263157894737e-05, "loss": 0.0025, "step": 301 }, { "epoch": 13.681818181818182, "eval_loss": 0.0016368039650842547, "eval_runtime": 0.2377, "eval_samples_per_second": 370.172, "eval_steps_per_second": 46.271, "step": 301 }, { "epoch": 13.727272727272727, "grad_norm": 0.024452779442071915, "learning_rate": 1.8513157894736843e-05, "loss": 0.0028, "step": 302 }, { "epoch": 13.727272727272727, "eval_loss": 0.0016317162662744522, "eval_runtime": 0.2252, "eval_samples_per_second": 390.707, "eval_steps_per_second": 48.838, "step": 302 }, { "epoch": 13.772727272727273, "grad_norm": 0.02014131471514702, "learning_rate": 1.8473684210526317e-05, "loss": 0.0024, "step": 303 }, { "epoch": 13.772727272727273, "eval_loss": 0.001626785146072507, "eval_runtime": 0.2343, "eval_samples_per_second": 375.607, "eval_steps_per_second": 46.951, "step": 303 }, { "epoch": 13.818181818181818, "grad_norm": 0.02657116763293743, "learning_rate": 1.8434210526315788e-05, "loss": 0.0025, "step": 304 }, { "epoch": 13.818181818181818, "eval_loss": 0.001621657982468605, "eval_runtime": 0.2287, "eval_samples_per_second": 384.722, "eval_steps_per_second": 48.09, "step": 304 }, { "epoch": 13.863636363636363, "grad_norm": 0.02328609488904476, "learning_rate": 1.8394736842105266e-05, "loss": 0.0025, "step": 305 }, { "epoch": 13.863636363636363, "eval_loss": 0.001616165740415454, "eval_runtime": 0.2335, "eval_samples_per_second": 376.921, "eval_steps_per_second": 47.115, "step": 305 }, { "epoch": 13.909090909090908, "grad_norm": 0.02286568656563759, "learning_rate": 1.8355263157894736e-05, "loss": 0.0027, "step": 306 }, { "epoch": 13.909090909090908, "eval_loss": 0.001610812614671886, "eval_runtime": 0.2295, "eval_samples_per_second": 383.365, "eval_steps_per_second": 47.921, "step": 306 }, { "epoch": 13.954545454545455, "grad_norm": 0.025216739624738693, "learning_rate": 1.831578947368421e-05, "loss": 0.0026, "step": 307 }, { "epoch": 13.954545454545455, "eval_loss": 0.001605312223546207, "eval_runtime": 0.2306, "eval_samples_per_second": 381.68, "eval_steps_per_second": 47.71, "step": 307 }, { "epoch": 14.0, "grad_norm": 0.02698989026248455, "learning_rate": 1.8276315789473685e-05, "loss": 0.003, "step": 308 }, { "epoch": 14.0, "eval_loss": 0.001599607290700078, "eval_runtime": 0.234, "eval_samples_per_second": 376.05, "eval_steps_per_second": 47.006, "step": 308 }, { "epoch": 14.045454545454545, "grad_norm": 0.02121439203619957, "learning_rate": 1.823684210526316e-05, "loss": 0.0026, "step": 309 }, { "epoch": 14.045454545454545, "eval_loss": 0.0015940162120386958, "eval_runtime": 0.2335, "eval_samples_per_second": 376.91, "eval_steps_per_second": 47.114, "step": 309 }, { "epoch": 14.090909090909092, "grad_norm": 0.02412167377769947, "learning_rate": 1.8197368421052633e-05, "loss": 0.0028, "step": 310 }, { "epoch": 14.090909090909092, "eval_loss": 0.001588392653502524, "eval_runtime": 0.2321, "eval_samples_per_second": 379.14, "eval_steps_per_second": 47.392, "step": 310 }, { "epoch": 14.136363636363637, "grad_norm": 0.02534678392112255, "learning_rate": 1.8157894736842107e-05, "loss": 0.0027, "step": 311 }, { "epoch": 14.136363636363637, "eval_loss": 0.0015829313779249787, "eval_runtime": 0.2274, "eval_samples_per_second": 386.989, "eval_steps_per_second": 48.374, "step": 311 }, { "epoch": 14.181818181818182, "grad_norm": 0.021638307720422745, "learning_rate": 1.811842105263158e-05, "loss": 0.0025, "step": 312 }, { "epoch": 14.181818181818182, "eval_loss": 0.0015773712657392025, "eval_runtime": 0.2294, "eval_samples_per_second": 383.682, "eval_steps_per_second": 47.96, "step": 312 }, { "epoch": 14.227272727272727, "grad_norm": 0.024357490241527557, "learning_rate": 1.8078947368421052e-05, "loss": 0.0027, "step": 313 }, { "epoch": 14.227272727272727, "eval_loss": 0.0015717636561021209, "eval_runtime": 0.2294, "eval_samples_per_second": 383.662, "eval_steps_per_second": 47.958, "step": 313 }, { "epoch": 14.272727272727273, "grad_norm": 0.022512707859277725, "learning_rate": 1.8039473684210526e-05, "loss": 0.0026, "step": 314 }, { "epoch": 14.272727272727273, "eval_loss": 0.001566153485327959, "eval_runtime": 0.2263, "eval_samples_per_second": 388.817, "eval_steps_per_second": 48.602, "step": 314 }, { "epoch": 14.318181818181818, "grad_norm": 0.022913463413715363, "learning_rate": 1.8e-05, "loss": 0.0026, "step": 315 }, { "epoch": 14.318181818181818, "eval_loss": 0.001560671953484416, "eval_runtime": 0.2319, "eval_samples_per_second": 379.401, "eval_steps_per_second": 47.425, "step": 315 }, { "epoch": 14.363636363636363, "grad_norm": 0.024906402453780174, "learning_rate": 1.7960526315789475e-05, "loss": 0.0026, "step": 316 }, { "epoch": 14.363636363636363, "eval_loss": 0.0015550776151940227, "eval_runtime": 0.2309, "eval_samples_per_second": 381.176, "eval_steps_per_second": 47.647, "step": 316 }, { "epoch": 14.409090909090908, "grad_norm": 0.020846841856837273, "learning_rate": 1.7921052631578945e-05, "loss": 0.0024, "step": 317 }, { "epoch": 14.409090909090908, "eval_loss": 0.0015492510283365846, "eval_runtime": 0.23, "eval_samples_per_second": 382.625, "eval_steps_per_second": 47.828, "step": 317 }, { "epoch": 14.454545454545455, "grad_norm": 0.020949576050043106, "learning_rate": 1.7881578947368423e-05, "loss": 0.0024, "step": 318 }, { "epoch": 14.454545454545455, "eval_loss": 0.001543792081065476, "eval_runtime": 0.2687, "eval_samples_per_second": 327.535, "eval_steps_per_second": 40.942, "step": 318 }, { "epoch": 14.5, "grad_norm": 0.027320073917508125, "learning_rate": 1.7842105263157894e-05, "loss": 0.0029, "step": 319 }, { "epoch": 14.5, "eval_loss": 0.0015383724821731448, "eval_runtime": 0.2378, "eval_samples_per_second": 369.998, "eval_steps_per_second": 46.25, "step": 319 }, { "epoch": 14.545454545454545, "grad_norm": 0.023768380284309387, "learning_rate": 1.7802631578947368e-05, "loss": 0.0024, "step": 320 }, { "epoch": 14.545454545454545, "eval_loss": 0.0015328243607655168, "eval_runtime": 0.2636, "eval_samples_per_second": 333.891, "eval_steps_per_second": 41.736, "step": 320 }, { "epoch": 14.590909090909092, "grad_norm": 0.023090893402695656, "learning_rate": 1.7763157894736842e-05, "loss": 0.0028, "step": 321 }, { "epoch": 14.590909090909092, "eval_loss": 0.0015273126773536205, "eval_runtime": 0.2297, "eval_samples_per_second": 383.091, "eval_steps_per_second": 47.886, "step": 321 }, { "epoch": 14.636363636363637, "grad_norm": 0.021861301735043526, "learning_rate": 1.7723684210526316e-05, "loss": 0.0023, "step": 322 }, { "epoch": 14.636363636363637, "eval_loss": 0.0015220079803839326, "eval_runtime": 0.2395, "eval_samples_per_second": 367.485, "eval_steps_per_second": 45.936, "step": 322 }, { "epoch": 14.681818181818182, "grad_norm": 0.02089674212038517, "learning_rate": 1.7684210526315787e-05, "loss": 0.0025, "step": 323 }, { "epoch": 14.681818181818182, "eval_loss": 0.0015169020043686032, "eval_runtime": 0.2277, "eval_samples_per_second": 386.55, "eval_steps_per_second": 48.319, "step": 323 }, { "epoch": 14.727272727272727, "grad_norm": 0.026943515986204147, "learning_rate": 1.7644736842105264e-05, "loss": 0.0027, "step": 324 }, { "epoch": 14.727272727272727, "eval_loss": 0.0015122044133022428, "eval_runtime": 0.2504, "eval_samples_per_second": 351.497, "eval_steps_per_second": 43.937, "step": 324 }, { "epoch": 14.772727272727273, "grad_norm": 0.021125871688127518, "learning_rate": 1.760526315789474e-05, "loss": 0.0024, "step": 325 }, { "epoch": 14.772727272727273, "eval_loss": 0.0015074351103976369, "eval_runtime": 0.2277, "eval_samples_per_second": 386.421, "eval_steps_per_second": 48.303, "step": 325 }, { "epoch": 14.818181818181818, "grad_norm": 0.023058133199810982, "learning_rate": 1.756578947368421e-05, "loss": 0.0025, "step": 326 }, { "epoch": 14.818181818181818, "eval_loss": 0.001502548111602664, "eval_runtime": 0.2371, "eval_samples_per_second": 371.118, "eval_steps_per_second": 46.39, "step": 326 }, { "epoch": 14.863636363636363, "grad_norm": 0.020260730758309364, "learning_rate": 1.7526315789473687e-05, "loss": 0.0023, "step": 327 }, { "epoch": 14.863636363636363, "eval_loss": 0.0014978590188547969, "eval_runtime": 0.231, "eval_samples_per_second": 380.935, "eval_steps_per_second": 47.617, "step": 327 }, { "epoch": 14.909090909090908, "grad_norm": 0.021094167605042458, "learning_rate": 1.7486842105263158e-05, "loss": 0.0024, "step": 328 }, { "epoch": 14.909090909090908, "eval_loss": 0.0014932234771549702, "eval_runtime": 0.2309, "eval_samples_per_second": 381.042, "eval_steps_per_second": 47.63, "step": 328 }, { "epoch": 14.954545454545455, "grad_norm": 0.023162171244621277, "learning_rate": 1.7447368421052632e-05, "loss": 0.0027, "step": 329 }, { "epoch": 14.954545454545455, "eval_loss": 0.0014887653524056077, "eval_runtime": 0.2298, "eval_samples_per_second": 382.875, "eval_steps_per_second": 47.859, "step": 329 }, { "epoch": 15.0, "grad_norm": 0.021899493411183357, "learning_rate": 1.7407894736842106e-05, "loss": 0.0026, "step": 330 }, { "epoch": 15.0, "eval_loss": 0.0014844763791188598, "eval_runtime": 0.2287, "eval_samples_per_second": 384.811, "eval_steps_per_second": 48.101, "step": 330 }, { "epoch": 15.045454545454545, "grad_norm": 0.02722894586622715, "learning_rate": 1.736842105263158e-05, "loss": 0.0029, "step": 331 }, { "epoch": 15.045454545454545, "eval_loss": 0.001479836879298091, "eval_runtime": 0.2296, "eval_samples_per_second": 383.331, "eval_steps_per_second": 47.916, "step": 331 }, { "epoch": 15.090909090909092, "grad_norm": 0.0198600422590971, "learning_rate": 1.732894736842105e-05, "loss": 0.0023, "step": 332 }, { "epoch": 15.090909090909092, "eval_loss": 0.001475546509027481, "eval_runtime": 0.2293, "eval_samples_per_second": 383.783, "eval_steps_per_second": 47.973, "step": 332 }, { "epoch": 15.136363636363637, "grad_norm": 0.018213720992207527, "learning_rate": 1.728947368421053e-05, "loss": 0.0021, "step": 333 }, { "epoch": 15.136363636363637, "eval_loss": 0.0014714114367961884, "eval_runtime": 0.2229, "eval_samples_per_second": 394.882, "eval_steps_per_second": 49.36, "step": 333 }, { "epoch": 15.181818181818182, "grad_norm": 0.02195083722472191, "learning_rate": 1.725e-05, "loss": 0.0026, "step": 334 }, { "epoch": 15.181818181818182, "eval_loss": 0.0014672887045890093, "eval_runtime": 0.2307, "eval_samples_per_second": 381.499, "eval_steps_per_second": 47.687, "step": 334 }, { "epoch": 15.227272727272727, "grad_norm": 0.020630402490496635, "learning_rate": 1.7210526315789473e-05, "loss": 0.0023, "step": 335 }, { "epoch": 15.227272727272727, "eval_loss": 0.0014632240636274219, "eval_runtime": 0.2345, "eval_samples_per_second": 375.326, "eval_steps_per_second": 46.916, "step": 335 }, { "epoch": 15.272727272727273, "grad_norm": 0.01985459215939045, "learning_rate": 1.7171052631578947e-05, "loss": 0.0024, "step": 336 }, { "epoch": 15.272727272727273, "eval_loss": 0.0014591444050893188, "eval_runtime": 0.2344, "eval_samples_per_second": 375.401, "eval_steps_per_second": 46.925, "step": 336 }, { "epoch": 15.318181818181818, "grad_norm": 0.02400742471218109, "learning_rate": 1.713157894736842e-05, "loss": 0.0024, "step": 337 }, { "epoch": 15.318181818181818, "eval_loss": 0.001454763114452362, "eval_runtime": 0.2401, "eval_samples_per_second": 366.585, "eval_steps_per_second": 45.823, "step": 337 }, { "epoch": 15.363636363636363, "grad_norm": 0.02545950934290886, "learning_rate": 1.7092105263157896e-05, "loss": 0.0026, "step": 338 }, { "epoch": 15.363636363636363, "eval_loss": 0.0014504102291539311, "eval_runtime": 0.2315, "eval_samples_per_second": 380.122, "eval_steps_per_second": 47.515, "step": 338 }, { "epoch": 15.409090909090908, "grad_norm": 0.02126440778374672, "learning_rate": 1.705263157894737e-05, "loss": 0.0024, "step": 339 }, { "epoch": 15.409090909090908, "eval_loss": 0.0014461844693869352, "eval_runtime": 0.2351, "eval_samples_per_second": 374.294, "eval_steps_per_second": 46.787, "step": 339 }, { "epoch": 15.454545454545455, "grad_norm": 0.025197012349963188, "learning_rate": 1.7013157894736844e-05, "loss": 0.0025, "step": 340 }, { "epoch": 15.454545454545455, "eval_loss": 0.0014418490463867784, "eval_runtime": 0.2274, "eval_samples_per_second": 387.064, "eval_steps_per_second": 48.383, "step": 340 }, { "epoch": 15.5, "grad_norm": 0.022640075534582138, "learning_rate": 1.6973684210526315e-05, "loss": 0.0024, "step": 341 }, { "epoch": 15.5, "eval_loss": 0.0014375299215316772, "eval_runtime": 0.2405, "eval_samples_per_second": 365.83, "eval_steps_per_second": 45.729, "step": 341 }, { "epoch": 15.545454545454545, "grad_norm": 0.021050602197647095, "learning_rate": 1.6934210526315792e-05, "loss": 0.0024, "step": 342 }, { "epoch": 15.545454545454545, "eval_loss": 0.0014335111482068896, "eval_runtime": 0.226, "eval_samples_per_second": 389.393, "eval_steps_per_second": 48.674, "step": 342 }, { "epoch": 15.590909090909092, "grad_norm": 0.0219247005879879, "learning_rate": 1.6894736842105263e-05, "loss": 0.0025, "step": 343 }, { "epoch": 15.590909090909092, "eval_loss": 0.0014295299770310521, "eval_runtime": 0.2342, "eval_samples_per_second": 375.717, "eval_steps_per_second": 46.965, "step": 343 }, { "epoch": 15.636363636363637, "grad_norm": 0.020925231277942657, "learning_rate": 1.6855263157894737e-05, "loss": 0.0024, "step": 344 }, { "epoch": 15.636363636363637, "eval_loss": 0.0014257035218179226, "eval_runtime": 0.2368, "eval_samples_per_second": 371.622, "eval_steps_per_second": 46.453, "step": 344 }, { "epoch": 15.681818181818182, "grad_norm": 0.019099295139312744, "learning_rate": 1.681578947368421e-05, "loss": 0.0023, "step": 345 }, { "epoch": 15.681818181818182, "eval_loss": 0.0014218251453712583, "eval_runtime": 0.2291, "eval_samples_per_second": 384.074, "eval_steps_per_second": 48.009, "step": 345 }, { "epoch": 15.727272727272727, "grad_norm": 0.021133864298462868, "learning_rate": 1.6776315789473686e-05, "loss": 0.0023, "step": 346 }, { "epoch": 15.727272727272727, "eval_loss": 0.0014178574783727527, "eval_runtime": 0.2372, "eval_samples_per_second": 370.96, "eval_steps_per_second": 46.37, "step": 346 }, { "epoch": 15.772727272727273, "grad_norm": 0.0220933947712183, "learning_rate": 1.6736842105263156e-05, "loss": 0.0024, "step": 347 }, { "epoch": 15.772727272727273, "eval_loss": 0.0014137736288830638, "eval_runtime": 0.2311, "eval_samples_per_second": 380.859, "eval_steps_per_second": 47.607, "step": 347 }, { "epoch": 15.818181818181818, "grad_norm": 0.02274385653436184, "learning_rate": 1.6697368421052634e-05, "loss": 0.0023, "step": 348 }, { "epoch": 15.818181818181818, "eval_loss": 0.0014094788348302245, "eval_runtime": 0.2489, "eval_samples_per_second": 353.537, "eval_steps_per_second": 44.192, "step": 348 }, { "epoch": 15.863636363636363, "grad_norm": 0.023772120475769043, "learning_rate": 1.6657894736842105e-05, "loss": 0.0025, "step": 349 }, { "epoch": 15.863636363636363, "eval_loss": 0.0014053123304620385, "eval_runtime": 0.2394, "eval_samples_per_second": 367.516, "eval_steps_per_second": 45.94, "step": 349 }, { "epoch": 15.909090909090908, "grad_norm": 0.023701833561062813, "learning_rate": 1.661842105263158e-05, "loss": 0.0026, "step": 350 }, { "epoch": 15.909090909090908, "eval_loss": 0.0014007468707859516, "eval_runtime": 0.2428, "eval_samples_per_second": 362.454, "eval_steps_per_second": 45.307, "step": 350 }, { "epoch": 15.954545454545455, "grad_norm": 0.020177854225039482, "learning_rate": 1.6578947368421053e-05, "loss": 0.0023, "step": 351 }, { "epoch": 15.954545454545455, "eval_loss": 0.001396444975398481, "eval_runtime": 0.2227, "eval_samples_per_second": 395.086, "eval_steps_per_second": 49.386, "step": 351 }, { "epoch": 16.0, "grad_norm": 0.018302910029888153, "learning_rate": 1.6539473684210527e-05, "loss": 0.0022, "step": 352 }, { "epoch": 16.0, "eval_loss": 0.0013921987265348434, "eval_runtime": 0.2255, "eval_samples_per_second": 390.23, "eval_steps_per_second": 48.779, "step": 352 }, { "epoch": 16.045454545454547, "grad_norm": 0.02006903663277626, "learning_rate": 1.65e-05, "loss": 0.0024, "step": 353 }, { "epoch": 16.045454545454547, "eval_loss": 0.0013879131292924285, "eval_runtime": 0.2332, "eval_samples_per_second": 377.362, "eval_steps_per_second": 47.17, "step": 353 }, { "epoch": 16.09090909090909, "grad_norm": 0.02006879448890686, "learning_rate": 1.6460526315789472e-05, "loss": 0.0024, "step": 354 }, { "epoch": 16.09090909090909, "eval_loss": 0.0013836818980053067, "eval_runtime": 0.2294, "eval_samples_per_second": 383.546, "eval_steps_per_second": 47.943, "step": 354 }, { "epoch": 16.136363636363637, "grad_norm": 0.01927405595779419, "learning_rate": 1.642105263157895e-05, "loss": 0.0021, "step": 355 }, { "epoch": 16.136363636363637, "eval_loss": 0.001379486988298595, "eval_runtime": 0.2304, "eval_samples_per_second": 381.9, "eval_steps_per_second": 47.738, "step": 355 }, { "epoch": 16.181818181818183, "grad_norm": 0.019441615790128708, "learning_rate": 1.638157894736842e-05, "loss": 0.0024, "step": 356 }, { "epoch": 16.181818181818183, "eval_loss": 0.0013752405066043139, "eval_runtime": 0.2339, "eval_samples_per_second": 376.279, "eval_steps_per_second": 47.035, "step": 356 }, { "epoch": 16.227272727272727, "grad_norm": 0.019047444686293602, "learning_rate": 1.6342105263157894e-05, "loss": 0.0022, "step": 357 }, { "epoch": 16.227272727272727, "eval_loss": 0.0013710103230550885, "eval_runtime": 0.2296, "eval_samples_per_second": 383.255, "eval_steps_per_second": 47.907, "step": 357 }, { "epoch": 16.272727272727273, "grad_norm": 0.02004443109035492, "learning_rate": 1.630263157894737e-05, "loss": 0.002, "step": 358 }, { "epoch": 16.272727272727273, "eval_loss": 0.0013666612794622779, "eval_runtime": 0.2306, "eval_samples_per_second": 381.651, "eval_steps_per_second": 47.706, "step": 358 }, { "epoch": 16.318181818181817, "grad_norm": 0.018162380903959274, "learning_rate": 1.6263157894736843e-05, "loss": 0.0022, "step": 359 }, { "epoch": 16.318181818181817, "eval_loss": 0.0013625015271827579, "eval_runtime": 0.2336, "eval_samples_per_second": 376.757, "eval_steps_per_second": 47.095, "step": 359 }, { "epoch": 16.363636363636363, "grad_norm": 0.01866663061082363, "learning_rate": 1.6223684210526314e-05, "loss": 0.0023, "step": 360 }, { "epoch": 16.363636363636363, "eval_loss": 0.001358471461571753, "eval_runtime": 0.234, "eval_samples_per_second": 376.031, "eval_steps_per_second": 47.004, "step": 360 }, { "epoch": 16.40909090909091, "grad_norm": 0.023692943155765533, "learning_rate": 1.618421052631579e-05, "loss": 0.0021, "step": 361 }, { "epoch": 16.40909090909091, "eval_loss": 0.001354728126898408, "eval_runtime": 0.236, "eval_samples_per_second": 372.916, "eval_steps_per_second": 46.614, "step": 361 }, { "epoch": 16.454545454545453, "grad_norm": 0.021557440981268883, "learning_rate": 1.6144736842105262e-05, "loss": 0.0025, "step": 362 }, { "epoch": 16.454545454545453, "eval_loss": 0.0013508024858310819, "eval_runtime": 0.2359, "eval_samples_per_second": 373.118, "eval_steps_per_second": 46.64, "step": 362 }, { "epoch": 16.5, "grad_norm": 0.02110958844423294, "learning_rate": 1.6105263157894736e-05, "loss": 0.0023, "step": 363 }, { "epoch": 16.5, "eval_loss": 0.0013467645039781928, "eval_runtime": 0.2299, "eval_samples_per_second": 382.703, "eval_steps_per_second": 47.838, "step": 363 }, { "epoch": 16.545454545454547, "grad_norm": 0.019328676164150238, "learning_rate": 1.6065789473684214e-05, "loss": 0.0024, "step": 364 }, { "epoch": 16.545454545454547, "eval_loss": 0.0013428251259028912, "eval_runtime": 0.2289, "eval_samples_per_second": 384.389, "eval_steps_per_second": 48.049, "step": 364 }, { "epoch": 16.59090909090909, "grad_norm": 0.022835319861769676, "learning_rate": 1.6026315789473684e-05, "loss": 0.0023, "step": 365 }, { "epoch": 16.59090909090909, "eval_loss": 0.0013391603715717793, "eval_runtime": 0.2311, "eval_samples_per_second": 380.86, "eval_steps_per_second": 47.607, "step": 365 }, { "epoch": 16.636363636363637, "grad_norm": 0.01819239743053913, "learning_rate": 1.598684210526316e-05, "loss": 0.0022, "step": 366 }, { "epoch": 16.636363636363637, "eval_loss": 0.0013354604598134756, "eval_runtime": 0.2268, "eval_samples_per_second": 388.088, "eval_steps_per_second": 48.511, "step": 366 }, { "epoch": 16.681818181818183, "grad_norm": 0.019428908824920654, "learning_rate": 1.5947368421052633e-05, "loss": 0.0021, "step": 367 }, { "epoch": 16.681818181818183, "eval_loss": 0.001331814331933856, "eval_runtime": 0.2357, "eval_samples_per_second": 373.331, "eval_steps_per_second": 46.666, "step": 367 }, { "epoch": 16.727272727272727, "grad_norm": 0.018047934398055077, "learning_rate": 1.5907894736842107e-05, "loss": 0.0022, "step": 368 }, { "epoch": 16.727272727272727, "eval_loss": 0.0013281968422234058, "eval_runtime": 0.2353, "eval_samples_per_second": 374.058, "eval_steps_per_second": 46.757, "step": 368 }, { "epoch": 16.772727272727273, "grad_norm": 0.022303372621536255, "learning_rate": 1.5868421052631578e-05, "loss": 0.0022, "step": 369 }, { "epoch": 16.772727272727273, "eval_loss": 0.0013246661983430386, "eval_runtime": 0.2364, "eval_samples_per_second": 372.233, "eval_steps_per_second": 46.529, "step": 369 }, { "epoch": 16.818181818181817, "grad_norm": 0.017466159537434578, "learning_rate": 1.5828947368421055e-05, "loss": 0.0021, "step": 370 }, { "epoch": 16.818181818181817, "eval_loss": 0.001321229967288673, "eval_runtime": 0.2328, "eval_samples_per_second": 378.075, "eval_steps_per_second": 47.259, "step": 370 }, { "epoch": 16.863636363636363, "grad_norm": 0.018749618902802467, "learning_rate": 1.5789473684210526e-05, "loss": 0.0021, "step": 371 }, { "epoch": 16.863636363636363, "eval_loss": 0.0013179152738302946, "eval_runtime": 0.2372, "eval_samples_per_second": 371.017, "eval_steps_per_second": 46.377, "step": 371 }, { "epoch": 16.90909090909091, "grad_norm": 0.01943541131913662, "learning_rate": 1.575e-05, "loss": 0.0021, "step": 372 }, { "epoch": 16.90909090909091, "eval_loss": 0.0013147753197699785, "eval_runtime": 0.2325, "eval_samples_per_second": 378.47, "eval_steps_per_second": 47.309, "step": 372 }, { "epoch": 16.954545454545453, "grad_norm": 0.018470529466867447, "learning_rate": 1.5710526315789474e-05, "loss": 0.0021, "step": 373 }, { "epoch": 16.954545454545453, "eval_loss": 0.0013116110349074006, "eval_runtime": 0.2449, "eval_samples_per_second": 359.303, "eval_steps_per_second": 44.913, "step": 373 }, { "epoch": 17.0, "grad_norm": 0.02088373526930809, "learning_rate": 1.5671052631578948e-05, "loss": 0.0022, "step": 374 }, { "epoch": 17.0, "eval_loss": 0.0013083978556096554, "eval_runtime": 0.2373, "eval_samples_per_second": 370.786, "eval_steps_per_second": 46.348, "step": 374 }, { "epoch": 17.045454545454547, "grad_norm": 0.02049199491739273, "learning_rate": 1.563157894736842e-05, "loss": 0.0021, "step": 375 }, { "epoch": 17.045454545454547, "eval_loss": 0.0013052173890173435, "eval_runtime": 0.2375, "eval_samples_per_second": 370.52, "eval_steps_per_second": 46.315, "step": 375 }, { "epoch": 17.09090909090909, "grad_norm": 0.022884204983711243, "learning_rate": 1.5592105263157897e-05, "loss": 0.0023, "step": 376 }, { "epoch": 17.09090909090909, "eval_loss": 0.0013022100320085883, "eval_runtime": 0.2451, "eval_samples_per_second": 359.032, "eval_steps_per_second": 44.879, "step": 376 }, { "epoch": 17.136363636363637, "grad_norm": 0.018668444827198982, "learning_rate": 1.5552631578947367e-05, "loss": 0.002, "step": 377 }, { "epoch": 17.136363636363637, "eval_loss": 0.0012990765972062945, "eval_runtime": 0.2377, "eval_samples_per_second": 370.243, "eval_steps_per_second": 46.28, "step": 377 }, { "epoch": 17.181818181818183, "grad_norm": 0.018272867426276207, "learning_rate": 1.551315789473684e-05, "loss": 0.002, "step": 378 }, { "epoch": 17.181818181818183, "eval_loss": 0.0012959121959283948, "eval_runtime": 0.2445, "eval_samples_per_second": 359.966, "eval_steps_per_second": 44.996, "step": 378 }, { "epoch": 17.227272727272727, "grad_norm": 0.018142884597182274, "learning_rate": 1.547368421052632e-05, "loss": 0.0023, "step": 379 }, { "epoch": 17.227272727272727, "eval_loss": 0.0012926937779411674, "eval_runtime": 0.2463, "eval_samples_per_second": 357.295, "eval_steps_per_second": 44.662, "step": 379 }, { "epoch": 17.272727272727273, "grad_norm": 0.019035378471016884, "learning_rate": 1.543421052631579e-05, "loss": 0.002, "step": 380 }, { "epoch": 17.272727272727273, "eval_loss": 0.0012895982945337892, "eval_runtime": 0.2335, "eval_samples_per_second": 376.923, "eval_steps_per_second": 47.115, "step": 380 }, { "epoch": 17.318181818181817, "grad_norm": 0.02087828330695629, "learning_rate": 1.5394736842105264e-05, "loss": 0.0023, "step": 381 }, { "epoch": 17.318181818181817, "eval_loss": 0.0012864163145422935, "eval_runtime": 0.2398, "eval_samples_per_second": 367.034, "eval_steps_per_second": 45.879, "step": 381 }, { "epoch": 17.363636363636363, "grad_norm": 0.019186902791261673, "learning_rate": 1.5355263157894738e-05, "loss": 0.0021, "step": 382 }, { "epoch": 17.363636363636363, "eval_loss": 0.001283234334550798, "eval_runtime": 0.2265, "eval_samples_per_second": 388.504, "eval_steps_per_second": 48.563, "step": 382 }, { "epoch": 17.40909090909091, "grad_norm": 0.01789664290845394, "learning_rate": 1.5315789473684212e-05, "loss": 0.002, "step": 383 }, { "epoch": 17.40909090909091, "eval_loss": 0.0012801456032320857, "eval_runtime": 0.229, "eval_samples_per_second": 384.262, "eval_steps_per_second": 48.033, "step": 383 }, { "epoch": 17.454545454545453, "grad_norm": 0.017828669399023056, "learning_rate": 1.5276315789473683e-05, "loss": 0.0021, "step": 384 }, { "epoch": 17.454545454545453, "eval_loss": 0.0012770771281793714, "eval_runtime": 0.2259, "eval_samples_per_second": 389.598, "eval_steps_per_second": 48.7, "step": 384 }, { "epoch": 17.5, "grad_norm": 0.0225471593439579, "learning_rate": 1.5236842105263159e-05, "loss": 0.0022, "step": 385 }, { "epoch": 17.5, "eval_loss": 0.0012742335675284266, "eval_runtime": 0.2398, "eval_samples_per_second": 366.97, "eval_steps_per_second": 45.871, "step": 385 }, { "epoch": 17.545454545454547, "grad_norm": 0.02024303376674652, "learning_rate": 1.5197368421052631e-05, "loss": 0.0021, "step": 386 }, { "epoch": 17.545454545454547, "eval_loss": 0.0012715155025944114, "eval_runtime": 0.2322, "eval_samples_per_second": 378.914, "eval_steps_per_second": 47.364, "step": 386 }, { "epoch": 17.59090909090909, "grad_norm": 0.021520059555768967, "learning_rate": 1.5157894736842105e-05, "loss": 0.0021, "step": 387 }, { "epoch": 17.59090909090909, "eval_loss": 0.0012686135014519095, "eval_runtime": 0.2273, "eval_samples_per_second": 387.222, "eval_steps_per_second": 48.403, "step": 387 }, { "epoch": 17.636363636363637, "grad_norm": 0.02026878483593464, "learning_rate": 1.5118421052631578e-05, "loss": 0.0024, "step": 388 }, { "epoch": 17.636363636363637, "eval_loss": 0.0012655220925807953, "eval_runtime": 0.2345, "eval_samples_per_second": 375.273, "eval_steps_per_second": 46.909, "step": 388 }, { "epoch": 17.681818181818183, "grad_norm": 0.017312707379460335, "learning_rate": 1.5078947368421054e-05, "loss": 0.0019, "step": 389 }, { "epoch": 17.681818181818183, "eval_loss": 0.0012624793453142047, "eval_runtime": 0.2341, "eval_samples_per_second": 375.953, "eval_steps_per_second": 46.994, "step": 389 }, { "epoch": 17.727272727272727, "grad_norm": 0.014796672388911247, "learning_rate": 1.5039473684210525e-05, "loss": 0.0018, "step": 390 }, { "epoch": 17.727272727272727, "eval_loss": 0.0012595909647643566, "eval_runtime": 0.2412, "eval_samples_per_second": 364.883, "eval_steps_per_second": 45.61, "step": 390 }, { "epoch": 17.772727272727273, "grad_norm": 0.024672966450452805, "learning_rate": 1.5e-05, "loss": 0.0024, "step": 391 }, { "epoch": 17.772727272727273, "eval_loss": 0.001256533432751894, "eval_runtime": 0.2394, "eval_samples_per_second": 367.656, "eval_steps_per_second": 45.957, "step": 391 }, { "epoch": 17.818181818181817, "grad_norm": 0.01785973645746708, "learning_rate": 1.4960526315789475e-05, "loss": 0.0021, "step": 392 }, { "epoch": 17.818181818181817, "eval_loss": 0.001253555528819561, "eval_runtime": 0.2448, "eval_samples_per_second": 359.499, "eval_steps_per_second": 44.937, "step": 392 }, { "epoch": 17.863636363636363, "grad_norm": 0.018725674599409103, "learning_rate": 1.4921052631578947e-05, "loss": 0.0022, "step": 393 }, { "epoch": 17.863636363636363, "eval_loss": 0.001250546658411622, "eval_runtime": 0.2295, "eval_samples_per_second": 383.446, "eval_steps_per_second": 47.931, "step": 393 }, { "epoch": 17.90909090909091, "grad_norm": 0.01906488463282585, "learning_rate": 1.4881578947368421e-05, "loss": 0.0019, "step": 394 }, { "epoch": 17.90909090909091, "eval_loss": 0.0012476051924750209, "eval_runtime": 0.2392, "eval_samples_per_second": 367.955, "eval_steps_per_second": 45.994, "step": 394 }, { "epoch": 17.954545454545453, "grad_norm": 0.01702312007546425, "learning_rate": 1.4842105263157895e-05, "loss": 0.0021, "step": 395 }, { "epoch": 17.954545454545453, "eval_loss": 0.0012446870096027851, "eval_runtime": 0.2408, "eval_samples_per_second": 365.513, "eval_steps_per_second": 45.689, "step": 395 }, { "epoch": 18.0, "grad_norm": 0.018446706235408783, "learning_rate": 1.4802631578947368e-05, "loss": 0.0021, "step": 396 }, { "epoch": 18.0, "eval_loss": 0.0012417498510330915, "eval_runtime": 0.2401, "eval_samples_per_second": 366.532, "eval_steps_per_second": 45.816, "step": 396 }, { "epoch": 18.045454545454547, "grad_norm": 0.017580052837729454, "learning_rate": 1.4763157894736842e-05, "loss": 0.002, "step": 397 }, { "epoch": 18.045454545454547, "eval_loss": 0.0012387962779030204, "eval_runtime": 0.2359, "eval_samples_per_second": 373.019, "eval_steps_per_second": 46.627, "step": 397 }, { "epoch": 18.09090909090909, "grad_norm": 0.018549149855971336, "learning_rate": 1.4723684210526318e-05, "loss": 0.002, "step": 398 }, { "epoch": 18.09090909090909, "eval_loss": 0.0012358062667772174, "eval_runtime": 0.2409, "eval_samples_per_second": 365.331, "eval_steps_per_second": 45.666, "step": 398 }, { "epoch": 18.136363636363637, "grad_norm": 0.021288642659783363, "learning_rate": 1.468421052631579e-05, "loss": 0.0021, "step": 399 }, { "epoch": 18.136363636363637, "eval_loss": 0.00123285548761487, "eval_runtime": 0.239, "eval_samples_per_second": 368.2, "eval_steps_per_second": 46.025, "step": 399 }, { "epoch": 18.181818181818183, "grad_norm": 0.018042676150798798, "learning_rate": 1.4644736842105264e-05, "loss": 0.0021, "step": 400 }, { "epoch": 18.181818181818183, "eval_loss": 0.0012299600057303905, "eval_runtime": 0.2368, "eval_samples_per_second": 371.628, "eval_steps_per_second": 46.454, "step": 400 }, { "epoch": 18.227272727272727, "grad_norm": 0.017950624227523804, "learning_rate": 1.4605263157894737e-05, "loss": 0.002, "step": 401 }, { "epoch": 18.227272727272727, "eval_loss": 0.0012270959559828043, "eval_runtime": 0.2217, "eval_samples_per_second": 396.934, "eval_steps_per_second": 49.617, "step": 401 }, { "epoch": 18.272727272727273, "grad_norm": 0.016649143770337105, "learning_rate": 1.4565789473684211e-05, "loss": 0.002, "step": 402 }, { "epoch": 18.272727272727273, "eval_loss": 0.0012242384254932404, "eval_runtime": 0.2287, "eval_samples_per_second": 384.84, "eval_steps_per_second": 48.105, "step": 402 }, { "epoch": 18.318181818181817, "grad_norm": 0.016468649730086327, "learning_rate": 1.4526315789473685e-05, "loss": 0.0018, "step": 403 }, { "epoch": 18.318181818181817, "eval_loss": 0.001221520360559225, "eval_runtime": 0.2271, "eval_samples_per_second": 387.51, "eval_steps_per_second": 48.439, "step": 403 }, { "epoch": 18.363636363636363, "grad_norm": 0.01778615266084671, "learning_rate": 1.4486842105263158e-05, "loss": 0.002, "step": 404 }, { "epoch": 18.363636363636363, "eval_loss": 0.0012188454857096076, "eval_runtime": 0.2323, "eval_samples_per_second": 378.869, "eval_steps_per_second": 47.359, "step": 404 }, { "epoch": 18.40909090909091, "grad_norm": 0.019096923992037773, "learning_rate": 1.4447368421052632e-05, "loss": 0.0021, "step": 405 }, { "epoch": 18.40909090909091, "eval_loss": 0.0012163707287982106, "eval_runtime": 0.2287, "eval_samples_per_second": 384.807, "eval_steps_per_second": 48.101, "step": 405 }, { "epoch": 18.454545454545453, "grad_norm": 0.020378055050969124, "learning_rate": 1.4407894736842106e-05, "loss": 0.0019, "step": 406 }, { "epoch": 18.454545454545453, "eval_loss": 0.0012139691971242428, "eval_runtime": 0.2285, "eval_samples_per_second": 385.172, "eval_steps_per_second": 48.146, "step": 406 }, { "epoch": 18.5, "grad_norm": 0.01801607571542263, "learning_rate": 1.4368421052631578e-05, "loss": 0.0019, "step": 407 }, { "epoch": 18.5, "eval_loss": 0.0012113729026168585, "eval_runtime": 0.2323, "eval_samples_per_second": 378.867, "eval_steps_per_second": 47.358, "step": 407 }, { "epoch": 18.545454545454547, "grad_norm": 0.016806334257125854, "learning_rate": 1.4328947368421052e-05, "loss": 0.0019, "step": 408 }, { "epoch": 18.545454545454547, "eval_loss": 0.0012086898786947131, "eval_runtime": 0.2266, "eval_samples_per_second": 388.422, "eval_steps_per_second": 48.553, "step": 408 }, { "epoch": 18.59090909090909, "grad_norm": 0.01768423058092594, "learning_rate": 1.4289473684210527e-05, "loss": 0.0019, "step": 409 }, { "epoch": 18.59090909090909, "eval_loss": 0.001205993234179914, "eval_runtime": 0.233, "eval_samples_per_second": 377.712, "eval_steps_per_second": 47.214, "step": 409 }, { "epoch": 18.636363636363637, "grad_norm": 0.016840273514389992, "learning_rate": 1.4249999999999999e-05, "loss": 0.0019, "step": 410 }, { "epoch": 18.636363636363637, "eval_loss": 0.00120334152597934, "eval_runtime": 0.2278, "eval_samples_per_second": 386.255, "eval_steps_per_second": 48.282, "step": 410 }, { "epoch": 18.681818181818183, "grad_norm": 0.019254090264439583, "learning_rate": 1.4210526315789473e-05, "loss": 0.0021, "step": 411 }, { "epoch": 18.681818181818183, "eval_loss": 0.001200651633553207, "eval_runtime": 0.2414, "eval_samples_per_second": 364.529, "eval_steps_per_second": 45.566, "step": 411 }, { "epoch": 18.727272727272727, "grad_norm": 0.018222426995635033, "learning_rate": 1.4171052631578949e-05, "loss": 0.0021, "step": 412 }, { "epoch": 18.727272727272727, "eval_loss": 0.0011977426474913955, "eval_runtime": 0.2297, "eval_samples_per_second": 383.168, "eval_steps_per_second": 47.896, "step": 412 }, { "epoch": 18.772727272727273, "grad_norm": 0.017460381612181664, "learning_rate": 1.4131578947368422e-05, "loss": 0.0019, "step": 413 }, { "epoch": 18.772727272727273, "eval_loss": 0.0011948675382882357, "eval_runtime": 0.2295, "eval_samples_per_second": 383.384, "eval_steps_per_second": 47.923, "step": 413 }, { "epoch": 18.818181818181817, "grad_norm": 0.014636803418397903, "learning_rate": 1.4092105263157896e-05, "loss": 0.0018, "step": 414 }, { "epoch": 18.818181818181817, "eval_loss": 0.0011919679818674922, "eval_runtime": 0.2375, "eval_samples_per_second": 370.502, "eval_steps_per_second": 46.313, "step": 414 }, { "epoch": 18.863636363636363, "grad_norm": 0.01725298911333084, "learning_rate": 1.405263157894737e-05, "loss": 0.0019, "step": 415 }, { "epoch": 18.863636363636363, "eval_loss": 0.0011888709850609303, "eval_runtime": 0.2319, "eval_samples_per_second": 379.492, "eval_steps_per_second": 47.437, "step": 415 }, { "epoch": 18.90909090909091, "grad_norm": 0.017635343596339226, "learning_rate": 1.4013157894736842e-05, "loss": 0.0019, "step": 416 }, { "epoch": 18.90909090909091, "eval_loss": 0.0011859294027090073, "eval_runtime": 0.232, "eval_samples_per_second": 379.329, "eval_steps_per_second": 47.416, "step": 416 }, { "epoch": 18.954545454545453, "grad_norm": 0.017270755022764206, "learning_rate": 1.3973684210526316e-05, "loss": 0.002, "step": 417 }, { "epoch": 18.954545454545453, "eval_loss": 0.0011831001611426473, "eval_runtime": 0.2293, "eval_samples_per_second": 383.786, "eval_steps_per_second": 47.973, "step": 417 }, { "epoch": 19.0, "grad_norm": 0.017159774899482727, "learning_rate": 1.393421052631579e-05, "loss": 0.0018, "step": 418 }, { "epoch": 19.0, "eval_loss": 0.001180406310595572, "eval_runtime": 0.2475, "eval_samples_per_second": 355.577, "eval_steps_per_second": 44.447, "step": 418 }, { "epoch": 19.045454545454547, "grad_norm": 0.015916157513856888, "learning_rate": 1.3894736842105263e-05, "loss": 0.0018, "step": 419 }, { "epoch": 19.045454545454547, "eval_loss": 0.0011776703177019954, "eval_runtime": 0.2406, "eval_samples_per_second": 365.71, "eval_steps_per_second": 45.714, "step": 419 }, { "epoch": 19.09090909090909, "grad_norm": 0.016425369307398796, "learning_rate": 1.3855263157894737e-05, "loss": 0.002, "step": 420 }, { "epoch": 19.09090909090909, "eval_loss": 0.0011750170961022377, "eval_runtime": 0.2379, "eval_samples_per_second": 369.975, "eval_steps_per_second": 46.247, "step": 420 }, { "epoch": 19.136363636363637, "grad_norm": 0.017857089638710022, "learning_rate": 1.3815789473684211e-05, "loss": 0.0019, "step": 421 }, { "epoch": 19.136363636363637, "eval_loss": 0.0011724097421392798, "eval_runtime": 0.2504, "eval_samples_per_second": 351.397, "eval_steps_per_second": 43.925, "step": 421 }, { "epoch": 19.181818181818183, "grad_norm": 0.01837003231048584, "learning_rate": 1.3776315789473684e-05, "loss": 0.0022, "step": 422 }, { "epoch": 19.181818181818183, "eval_loss": 0.0011697578011080623, "eval_runtime": 0.2585, "eval_samples_per_second": 340.422, "eval_steps_per_second": 42.553, "step": 422 }, { "epoch": 19.227272727272727, "grad_norm": 0.019487086683511734, "learning_rate": 1.3736842105263158e-05, "loss": 0.0021, "step": 423 }, { "epoch": 19.227272727272727, "eval_loss": 0.0011671868851408362, "eval_runtime": 0.2398, "eval_samples_per_second": 366.896, "eval_steps_per_second": 45.862, "step": 423 }, { "epoch": 19.272727272727273, "grad_norm": 0.016021518036723137, "learning_rate": 1.369736842105263e-05, "loss": 0.0019, "step": 424 }, { "epoch": 19.272727272727273, "eval_loss": 0.001164758112281561, "eval_runtime": 0.2642, "eval_samples_per_second": 333.083, "eval_steps_per_second": 41.635, "step": 424 }, { "epoch": 19.318181818181817, "grad_norm": 0.018122289329767227, "learning_rate": 1.3657894736842106e-05, "loss": 0.0019, "step": 425 }, { "epoch": 19.318181818181817, "eval_loss": 0.001162288710474968, "eval_runtime": 0.2578, "eval_samples_per_second": 341.316, "eval_steps_per_second": 42.665, "step": 425 }, { "epoch": 19.363636363636363, "grad_norm": 0.015892351046204567, "learning_rate": 1.361842105263158e-05, "loss": 0.0018, "step": 426 }, { "epoch": 19.363636363636363, "eval_loss": 0.001159931649453938, "eval_runtime": 0.2409, "eval_samples_per_second": 365.291, "eval_steps_per_second": 45.661, "step": 426 }, { "epoch": 19.40909090909091, "grad_norm": 0.015699921175837517, "learning_rate": 1.3578947368421053e-05, "loss": 0.0019, "step": 427 }, { "epoch": 19.40909090909091, "eval_loss": 0.0011575055541470647, "eval_runtime": 0.2388, "eval_samples_per_second": 368.523, "eval_steps_per_second": 46.065, "step": 427 }, { "epoch": 19.454545454545453, "grad_norm": 0.01474451832473278, "learning_rate": 1.3539473684210527e-05, "loss": 0.0017, "step": 428 }, { "epoch": 19.454545454545453, "eval_loss": 0.001155222998932004, "eval_runtime": 0.2408, "eval_samples_per_second": 365.449, "eval_steps_per_second": 45.681, "step": 428 }, { "epoch": 19.5, "grad_norm": 0.016437875106930733, "learning_rate": 1.3500000000000001e-05, "loss": 0.0018, "step": 429 }, { "epoch": 19.5, "eval_loss": 0.0011530268238857388, "eval_runtime": 0.2325, "eval_samples_per_second": 378.535, "eval_steps_per_second": 47.317, "step": 429 }, { "epoch": 19.545454545454547, "grad_norm": 0.01538484264165163, "learning_rate": 1.3460526315789474e-05, "loss": 0.0018, "step": 430 }, { "epoch": 19.545454545454547, "eval_loss": 0.0011508835013955832, "eval_runtime": 0.2309, "eval_samples_per_second": 381.166, "eval_steps_per_second": 47.646, "step": 430 }, { "epoch": 19.59090909090909, "grad_norm": 0.017129214480519295, "learning_rate": 1.3421052631578948e-05, "loss": 0.0019, "step": 431 }, { "epoch": 19.59090909090909, "eval_loss": 0.0011487645097076893, "eval_runtime": 0.2362, "eval_samples_per_second": 372.58, "eval_steps_per_second": 46.573, "step": 431 }, { "epoch": 19.636363636363637, "grad_norm": 0.016592320054769516, "learning_rate": 1.3381578947368422e-05, "loss": 0.0019, "step": 432 }, { "epoch": 19.636363636363637, "eval_loss": 0.0011467835865914822, "eval_runtime": 0.2418, "eval_samples_per_second": 364.003, "eval_steps_per_second": 45.5, "step": 432 }, { "epoch": 19.681818181818183, "grad_norm": 0.018111824989318848, "learning_rate": 1.3342105263157894e-05, "loss": 0.0019, "step": 433 }, { "epoch": 19.681818181818183, "eval_loss": 0.0011448581935837865, "eval_runtime": 0.2437, "eval_samples_per_second": 361.142, "eval_steps_per_second": 45.143, "step": 433 }, { "epoch": 19.727272727272727, "grad_norm": 0.01678645797073841, "learning_rate": 1.3302631578947369e-05, "loss": 0.0018, "step": 434 }, { "epoch": 19.727272727272727, "eval_loss": 0.0011427812278270721, "eval_runtime": 0.229, "eval_samples_per_second": 384.254, "eval_steps_per_second": 48.032, "step": 434 }, { "epoch": 19.772727272727273, "grad_norm": 0.01921844109892845, "learning_rate": 1.3263157894736843e-05, "loss": 0.0021, "step": 435 }, { "epoch": 19.772727272727273, "eval_loss": 0.0011407433776184916, "eval_runtime": 0.24, "eval_samples_per_second": 366.62, "eval_steps_per_second": 45.828, "step": 435 }, { "epoch": 19.818181818181817, "grad_norm": 0.01700635813176632, "learning_rate": 1.3223684210526315e-05, "loss": 0.0019, "step": 436 }, { "epoch": 19.818181818181817, "eval_loss": 0.0011388043640181422, "eval_runtime": 0.241, "eval_samples_per_second": 365.22, "eval_steps_per_second": 45.652, "step": 436 }, { "epoch": 19.863636363636363, "grad_norm": 0.02139265649020672, "learning_rate": 1.318421052631579e-05, "loss": 0.0021, "step": 437 }, { "epoch": 19.863636363636363, "eval_loss": 0.0011367396218702197, "eval_runtime": 0.2327, "eval_samples_per_second": 378.128, "eval_steps_per_second": 47.266, "step": 437 }, { "epoch": 19.90909090909091, "grad_norm": 0.016315054148435593, "learning_rate": 1.3144736842105263e-05, "loss": 0.0018, "step": 438 }, { "epoch": 19.90909090909091, "eval_loss": 0.001134704565629363, "eval_runtime": 0.243, "eval_samples_per_second": 362.095, "eval_steps_per_second": 45.262, "step": 438 }, { "epoch": 19.954545454545453, "grad_norm": 0.015357021242380142, "learning_rate": 1.3105263157894738e-05, "loss": 0.0019, "step": 439 }, { "epoch": 19.954545454545453, "eval_loss": 0.0011326519306749105, "eval_runtime": 0.238, "eval_samples_per_second": 369.798, "eval_steps_per_second": 46.225, "step": 439 }, { "epoch": 20.0, "grad_norm": 0.01644103042781353, "learning_rate": 1.3065789473684212e-05, "loss": 0.0019, "step": 440 }, { "epoch": 20.0, "eval_loss": 0.0011306345695629716, "eval_runtime": 0.2373, "eval_samples_per_second": 370.835, "eval_steps_per_second": 46.354, "step": 440 }, { "epoch": 20.045454545454547, "grad_norm": 0.0168069489300251, "learning_rate": 1.3026315789473684e-05, "loss": 0.002, "step": 441 }, { "epoch": 20.045454545454547, "eval_loss": 0.0011284599313512444, "eval_runtime": 0.2305, "eval_samples_per_second": 381.741, "eval_steps_per_second": 47.718, "step": 441 }, { "epoch": 20.09090909090909, "grad_norm": 0.015401924960315228, "learning_rate": 1.2986842105263158e-05, "loss": 0.0019, "step": 442 }, { "epoch": 20.09090909090909, "eval_loss": 0.0011262963525950909, "eval_runtime": 0.2364, "eval_samples_per_second": 372.316, "eval_steps_per_second": 46.54, "step": 442 }, { "epoch": 20.136363636363637, "grad_norm": 0.019058704376220703, "learning_rate": 1.2947368421052633e-05, "loss": 0.0019, "step": 443 }, { "epoch": 20.136363636363637, "eval_loss": 0.0011239717714488506, "eval_runtime": 0.2383, "eval_samples_per_second": 369.214, "eval_steps_per_second": 46.152, "step": 443 }, { "epoch": 20.181818181818183, "grad_norm": 0.018643731251358986, "learning_rate": 1.2907894736842105e-05, "loss": 0.0019, "step": 444 }, { "epoch": 20.181818181818183, "eval_loss": 0.0011216469574719667, "eval_runtime": 0.2413, "eval_samples_per_second": 364.671, "eval_steps_per_second": 45.584, "step": 444 }, { "epoch": 20.227272727272727, "grad_norm": 0.018360739573836327, "learning_rate": 1.2868421052631579e-05, "loss": 0.002, "step": 445 }, { "epoch": 20.227272727272727, "eval_loss": 0.0011192425154149532, "eval_runtime": 0.2331, "eval_samples_per_second": 377.473, "eval_steps_per_second": 47.184, "step": 445 }, { "epoch": 20.272727272727273, "grad_norm": 0.016574162989854813, "learning_rate": 1.2828947368421053e-05, "loss": 0.0019, "step": 446 }, { "epoch": 20.272727272727273, "eval_loss": 0.001116919214837253, "eval_runtime": 0.2433, "eval_samples_per_second": 361.621, "eval_steps_per_second": 45.203, "step": 446 }, { "epoch": 20.318181818181817, "grad_norm": 0.01646783947944641, "learning_rate": 1.2789473684210526e-05, "loss": 0.0019, "step": 447 }, { "epoch": 20.318181818181817, "eval_loss": 0.0011146310716867447, "eval_runtime": 0.2514, "eval_samples_per_second": 349.985, "eval_steps_per_second": 43.748, "step": 447 }, { "epoch": 20.363636363636363, "grad_norm": 0.017044425010681152, "learning_rate": 1.275e-05, "loss": 0.0018, "step": 448 }, { "epoch": 20.363636363636363, "eval_loss": 0.0011123091680929065, "eval_runtime": 0.253, "eval_samples_per_second": 347.827, "eval_steps_per_second": 43.478, "step": 448 }, { "epoch": 20.40909090909091, "grad_norm": 0.017729461193084717, "learning_rate": 1.2710526315789474e-05, "loss": 0.0019, "step": 449 }, { "epoch": 20.40909090909091, "eval_loss": 0.001110163051635027, "eval_runtime": 0.2651, "eval_samples_per_second": 331.944, "eval_steps_per_second": 41.493, "step": 449 }, { "epoch": 20.454545454545453, "grad_norm": 0.014911322854459286, "learning_rate": 1.2671052631578947e-05, "loss": 0.0017, "step": 450 }, { "epoch": 20.454545454545453, "eval_loss": 0.0011080644326284528, "eval_runtime": 0.2496, "eval_samples_per_second": 352.625, "eval_steps_per_second": 44.078, "step": 450 }, { "epoch": 20.5, "grad_norm": 0.016675200313329697, "learning_rate": 1.263157894736842e-05, "loss": 0.0019, "step": 451 }, { "epoch": 20.5, "eval_loss": 0.0011060454417020082, "eval_runtime": 0.26, "eval_samples_per_second": 338.446, "eval_steps_per_second": 42.306, "step": 451 }, { "epoch": 20.545454545454547, "grad_norm": 0.016018547117710114, "learning_rate": 1.2592105263157895e-05, "loss": 0.0018, "step": 452 }, { "epoch": 20.545454545454547, "eval_loss": 0.0011039102682843804, "eval_runtime": 0.2399, "eval_samples_per_second": 366.846, "eval_steps_per_second": 45.856, "step": 452 }, { "epoch": 20.59090909090909, "grad_norm": 0.016912776976823807, "learning_rate": 1.2552631578947369e-05, "loss": 0.0019, "step": 453 }, { "epoch": 20.59090909090909, "eval_loss": 0.0011017858050763607, "eval_runtime": 0.2273, "eval_samples_per_second": 387.134, "eval_steps_per_second": 48.392, "step": 453 }, { "epoch": 20.636363636363637, "grad_norm": 0.015879783779382706, "learning_rate": 1.2513157894736843e-05, "loss": 0.0018, "step": 454 }, { "epoch": 20.636363636363637, "eval_loss": 0.0010996219934895635, "eval_runtime": 0.2449, "eval_samples_per_second": 359.378, "eval_steps_per_second": 44.922, "step": 454 }, { "epoch": 20.681818181818183, "grad_norm": 0.017021868377923965, "learning_rate": 1.2473684210526317e-05, "loss": 0.0019, "step": 455 }, { "epoch": 20.681818181818183, "eval_loss": 0.0010973933385685086, "eval_runtime": 0.229, "eval_samples_per_second": 384.317, "eval_steps_per_second": 48.04, "step": 455 }, { "epoch": 20.727272727272727, "grad_norm": 0.015419513918459415, "learning_rate": 1.243421052631579e-05, "loss": 0.0019, "step": 456 }, { "epoch": 20.727272727272727, "eval_loss": 0.001095130923204124, "eval_runtime": 0.2362, "eval_samples_per_second": 372.489, "eval_steps_per_second": 46.561, "step": 456 }, { "epoch": 20.772727272727273, "grad_norm": 0.01693497784435749, "learning_rate": 1.2394736842105264e-05, "loss": 0.0018, "step": 457 }, { "epoch": 20.772727272727273, "eval_loss": 0.0010928618721663952, "eval_runtime": 0.2233, "eval_samples_per_second": 394.174, "eval_steps_per_second": 49.272, "step": 457 }, { "epoch": 20.818181818181817, "grad_norm": 0.017432473599910736, "learning_rate": 1.2355263157894738e-05, "loss": 0.0018, "step": 458 }, { "epoch": 20.818181818181817, "eval_loss": 0.0010908265830948949, "eval_runtime": 0.2275, "eval_samples_per_second": 386.81, "eval_steps_per_second": 48.351, "step": 458 }, { "epoch": 20.863636363636363, "grad_norm": 0.014237020164728165, "learning_rate": 1.231578947368421e-05, "loss": 0.0016, "step": 459 }, { "epoch": 20.863636363636363, "eval_loss": 0.0010887522948905826, "eval_runtime": 0.236, "eval_samples_per_second": 372.82, "eval_steps_per_second": 46.603, "step": 459 }, { "epoch": 20.90909090909091, "grad_norm": 0.016278453171253204, "learning_rate": 1.2276315789473685e-05, "loss": 0.0017, "step": 460 }, { "epoch": 20.90909090909091, "eval_loss": 0.0010867157252505422, "eval_runtime": 0.2288, "eval_samples_per_second": 384.554, "eval_steps_per_second": 48.069, "step": 460 }, { "epoch": 20.954545454545453, "grad_norm": 0.01595933921635151, "learning_rate": 1.2236842105263159e-05, "loss": 0.0019, "step": 461 }, { "epoch": 20.954545454545453, "eval_loss": 0.0010847292141988873, "eval_runtime": 0.2252, "eval_samples_per_second": 390.754, "eval_steps_per_second": 48.844, "step": 461 }, { "epoch": 21.0, "grad_norm": 0.017483873292803764, "learning_rate": 1.2197368421052631e-05, "loss": 0.0018, "step": 462 }, { "epoch": 21.0, "eval_loss": 0.0010827549267560244, "eval_runtime": 0.2236, "eval_samples_per_second": 393.554, "eval_steps_per_second": 49.194, "step": 462 }, { "epoch": 21.045454545454547, "grad_norm": 0.01537961047142744, "learning_rate": 1.2157894736842105e-05, "loss": 0.0018, "step": 463 }, { "epoch": 21.045454545454547, "eval_loss": 0.0010808442020788789, "eval_runtime": 0.2361, "eval_samples_per_second": 372.729, "eval_steps_per_second": 46.591, "step": 463 }, { "epoch": 21.09090909090909, "grad_norm": 0.015306917950510979, "learning_rate": 1.2118421052631578e-05, "loss": 0.0017, "step": 464 }, { "epoch": 21.09090909090909, "eval_loss": 0.0010790039086714387, "eval_runtime": 0.2298, "eval_samples_per_second": 382.888, "eval_steps_per_second": 47.861, "step": 464 }, { "epoch": 21.136363636363637, "grad_norm": 0.013436819426715374, "learning_rate": 1.2078947368421052e-05, "loss": 0.0016, "step": 465 }, { "epoch": 21.136363636363637, "eval_loss": 0.0010772122768685222, "eval_runtime": 0.2421, "eval_samples_per_second": 363.528, "eval_steps_per_second": 45.441, "step": 465 }, { "epoch": 21.181818181818183, "grad_norm": 0.016245294362306595, "learning_rate": 1.2039473684210528e-05, "loss": 0.0018, "step": 466 }, { "epoch": 21.181818181818183, "eval_loss": 0.0010752826929092407, "eval_runtime": 0.2313, "eval_samples_per_second": 380.386, "eval_steps_per_second": 47.548, "step": 466 }, { "epoch": 21.227272727272727, "grad_norm": 0.015921350568532944, "learning_rate": 1.2e-05, "loss": 0.0017, "step": 467 }, { "epoch": 21.227272727272727, "eval_loss": 0.0010733003728091717, "eval_runtime": 0.2302, "eval_samples_per_second": 382.349, "eval_steps_per_second": 47.794, "step": 467 }, { "epoch": 21.272727272727273, "grad_norm": 0.016333753243088722, "learning_rate": 1.1960526315789474e-05, "loss": 0.0018, "step": 468 }, { "epoch": 21.272727272727273, "eval_loss": 0.0010712259681895375, "eval_runtime": 0.2299, "eval_samples_per_second": 382.824, "eval_steps_per_second": 47.853, "step": 468 }, { "epoch": 21.318181818181817, "grad_norm": 0.015542343258857727, "learning_rate": 1.1921052631578949e-05, "loss": 0.0017, "step": 469 }, { "epoch": 21.318181818181817, "eval_loss": 0.0010691812494769692, "eval_runtime": 0.2401, "eval_samples_per_second": 366.569, "eval_steps_per_second": 45.821, "step": 469 }, { "epoch": 21.363636363636363, "grad_norm": 0.017036397010087967, "learning_rate": 1.1881578947368421e-05, "loss": 0.0019, "step": 470 }, { "epoch": 21.363636363636363, "eval_loss": 0.0010671325726434588, "eval_runtime": 0.2367, "eval_samples_per_second": 371.749, "eval_steps_per_second": 46.469, "step": 470 }, { "epoch": 21.40909090909091, "grad_norm": 0.01621134579181671, "learning_rate": 1.1842105263157895e-05, "loss": 0.0018, "step": 471 }, { "epoch": 21.40909090909091, "eval_loss": 0.0010652164928615093, "eval_runtime": 0.2376, "eval_samples_per_second": 370.382, "eval_steps_per_second": 46.298, "step": 471 }, { "epoch": 21.454545454545453, "grad_norm": 0.013604752719402313, "learning_rate": 1.180263157894737e-05, "loss": 0.0017, "step": 472 }, { "epoch": 21.454545454545453, "eval_loss": 0.0010633313795551658, "eval_runtime": 0.2408, "eval_samples_per_second": 365.399, "eval_steps_per_second": 45.675, "step": 472 }, { "epoch": 21.5, "grad_norm": 0.014795001596212387, "learning_rate": 1.1763157894736842e-05, "loss": 0.0016, "step": 473 }, { "epoch": 21.5, "eval_loss": 0.001061469316482544, "eval_runtime": 0.2486, "eval_samples_per_second": 354.0, "eval_steps_per_second": 44.25, "step": 473 }, { "epoch": 21.545454545454547, "grad_norm": 0.015267064794898033, "learning_rate": 1.1723684210526316e-05, "loss": 0.0018, "step": 474 }, { "epoch": 21.545454545454547, "eval_loss": 0.0010596156353130937, "eval_runtime": 0.2421, "eval_samples_per_second": 363.419, "eval_steps_per_second": 45.427, "step": 474 }, { "epoch": 21.59090909090909, "grad_norm": 0.017209574580192566, "learning_rate": 1.168421052631579e-05, "loss": 0.0018, "step": 475 }, { "epoch": 21.59090909090909, "eval_loss": 0.0010576344793662429, "eval_runtime": 0.2464, "eval_samples_per_second": 357.122, "eval_steps_per_second": 44.64, "step": 475 }, { "epoch": 21.636363636363637, "grad_norm": 0.0154210040345788, "learning_rate": 1.1644736842105263e-05, "loss": 0.0018, "step": 476 }, { "epoch": 21.636363636363637, "eval_loss": 0.0010555870831012726, "eval_runtime": 0.2538, "eval_samples_per_second": 346.671, "eval_steps_per_second": 43.334, "step": 476 }, { "epoch": 21.681818181818183, "grad_norm": 0.017148546874523163, "learning_rate": 1.1605263157894737e-05, "loss": 0.0018, "step": 477 }, { "epoch": 21.681818181818183, "eval_loss": 0.0010535044129937887, "eval_runtime": 0.2437, "eval_samples_per_second": 361.038, "eval_steps_per_second": 45.13, "step": 477 }, { "epoch": 21.727272727272727, "grad_norm": 0.01518462784588337, "learning_rate": 1.1565789473684211e-05, "loss": 0.0017, "step": 478 }, { "epoch": 21.727272727272727, "eval_loss": 0.0010514232562854886, "eval_runtime": 0.2402, "eval_samples_per_second": 366.378, "eval_steps_per_second": 45.797, "step": 478 }, { "epoch": 21.772727272727273, "grad_norm": 0.01500785257667303, "learning_rate": 1.1526315789473683e-05, "loss": 0.0016, "step": 479 }, { "epoch": 21.772727272727273, "eval_loss": 0.0010493744630366564, "eval_runtime": 0.2449, "eval_samples_per_second": 359.362, "eval_steps_per_second": 44.92, "step": 479 }, { "epoch": 21.818181818181817, "grad_norm": 0.015978703275322914, "learning_rate": 1.148684210526316e-05, "loss": 0.0018, "step": 480 }, { "epoch": 21.818181818181817, "eval_loss": 0.0010474204318597913, "eval_runtime": 0.2586, "eval_samples_per_second": 340.345, "eval_steps_per_second": 42.543, "step": 480 }, { "epoch": 21.863636363636363, "grad_norm": 0.01765250600874424, "learning_rate": 1.1447368421052632e-05, "loss": 0.0017, "step": 481 }, { "epoch": 21.863636363636363, "eval_loss": 0.0010454690782353282, "eval_runtime": 0.2292, "eval_samples_per_second": 383.999, "eval_steps_per_second": 48.0, "step": 481 }, { "epoch": 21.90909090909091, "grad_norm": 0.016576098278164864, "learning_rate": 1.1407894736842106e-05, "loss": 0.0017, "step": 482 }, { "epoch": 21.90909090909091, "eval_loss": 0.0010435826843604445, "eval_runtime": 0.2414, "eval_samples_per_second": 364.501, "eval_steps_per_second": 45.563, "step": 482 }, { "epoch": 21.954545454545453, "grad_norm": 0.014276851899921894, "learning_rate": 1.136842105263158e-05, "loss": 0.0017, "step": 483 }, { "epoch": 21.954545454545453, "eval_loss": 0.0010416691657155752, "eval_runtime": 0.2241, "eval_samples_per_second": 392.673, "eval_steps_per_second": 49.084, "step": 483 }, { "epoch": 22.0, "grad_norm": 0.01667684316635132, "learning_rate": 1.1328947368421052e-05, "loss": 0.0017, "step": 484 }, { "epoch": 22.0, "eval_loss": 0.0010398232843726873, "eval_runtime": 0.24, "eval_samples_per_second": 366.592, "eval_steps_per_second": 45.824, "step": 484 }, { "epoch": 22.045454545454547, "grad_norm": 0.016187671571969986, "learning_rate": 1.1289473684210527e-05, "loss": 0.0018, "step": 485 }, { "epoch": 22.045454545454547, "eval_loss": 0.0010379315353929996, "eval_runtime": 0.2306, "eval_samples_per_second": 381.551, "eval_steps_per_second": 47.694, "step": 485 }, { "epoch": 22.09090909090909, "grad_norm": 0.014743163250386715, "learning_rate": 1.125e-05, "loss": 0.0018, "step": 486 }, { "epoch": 22.09090909090909, "eval_loss": 0.0010359951993450522, "eval_runtime": 0.227, "eval_samples_per_second": 387.598, "eval_steps_per_second": 48.45, "step": 486 }, { "epoch": 22.136363636363637, "grad_norm": 0.01694609597325325, "learning_rate": 1.1210526315789473e-05, "loss": 0.0017, "step": 487 }, { "epoch": 22.136363636363637, "eval_loss": 0.0010341384913772345, "eval_runtime": 0.2407, "eval_samples_per_second": 365.633, "eval_steps_per_second": 45.704, "step": 487 }, { "epoch": 22.181818181818183, "grad_norm": 0.014260073192417622, "learning_rate": 1.1171052631578947e-05, "loss": 0.0017, "step": 488 }, { "epoch": 22.181818181818183, "eval_loss": 0.0010322789894416928, "eval_runtime": 0.2279, "eval_samples_per_second": 386.189, "eval_steps_per_second": 48.274, "step": 488 }, { "epoch": 22.227272727272727, "grad_norm": 0.017539717257022858, "learning_rate": 1.1131578947368421e-05, "loss": 0.0016, "step": 489 }, { "epoch": 22.227272727272727, "eval_loss": 0.001030544051900506, "eval_runtime": 0.239, "eval_samples_per_second": 368.276, "eval_steps_per_second": 46.034, "step": 489 }, { "epoch": 22.272727272727273, "grad_norm": 0.013456945307552814, "learning_rate": 1.1092105263157894e-05, "loss": 0.0016, "step": 490 }, { "epoch": 22.272727272727273, "eval_loss": 0.0010288661578670144, "eval_runtime": 0.2301, "eval_samples_per_second": 382.513, "eval_steps_per_second": 47.814, "step": 490 }, { "epoch": 22.318181818181817, "grad_norm": 0.016474781557917595, "learning_rate": 1.1052631578947368e-05, "loss": 0.0017, "step": 491 }, { "epoch": 22.318181818181817, "eval_loss": 0.0010273018851876259, "eval_runtime": 0.235, "eval_samples_per_second": 374.491, "eval_steps_per_second": 46.811, "step": 491 }, { "epoch": 22.363636363636363, "grad_norm": 0.01373574323952198, "learning_rate": 1.1013157894736842e-05, "loss": 0.0014, "step": 492 }, { "epoch": 22.363636363636363, "eval_loss": 0.00102571165189147, "eval_runtime": 0.2263, "eval_samples_per_second": 388.813, "eval_steps_per_second": 48.602, "step": 492 }, { "epoch": 22.40909090909091, "grad_norm": 0.015442097559571266, "learning_rate": 1.0973684210526316e-05, "loss": 0.0016, "step": 493 }, { "epoch": 22.40909090909091, "eval_loss": 0.0010241527343168855, "eval_runtime": 0.2352, "eval_samples_per_second": 374.081, "eval_steps_per_second": 46.76, "step": 493 }, { "epoch": 22.454545454545453, "grad_norm": 0.015592455863952637, "learning_rate": 1.093421052631579e-05, "loss": 0.0017, "step": 494 }, { "epoch": 22.454545454545453, "eval_loss": 0.0010226276936009526, "eval_runtime": 0.2373, "eval_samples_per_second": 370.902, "eval_steps_per_second": 46.363, "step": 494 }, { "epoch": 22.5, "grad_norm": 0.013556539081037045, "learning_rate": 1.0894736842105265e-05, "loss": 0.0016, "step": 495 }, { "epoch": 22.5, "eval_loss": 0.001021133502945304, "eval_runtime": 0.2433, "eval_samples_per_second": 361.732, "eval_steps_per_second": 45.217, "step": 495 }, { "epoch": 22.545454545454547, "grad_norm": 0.012894881889224052, "learning_rate": 1.0855263157894737e-05, "loss": 0.0016, "step": 496 }, { "epoch": 22.545454545454547, "eval_loss": 0.0010197004303336143, "eval_runtime": 0.2415, "eval_samples_per_second": 364.331, "eval_steps_per_second": 45.541, "step": 496 }, { "epoch": 22.59090909090909, "grad_norm": 0.014628540724515915, "learning_rate": 1.0815789473684211e-05, "loss": 0.0017, "step": 497 }, { "epoch": 22.59090909090909, "eval_loss": 0.0010182132245972753, "eval_runtime": 0.2417, "eval_samples_per_second": 364.047, "eval_steps_per_second": 45.506, "step": 497 }, { "epoch": 22.636363636363637, "grad_norm": 0.014721691608428955, "learning_rate": 1.0776315789473685e-05, "loss": 0.0017, "step": 498 }, { "epoch": 22.636363636363637, "eval_loss": 0.0010166773572564125, "eval_runtime": 0.2388, "eval_samples_per_second": 368.522, "eval_steps_per_second": 46.065, "step": 498 }, { "epoch": 22.681818181818183, "grad_norm": 0.01576976478099823, "learning_rate": 1.0736842105263158e-05, "loss": 0.0018, "step": 499 }, { "epoch": 22.681818181818183, "eval_loss": 0.001015029032714665, "eval_runtime": 0.2308, "eval_samples_per_second": 381.26, "eval_steps_per_second": 47.657, "step": 499 }, { "epoch": 22.727272727272727, "grad_norm": 0.015886450186371803, "learning_rate": 1.0697368421052632e-05, "loss": 0.0017, "step": 500 }, { "epoch": 22.727272727272727, "eval_loss": 0.0010134456679224968, "eval_runtime": 0.236, "eval_samples_per_second": 372.817, "eval_steps_per_second": 46.602, "step": 500 }, { "epoch": 22.772727272727273, "grad_norm": 0.01687587983906269, "learning_rate": 1.0657894736842106e-05, "loss": 0.0017, "step": 501 }, { "epoch": 22.772727272727273, "eval_loss": 0.0010118514765053988, "eval_runtime": 0.2468, "eval_samples_per_second": 356.526, "eval_steps_per_second": 44.566, "step": 501 }, { "epoch": 22.818181818181817, "grad_norm": 0.013874330557882786, "learning_rate": 1.0618421052631579e-05, "loss": 0.0016, "step": 502 }, { "epoch": 22.818181818181817, "eval_loss": 0.0010103358654305339, "eval_runtime": 0.2231, "eval_samples_per_second": 394.376, "eval_steps_per_second": 49.297, "step": 502 }, { "epoch": 22.863636363636363, "grad_norm": 0.014864981174468994, "learning_rate": 1.0578947368421053e-05, "loss": 0.0017, "step": 503 }, { "epoch": 22.863636363636363, "eval_loss": 0.001008835039101541, "eval_runtime": 0.2399, "eval_samples_per_second": 366.77, "eval_steps_per_second": 45.846, "step": 503 }, { "epoch": 22.90909090909091, "grad_norm": 0.013614412397146225, "learning_rate": 1.0539473684210525e-05, "loss": 0.0016, "step": 504 }, { "epoch": 22.90909090909091, "eval_loss": 0.001007361221127212, "eval_runtime": 0.2267, "eval_samples_per_second": 388.143, "eval_steps_per_second": 48.518, "step": 504 }, { "epoch": 22.954545454545453, "grad_norm": 0.019395658746361732, "learning_rate": 1.05e-05, "loss": 0.0019, "step": 505 }, { "epoch": 22.954545454545453, "eval_loss": 0.0010058052139356732, "eval_runtime": 0.2757, "eval_samples_per_second": 319.24, "eval_steps_per_second": 39.905, "step": 505 }, { "epoch": 23.0, "grad_norm": 0.017713138833642006, "learning_rate": 1.0460526315789474e-05, "loss": 0.0018, "step": 506 }, { "epoch": 23.0, "eval_loss": 0.0010041649220511317, "eval_runtime": 0.2305, "eval_samples_per_second": 381.809, "eval_steps_per_second": 47.726, "step": 506 }, { "epoch": 23.045454545454547, "grad_norm": 0.014331554993987083, "learning_rate": 1.0421052631578948e-05, "loss": 0.0017, "step": 507 }, { "epoch": 23.045454545454547, "eval_loss": 0.0010025816736742854, "eval_runtime": 0.2325, "eval_samples_per_second": 378.56, "eval_steps_per_second": 47.32, "step": 507 }, { "epoch": 23.09090909090909, "grad_norm": 0.014041769318282604, "learning_rate": 1.0381578947368422e-05, "loss": 0.0017, "step": 508 }, { "epoch": 23.09090909090909, "eval_loss": 0.001001022639684379, "eval_runtime": 0.2296, "eval_samples_per_second": 383.301, "eval_steps_per_second": 47.913, "step": 508 }, { "epoch": 23.136363636363637, "grad_norm": 0.014782671816647053, "learning_rate": 1.0342105263157896e-05, "loss": 0.0017, "step": 509 }, { "epoch": 23.136363636363637, "eval_loss": 0.0009995178552344441, "eval_runtime": 0.2324, "eval_samples_per_second": 378.719, "eval_steps_per_second": 47.34, "step": 509 }, { "epoch": 23.181818181818183, "grad_norm": 0.014820964075624943, "learning_rate": 1.0302631578947368e-05, "loss": 0.0017, "step": 510 }, { "epoch": 23.181818181818183, "eval_loss": 0.0009979914175346494, "eval_runtime": 0.2306, "eval_samples_per_second": 381.609, "eval_steps_per_second": 47.701, "step": 510 }, { "epoch": 23.227272727272727, "grad_norm": 0.014552117325365543, "learning_rate": 1.0263157894736843e-05, "loss": 0.0017, "step": 511 }, { "epoch": 23.227272727272727, "eval_loss": 0.0009964742930606008, "eval_runtime": 0.2277, "eval_samples_per_second": 386.477, "eval_steps_per_second": 48.31, "step": 511 }, { "epoch": 23.272727272727273, "grad_norm": 0.016575666144490242, "learning_rate": 1.0223684210526317e-05, "loss": 0.0018, "step": 512 }, { "epoch": 23.272727272727273, "eval_loss": 0.0009949287632480264, "eval_runtime": 0.2408, "eval_samples_per_second": 365.446, "eval_steps_per_second": 45.681, "step": 512 }, { "epoch": 23.318181818181817, "grad_norm": 0.013247662223875523, "learning_rate": 1.018421052631579e-05, "loss": 0.0016, "step": 513 }, { "epoch": 23.318181818181817, "eval_loss": 0.0009934090776368976, "eval_runtime": 0.2312, "eval_samples_per_second": 380.548, "eval_steps_per_second": 47.569, "step": 513 }, { "epoch": 23.363636363636363, "grad_norm": 0.014102768152952194, "learning_rate": 1.0144736842105263e-05, "loss": 0.0017, "step": 514 }, { "epoch": 23.363636363636363, "eval_loss": 0.0009918283903971314, "eval_runtime": 0.2315, "eval_samples_per_second": 380.202, "eval_steps_per_second": 47.525, "step": 514 }, { "epoch": 23.40909090909091, "grad_norm": 0.015047273598611355, "learning_rate": 1.0105263157894738e-05, "loss": 0.0017, "step": 515 }, { "epoch": 23.40909090909091, "eval_loss": 0.0009903222089633346, "eval_runtime": 0.2308, "eval_samples_per_second": 381.309, "eval_steps_per_second": 47.664, "step": 515 }, { "epoch": 23.454545454545453, "grad_norm": 0.016119079664349556, "learning_rate": 1.006578947368421e-05, "loss": 0.0018, "step": 516 }, { "epoch": 23.454545454545453, "eval_loss": 0.0009887360502034426, "eval_runtime": 0.2356, "eval_samples_per_second": 373.467, "eval_steps_per_second": 46.683, "step": 516 }, { "epoch": 23.5, "grad_norm": 0.013055874034762383, "learning_rate": 1.0026315789473684e-05, "loss": 0.0015, "step": 517 }, { "epoch": 23.5, "eval_loss": 0.0009872028604149818, "eval_runtime": 0.2353, "eval_samples_per_second": 373.918, "eval_steps_per_second": 46.74, "step": 517 }, { "epoch": 23.545454545454547, "grad_norm": 0.014796939678490162, "learning_rate": 9.986842105263158e-06, "loss": 0.0017, "step": 518 }, { "epoch": 23.545454545454547, "eval_loss": 0.0009856532560661435, "eval_runtime": 0.236, "eval_samples_per_second": 372.816, "eval_steps_per_second": 46.602, "step": 518 }, { "epoch": 23.59090909090909, "grad_norm": 0.01749352179467678, "learning_rate": 9.94736842105263e-06, "loss": 0.0018, "step": 519 }, { "epoch": 23.59090909090909, "eval_loss": 0.000984109123237431, "eval_runtime": 0.2375, "eval_samples_per_second": 370.519, "eval_steps_per_second": 46.315, "step": 519 }, { "epoch": 23.636363636363637, "grad_norm": 0.014436857774853706, "learning_rate": 9.907894736842107e-06, "loss": 0.0017, "step": 520 }, { "epoch": 23.636363636363637, "eval_loss": 0.0009825569577515125, "eval_runtime": 0.2329, "eval_samples_per_second": 377.839, "eval_steps_per_second": 47.23, "step": 520 }, { "epoch": 23.681818181818183, "grad_norm": 0.0134369982406497, "learning_rate": 9.868421052631579e-06, "loss": 0.0015, "step": 521 }, { "epoch": 23.681818181818183, "eval_loss": 0.0009810830233618617, "eval_runtime": 0.2463, "eval_samples_per_second": 357.352, "eval_steps_per_second": 44.669, "step": 521 }, { "epoch": 23.727272727272727, "grad_norm": 0.015284021385014057, "learning_rate": 9.828947368421053e-06, "loss": 0.0017, "step": 522 }, { "epoch": 23.727272727272727, "eval_loss": 0.0009796229423955083, "eval_runtime": 0.2303, "eval_samples_per_second": 382.111, "eval_steps_per_second": 47.764, "step": 522 }, { "epoch": 23.772727272727273, "grad_norm": 0.01389851700514555, "learning_rate": 9.789473684210527e-06, "loss": 0.0016, "step": 523 }, { "epoch": 23.772727272727273, "eval_loss": 0.0009782682172954082, "eval_runtime": 0.2358, "eval_samples_per_second": 373.188, "eval_steps_per_second": 46.649, "step": 523 }, { "epoch": 23.818181818181817, "grad_norm": 0.013064984232187271, "learning_rate": 9.75e-06, "loss": 0.0016, "step": 524 }, { "epoch": 23.818181818181817, "eval_loss": 0.000976921641267836, "eval_runtime": 0.4347, "eval_samples_per_second": 202.42, "eval_steps_per_second": 25.303, "step": 524 }, { "epoch": 23.863636363636363, "grad_norm": 0.01853189431130886, "learning_rate": 9.710526315789474e-06, "loss": 0.0018, "step": 525 }, { "epoch": 23.863636363636363, "eval_loss": 0.0009755737846717238, "eval_runtime": 0.3094, "eval_samples_per_second": 284.419, "eval_steps_per_second": 35.552, "step": 525 }, { "epoch": 23.90909090909091, "grad_norm": 0.015431704930961132, "learning_rate": 9.671052631578948e-06, "loss": 0.0016, "step": 526 }, { "epoch": 23.90909090909091, "eval_loss": 0.0009742649854160845, "eval_runtime": 0.3285, "eval_samples_per_second": 267.881, "eval_steps_per_second": 33.485, "step": 526 }, { "epoch": 23.954545454545453, "grad_norm": 0.015396500937640667, "learning_rate": 9.63157894736842e-06, "loss": 0.0017, "step": 527 }, { "epoch": 23.954545454545453, "eval_loss": 0.0009728847653605044, "eval_runtime": 0.379, "eval_samples_per_second": 232.167, "eval_steps_per_second": 29.021, "step": 527 }, { "epoch": 24.0, "grad_norm": 0.018940720707178116, "learning_rate": 9.592105263157895e-06, "loss": 0.0018, "step": 528 }, { "epoch": 24.0, "eval_loss": 0.0009714543703012168, "eval_runtime": 0.4121, "eval_samples_per_second": 213.562, "eval_steps_per_second": 26.695, "step": 528 }, { "epoch": 24.045454545454547, "grad_norm": 0.013447549194097519, "learning_rate": 9.552631578947369e-06, "loss": 0.0016, "step": 529 }, { "epoch": 24.045454545454547, "eval_loss": 0.0009699968504719436, "eval_runtime": 0.4871, "eval_samples_per_second": 180.65, "eval_steps_per_second": 22.581, "step": 529 }, { "epoch": 24.09090909090909, "grad_norm": 0.01361093670129776, "learning_rate": 9.513157894736841e-06, "loss": 0.0016, "step": 530 }, { "epoch": 24.09090909090909, "eval_loss": 0.0009685555123724043, "eval_runtime": 0.4944, "eval_samples_per_second": 178.009, "eval_steps_per_second": 22.251, "step": 530 }, { "epoch": 24.136363636363637, "grad_norm": 0.014719787985086441, "learning_rate": 9.473684210526315e-06, "loss": 0.0016, "step": 531 }, { "epoch": 24.136363636363637, "eval_loss": 0.0009670979925431311, "eval_runtime": 0.3472, "eval_samples_per_second": 253.483, "eval_steps_per_second": 31.685, "step": 531 }, { "epoch": 24.181818181818183, "grad_norm": 0.01682870462536812, "learning_rate": 9.43421052631579e-06, "loss": 0.0018, "step": 532 }, { "epoch": 24.181818181818183, "eval_loss": 0.0009655930334702134, "eval_runtime": 0.2295, "eval_samples_per_second": 383.473, "eval_steps_per_second": 47.934, "step": 532 }, { "epoch": 24.227272727272727, "grad_norm": 0.015661459416151047, "learning_rate": 9.394736842105262e-06, "loss": 0.0016, "step": 533 }, { "epoch": 24.227272727272727, "eval_loss": 0.0009641083306632936, "eval_runtime": 0.247, "eval_samples_per_second": 356.243, "eval_steps_per_second": 44.53, "step": 533 }, { "epoch": 24.272727272727273, "grad_norm": 0.015652479603886604, "learning_rate": 9.355263157894738e-06, "loss": 0.0016, "step": 534 }, { "epoch": 24.272727272727273, "eval_loss": 0.0009626846294850111, "eval_runtime": 0.2337, "eval_samples_per_second": 376.608, "eval_steps_per_second": 47.076, "step": 534 }, { "epoch": 24.318181818181817, "grad_norm": 0.013394070789217949, "learning_rate": 9.315789473684212e-06, "loss": 0.0016, "step": 535 }, { "epoch": 24.318181818181817, "eval_loss": 0.0009613109868951142, "eval_runtime": 0.2315, "eval_samples_per_second": 380.202, "eval_steps_per_second": 47.525, "step": 535 }, { "epoch": 24.363636363636363, "grad_norm": 0.015152989886701107, "learning_rate": 9.276315789473685e-06, "loss": 0.0016, "step": 536 }, { "epoch": 24.363636363636363, "eval_loss": 0.0009599780314601958, "eval_runtime": 0.2373, "eval_samples_per_second": 370.835, "eval_steps_per_second": 46.354, "step": 536 }, { "epoch": 24.40909090909091, "grad_norm": 0.014209273271262646, "learning_rate": 9.236842105263159e-06, "loss": 0.0016, "step": 537 }, { "epoch": 24.40909090909091, "eval_loss": 0.0009586341911926866, "eval_runtime": 0.2342, "eval_samples_per_second": 375.816, "eval_steps_per_second": 46.977, "step": 537 }, { "epoch": 24.454545454545453, "grad_norm": 0.014566083438694477, "learning_rate": 9.197368421052633e-06, "loss": 0.0015, "step": 538 }, { "epoch": 24.454545454545453, "eval_loss": 0.000957344425842166, "eval_runtime": 0.2373, "eval_samples_per_second": 370.82, "eval_steps_per_second": 46.352, "step": 538 }, { "epoch": 24.5, "grad_norm": 0.016195589676499367, "learning_rate": 9.157894736842105e-06, "loss": 0.0017, "step": 539 }, { "epoch": 24.5, "eval_loss": 0.0009560330072417855, "eval_runtime": 0.2313, "eval_samples_per_second": 380.382, "eval_steps_per_second": 47.548, "step": 539 }, { "epoch": 24.545454545454547, "grad_norm": 0.01577996276319027, "learning_rate": 9.11842105263158e-06, "loss": 0.0017, "step": 540 }, { "epoch": 24.545454545454547, "eval_loss": 0.0009547690278850496, "eval_runtime": 0.2288, "eval_samples_per_second": 384.628, "eval_steps_per_second": 48.079, "step": 540 }, { "epoch": 24.59090909090909, "grad_norm": 0.013901899568736553, "learning_rate": 9.078947368421054e-06, "loss": 0.0015, "step": 541 }, { "epoch": 24.59090909090909, "eval_loss": 0.0009535103454254568, "eval_runtime": 0.2351, "eval_samples_per_second": 374.379, "eval_steps_per_second": 46.797, "step": 541 }, { "epoch": 24.636363636363637, "grad_norm": 0.014091338962316513, "learning_rate": 9.039473684210526e-06, "loss": 0.0016, "step": 542 }, { "epoch": 24.636363636363637, "eval_loss": 0.0009522747131995857, "eval_runtime": 0.2274, "eval_samples_per_second": 387.015, "eval_steps_per_second": 48.377, "step": 542 }, { "epoch": 24.681818181818183, "grad_norm": 0.014544407837092876, "learning_rate": 9e-06, "loss": 0.0017, "step": 543 }, { "epoch": 24.681818181818183, "eval_loss": 0.0009510606760159135, "eval_runtime": 0.2442, "eval_samples_per_second": 360.336, "eval_steps_per_second": 45.042, "step": 543 }, { "epoch": 24.727272727272727, "grad_norm": 0.01616845279932022, "learning_rate": 8.960526315789473e-06, "loss": 0.0017, "step": 544 }, { "epoch": 24.727272727272727, "eval_loss": 0.0009498685249127448, "eval_runtime": 0.2388, "eval_samples_per_second": 368.514, "eval_steps_per_second": 46.064, "step": 544 }, { "epoch": 24.772727272727273, "grad_norm": 0.01609298586845398, "learning_rate": 8.921052631578947e-06, "loss": 0.0017, "step": 545 }, { "epoch": 24.772727272727273, "eval_loss": 0.0009486477356404066, "eval_runtime": 0.2287, "eval_samples_per_second": 384.803, "eval_steps_per_second": 48.1, "step": 545 }, { "epoch": 24.818181818181817, "grad_norm": 0.013633071444928646, "learning_rate": 8.881578947368421e-06, "loss": 0.0016, "step": 546 }, { "epoch": 24.818181818181817, "eval_loss": 0.0009474134421907365, "eval_runtime": 0.2393, "eval_samples_per_second": 367.684, "eval_steps_per_second": 45.96, "step": 546 }, { "epoch": 24.863636363636363, "grad_norm": 0.013738269917666912, "learning_rate": 8.842105263157893e-06, "loss": 0.0016, "step": 547 }, { "epoch": 24.863636363636363, "eval_loss": 0.0009461792069487274, "eval_runtime": 0.2312, "eval_samples_per_second": 380.637, "eval_steps_per_second": 47.58, "step": 547 }, { "epoch": 24.90909090909091, "grad_norm": 0.013620936311781406, "learning_rate": 8.80263157894737e-06, "loss": 0.0015, "step": 548 }, { "epoch": 24.90909090909091, "eval_loss": 0.0009449638891965151, "eval_runtime": 0.2459, "eval_samples_per_second": 357.891, "eval_steps_per_second": 44.736, "step": 548 }, { "epoch": 24.954545454545453, "grad_norm": 0.015967663377523422, "learning_rate": 8.763157894736843e-06, "loss": 0.0017, "step": 549 }, { "epoch": 24.954545454545453, "eval_loss": 0.0009437742992304265, "eval_runtime": 0.239, "eval_samples_per_second": 368.142, "eval_steps_per_second": 46.018, "step": 549 }, { "epoch": 25.0, "grad_norm": 0.012870087288320065, "learning_rate": 8.723684210526316e-06, "loss": 0.0015, "step": 550 }, { "epoch": 25.0, "eval_loss": 0.0009425426251254976, "eval_runtime": 0.2335, "eval_samples_per_second": 376.798, "eval_steps_per_second": 47.1, "step": 550 }, { "epoch": 25.045454545454547, "grad_norm": 0.012893461622297764, "learning_rate": 8.68421052631579e-06, "loss": 0.0015, "step": 551 }, { "epoch": 25.045454545454547, "eval_loss": 0.0009413667139597237, "eval_runtime": 0.236, "eval_samples_per_second": 372.836, "eval_steps_per_second": 46.605, "step": 551 }, { "epoch": 25.09090909090909, "grad_norm": 0.014959870837628841, "learning_rate": 8.644736842105264e-06, "loss": 0.0016, "step": 552 }, { "epoch": 25.09090909090909, "eval_loss": 0.0009402299183420837, "eval_runtime": 0.2482, "eval_samples_per_second": 354.624, "eval_steps_per_second": 44.328, "step": 552 }, { "epoch": 25.136363636363637, "grad_norm": 0.01649138703942299, "learning_rate": 8.605263157894737e-06, "loss": 0.0017, "step": 553 }, { "epoch": 25.136363636363637, "eval_loss": 0.0009390347986482084, "eval_runtime": 0.2599, "eval_samples_per_second": 338.554, "eval_steps_per_second": 42.319, "step": 553 }, { "epoch": 25.181818181818183, "grad_norm": 0.01470938976854086, "learning_rate": 8.56578947368421e-06, "loss": 0.0016, "step": 554 }, { "epoch": 25.181818181818183, "eval_loss": 0.0009378465474583209, "eval_runtime": 0.2574, "eval_samples_per_second": 341.926, "eval_steps_per_second": 42.741, "step": 554 }, { "epoch": 25.227272727272727, "grad_norm": 0.011589915491640568, "learning_rate": 8.526315789473685e-06, "loss": 0.0014, "step": 555 }, { "epoch": 25.227272727272727, "eval_loss": 0.000936675991397351, "eval_runtime": 0.2348, "eval_samples_per_second": 374.714, "eval_steps_per_second": 46.839, "step": 555 }, { "epoch": 25.272727272727273, "grad_norm": 0.012033880688250065, "learning_rate": 8.486842105263157e-06, "loss": 0.0014, "step": 556 }, { "epoch": 25.272727272727273, "eval_loss": 0.0009355823858641088, "eval_runtime": 0.2479, "eval_samples_per_second": 354.912, "eval_steps_per_second": 44.364, "step": 556 }, { "epoch": 25.318181818181817, "grad_norm": 0.012967276386916637, "learning_rate": 8.447368421052632e-06, "loss": 0.0016, "step": 557 }, { "epoch": 25.318181818181817, "eval_loss": 0.0009344658465124667, "eval_runtime": 0.2455, "eval_samples_per_second": 358.387, "eval_steps_per_second": 44.798, "step": 557 }, { "epoch": 25.363636363636363, "grad_norm": 0.01223038136959076, "learning_rate": 8.407894736842106e-06, "loss": 0.0015, "step": 558 }, { "epoch": 25.363636363636363, "eval_loss": 0.0009333452326245606, "eval_runtime": 0.2906, "eval_samples_per_second": 302.832, "eval_steps_per_second": 37.854, "step": 558 }, { "epoch": 25.40909090909091, "grad_norm": 0.015218369662761688, "learning_rate": 8.368421052631578e-06, "loss": 0.0016, "step": 559 }, { "epoch": 25.40909090909091, "eval_loss": 0.0009322408004663885, "eval_runtime": 0.2272, "eval_samples_per_second": 387.247, "eval_steps_per_second": 48.406, "step": 559 }, { "epoch": 25.454545454545453, "grad_norm": 0.015988919883966446, "learning_rate": 8.328947368421052e-06, "loss": 0.0016, "step": 560 }, { "epoch": 25.454545454545453, "eval_loss": 0.0009310647728852928, "eval_runtime": 0.2299, "eval_samples_per_second": 382.796, "eval_steps_per_second": 47.85, "step": 560 }, { "epoch": 25.5, "grad_norm": 0.012890150770545006, "learning_rate": 8.289473684210526e-06, "loss": 0.0015, "step": 561 }, { "epoch": 25.5, "eval_loss": 0.0009298656368628144, "eval_runtime": 0.2335, "eval_samples_per_second": 376.874, "eval_steps_per_second": 47.109, "step": 561 }, { "epoch": 25.545454545454547, "grad_norm": 0.013084178790450096, "learning_rate": 8.25e-06, "loss": 0.0016, "step": 562 }, { "epoch": 25.545454545454547, "eval_loss": 0.0009286908898502588, "eval_runtime": 0.2286, "eval_samples_per_second": 384.978, "eval_steps_per_second": 48.122, "step": 562 }, { "epoch": 25.59090909090909, "grad_norm": 0.01568671688437462, "learning_rate": 8.210526315789475e-06, "loss": 0.0018, "step": 563 }, { "epoch": 25.59090909090909, "eval_loss": 0.0009274999029003084, "eval_runtime": 0.2258, "eval_samples_per_second": 389.702, "eval_steps_per_second": 48.713, "step": 563 }, { "epoch": 25.636363636363637, "grad_norm": 0.012654740363359451, "learning_rate": 8.171052631578947e-06, "loss": 0.0014, "step": 564 }, { "epoch": 25.636363636363637, "eval_loss": 0.0009263442480005324, "eval_runtime": 0.2297, "eval_samples_per_second": 383.078, "eval_steps_per_second": 47.885, "step": 564 }, { "epoch": 25.681818181818183, "grad_norm": 0.014308282174170017, "learning_rate": 8.131578947368421e-06, "loss": 0.0016, "step": 565 }, { "epoch": 25.681818181818183, "eval_loss": 0.0009251585579477251, "eval_runtime": 0.2407, "eval_samples_per_second": 365.643, "eval_steps_per_second": 45.705, "step": 565 }, { "epoch": 25.727272727272727, "grad_norm": 0.013645520433783531, "learning_rate": 8.092105263157896e-06, "loss": 0.0016, "step": 566 }, { "epoch": 25.727272727272727, "eval_loss": 0.000924033869523555, "eval_runtime": 0.2295, "eval_samples_per_second": 383.49, "eval_steps_per_second": 47.936, "step": 566 }, { "epoch": 25.772727272727273, "grad_norm": 0.013325618579983711, "learning_rate": 8.052631578947368e-06, "loss": 0.0016, "step": 567 }, { "epoch": 25.772727272727273, "eval_loss": 0.0009229186689481139, "eval_runtime": 0.2286, "eval_samples_per_second": 384.951, "eval_steps_per_second": 48.119, "step": 567 }, { "epoch": 25.818181818181817, "grad_norm": 0.013046055100858212, "learning_rate": 8.013157894736842e-06, "loss": 0.0015, "step": 568 }, { "epoch": 25.818181818181817, "eval_loss": 0.0009218386840075254, "eval_runtime": 0.2278, "eval_samples_per_second": 386.339, "eval_steps_per_second": 48.292, "step": 568 }, { "epoch": 25.863636363636363, "grad_norm": 0.014013804495334625, "learning_rate": 7.973684210526316e-06, "loss": 0.0015, "step": 569 }, { "epoch": 25.863636363636363, "eval_loss": 0.0009208493283949792, "eval_runtime": 0.239, "eval_samples_per_second": 368.218, "eval_steps_per_second": 46.027, "step": 569 }, { "epoch": 25.90909090909091, "grad_norm": 0.014438400976359844, "learning_rate": 7.934210526315789e-06, "loss": 0.0016, "step": 570 }, { "epoch": 25.90909090909091, "eval_loss": 0.0009198287734761834, "eval_runtime": 0.2403, "eval_samples_per_second": 366.205, "eval_steps_per_second": 45.776, "step": 570 }, { "epoch": 25.954545454545453, "grad_norm": 0.013837904669344425, "learning_rate": 7.894736842105263e-06, "loss": 0.0016, "step": 571 }, { "epoch": 25.954545454545453, "eval_loss": 0.0009188164258375764, "eval_runtime": 0.2295, "eval_samples_per_second": 383.499, "eval_steps_per_second": 47.937, "step": 571 }, { "epoch": 26.0, "grad_norm": 0.014442033134400845, "learning_rate": 7.855263157894737e-06, "loss": 0.0015, "step": 572 }, { "epoch": 26.0, "eval_loss": 0.0009178462787531316, "eval_runtime": 0.2369, "eval_samples_per_second": 371.428, "eval_steps_per_second": 46.428, "step": 572 }, { "epoch": 26.045454545454547, "grad_norm": 0.01597905345261097, "learning_rate": 7.81578947368421e-06, "loss": 0.0016, "step": 573 }, { "epoch": 26.045454545454547, "eval_loss": 0.000916794640943408, "eval_runtime": 0.2272, "eval_samples_per_second": 387.243, "eval_steps_per_second": 48.405, "step": 573 }, { "epoch": 26.09090909090909, "grad_norm": 0.014845073223114014, "learning_rate": 7.776315789473684e-06, "loss": 0.0016, "step": 574 }, { "epoch": 26.09090909090909, "eval_loss": 0.0009157375898212194, "eval_runtime": 0.2356, "eval_samples_per_second": 373.503, "eval_steps_per_second": 46.688, "step": 574 }, { "epoch": 26.136363636363637, "grad_norm": 0.016282513737678528, "learning_rate": 7.73684210526316e-06, "loss": 0.0016, "step": 575 }, { "epoch": 26.136363636363637, "eval_loss": 0.0009147171513177454, "eval_runtime": 0.232, "eval_samples_per_second": 379.38, "eval_steps_per_second": 47.422, "step": 575 }, { "epoch": 26.181818181818183, "grad_norm": 0.01518057007342577, "learning_rate": 7.697368421052632e-06, "loss": 0.0016, "step": 576 }, { "epoch": 26.181818181818183, "eval_loss": 0.0009137062006630003, "eval_runtime": 0.2426, "eval_samples_per_second": 362.715, "eval_steps_per_second": 45.339, "step": 576 }, { "epoch": 26.227272727272727, "grad_norm": 0.014094051904976368, "learning_rate": 7.657894736842106e-06, "loss": 0.0016, "step": 577 }, { "epoch": 26.227272727272727, "eval_loss": 0.0009126991499215364, "eval_runtime": 0.2293, "eval_samples_per_second": 383.817, "eval_steps_per_second": 47.977, "step": 577 }, { "epoch": 26.272727272727273, "grad_norm": 0.013502271845936775, "learning_rate": 7.6184210526315794e-06, "loss": 0.0015, "step": 578 }, { "epoch": 26.272727272727273, "eval_loss": 0.0009116692817769945, "eval_runtime": 0.2603, "eval_samples_per_second": 338.068, "eval_steps_per_second": 42.258, "step": 578 }, { "epoch": 26.318181818181817, "grad_norm": 0.01577981747686863, "learning_rate": 7.578947368421053e-06, "loss": 0.0016, "step": 579 }, { "epoch": 26.318181818181817, "eval_loss": 0.0009106568759307265, "eval_runtime": 0.2284, "eval_samples_per_second": 385.212, "eval_steps_per_second": 48.151, "step": 579 }, { "epoch": 26.363636363636363, "grad_norm": 0.013350007124245167, "learning_rate": 7.539473684210527e-06, "loss": 0.0016, "step": 580 }, { "epoch": 26.363636363636363, "eval_loss": 0.0009096513967961073, "eval_runtime": 0.251, "eval_samples_per_second": 350.661, "eval_steps_per_second": 43.833, "step": 580 }, { "epoch": 26.40909090909091, "grad_norm": 0.013078941963613033, "learning_rate": 7.5e-06, "loss": 0.0014, "step": 581 }, { "epoch": 26.40909090909091, "eval_loss": 0.000908670190256089, "eval_runtime": 0.2388, "eval_samples_per_second": 368.458, "eval_steps_per_second": 46.057, "step": 581 }, { "epoch": 26.454545454545453, "grad_norm": 0.013791137374937534, "learning_rate": 7.4605263157894735e-06, "loss": 0.0015, "step": 582 }, { "epoch": 26.454545454545453, "eval_loss": 0.000907672569155693, "eval_runtime": 0.242, "eval_samples_per_second": 363.581, "eval_steps_per_second": 45.448, "step": 582 }, { "epoch": 26.5, "grad_norm": 0.015615719370543957, "learning_rate": 7.421052631578948e-06, "loss": 0.0017, "step": 583 }, { "epoch": 26.5, "eval_loss": 0.0009066305938176811, "eval_runtime": 0.2567, "eval_samples_per_second": 342.844, "eval_steps_per_second": 42.856, "step": 583 }, { "epoch": 26.545454545454547, "grad_norm": 0.015224572271108627, "learning_rate": 7.381578947368421e-06, "loss": 0.0016, "step": 584 }, { "epoch": 26.545454545454547, "eval_loss": 0.000905528839211911, "eval_runtime": 0.2456, "eval_samples_per_second": 358.266, "eval_steps_per_second": 44.783, "step": 584 }, { "epoch": 26.59090909090909, "grad_norm": 0.015507878735661507, "learning_rate": 7.342105263157895e-06, "loss": 0.0016, "step": 585 }, { "epoch": 26.59090909090909, "eval_loss": 0.0009044149774126709, "eval_runtime": 0.2492, "eval_samples_per_second": 353.074, "eval_steps_per_second": 44.134, "step": 585 }, { "epoch": 26.636363636363637, "grad_norm": 0.012780736200511456, "learning_rate": 7.302631578947368e-06, "loss": 0.0015, "step": 586 }, { "epoch": 26.636363636363637, "eval_loss": 0.0009033335372805595, "eval_runtime": 0.2468, "eval_samples_per_second": 356.58, "eval_steps_per_second": 44.572, "step": 586 }, { "epoch": 26.681818181818183, "grad_norm": 0.014048571698367596, "learning_rate": 7.2631578947368426e-06, "loss": 0.0015, "step": 587 }, { "epoch": 26.681818181818183, "eval_loss": 0.0009022265439853072, "eval_runtime": 0.2552, "eval_samples_per_second": 344.851, "eval_steps_per_second": 43.106, "step": 587 }, { "epoch": 26.727272727272727, "grad_norm": 0.015583625994622707, "learning_rate": 7.223684210526316e-06, "loss": 0.0017, "step": 588 }, { "epoch": 26.727272727272727, "eval_loss": 0.0009011449874378741, "eval_runtime": 0.2308, "eval_samples_per_second": 381.278, "eval_steps_per_second": 47.66, "step": 588 }, { "epoch": 26.772727272727273, "grad_norm": 0.01401633583009243, "learning_rate": 7.184210526315789e-06, "loss": 0.0015, "step": 589 }, { "epoch": 26.772727272727273, "eval_loss": 0.0009001016733236611, "eval_runtime": 0.2374, "eval_samples_per_second": 370.679, "eval_steps_per_second": 46.335, "step": 589 }, { "epoch": 26.818181818181817, "grad_norm": 0.01262589916586876, "learning_rate": 7.144736842105263e-06, "loss": 0.0015, "step": 590 }, { "epoch": 26.818181818181817, "eval_loss": 0.0008990716305561364, "eval_runtime": 0.2399, "eval_samples_per_second": 366.822, "eval_steps_per_second": 45.853, "step": 590 }, { "epoch": 26.863636363636363, "grad_norm": 0.015306267887353897, "learning_rate": 7.105263157894737e-06, "loss": 0.0016, "step": 591 }, { "epoch": 26.863636363636363, "eval_loss": 0.0008980457205325365, "eval_runtime": 0.2286, "eval_samples_per_second": 385.033, "eval_steps_per_second": 48.129, "step": 591 }, { "epoch": 26.90909090909091, "grad_norm": 0.014178605750203133, "learning_rate": 7.065789473684211e-06, "loss": 0.0016, "step": 592 }, { "epoch": 26.90909090909091, "eval_loss": 0.0008970522903837264, "eval_runtime": 0.2364, "eval_samples_per_second": 372.229, "eval_steps_per_second": 46.529, "step": 592 }, { "epoch": 26.954545454545453, "grad_norm": 0.013244709931313992, "learning_rate": 7.026315789473685e-06, "loss": 0.0016, "step": 593 }, { "epoch": 26.954545454545453, "eval_loss": 0.0008960642153397202, "eval_runtime": 0.2462, "eval_samples_per_second": 357.44, "eval_steps_per_second": 44.68, "step": 593 }, { "epoch": 27.0, "grad_norm": 0.012383348308503628, "learning_rate": 6.986842105263158e-06, "loss": 0.0014, "step": 594 }, { "epoch": 27.0, "eval_loss": 0.0008951277122832835, "eval_runtime": 0.2326, "eval_samples_per_second": 378.306, "eval_steps_per_second": 47.288, "step": 594 }, { "epoch": 27.045454545454547, "grad_norm": 0.011418252252042294, "learning_rate": 6.9473684210526315e-06, "loss": 0.0014, "step": 595 }, { "epoch": 27.045454545454547, "eval_loss": 0.0008942168205976486, "eval_runtime": 0.2431, "eval_samples_per_second": 362.037, "eval_steps_per_second": 45.255, "step": 595 }, { "epoch": 27.09090909090909, "grad_norm": 0.013398653827607632, "learning_rate": 6.907894736842106e-06, "loss": 0.0014, "step": 596 }, { "epoch": 27.09090909090909, "eval_loss": 0.0008933371282182634, "eval_runtime": 0.2375, "eval_samples_per_second": 370.507, "eval_steps_per_second": 46.313, "step": 596 }, { "epoch": 27.136363636363637, "grad_norm": 0.013324232771992683, "learning_rate": 6.868421052631579e-06, "loss": 0.0014, "step": 597 }, { "epoch": 27.136363636363637, "eval_loss": 0.0008924913126975298, "eval_runtime": 0.2409, "eval_samples_per_second": 365.308, "eval_steps_per_second": 45.663, "step": 597 }, { "epoch": 27.181818181818183, "grad_norm": 0.014774598181247711, "learning_rate": 6.828947368421053e-06, "loss": 0.0016, "step": 598 }, { "epoch": 27.181818181818183, "eval_loss": 0.0008916006772778928, "eval_runtime": 0.2374, "eval_samples_per_second": 370.613, "eval_steps_per_second": 46.327, "step": 598 }, { "epoch": 27.227272727272727, "grad_norm": 0.015260329470038414, "learning_rate": 6.7894736842105264e-06, "loss": 0.0016, "step": 599 }, { "epoch": 27.227272727272727, "eval_loss": 0.0008907453739084303, "eval_runtime": 0.2427, "eval_samples_per_second": 362.645, "eval_steps_per_second": 45.331, "step": 599 }, { "epoch": 27.272727272727273, "grad_norm": 0.01440617348998785, "learning_rate": 6.750000000000001e-06, "loss": 0.0016, "step": 600 }, { "epoch": 27.272727272727273, "eval_loss": 0.0008899224339984357, "eval_runtime": 0.2506, "eval_samples_per_second": 351.14, "eval_steps_per_second": 43.892, "step": 600 }, { "epoch": 27.318181818181817, "grad_norm": 0.0139328483492136, "learning_rate": 6.710526315789474e-06, "loss": 0.0015, "step": 601 }, { "epoch": 27.318181818181817, "eval_loss": 0.0008891185279935598, "eval_runtime": 0.223, "eval_samples_per_second": 394.603, "eval_steps_per_second": 49.325, "step": 601 }, { "epoch": 27.363636363636363, "grad_norm": 0.014009720645844936, "learning_rate": 6.671052631578947e-06, "loss": 0.0015, "step": 602 }, { "epoch": 27.363636363636363, "eval_loss": 0.0008883295231498778, "eval_runtime": 0.2262, "eval_samples_per_second": 389.114, "eval_steps_per_second": 48.639, "step": 602 }, { "epoch": 27.40909090909091, "grad_norm": 0.014640220440924168, "learning_rate": 6.631578947368421e-06, "loss": 0.0016, "step": 603 }, { "epoch": 27.40909090909091, "eval_loss": 0.0008875647909007967, "eval_runtime": 0.2259, "eval_samples_per_second": 389.586, "eval_steps_per_second": 48.698, "step": 603 }, { "epoch": 27.454545454545453, "grad_norm": 0.012875789776444435, "learning_rate": 6.592105263157895e-06, "loss": 0.0014, "step": 604 }, { "epoch": 27.454545454545453, "eval_loss": 0.0008868000004440546, "eval_runtime": 0.2267, "eval_samples_per_second": 388.239, "eval_steps_per_second": 48.53, "step": 604 }, { "epoch": 27.5, "grad_norm": 0.012748241424560547, "learning_rate": 6.552631578947369e-06, "loss": 0.0014, "step": 605 }, { "epoch": 27.5, "eval_loss": 0.0008860474918037653, "eval_runtime": 0.2273, "eval_samples_per_second": 387.108, "eval_steps_per_second": 48.388, "step": 605 }, { "epoch": 27.545454545454547, "grad_norm": 0.015082623809576035, "learning_rate": 6.513157894736842e-06, "loss": 0.0016, "step": 606 }, { "epoch": 27.545454545454547, "eval_loss": 0.0008852502796798944, "eval_runtime": 0.2413, "eval_samples_per_second": 364.656, "eval_steps_per_second": 45.582, "step": 606 }, { "epoch": 27.59090909090909, "grad_norm": 0.012016087770462036, "learning_rate": 6.473684210526316e-06, "loss": 0.0014, "step": 607 }, { "epoch": 27.59090909090909, "eval_loss": 0.0008844301337376237, "eval_runtime": 0.2344, "eval_samples_per_second": 375.37, "eval_steps_per_second": 46.921, "step": 607 }, { "epoch": 27.636363636363637, "grad_norm": 0.013424508273601532, "learning_rate": 6.4342105263157896e-06, "loss": 0.0014, "step": 608 }, { "epoch": 27.636363636363637, "eval_loss": 0.0008835734915919602, "eval_runtime": 0.2456, "eval_samples_per_second": 358.327, "eval_steps_per_second": 44.791, "step": 608 }, { "epoch": 27.681818181818183, "grad_norm": 0.014258569106459618, "learning_rate": 6.394736842105263e-06, "loss": 0.0016, "step": 609 }, { "epoch": 27.681818181818183, "eval_loss": 0.0008827546262182295, "eval_runtime": 0.2293, "eval_samples_per_second": 383.729, "eval_steps_per_second": 47.966, "step": 609 }, { "epoch": 27.727272727272727, "grad_norm": 0.012304065749049187, "learning_rate": 6.355263157894737e-06, "loss": 0.0014, "step": 610 }, { "epoch": 27.727272727272727, "eval_loss": 0.0008819656213745475, "eval_runtime": 0.2293, "eval_samples_per_second": 383.825, "eval_steps_per_second": 47.978, "step": 610 }, { "epoch": 27.772727272727273, "grad_norm": 0.01459804829210043, "learning_rate": 6.31578947368421e-06, "loss": 0.0016, "step": 611 }, { "epoch": 27.772727272727273, "eval_loss": 0.000881133193615824, "eval_runtime": 0.2354, "eval_samples_per_second": 373.888, "eval_steps_per_second": 46.736, "step": 611 }, { "epoch": 27.818181818181817, "grad_norm": 0.013015978038311005, "learning_rate": 6.2763157894736845e-06, "loss": 0.0014, "step": 612 }, { "epoch": 27.818181818181817, "eval_loss": 0.0008803331875242293, "eval_runtime": 0.267, "eval_samples_per_second": 329.599, "eval_steps_per_second": 41.2, "step": 612 }, { "epoch": 27.863636363636363, "grad_norm": 0.013901845552027225, "learning_rate": 6.236842105263159e-06, "loss": 0.0016, "step": 613 }, { "epoch": 27.863636363636363, "eval_loss": 0.0008795224712230265, "eval_runtime": 0.2596, "eval_samples_per_second": 339.008, "eval_steps_per_second": 42.376, "step": 613 }, { "epoch": 27.90909090909091, "grad_norm": 0.012065750546753407, "learning_rate": 6.197368421052632e-06, "loss": 0.0014, "step": 614 }, { "epoch": 27.90909090909091, "eval_loss": 0.0008787267142906785, "eval_runtime": 0.2638, "eval_samples_per_second": 333.543, "eval_steps_per_second": 41.693, "step": 614 }, { "epoch": 27.954545454545453, "grad_norm": 0.013637811876833439, "learning_rate": 6.157894736842105e-06, "loss": 0.0016, "step": 615 }, { "epoch": 27.954545454545453, "eval_loss": 0.0008779308409430087, "eval_runtime": 0.2586, "eval_samples_per_second": 340.291, "eval_steps_per_second": 42.536, "step": 615 }, { "epoch": 28.0, "grad_norm": 0.012989726848900318, "learning_rate": 6.118421052631579e-06, "loss": 0.0015, "step": 616 }, { "epoch": 28.0, "eval_loss": 0.0008771241991780698, "eval_runtime": 0.2429, "eval_samples_per_second": 362.255, "eval_steps_per_second": 45.282, "step": 616 }, { "epoch": 28.045454545454547, "grad_norm": 0.011249346658587456, "learning_rate": 6.078947368421053e-06, "loss": 0.0013, "step": 617 }, { "epoch": 28.045454545454547, "eval_loss": 0.0008763446821831167, "eval_runtime": 0.2419, "eval_samples_per_second": 363.721, "eval_steps_per_second": 45.465, "step": 617 }, { "epoch": 28.09090909090909, "grad_norm": 0.013492336496710777, "learning_rate": 6.039473684210526e-06, "loss": 0.0016, "step": 618 }, { "epoch": 28.09090909090909, "eval_loss": 0.000875540659762919, "eval_runtime": 0.2616, "eval_samples_per_second": 336.357, "eval_steps_per_second": 42.045, "step": 618 }, { "epoch": 28.136363636363637, "grad_norm": 0.013201452791690826, "learning_rate": 6e-06, "loss": 0.0014, "step": 619 }, { "epoch": 28.136363636363637, "eval_loss": 0.0008747638785280287, "eval_runtime": 0.2332, "eval_samples_per_second": 377.308, "eval_steps_per_second": 47.163, "step": 619 }, { "epoch": 28.181818181818183, "grad_norm": 0.012346605770289898, "learning_rate": 5.960526315789474e-06, "loss": 0.0015, "step": 620 }, { "epoch": 28.181818181818183, "eval_loss": 0.0008740072953514755, "eval_runtime": 0.2297, "eval_samples_per_second": 383.134, "eval_steps_per_second": 47.892, "step": 620 }, { "epoch": 28.227272727272727, "grad_norm": 0.013474266044795513, "learning_rate": 5.921052631578948e-06, "loss": 0.0015, "step": 621 }, { "epoch": 28.227272727272727, "eval_loss": 0.0008732505375519395, "eval_runtime": 0.2261, "eval_samples_per_second": 389.249, "eval_steps_per_second": 48.656, "step": 621 }, { "epoch": 28.272727272727273, "grad_norm": 0.011779211461544037, "learning_rate": 5.881578947368421e-06, "loss": 0.0013, "step": 622 }, { "epoch": 28.272727272727273, "eval_loss": 0.0008725319639779627, "eval_runtime": 0.2358, "eval_samples_per_second": 373.257, "eval_steps_per_second": 46.657, "step": 622 }, { "epoch": 28.318181818181817, "grad_norm": 0.01458238996565342, "learning_rate": 5.842105263157895e-06, "loss": 0.0015, "step": 623 }, { "epoch": 28.318181818181817, "eval_loss": 0.0008718472090549767, "eval_runtime": 0.2469, "eval_samples_per_second": 356.442, "eval_steps_per_second": 44.555, "step": 623 }, { "epoch": 28.363636363636363, "grad_norm": 0.013492444530129433, "learning_rate": 5.802631578947368e-06, "loss": 0.0015, "step": 624 }, { "epoch": 28.363636363636363, "eval_loss": 0.0008711445843800902, "eval_runtime": 0.2339, "eval_samples_per_second": 376.299, "eval_steps_per_second": 47.037, "step": 624 }, { "epoch": 28.40909090909091, "grad_norm": 0.016801927238702774, "learning_rate": 5.763157894736842e-06, "loss": 0.0016, "step": 625 }, { "epoch": 28.40909090909091, "eval_loss": 0.0008704178035259247, "eval_runtime": 0.2467, "eval_samples_per_second": 356.761, "eval_steps_per_second": 44.595, "step": 625 }, { "epoch": 28.454545454545453, "grad_norm": 0.01472269557416439, "learning_rate": 5.723684210526316e-06, "loss": 0.0015, "step": 626 }, { "epoch": 28.454545454545453, "eval_loss": 0.0008697099983692169, "eval_runtime": 0.2361, "eval_samples_per_second": 372.695, "eval_steps_per_second": 46.587, "step": 626 }, { "epoch": 28.5, "grad_norm": 0.012456816621124744, "learning_rate": 5.68421052631579e-06, "loss": 0.0014, "step": 627 }, { "epoch": 28.5, "eval_loss": 0.0008690251270309091, "eval_runtime": 0.227, "eval_samples_per_second": 387.675, "eval_steps_per_second": 48.459, "step": 627 }, { "epoch": 28.545454545454547, "grad_norm": 0.010930378921329975, "learning_rate": 5.644736842105263e-06, "loss": 0.0013, "step": 628 }, { "epoch": 28.545454545454547, "eval_loss": 0.0008683226769790053, "eval_runtime": 0.2396, "eval_samples_per_second": 367.217, "eval_steps_per_second": 45.902, "step": 628 }, { "epoch": 28.59090909090909, "grad_norm": 0.013773776590824127, "learning_rate": 5.605263157894737e-06, "loss": 0.0016, "step": 629 }, { "epoch": 28.59090909090909, "eval_loss": 0.0008676418801769614, "eval_runtime": 0.2255, "eval_samples_per_second": 390.204, "eval_steps_per_second": 48.776, "step": 629 }, { "epoch": 28.636363636363637, "grad_norm": 0.01485821045935154, "learning_rate": 5.565789473684211e-06, "loss": 0.0015, "step": 630 }, { "epoch": 28.636363636363637, "eval_loss": 0.0008669787785038352, "eval_runtime": 0.238, "eval_samples_per_second": 369.806, "eval_steps_per_second": 46.226, "step": 630 }, { "epoch": 28.681818181818183, "grad_norm": 0.012882347218692303, "learning_rate": 5.526315789473684e-06, "loss": 0.0015, "step": 631 }, { "epoch": 28.681818181818183, "eval_loss": 0.0008663006592541933, "eval_runtime": 0.2392, "eval_samples_per_second": 367.945, "eval_steps_per_second": 45.993, "step": 631 }, { "epoch": 28.727272727272727, "grad_norm": 0.013756033033132553, "learning_rate": 5.486842105263158e-06, "loss": 0.0015, "step": 632 }, { "epoch": 28.727272727272727, "eval_loss": 0.0008656617719680071, "eval_runtime": 0.2392, "eval_samples_per_second": 367.897, "eval_steps_per_second": 45.987, "step": 632 }, { "epoch": 28.772727272727273, "grad_norm": 0.011964356526732445, "learning_rate": 5.447368421052632e-06, "loss": 0.0014, "step": 633 }, { "epoch": 28.772727272727273, "eval_loss": 0.0008649809169583023, "eval_runtime": 0.2416, "eval_samples_per_second": 364.235, "eval_steps_per_second": 45.529, "step": 633 }, { "epoch": 28.818181818181817, "grad_norm": 0.014426548965275288, "learning_rate": 5.407894736842106e-06, "loss": 0.0015, "step": 634 }, { "epoch": 28.818181818181817, "eval_loss": 0.0008642975008115172, "eval_runtime": 0.2426, "eval_samples_per_second": 362.673, "eval_steps_per_second": 45.334, "step": 634 }, { "epoch": 28.863636363636363, "grad_norm": 0.013472221791744232, "learning_rate": 5.368421052631579e-06, "loss": 0.0014, "step": 635 }, { "epoch": 28.863636363636363, "eval_loss": 0.0008636031416244805, "eval_runtime": 0.2517, "eval_samples_per_second": 349.684, "eval_steps_per_second": 43.711, "step": 635 }, { "epoch": 28.90909090909091, "grad_norm": 0.012157904915511608, "learning_rate": 5.328947368421053e-06, "loss": 0.0014, "step": 636 }, { "epoch": 28.90909090909091, "eval_loss": 0.000862881715875119, "eval_runtime": 0.2369, "eval_samples_per_second": 371.509, "eval_steps_per_second": 46.439, "step": 636 }, { "epoch": 28.954545454545453, "grad_norm": 0.012409983202815056, "learning_rate": 5.289473684210526e-06, "loss": 0.0014, "step": 637 }, { "epoch": 28.954545454545453, "eval_loss": 0.0008621684974059463, "eval_runtime": 0.2465, "eval_samples_per_second": 357.054, "eval_steps_per_second": 44.632, "step": 637 }, { "epoch": 29.0, "grad_norm": 0.013315846212208271, "learning_rate": 5.25e-06, "loss": 0.0015, "step": 638 }, { "epoch": 29.0, "eval_loss": 0.0008614835678599775, "eval_runtime": 0.2407, "eval_samples_per_second": 365.586, "eval_steps_per_second": 45.698, "step": 638 }, { "epoch": 29.045454545454547, "grad_norm": 0.015236815437674522, "learning_rate": 5.210526315789474e-06, "loss": 0.0016, "step": 639 }, { "epoch": 29.045454545454547, "eval_loss": 0.0008607918862253428, "eval_runtime": 0.2362, "eval_samples_per_second": 372.636, "eval_steps_per_second": 46.579, "step": 639 }, { "epoch": 29.09090909090909, "grad_norm": 0.01497814990580082, "learning_rate": 5.171052631578948e-06, "loss": 0.0015, "step": 640 }, { "epoch": 29.09090909090909, "eval_loss": 0.0008601464214734733, "eval_runtime": 0.2513, "eval_samples_per_second": 350.225, "eval_steps_per_second": 43.778, "step": 640 }, { "epoch": 29.136363636363637, "grad_norm": 0.010525020770728588, "learning_rate": 5.131578947368421e-06, "loss": 0.0013, "step": 641 }, { "epoch": 29.136363636363637, "eval_loss": 0.0008594872197136283, "eval_runtime": 0.2472, "eval_samples_per_second": 355.947, "eval_steps_per_second": 44.493, "step": 641 }, { "epoch": 29.181818181818183, "grad_norm": 0.012257490307092667, "learning_rate": 5.092105263157895e-06, "loss": 0.0014, "step": 642 }, { "epoch": 29.181818181818183, "eval_loss": 0.0008588552009314299, "eval_runtime": 0.2514, "eval_samples_per_second": 350.01, "eval_steps_per_second": 43.751, "step": 642 }, { "epoch": 29.227272727272727, "grad_norm": 0.016379721462726593, "learning_rate": 5.052631578947369e-06, "loss": 0.0016, "step": 643 }, { "epoch": 29.227272727272727, "eval_loss": 0.0008582230657339096, "eval_runtime": 0.2421, "eval_samples_per_second": 363.525, "eval_steps_per_second": 45.441, "step": 643 }, { "epoch": 29.272727272727273, "grad_norm": 0.013389473780989647, "learning_rate": 5.013157894736842e-06, "loss": 0.0014, "step": 644 }, { "epoch": 29.272727272727273, "eval_loss": 0.0008576181135140359, "eval_runtime": 0.2837, "eval_samples_per_second": 310.222, "eval_steps_per_second": 38.778, "step": 644 }, { "epoch": 29.318181818181817, "grad_norm": 0.011728441342711449, "learning_rate": 4.973684210526315e-06, "loss": 0.0014, "step": 645 }, { "epoch": 29.318181818181817, "eval_loss": 0.0008570144418627024, "eval_runtime": 0.3144, "eval_samples_per_second": 279.869, "eval_steps_per_second": 34.984, "step": 645 }, { "epoch": 29.363636363636363, "grad_norm": 0.014150052331387997, "learning_rate": 4.9342105263157895e-06, "loss": 0.0015, "step": 646 }, { "epoch": 29.363636363636363, "eval_loss": 0.0008564500021748245, "eval_runtime": 0.2427, "eval_samples_per_second": 362.611, "eval_steps_per_second": 45.326, "step": 646 }, { "epoch": 29.40909090909091, "grad_norm": 0.012562847696244717, "learning_rate": 4.894736842105264e-06, "loss": 0.0015, "step": 647 }, { "epoch": 29.40909090909091, "eval_loss": 0.0008558626868762076, "eval_runtime": 0.2385, "eval_samples_per_second": 368.954, "eval_steps_per_second": 46.119, "step": 647 }, { "epoch": 29.454545454545453, "grad_norm": 0.01115860603749752, "learning_rate": 4.855263157894737e-06, "loss": 0.0012, "step": 648 }, { "epoch": 29.454545454545453, "eval_loss": 0.000855276535730809, "eval_runtime": 0.2434, "eval_samples_per_second": 361.501, "eval_steps_per_second": 45.188, "step": 648 }, { "epoch": 29.5, "grad_norm": 0.014787169173359871, "learning_rate": 4.81578947368421e-06, "loss": 0.0015, "step": 649 }, { "epoch": 29.5, "eval_loss": 0.0008546687895432115, "eval_runtime": 0.2404, "eval_samples_per_second": 366.019, "eval_steps_per_second": 45.752, "step": 649 }, { "epoch": 29.545454545454547, "grad_norm": 0.014013570733368397, "learning_rate": 4.7763157894736844e-06, "loss": 0.0014, "step": 650 }, { "epoch": 29.545454545454547, "eval_loss": 0.0008540409035049379, "eval_runtime": 0.2415, "eval_samples_per_second": 364.376, "eval_steps_per_second": 45.547, "step": 650 }, { "epoch": 29.59090909090909, "grad_norm": 0.013314800336956978, "learning_rate": 4.736842105263158e-06, "loss": 0.0015, "step": 651 }, { "epoch": 29.59090909090909, "eval_loss": 0.0008533978252671659, "eval_runtime": 0.2334, "eval_samples_per_second": 377.055, "eval_steps_per_second": 47.132, "step": 651 }, { "epoch": 29.636363636363637, "grad_norm": 0.011727740988135338, "learning_rate": 4.697368421052631e-06, "loss": 0.0014, "step": 652 }, { "epoch": 29.636363636363637, "eval_loss": 0.0008527915342710912, "eval_runtime": 0.2324, "eval_samples_per_second": 378.693, "eval_steps_per_second": 47.337, "step": 652 }, { "epoch": 29.681818181818183, "grad_norm": 0.014551502652466297, "learning_rate": 4.657894736842106e-06, "loss": 0.0016, "step": 653 }, { "epoch": 29.681818181818183, "eval_loss": 0.0008521459531039, "eval_runtime": 0.2274, "eval_samples_per_second": 387.021, "eval_steps_per_second": 48.378, "step": 653 }, { "epoch": 29.727272727272727, "grad_norm": 0.01226063258945942, "learning_rate": 4.618421052631579e-06, "loss": 0.0013, "step": 654 }, { "epoch": 29.727272727272727, "eval_loss": 0.0008515057852491736, "eval_runtime": 0.2277, "eval_samples_per_second": 386.523, "eval_steps_per_second": 48.315, "step": 654 }, { "epoch": 29.772727272727273, "grad_norm": 0.013769338838756084, "learning_rate": 4.578947368421053e-06, "loss": 0.0015, "step": 655 }, { "epoch": 29.772727272727273, "eval_loss": 0.0008508588653057814, "eval_runtime": 0.2246, "eval_samples_per_second": 391.814, "eval_steps_per_second": 48.977, "step": 655 }, { "epoch": 29.818181818181817, "grad_norm": 0.012221275828778744, "learning_rate": 4.539473684210527e-06, "loss": 0.0015, "step": 656 }, { "epoch": 29.818181818181817, "eval_loss": 0.0008502537966705859, "eval_runtime": 0.2288, "eval_samples_per_second": 384.596, "eval_steps_per_second": 48.075, "step": 656 }, { "epoch": 29.863636363636363, "grad_norm": 0.011863375082612038, "learning_rate": 4.5e-06, "loss": 0.0013, "step": 657 }, { "epoch": 29.863636363636363, "eval_loss": 0.0008496582740917802, "eval_runtime": 0.2473, "eval_samples_per_second": 355.889, "eval_steps_per_second": 44.486, "step": 657 }, { "epoch": 29.90909090909091, "grad_norm": 0.01440768875181675, "learning_rate": 4.460526315789473e-06, "loss": 0.0015, "step": 658 }, { "epoch": 29.90909090909091, "eval_loss": 0.0008490938926115632, "eval_runtime": 0.2279, "eval_samples_per_second": 386.137, "eval_steps_per_second": 48.267, "step": 658 }, { "epoch": 29.954545454545453, "grad_norm": 0.013953134417533875, "learning_rate": 4.421052631578947e-06, "loss": 0.0014, "step": 659 }, { "epoch": 29.954545454545453, "eval_loss": 0.0008485484286211431, "eval_runtime": 0.2309, "eval_samples_per_second": 381.17, "eval_steps_per_second": 47.646, "step": 659 }, { "epoch": 30.0, "grad_norm": 0.012044006027281284, "learning_rate": 4.381578947368422e-06, "loss": 0.0014, "step": 660 }, { "epoch": 30.0, "eval_loss": 0.0008479988318867981, "eval_runtime": 0.2301, "eval_samples_per_second": 382.482, "eval_steps_per_second": 47.81, "step": 660 }, { "epoch": 30.045454545454547, "grad_norm": 0.014352229423820972, "learning_rate": 4.342105263157895e-06, "loss": 0.0015, "step": 661 }, { "epoch": 30.045454545454547, "eval_loss": 0.0008474763599224389, "eval_runtime": 0.228, "eval_samples_per_second": 385.949, "eval_steps_per_second": 48.244, "step": 661 }, { "epoch": 30.09090909090909, "grad_norm": 0.012857983820140362, "learning_rate": 4.302631578947368e-06, "loss": 0.0015, "step": 662 }, { "epoch": 30.09090909090909, "eval_loss": 0.0008469296153634787, "eval_runtime": 0.2254, "eval_samples_per_second": 390.463, "eval_steps_per_second": 48.808, "step": 662 }, { "epoch": 30.136363636363637, "grad_norm": 0.013745253905653954, "learning_rate": 4.2631578947368425e-06, "loss": 0.0014, "step": 663 }, { "epoch": 30.136363636363637, "eval_loss": 0.0008464112761430442, "eval_runtime": 0.2615, "eval_samples_per_second": 336.504, "eval_steps_per_second": 42.063, "step": 663 }, { "epoch": 30.181818181818183, "grad_norm": 0.011542108841240406, "learning_rate": 4.223684210526316e-06, "loss": 0.0014, "step": 664 }, { "epoch": 30.181818181818183, "eval_loss": 0.0008458928787149489, "eval_runtime": 0.2363, "eval_samples_per_second": 372.361, "eval_steps_per_second": 46.545, "step": 664 }, { "epoch": 30.227272727272727, "grad_norm": 0.013680350966751575, "learning_rate": 4.184210526315789e-06, "loss": 0.0015, "step": 665 }, { "epoch": 30.227272727272727, "eval_loss": 0.0008453825721517205, "eval_runtime": 0.2422, "eval_samples_per_second": 363.317, "eval_steps_per_second": 45.415, "step": 665 }, { "epoch": 30.272727272727273, "grad_norm": 0.01278683077543974, "learning_rate": 4.144736842105263e-06, "loss": 0.0013, "step": 666 }, { "epoch": 30.272727272727273, "eval_loss": 0.0008448913577012718, "eval_runtime": 0.2274, "eval_samples_per_second": 386.997, "eval_steps_per_second": 48.375, "step": 666 }, { "epoch": 30.318181818181817, "grad_norm": 0.013793477788567543, "learning_rate": 4.105263157894737e-06, "loss": 0.0016, "step": 667 }, { "epoch": 30.318181818181817, "eval_loss": 0.0008444040431641042, "eval_runtime": 0.2383, "eval_samples_per_second": 369.303, "eval_steps_per_second": 46.163, "step": 667 }, { "epoch": 30.363636363636363, "grad_norm": 0.013766897842288017, "learning_rate": 4.065789473684211e-06, "loss": 0.0014, "step": 668 }, { "epoch": 30.363636363636363, "eval_loss": 0.0008439045632258058, "eval_runtime": 0.2472, "eval_samples_per_second": 355.963, "eval_steps_per_second": 44.495, "step": 668 }, { "epoch": 30.40909090909091, "grad_norm": 0.01388518325984478, "learning_rate": 4.026315789473684e-06, "loss": 0.0014, "step": 669 }, { "epoch": 30.40909090909091, "eval_loss": 0.0008434146293438971, "eval_runtime": 0.2555, "eval_samples_per_second": 344.476, "eval_steps_per_second": 43.059, "step": 669 }, { "epoch": 30.454545454545453, "grad_norm": 0.013302307575941086, "learning_rate": 3.986842105263158e-06, "loss": 0.0014, "step": 670 }, { "epoch": 30.454545454545453, "eval_loss": 0.0008429314475506544, "eval_runtime": 0.234, "eval_samples_per_second": 375.99, "eval_steps_per_second": 46.999, "step": 670 }, { "epoch": 30.5, "grad_norm": 0.015602638944983482, "learning_rate": 3.9473684210526315e-06, "loss": 0.0015, "step": 671 }, { "epoch": 30.5, "eval_loss": 0.0008424482657574117, "eval_runtime": 0.2312, "eval_samples_per_second": 380.69, "eval_steps_per_second": 47.586, "step": 671 }, { "epoch": 30.545454545454547, "grad_norm": 0.012195833958685398, "learning_rate": 3.907894736842105e-06, "loss": 0.0014, "step": 672 }, { "epoch": 30.545454545454547, "eval_loss": 0.0008419921505264938, "eval_runtime": 0.2348, "eval_samples_per_second": 374.848, "eval_steps_per_second": 46.856, "step": 672 }, { "epoch": 30.59090909090909, "grad_norm": 0.012124909088015556, "learning_rate": 3.86842105263158e-06, "loss": 0.0014, "step": 673 }, { "epoch": 30.59090909090909, "eval_loss": 0.0008415495394729078, "eval_runtime": 0.2337, "eval_samples_per_second": 376.514, "eval_steps_per_second": 47.064, "step": 673 }, { "epoch": 30.636363636363637, "grad_norm": 0.012487749569118023, "learning_rate": 3.828947368421053e-06, "loss": 0.0014, "step": 674 }, { "epoch": 30.636363636363637, "eval_loss": 0.0008411163580603898, "eval_runtime": 0.2252, "eval_samples_per_second": 390.686, "eval_steps_per_second": 48.836, "step": 674 }, { "epoch": 30.681818181818183, "grad_norm": 0.013694563880562782, "learning_rate": 3.7894736842105264e-06, "loss": 0.0015, "step": 675 }, { "epoch": 30.681818181818183, "eval_loss": 0.0008406452834606171, "eval_runtime": 0.2277, "eval_samples_per_second": 386.401, "eval_steps_per_second": 48.3, "step": 675 }, { "epoch": 30.727272727272727, "grad_norm": 0.012177863158285618, "learning_rate": 3.75e-06, "loss": 0.0015, "step": 676 }, { "epoch": 30.727272727272727, "eval_loss": 0.0008401837549172342, "eval_runtime": 0.2284, "eval_samples_per_second": 385.297, "eval_steps_per_second": 48.162, "step": 676 }, { "epoch": 30.772727272727273, "grad_norm": 0.011734875850379467, "learning_rate": 3.710526315789474e-06, "loss": 0.0013, "step": 677 }, { "epoch": 30.772727272727273, "eval_loss": 0.0008397437632083893, "eval_runtime": 0.2349, "eval_samples_per_second": 374.661, "eval_steps_per_second": 46.833, "step": 677 }, { "epoch": 30.818181818181817, "grad_norm": 0.012181814759969711, "learning_rate": 3.6710526315789476e-06, "loss": 0.0015, "step": 678 }, { "epoch": 30.818181818181817, "eval_loss": 0.000839310756418854, "eval_runtime": 0.2267, "eval_samples_per_second": 388.249, "eval_steps_per_second": 48.531, "step": 678 }, { "epoch": 30.863636363636363, "grad_norm": 0.014351209625601768, "learning_rate": 3.6315789473684213e-06, "loss": 0.0015, "step": 679 }, { "epoch": 30.863636363636363, "eval_loss": 0.0008388804271817207, "eval_runtime": 0.2382, "eval_samples_per_second": 369.474, "eval_steps_per_second": 46.184, "step": 679 }, { "epoch": 30.90909090909091, "grad_norm": 0.01179533638060093, "learning_rate": 3.5921052631578946e-06, "loss": 0.0014, "step": 680 }, { "epoch": 30.90909090909091, "eval_loss": 0.0008384499233216047, "eval_runtime": 0.227, "eval_samples_per_second": 387.694, "eval_steps_per_second": 48.462, "step": 680 }, { "epoch": 30.954545454545453, "grad_norm": 0.01200299896299839, "learning_rate": 3.5526315789473683e-06, "loss": 0.0014, "step": 681 }, { "epoch": 30.954545454545453, "eval_loss": 0.0008380439248867333, "eval_runtime": 0.2357, "eval_samples_per_second": 373.384, "eval_steps_per_second": 46.673, "step": 681 }, { "epoch": 31.0, "grad_norm": 0.012165653519332409, "learning_rate": 3.5131578947368425e-06, "loss": 0.0014, "step": 682 }, { "epoch": 31.0, "eval_loss": 0.0008376243058592081, "eval_runtime": 0.2268, "eval_samples_per_second": 387.994, "eval_steps_per_second": 48.499, "step": 682 }, { "epoch": 31.045454545454547, "grad_norm": 0.013023504056036472, "learning_rate": 3.4736842105263158e-06, "loss": 0.0014, "step": 683 }, { "epoch": 31.045454545454547, "eval_loss": 0.0008372208685614169, "eval_runtime": 0.2408, "eval_samples_per_second": 365.51, "eval_steps_per_second": 45.689, "step": 683 }, { "epoch": 31.09090909090909, "grad_norm": 0.012478847056627274, "learning_rate": 3.4342105263157895e-06, "loss": 0.0015, "step": 684 }, { "epoch": 31.09090909090909, "eval_loss": 0.0008367864647880197, "eval_runtime": 0.2261, "eval_samples_per_second": 389.221, "eval_steps_per_second": 48.653, "step": 684 }, { "epoch": 31.136363636363637, "grad_norm": 0.011943116784095764, "learning_rate": 3.3947368421052632e-06, "loss": 0.0014, "step": 685 }, { "epoch": 31.136363636363637, "eval_loss": 0.0008363695815205574, "eval_runtime": 0.2405, "eval_samples_per_second": 365.829, "eval_steps_per_second": 45.729, "step": 685 }, { "epoch": 31.181818181818183, "grad_norm": 0.012198768556118011, "learning_rate": 3.355263157894737e-06, "loss": 0.0014, "step": 686 }, { "epoch": 31.181818181818183, "eval_loss": 0.000835962186101824, "eval_runtime": 0.2414, "eval_samples_per_second": 364.526, "eval_steps_per_second": 45.566, "step": 686 }, { "epoch": 31.227272727272727, "grad_norm": 0.012970656156539917, "learning_rate": 3.3157894736842107e-06, "loss": 0.0014, "step": 687 }, { "epoch": 31.227272727272727, "eval_loss": 0.0008355574682354927, "eval_runtime": 0.2355, "eval_samples_per_second": 373.686, "eval_steps_per_second": 46.711, "step": 687 }, { "epoch": 31.272727272727273, "grad_norm": 0.01133756898343563, "learning_rate": 3.2763157894736844e-06, "loss": 0.0012, "step": 688 }, { "epoch": 31.272727272727273, "eval_loss": 0.0008351581636816263, "eval_runtime": 0.239, "eval_samples_per_second": 368.146, "eval_steps_per_second": 46.018, "step": 688 }, { "epoch": 31.318181818181817, "grad_norm": 0.014246292412281036, "learning_rate": 3.236842105263158e-06, "loss": 0.0014, "step": 689 }, { "epoch": 31.318181818181817, "eval_loss": 0.0008347549010068178, "eval_runtime": 0.2413, "eval_samples_per_second": 364.723, "eval_steps_per_second": 45.59, "step": 689 }, { "epoch": 31.363636363636363, "grad_norm": 0.01505040843039751, "learning_rate": 3.1973684210526314e-06, "loss": 0.0016, "step": 690 }, { "epoch": 31.363636363636363, "eval_loss": 0.0008343501249328256, "eval_runtime": 0.2321, "eval_samples_per_second": 379.09, "eval_steps_per_second": 47.386, "step": 690 }, { "epoch": 31.40909090909091, "grad_norm": 0.011749452911317348, "learning_rate": 3.157894736842105e-06, "loss": 0.0013, "step": 691 }, { "epoch": 31.40909090909091, "eval_loss": 0.0008339481428265572, "eval_runtime": 0.2656, "eval_samples_per_second": 331.332, "eval_steps_per_second": 41.416, "step": 691 }, { "epoch": 31.454545454545453, "grad_norm": 0.012921934016048908, "learning_rate": 3.1184210526315793e-06, "loss": 0.0015, "step": 692 }, { "epoch": 31.454545454545453, "eval_loss": 0.0008335394668392837, "eval_runtime": 0.2542, "eval_samples_per_second": 346.242, "eval_steps_per_second": 43.28, "step": 692 }, { "epoch": 31.5, "grad_norm": 0.01331315003335476, "learning_rate": 3.0789473684210526e-06, "loss": 0.0014, "step": 693 }, { "epoch": 31.5, "eval_loss": 0.0008331468561664224, "eval_runtime": 0.2417, "eval_samples_per_second": 364.055, "eval_steps_per_second": 45.507, "step": 693 }, { "epoch": 31.545454545454547, "grad_norm": 0.012770496308803558, "learning_rate": 3.0394736842105263e-06, "loss": 0.0015, "step": 694 }, { "epoch": 31.545454545454547, "eval_loss": 0.0008327368414029479, "eval_runtime": 0.2689, "eval_samples_per_second": 327.265, "eval_steps_per_second": 40.908, "step": 694 }, { "epoch": 31.59090909090909, "grad_norm": 0.012804139405488968, "learning_rate": 3e-06, "loss": 0.0014, "step": 695 }, { "epoch": 31.59090909090909, "eval_loss": 0.0008323252550326288, "eval_runtime": 0.2468, "eval_samples_per_second": 356.61, "eval_steps_per_second": 44.576, "step": 695 }, { "epoch": 31.636363636363637, "grad_norm": 0.014062759466469288, "learning_rate": 2.960526315789474e-06, "loss": 0.0015, "step": 696 }, { "epoch": 31.636363636363637, "eval_loss": 0.0008318935870192945, "eval_runtime": 0.2529, "eval_samples_per_second": 347.95, "eval_steps_per_second": 43.494, "step": 696 }, { "epoch": 31.681818181818183, "grad_norm": 0.013049440458416939, "learning_rate": 2.9210526315789475e-06, "loss": 0.0014, "step": 697 }, { "epoch": 31.681818181818183, "eval_loss": 0.0008314928272739053, "eval_runtime": 0.2521, "eval_samples_per_second": 349.0, "eval_steps_per_second": 43.625, "step": 697 }, { "epoch": 31.727272727272727, "grad_norm": 0.01172225084155798, "learning_rate": 2.881578947368421e-06, "loss": 0.0013, "step": 698 }, { "epoch": 31.727272727272727, "eval_loss": 0.0008310881094075739, "eval_runtime": 0.2672, "eval_samples_per_second": 329.329, "eval_steps_per_second": 41.166, "step": 698 }, { "epoch": 31.772727272727273, "grad_norm": 0.01266531739383936, "learning_rate": 2.842105263157895e-06, "loss": 0.0014, "step": 699 }, { "epoch": 31.772727272727273, "eval_loss": 0.0008307105163112283, "eval_runtime": 0.3176, "eval_samples_per_second": 277.082, "eval_steps_per_second": 34.635, "step": 699 }, { "epoch": 31.818181818181817, "grad_norm": 0.014071842655539513, "learning_rate": 2.8026315789473683e-06, "loss": 0.0015, "step": 700 }, { "epoch": 31.818181818181817, "eval_loss": 0.0008303424110636115, "eval_runtime": 0.2648, "eval_samples_per_second": 332.279, "eval_steps_per_second": 41.535, "step": 700 }, { "epoch": 31.863636363636363, "grad_norm": 0.01333391759544611, "learning_rate": 2.763157894736842e-06, "loss": 0.0015, "step": 701 }, { "epoch": 31.863636363636363, "eval_loss": 0.0008299809414893389, "eval_runtime": 0.2429, "eval_samples_per_second": 362.239, "eval_steps_per_second": 45.28, "step": 701 }, { "epoch": 31.90909090909091, "grad_norm": 0.010583317838609219, "learning_rate": 2.723684210526316e-06, "loss": 0.0012, "step": 702 }, { "epoch": 31.90909090909091, "eval_loss": 0.0008296439773403108, "eval_runtime": 0.2463, "eval_samples_per_second": 357.358, "eval_steps_per_second": 44.67, "step": 702 }, { "epoch": 31.954545454545453, "grad_norm": 0.01122986525297165, "learning_rate": 2.6842105263157895e-06, "loss": 0.0013, "step": 703 }, { "epoch": 31.954545454545453, "eval_loss": 0.0008293138234876096, "eval_runtime": 0.2433, "eval_samples_per_second": 361.652, "eval_steps_per_second": 45.206, "step": 703 }, { "epoch": 32.0, "grad_norm": 0.011437175795435905, "learning_rate": 2.644736842105263e-06, "loss": 0.0013, "step": 704 }, { "epoch": 32.0, "eval_loss": 0.0008289901888929307, "eval_runtime": 0.2357, "eval_samples_per_second": 373.319, "eval_steps_per_second": 46.665, "step": 704 }, { "epoch": 32.04545454545455, "grad_norm": 0.012699670158326626, "learning_rate": 2.605263157894737e-06, "loss": 0.0014, "step": 705 }, { "epoch": 32.04545454545455, "eval_loss": 0.0008286829688586295, "eval_runtime": 0.2319, "eval_samples_per_second": 379.476, "eval_steps_per_second": 47.435, "step": 705 }, { "epoch": 32.09090909090909, "grad_norm": 0.013239861465990543, "learning_rate": 2.5657894736842107e-06, "loss": 0.0014, "step": 706 }, { "epoch": 32.09090909090909, "eval_loss": 0.0008283716160804033, "eval_runtime": 0.2319, "eval_samples_per_second": 379.415, "eval_steps_per_second": 47.427, "step": 706 }, { "epoch": 32.13636363636363, "grad_norm": 0.012133197858929634, "learning_rate": 2.5263157894736844e-06, "loss": 0.0013, "step": 707 }, { "epoch": 32.13636363636363, "eval_loss": 0.0008280739421024919, "eval_runtime": 0.2242, "eval_samples_per_second": 392.558, "eval_steps_per_second": 49.07, "step": 707 }, { "epoch": 32.18181818181818, "grad_norm": 0.011126801371574402, "learning_rate": 2.4868421052631577e-06, "loss": 0.0013, "step": 708 }, { "epoch": 32.18181818181818, "eval_loss": 0.0008277747547253966, "eval_runtime": 0.2381, "eval_samples_per_second": 369.534, "eval_steps_per_second": 46.192, "step": 708 }, { "epoch": 32.22727272727273, "grad_norm": 0.012151258997619152, "learning_rate": 2.447368421052632e-06, "loss": 0.0014, "step": 709 }, { "epoch": 32.22727272727273, "eval_loss": 0.0008274810388684273, "eval_runtime": 0.265, "eval_samples_per_second": 332.045, "eval_steps_per_second": 41.506, "step": 709 }, { "epoch": 32.27272727272727, "grad_norm": 0.013219231739640236, "learning_rate": 2.407894736842105e-06, "loss": 0.0014, "step": 710 }, { "epoch": 32.27272727272727, "eval_loss": 0.0008271847036667168, "eval_runtime": 0.2428, "eval_samples_per_second": 362.463, "eval_steps_per_second": 45.308, "step": 710 }, { "epoch": 32.31818181818182, "grad_norm": 0.010275053791701794, "learning_rate": 2.368421052631579e-06, "loss": 0.0012, "step": 711 }, { "epoch": 32.31818181818182, "eval_loss": 0.0008268963429145515, "eval_runtime": 0.2418, "eval_samples_per_second": 363.953, "eval_steps_per_second": 45.494, "step": 711 }, { "epoch": 32.36363636363637, "grad_norm": 0.013079304248094559, "learning_rate": 2.328947368421053e-06, "loss": 0.0014, "step": 712 }, { "epoch": 32.36363636363637, "eval_loss": 0.00082661077613011, "eval_runtime": 0.232, "eval_samples_per_second": 379.238, "eval_steps_per_second": 47.405, "step": 712 }, { "epoch": 32.40909090909091, "grad_norm": 0.019619744271039963, "learning_rate": 2.2894736842105263e-06, "loss": 0.0014, "step": 713 }, { "epoch": 32.40909090909091, "eval_loss": 0.0008263156050816178, "eval_runtime": 0.2626, "eval_samples_per_second": 335.09, "eval_steps_per_second": 41.886, "step": 713 }, { "epoch": 32.45454545454545, "grad_norm": 0.014103109948337078, "learning_rate": 2.25e-06, "loss": 0.0015, "step": 714 }, { "epoch": 32.45454545454545, "eval_loss": 0.0008260206668637693, "eval_runtime": 0.256, "eval_samples_per_second": 343.813, "eval_steps_per_second": 42.977, "step": 714 }, { "epoch": 32.5, "grad_norm": 0.013360358774662018, "learning_rate": 2.2105263157894734e-06, "loss": 0.0015, "step": 715 }, { "epoch": 32.5, "eval_loss": 0.0008257552981376648, "eval_runtime": 0.2719, "eval_samples_per_second": 323.628, "eval_steps_per_second": 40.454, "step": 715 }, { "epoch": 32.54545454545455, "grad_norm": 0.012335807085037231, "learning_rate": 2.1710526315789475e-06, "loss": 0.0014, "step": 716 }, { "epoch": 32.54545454545455, "eval_loss": 0.0008254764834418893, "eval_runtime": 0.257, "eval_samples_per_second": 342.453, "eval_steps_per_second": 42.807, "step": 716 }, { "epoch": 32.59090909090909, "grad_norm": 0.012738436460494995, "learning_rate": 2.1315789473684212e-06, "loss": 0.0014, "step": 717 }, { "epoch": 32.59090909090909, "eval_loss": 0.0008252071565948427, "eval_runtime": 0.2774, "eval_samples_per_second": 317.284, "eval_steps_per_second": 39.66, "step": 717 }, { "epoch": 32.63636363636363, "grad_norm": 0.011913586407899857, "learning_rate": 2.0921052631578945e-06, "loss": 0.0013, "step": 718 }, { "epoch": 32.63636363636363, "eval_loss": 0.0008249431848526001, "eval_runtime": 0.2458, "eval_samples_per_second": 358.083, "eval_steps_per_second": 44.76, "step": 718 }, { "epoch": 32.68181818181818, "grad_norm": 0.010375920683145523, "learning_rate": 2.0526315789473687e-06, "loss": 0.0013, "step": 719 }, { "epoch": 32.68181818181818, "eval_loss": 0.0008246820070780814, "eval_runtime": 0.2548, "eval_samples_per_second": 345.32, "eval_steps_per_second": 43.165, "step": 719 }, { "epoch": 32.72727272727273, "grad_norm": 0.016080064699053764, "learning_rate": 2.013157894736842e-06, "loss": 0.0016, "step": 720 }, { "epoch": 32.72727272727273, "eval_loss": 0.0008244179771281779, "eval_runtime": 0.2695, "eval_samples_per_second": 326.571, "eval_steps_per_second": 40.821, "step": 720 }, { "epoch": 32.77272727272727, "grad_norm": 0.01252568420022726, "learning_rate": 1.9736842105263157e-06, "loss": 0.0013, "step": 721 }, { "epoch": 32.77272727272727, "eval_loss": 0.0008241839241236448, "eval_runtime": 0.2948, "eval_samples_per_second": 298.515, "eval_steps_per_second": 37.314, "step": 721 }, { "epoch": 32.81818181818182, "grad_norm": 0.012378372251987457, "learning_rate": 1.93421052631579e-06, "loss": 0.0014, "step": 722 }, { "epoch": 32.81818181818182, "eval_loss": 0.0008239619201049209, "eval_runtime": 0.2733, "eval_samples_per_second": 321.958, "eval_steps_per_second": 40.245, "step": 722 }, { "epoch": 32.86363636363637, "grad_norm": 0.013344389386475086, "learning_rate": 1.8947368421052632e-06, "loss": 0.0015, "step": 723 }, { "epoch": 32.86363636363637, "eval_loss": 0.000823718321043998, "eval_runtime": 0.2569, "eval_samples_per_second": 342.55, "eval_steps_per_second": 42.819, "step": 723 }, { "epoch": 32.90909090909091, "grad_norm": 0.012948358431458473, "learning_rate": 1.855263157894737e-06, "loss": 0.0015, "step": 724 }, { "epoch": 32.90909090909091, "eval_loss": 0.0008235003333538771, "eval_runtime": 0.2559, "eval_samples_per_second": 343.901, "eval_steps_per_second": 42.988, "step": 724 }, { "epoch": 32.95454545454545, "grad_norm": 0.011233711615204811, "learning_rate": 1.8157894736842106e-06, "loss": 0.0012, "step": 725 }, { "epoch": 32.95454545454545, "eval_loss": 0.000823282403871417, "eval_runtime": 0.2642, "eval_samples_per_second": 333.059, "eval_steps_per_second": 41.632, "step": 725 }, { "epoch": 33.0, "grad_norm": 0.01327808853238821, "learning_rate": 1.7763157894736842e-06, "loss": 0.0015, "step": 726 }, { "epoch": 33.0, "eval_loss": 0.0008230686071328819, "eval_runtime": 0.2841, "eval_samples_per_second": 309.721, "eval_steps_per_second": 38.715, "step": 726 }, { "epoch": 33.04545454545455, "grad_norm": 0.011662392877042294, "learning_rate": 1.7368421052631579e-06, "loss": 0.0014, "step": 727 }, { "epoch": 33.04545454545455, "eval_loss": 0.0008228750666603446, "eval_runtime": 0.249, "eval_samples_per_second": 353.382, "eval_steps_per_second": 44.173, "step": 727 }, { "epoch": 33.09090909090909, "grad_norm": 0.011290736496448517, "learning_rate": 1.6973684210526316e-06, "loss": 0.0013, "step": 728 }, { "epoch": 33.09090909090909, "eval_loss": 0.0008226787904277444, "eval_runtime": 0.2459, "eval_samples_per_second": 357.906, "eval_steps_per_second": 44.738, "step": 728 }, { "epoch": 33.13636363636363, "grad_norm": 0.011928938329219818, "learning_rate": 1.6578947368421053e-06, "loss": 0.0014, "step": 729 }, { "epoch": 33.13636363636363, "eval_loss": 0.0008224839111790061, "eval_runtime": 0.2693, "eval_samples_per_second": 326.807, "eval_steps_per_second": 40.851, "step": 729 }, { "epoch": 33.18181818181818, "grad_norm": 0.013969271443784237, "learning_rate": 1.618421052631579e-06, "loss": 0.0014, "step": 730 }, { "epoch": 33.18181818181818, "eval_loss": 0.0008223024778999388, "eval_runtime": 0.2985, "eval_samples_per_second": 294.848, "eval_steps_per_second": 36.856, "step": 730 }, { "epoch": 33.22727272727273, "grad_norm": 0.01247771643102169, "learning_rate": 1.5789473684210526e-06, "loss": 0.0014, "step": 731 }, { "epoch": 33.22727272727273, "eval_loss": 0.0008221129537560046, "eval_runtime": 0.2564, "eval_samples_per_second": 343.194, "eval_steps_per_second": 42.899, "step": 731 }, { "epoch": 33.27272727272727, "grad_norm": 0.012111688032746315, "learning_rate": 1.5394736842105263e-06, "loss": 0.0013, "step": 732 }, { "epoch": 33.27272727272727, "eval_loss": 0.0008219464216381311, "eval_runtime": 0.2484, "eval_samples_per_second": 354.308, "eval_steps_per_second": 44.289, "step": 732 }, { "epoch": 33.31818181818182, "grad_norm": 0.01268478948622942, "learning_rate": 1.5e-06, "loss": 0.0014, "step": 733 }, { "epoch": 33.31818181818182, "eval_loss": 0.0008217745926231146, "eval_runtime": 0.2938, "eval_samples_per_second": 299.518, "eval_steps_per_second": 37.44, "step": 733 }, { "epoch": 33.36363636363637, "grad_norm": 0.01151086576282978, "learning_rate": 1.4605263157894738e-06, "loss": 0.0012, "step": 734 }, { "epoch": 33.36363636363637, "eval_loss": 0.0008215824491344392, "eval_runtime": 0.2627, "eval_samples_per_second": 335.021, "eval_steps_per_second": 41.878, "step": 734 }, { "epoch": 33.40909090909091, "grad_norm": 0.012743664905428886, "learning_rate": 1.4210526315789475e-06, "loss": 0.0014, "step": 735 }, { "epoch": 33.40909090909091, "eval_loss": 0.0008214117842726409, "eval_runtime": 0.257, "eval_samples_per_second": 342.469, "eval_steps_per_second": 42.809, "step": 735 }, { "epoch": 33.45454545454545, "grad_norm": 0.014465508982539177, "learning_rate": 1.381578947368421e-06, "loss": 0.0015, "step": 736 }, { "epoch": 33.45454545454545, "eval_loss": 0.0008212332031689584, "eval_runtime": 0.2383, "eval_samples_per_second": 369.294, "eval_steps_per_second": 46.162, "step": 736 }, { "epoch": 33.5, "grad_norm": 0.011136289685964584, "learning_rate": 1.3421052631578947e-06, "loss": 0.0013, "step": 737 }, { "epoch": 33.5, "eval_loss": 0.0008210748201236129, "eval_runtime": 0.2515, "eval_samples_per_second": 349.848, "eval_steps_per_second": 43.731, "step": 737 }, { "epoch": 33.54545454545455, "grad_norm": 0.013279801234602928, "learning_rate": 1.3026315789473685e-06, "loss": 0.0014, "step": 738 }, { "epoch": 33.54545454545455, "eval_loss": 0.0008209014777094126, "eval_runtime": 0.2624, "eval_samples_per_second": 335.412, "eval_steps_per_second": 41.926, "step": 738 }, { "epoch": 33.59090909090909, "grad_norm": 0.011146324686706066, "learning_rate": 1.2631578947368422e-06, "loss": 0.0012, "step": 739 }, { "epoch": 33.59090909090909, "eval_loss": 0.0008207445498555899, "eval_runtime": 0.2477, "eval_samples_per_second": 355.277, "eval_steps_per_second": 44.41, "step": 739 }, { "epoch": 33.63636363636363, "grad_norm": 0.011300037615001202, "learning_rate": 1.223684210526316e-06, "loss": 0.0013, "step": 740 }, { "epoch": 33.63636363636363, "eval_loss": 0.0008206011261790991, "eval_runtime": 0.2524, "eval_samples_per_second": 348.598, "eval_steps_per_second": 43.575, "step": 740 }, { "epoch": 33.68181818181818, "grad_norm": 0.013210857287049294, "learning_rate": 1.1842105263157894e-06, "loss": 0.0013, "step": 741 }, { "epoch": 33.68181818181818, "eval_loss": 0.0008204494952224195, "eval_runtime": 0.2769, "eval_samples_per_second": 317.777, "eval_steps_per_second": 39.722, "step": 741 }, { "epoch": 33.72727272727273, "grad_norm": 0.011201176792383194, "learning_rate": 1.1447368421052632e-06, "loss": 0.0013, "step": 742 }, { "epoch": 33.72727272727273, "eval_loss": 0.0008203128236345947, "eval_runtime": 0.259, "eval_samples_per_second": 339.749, "eval_steps_per_second": 42.469, "step": 742 }, { "epoch": 33.77272727272727, "grad_norm": 0.012550720945000648, "learning_rate": 1.1052631578947367e-06, "loss": 0.0013, "step": 743 }, { "epoch": 33.77272727272727, "eval_loss": 0.0008201680611819029, "eval_runtime": 0.2549, "eval_samples_per_second": 345.3, "eval_steps_per_second": 43.163, "step": 743 }, { "epoch": 33.81818181818182, "grad_norm": 0.011524029076099396, "learning_rate": 1.0657894736842106e-06, "loss": 0.0014, "step": 744 }, { "epoch": 33.81818181818182, "eval_loss": 0.0008200569427572191, "eval_runtime": 0.2576, "eval_samples_per_second": 341.575, "eval_steps_per_second": 42.697, "step": 744 }, { "epoch": 33.86363636363637, "grad_norm": 0.014999749138951302, "learning_rate": 1.0263157894736843e-06, "loss": 0.0015, "step": 745 }, { "epoch": 33.86363636363637, "eval_loss": 0.0008199459407478571, "eval_runtime": 0.2573, "eval_samples_per_second": 342.038, "eval_steps_per_second": 42.755, "step": 745 }, { "epoch": 33.90909090909091, "grad_norm": 0.013432620093226433, "learning_rate": 9.868421052631579e-07, "loss": 0.0014, "step": 746 }, { "epoch": 33.90909090909091, "eval_loss": 0.0008198119467124343, "eval_runtime": 0.2794, "eval_samples_per_second": 314.936, "eval_steps_per_second": 39.367, "step": 746 }, { "epoch": 33.95454545454545, "grad_norm": 0.011333504691720009, "learning_rate": 9.473684210526316e-07, "loss": 0.0014, "step": 747 }, { "epoch": 33.95454545454545, "eval_loss": 0.0008196914568543434, "eval_runtime": 0.2549, "eval_samples_per_second": 345.205, "eval_steps_per_second": 43.151, "step": 747 }, { "epoch": 34.0, "grad_norm": 0.0102554215118289, "learning_rate": 9.078947368421053e-07, "loss": 0.0012, "step": 748 }, { "epoch": 34.0, "eval_loss": 0.0008195764967240393, "eval_runtime": 0.2656, "eval_samples_per_second": 331.349, "eval_steps_per_second": 41.419, "step": 748 }, { "epoch": 34.04545454545455, "grad_norm": 0.011500447988510132, "learning_rate": 8.684210526315789e-07, "loss": 0.0013, "step": 749 }, { "epoch": 34.04545454545455, "eval_loss": 0.0008194709080271423, "eval_runtime": 0.2631, "eval_samples_per_second": 334.415, "eval_steps_per_second": 41.802, "step": 749 }, { "epoch": 34.09090909090909, "grad_norm": 0.011614636518061161, "learning_rate": 8.289473684210527e-07, "loss": 0.0014, "step": 750 }, { "epoch": 34.09090909090909, "eval_loss": 0.0008193707326427102, "eval_runtime": 0.2898, "eval_samples_per_second": 303.667, "eval_steps_per_second": 37.958, "step": 750 }, { "epoch": 34.13636363636363, "grad_norm": 0.010696956887841225, "learning_rate": 7.894736842105263e-07, "loss": 0.0013, "step": 751 }, { "epoch": 34.13636363636363, "eval_loss": 0.0008192665409296751, "eval_runtime": 0.2328, "eval_samples_per_second": 378.006, "eval_steps_per_second": 47.251, "step": 751 }, { "epoch": 34.18181818181818, "grad_norm": 0.011633389629423618, "learning_rate": 7.5e-07, "loss": 0.0014, "step": 752 }, { "epoch": 34.18181818181818, "eval_loss": 0.0008191689848899841, "eval_runtime": 0.2379, "eval_samples_per_second": 369.833, "eval_steps_per_second": 46.229, "step": 752 }, { "epoch": 34.22727272727273, "grad_norm": 0.013071279041469097, "learning_rate": 7.105263157894737e-07, "loss": 0.0013, "step": 753 }, { "epoch": 34.22727272727273, "eval_loss": 0.0008190743392333388, "eval_runtime": 0.2373, "eval_samples_per_second": 370.812, "eval_steps_per_second": 46.351, "step": 753 }, { "epoch": 34.27272727272727, "grad_norm": 0.011386328376829624, "learning_rate": 6.710526315789474e-07, "loss": 0.0013, "step": 754 }, { "epoch": 34.27272727272727, "eval_loss": 0.0008190052467398345, "eval_runtime": 0.2237, "eval_samples_per_second": 393.447, "eval_steps_per_second": 49.181, "step": 754 }, { "epoch": 34.31818181818182, "grad_norm": 0.011327208951115608, "learning_rate": 6.315789473684211e-07, "loss": 0.0013, "step": 755 }, { "epoch": 34.31818181818182, "eval_loss": 0.0008189321961253881, "eval_runtime": 0.2293, "eval_samples_per_second": 383.737, "eval_steps_per_second": 47.967, "step": 755 }, { "epoch": 34.36363636363637, "grad_norm": 0.011524545960128307, "learning_rate": 5.921052631578947e-07, "loss": 0.0014, "step": 756 }, { "epoch": 34.36363636363637, "eval_loss": 0.0008188713109120727, "eval_runtime": 0.2355, "eval_samples_per_second": 373.643, "eval_steps_per_second": 46.705, "step": 756 }, { "epoch": 34.40909090909091, "grad_norm": 0.012313243001699448, "learning_rate": 5.526315789473683e-07, "loss": 0.0014, "step": 757 }, { "epoch": 34.40909090909091, "eval_loss": 0.0008188103674910963, "eval_runtime": 0.4518, "eval_samples_per_second": 194.767, "eval_steps_per_second": 24.346, "step": 757 }, { "epoch": 34.45454545454545, "grad_norm": 0.012687238864600658, "learning_rate": 5.131578947368422e-07, "loss": 0.0014, "step": 758 }, { "epoch": 34.45454545454545, "eval_loss": 0.0008187480852939188, "eval_runtime": 0.2941, "eval_samples_per_second": 299.267, "eval_steps_per_second": 37.408, "step": 758 }, { "epoch": 34.5, "grad_norm": 0.012826275080442429, "learning_rate": 4.736842105263158e-07, "loss": 0.0013, "step": 759 }, { "epoch": 34.5, "eval_loss": 0.0008186926716007292, "eval_runtime": 0.3991, "eval_samples_per_second": 220.5, "eval_steps_per_second": 27.562, "step": 759 }, { "epoch": 34.54545454545455, "grad_norm": 0.012961960397660732, "learning_rate": 4.3421052631578947e-07, "loss": 0.0015, "step": 760 }, { "epoch": 34.54545454545455, "eval_loss": 0.0008186465711332858, "eval_runtime": 0.2484, "eval_samples_per_second": 354.263, "eval_steps_per_second": 44.283, "step": 760 }, { "epoch": 34.59090909090909, "grad_norm": 0.013269671238958836, "learning_rate": 3.9473684210526315e-07, "loss": 0.0014, "step": 761 }, { "epoch": 34.59090909090909, "eval_loss": 0.0008186018676497042, "eval_runtime": 0.291, "eval_samples_per_second": 302.416, "eval_steps_per_second": 37.802, "step": 761 }, { "epoch": 34.63636363636363, "grad_norm": 0.012951558455824852, "learning_rate": 3.5526315789473687e-07, "loss": 0.0013, "step": 762 }, { "epoch": 34.63636363636363, "eval_loss": 0.0008185504120774567, "eval_runtime": 0.2594, "eval_samples_per_second": 339.213, "eval_steps_per_second": 42.402, "step": 762 }, { "epoch": 34.68181818181818, "grad_norm": 0.01040305569767952, "learning_rate": 3.1578947368421055e-07, "loss": 0.0013, "step": 763 }, { "epoch": 34.68181818181818, "eval_loss": 0.0008185274782590568, "eval_runtime": 0.3222, "eval_samples_per_second": 273.081, "eval_steps_per_second": 34.135, "step": 763 }, { "epoch": 34.72727272727273, "grad_norm": 0.013104148209095001, "learning_rate": 2.7631578947368417e-07, "loss": 0.0014, "step": 764 }, { "epoch": 34.72727272727273, "eval_loss": 0.0008184895268641412, "eval_runtime": 0.3124, "eval_samples_per_second": 281.65, "eval_steps_per_second": 35.206, "step": 764 }, { "epoch": 34.77272727272727, "grad_norm": 0.012136269360780716, "learning_rate": 2.368421052631579e-07, "loss": 0.0014, "step": 765 }, { "epoch": 34.77272727272727, "eval_loss": 0.0008184570469893515, "eval_runtime": 0.2394, "eval_samples_per_second": 367.55, "eval_steps_per_second": 45.944, "step": 765 }, { "epoch": 34.81818181818182, "grad_norm": 0.011621113866567612, "learning_rate": 1.9736842105263157e-07, "loss": 0.0014, "step": 766 }, { "epoch": 34.81818181818182, "eval_loss": 0.0008184341131709516, "eval_runtime": 0.2627, "eval_samples_per_second": 334.967, "eval_steps_per_second": 41.871, "step": 766 }, { "epoch": 34.86363636363637, "grad_norm": 0.0140585508197546, "learning_rate": 1.5789473684210527e-07, "loss": 0.0014, "step": 767 }, { "epoch": 34.86363636363637, "eval_loss": 0.0008184110629372299, "eval_runtime": 0.2624, "eval_samples_per_second": 335.359, "eval_steps_per_second": 41.92, "step": 767 }, { "epoch": 34.90909090909091, "grad_norm": 0.0137332146987319, "learning_rate": 1.1842105263157895e-07, "loss": 0.0013, "step": 768 }, { "epoch": 34.90909090909091, "eval_loss": 0.000818394822999835, "eval_runtime": 0.3455, "eval_samples_per_second": 254.714, "eval_steps_per_second": 31.839, "step": 768 }, { "epoch": 34.95454545454545, "grad_norm": 0.013574851676821709, "learning_rate": 7.894736842105264e-08, "loss": 0.0015, "step": 769 }, { "epoch": 34.95454545454545, "eval_loss": 0.000818385393358767, "eval_runtime": 0.4011, "eval_samples_per_second": 219.413, "eval_steps_per_second": 27.427, "step": 769 }, { "epoch": 35.0, "grad_norm": 0.01393211167305708, "learning_rate": 3.947368421052632e-08, "loss": 0.0014, "step": 770 }, { "epoch": 35.0, "eval_loss": 0.0008183813188225031, "eval_runtime": 0.2776, "eval_samples_per_second": 316.984, "eval_steps_per_second": 39.623, "step": 770 } ], "logging_steps": 1, "max_steps": 770, "num_input_tokens_seen": 0, "num_train_epochs": 35, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 335566894333440.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }