diff --git "a/checkpoint-770/trainer_state.json" "b/checkpoint-770/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-770/trainer_state.json" @@ -0,0 +1,11584 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 35.0, + "eval_steps": 1, + "global_step": 770, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.045454545454545456, + "grad_norm": 5.237588882446289, + "learning_rate": 0.0, + "loss": 2.0682, + "step": 1 + }, + { + "epoch": 0.045454545454545456, + "eval_loss": 2.063732147216797, + "eval_runtime": 0.2778, + "eval_samples_per_second": 316.813, + "eval_steps_per_second": 39.602, + "step": 1 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 5.7836594581604, + "learning_rate": 3e-06, + "loss": 2.0543, + "step": 2 + }, + { + "epoch": 0.09090909090909091, + "eval_loss": 2.058272123336792, + "eval_runtime": 0.2138, + "eval_samples_per_second": 411.689, + "eval_steps_per_second": 51.461, + "step": 2 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 4.997707366943359, + "learning_rate": 6e-06, + "loss": 2.106, + "step": 3 + }, + { + "epoch": 0.13636363636363635, + "eval_loss": 2.044473171234131, + "eval_runtime": 0.2229, + "eval_samples_per_second": 394.85, + "eval_steps_per_second": 49.356, + "step": 3 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 4.480862140655518, + "learning_rate": 9e-06, + "loss": 2.0133, + "step": 4 + }, + { + "epoch": 0.18181818181818182, + "eval_loss": 2.026616096496582, + "eval_runtime": 0.2098, + "eval_samples_per_second": 419.399, + "eval_steps_per_second": 52.425, + "step": 4 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 4.413949489593506, + "learning_rate": 1.2e-05, + "loss": 2.0339, + "step": 5 + }, + { + "epoch": 0.22727272727272727, + "eval_loss": 2.0050275325775146, + "eval_runtime": 0.2083, + "eval_samples_per_second": 422.489, + "eval_steps_per_second": 52.811, + "step": 5 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 3.8636281490325928, + "learning_rate": 1.5e-05, + "loss": 1.9456, + "step": 6 + }, + { + "epoch": 0.2727272727272727, + "eval_loss": 1.978696346282959, + "eval_runtime": 0.2234, + "eval_samples_per_second": 393.959, + "eval_steps_per_second": 49.245, + "step": 6 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 5.352145671844482, + "learning_rate": 1.8e-05, + "loss": 2.0702, + "step": 7 + }, + { + "epoch": 0.3181818181818182, + "eval_loss": 1.9451583623886108, + "eval_runtime": 0.2365, + "eval_samples_per_second": 372.165, + "eval_steps_per_second": 46.521, + "step": 7 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 6.098653316497803, + "learning_rate": 2.1e-05, + "loss": 1.9057, + "step": 8 + }, + { + "epoch": 0.36363636363636365, + "eval_loss": 1.908401608467102, + "eval_runtime": 0.2109, + "eval_samples_per_second": 417.279, + "eval_steps_per_second": 52.16, + "step": 8 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 4.3218302726745605, + "learning_rate": 2.4e-05, + "loss": 2.0159, + "step": 9 + }, + { + "epoch": 0.4090909090909091, + "eval_loss": 1.860684871673584, + "eval_runtime": 0.2261, + "eval_samples_per_second": 389.203, + "eval_steps_per_second": 48.65, + "step": 9 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 4.778627395629883, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.8808, + "step": 10 + }, + { + "epoch": 0.45454545454545453, + "eval_loss": 1.793589472770691, + "eval_runtime": 0.2922, + "eval_samples_per_second": 301.187, + "eval_steps_per_second": 37.648, + "step": 10 + }, + { + "epoch": 0.5, + "grad_norm": 5.957038879394531, + "learning_rate": 3e-05, + "loss": 1.896, + "step": 11 + }, + { + "epoch": 0.5, + "eval_loss": 1.7104023694992065, + "eval_runtime": 0.3181, + "eval_samples_per_second": 276.671, + "eval_steps_per_second": 34.584, + "step": 11 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 6.62753963470459, + "learning_rate": 2.9960526315789475e-05, + "loss": 1.7627, + "step": 12 + }, + { + "epoch": 0.5454545454545454, + "eval_loss": 1.6353049278259277, + "eval_runtime": 0.4101, + "eval_samples_per_second": 214.57, + "eval_steps_per_second": 26.821, + "step": 12 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 5.637991905212402, + "learning_rate": 2.992105263157895e-05, + "loss": 1.6927, + "step": 13 + }, + { + "epoch": 0.5909090909090909, + "eval_loss": 1.5653632879257202, + "eval_runtime": 0.3772, + "eval_samples_per_second": 233.322, + "eval_steps_per_second": 29.165, + "step": 13 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 7.619434356689453, + "learning_rate": 2.9881578947368423e-05, + "loss": 1.5805, + "step": 14 + }, + { + "epoch": 0.6363636363636364, + "eval_loss": 1.4975870847702026, + "eval_runtime": 0.2484, + "eval_samples_per_second": 354.217, + "eval_steps_per_second": 44.277, + "step": 14 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 8.660569190979004, + "learning_rate": 2.9842105263157894e-05, + "loss": 1.5803, + "step": 15 + }, + { + "epoch": 0.6818181818181818, + "eval_loss": 1.4246007204055786, + "eval_runtime": 0.3233, + "eval_samples_per_second": 272.164, + "eval_steps_per_second": 34.02, + "step": 15 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 6.809484481811523, + "learning_rate": 2.980263157894737e-05, + "loss": 1.4897, + "step": 16 + }, + { + "epoch": 0.7272727272727273, + "eval_loss": 1.3582329750061035, + "eval_runtime": 0.2729, + "eval_samples_per_second": 322.483, + "eval_steps_per_second": 40.31, + "step": 16 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 7.0124711990356445, + "learning_rate": 2.9763157894736842e-05, + "loss": 1.3831, + "step": 17 + }, + { + "epoch": 0.7727272727272727, + "eval_loss": 1.2863445281982422, + "eval_runtime": 0.2734, + "eval_samples_per_second": 321.916, + "eval_steps_per_second": 40.24, + "step": 17 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 6.749629974365234, + "learning_rate": 2.9723684210526316e-05, + "loss": 1.2616, + "step": 18 + }, + { + "epoch": 0.8181818181818182, + "eval_loss": 1.1985043287277222, + "eval_runtime": 0.2953, + "eval_samples_per_second": 297.968, + "eval_steps_per_second": 37.246, + "step": 18 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 8.935945510864258, + "learning_rate": 2.968421052631579e-05, + "loss": 1.2058, + "step": 19 + }, + { + "epoch": 0.8636363636363636, + "eval_loss": 1.1089844703674316, + "eval_runtime": 0.3886, + "eval_samples_per_second": 226.48, + "eval_steps_per_second": 28.31, + "step": 19 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 5.048995018005371, + "learning_rate": 2.9644736842105265e-05, + "loss": 1.1399, + "step": 20 + }, + { + "epoch": 0.9090909090909091, + "eval_loss": 1.0176739692687988, + "eval_runtime": 0.2913, + "eval_samples_per_second": 302.091, + "eval_steps_per_second": 37.761, + "step": 20 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 6.563332557678223, + "learning_rate": 2.9605263157894735e-05, + "loss": 0.9906, + "step": 21 + }, + { + "epoch": 0.9545454545454546, + "eval_loss": 0.930864155292511, + "eval_runtime": 0.2425, + "eval_samples_per_second": 362.831, + "eval_steps_per_second": 45.354, + "step": 21 + }, + { + "epoch": 1.0, + "grad_norm": 12.079025268554688, + "learning_rate": 2.9565789473684213e-05, + "loss": 1.0795, + "step": 22 + }, + { + "epoch": 1.0, + "eval_loss": 0.8574727773666382, + "eval_runtime": 0.2662, + "eval_samples_per_second": 330.588, + "eval_steps_per_second": 41.323, + "step": 22 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 5.452284336090088, + "learning_rate": 2.9526315789473684e-05, + "loss": 0.8862, + "step": 23 + }, + { + "epoch": 1.0454545454545454, + "eval_loss": 0.7834421396255493, + "eval_runtime": 0.2559, + "eval_samples_per_second": 343.941, + "eval_steps_per_second": 42.993, + "step": 23 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 6.780595302581787, + "learning_rate": 2.9486842105263158e-05, + "loss": 0.7825, + "step": 24 + }, + { + "epoch": 1.0909090909090908, + "eval_loss": 0.7133036255836487, + "eval_runtime": 0.4064, + "eval_samples_per_second": 216.526, + "eval_steps_per_second": 27.066, + "step": 24 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 6.756824016571045, + "learning_rate": 2.9447368421052635e-05, + "loss": 0.9249, + "step": 25 + }, + { + "epoch": 1.1363636363636362, + "eval_loss": 0.652009904384613, + "eval_runtime": 0.4122, + "eval_samples_per_second": 213.486, + "eval_steps_per_second": 26.686, + "step": 25 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 4.798681259155273, + "learning_rate": 2.9407894736842106e-05, + "loss": 0.5773, + "step": 26 + }, + { + "epoch": 1.1818181818181819, + "eval_loss": 0.6013602614402771, + "eval_runtime": 0.4182, + "eval_samples_per_second": 210.411, + "eval_steps_per_second": 26.301, + "step": 26 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 4.608880996704102, + "learning_rate": 2.936842105263158e-05, + "loss": 0.6573, + "step": 27 + }, + { + "epoch": 1.2272727272727273, + "eval_loss": 0.5579346418380737, + "eval_runtime": 0.5426, + "eval_samples_per_second": 162.18, + "eval_steps_per_second": 20.272, + "step": 27 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 4.582436561584473, + "learning_rate": 2.9328947368421055e-05, + "loss": 0.5408, + "step": 28 + }, + { + "epoch": 1.2727272727272727, + "eval_loss": 0.5213125348091125, + "eval_runtime": 0.274, + "eval_samples_per_second": 321.214, + "eval_steps_per_second": 40.152, + "step": 28 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 6.145488262176514, + "learning_rate": 2.928947368421053e-05, + "loss": 0.6888, + "step": 29 + }, + { + "epoch": 1.3181818181818181, + "eval_loss": 0.47387245297431946, + "eval_runtime": 0.2153, + "eval_samples_per_second": 408.668, + "eval_steps_per_second": 51.083, + "step": 29 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 4.611596584320068, + "learning_rate": 2.925e-05, + "loss": 0.584, + "step": 30 + }, + { + "epoch": 1.3636363636363638, + "eval_loss": 0.41591426730155945, + "eval_runtime": 0.2262, + "eval_samples_per_second": 388.952, + "eval_steps_per_second": 48.619, + "step": 30 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 4.470975875854492, + "learning_rate": 2.9210526315789474e-05, + "loss": 0.4962, + "step": 31 + }, + { + "epoch": 1.4090909090909092, + "eval_loss": 0.3586600720882416, + "eval_runtime": 0.2233, + "eval_samples_per_second": 394.029, + "eval_steps_per_second": 49.254, + "step": 31 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 3.111593008041382, + "learning_rate": 2.9171052631578948e-05, + "loss": 0.3594, + "step": 32 + }, + { + "epoch": 1.4545454545454546, + "eval_loss": 0.3188125491142273, + "eval_runtime": 0.3382, + "eval_samples_per_second": 260.203, + "eval_steps_per_second": 32.525, + "step": 32 + }, + { + "epoch": 1.5, + "grad_norm": 3.246596336364746, + "learning_rate": 2.9131578947368422e-05, + "loss": 0.3643, + "step": 33 + }, + { + "epoch": 1.5, + "eval_loss": 0.2900885343551636, + "eval_runtime": 0.2904, + "eval_samples_per_second": 302.998, + "eval_steps_per_second": 37.875, + "step": 33 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 4.4003376960754395, + "learning_rate": 2.9092105263157893e-05, + "loss": 0.3334, + "step": 34 + }, + { + "epoch": 1.5454545454545454, + "eval_loss": 0.260213166475296, + "eval_runtime": 0.3641, + "eval_samples_per_second": 241.707, + "eval_steps_per_second": 30.213, + "step": 34 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 5.7509236335754395, + "learning_rate": 2.905263157894737e-05, + "loss": 0.3754, + "step": 35 + }, + { + "epoch": 1.5909090909090908, + "eval_loss": 0.2297886312007904, + "eval_runtime": 0.3003, + "eval_samples_per_second": 293.032, + "eval_steps_per_second": 36.629, + "step": 35 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 3.7421319484710693, + "learning_rate": 2.901315789473684e-05, + "loss": 0.3108, + "step": 36 + }, + { + "epoch": 1.6363636363636362, + "eval_loss": 0.21363353729248047, + "eval_runtime": 0.4783, + "eval_samples_per_second": 183.979, + "eval_steps_per_second": 22.997, + "step": 36 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 3.7049357891082764, + "learning_rate": 2.8973684210526315e-05, + "loss": 0.2933, + "step": 37 + }, + { + "epoch": 1.6818181818181817, + "eval_loss": 0.20323915779590607, + "eval_runtime": 0.25, + "eval_samples_per_second": 351.979, + "eval_steps_per_second": 43.997, + "step": 37 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 2.6143414974212646, + "learning_rate": 2.893421052631579e-05, + "loss": 0.2208, + "step": 38 + }, + { + "epoch": 1.7272727272727273, + "eval_loss": 0.19065451622009277, + "eval_runtime": 0.284, + "eval_samples_per_second": 309.864, + "eval_steps_per_second": 38.733, + "step": 38 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 3.0895273685455322, + "learning_rate": 2.8894736842105263e-05, + "loss": 0.2448, + "step": 39 + }, + { + "epoch": 1.7727272727272727, + "eval_loss": 0.17271381616592407, + "eval_runtime": 0.3543, + "eval_samples_per_second": 248.403, + "eval_steps_per_second": 31.05, + "step": 39 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.7658973932266235, + "learning_rate": 2.8855263157894738e-05, + "loss": 0.1742, + "step": 40 + }, + { + "epoch": 1.8181818181818183, + "eval_loss": 0.152969092130661, + "eval_runtime": 0.2714, + "eval_samples_per_second": 324.231, + "eval_steps_per_second": 40.529, + "step": 40 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 1.7428200244903564, + "learning_rate": 2.8815789473684212e-05, + "loss": 0.1717, + "step": 41 + }, + { + "epoch": 1.8636363636363638, + "eval_loss": 0.13160385191440582, + "eval_runtime": 0.2485, + "eval_samples_per_second": 354.091, + "eval_steps_per_second": 44.261, + "step": 41 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 1.9848284721374512, + "learning_rate": 2.8776315789473686e-05, + "loss": 0.1487, + "step": 42 + }, + { + "epoch": 1.9090909090909092, + "eval_loss": 0.11496426910161972, + "eval_runtime": 0.2812, + "eval_samples_per_second": 312.902, + "eval_steps_per_second": 39.113, + "step": 42 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 1.8623422384262085, + "learning_rate": 2.8736842105263157e-05, + "loss": 0.1671, + "step": 43 + }, + { + "epoch": 1.9545454545454546, + "eval_loss": 0.10060829669237137, + "eval_runtime": 0.531, + "eval_samples_per_second": 165.721, + "eval_steps_per_second": 20.715, + "step": 43 + }, + { + "epoch": 2.0, + "grad_norm": 1.254258632659912, + "learning_rate": 2.8697368421052634e-05, + "loss": 0.1296, + "step": 44 + }, + { + "epoch": 2.0, + "eval_loss": 0.09032303839921951, + "eval_runtime": 0.4212, + "eval_samples_per_second": 208.91, + "eval_steps_per_second": 26.114, + "step": 44 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 1.7023710012435913, + "learning_rate": 2.8657894736842105e-05, + "loss": 0.1269, + "step": 45 + }, + { + "epoch": 2.0454545454545454, + "eval_loss": 0.08172891288995743, + "eval_runtime": 0.3434, + "eval_samples_per_second": 256.286, + "eval_steps_per_second": 32.036, + "step": 45 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 1.1132336854934692, + "learning_rate": 2.861842105263158e-05, + "loss": 0.1087, + "step": 46 + }, + { + "epoch": 2.090909090909091, + "eval_loss": 0.07363786548376083, + "eval_runtime": 0.2227, + "eval_samples_per_second": 395.148, + "eval_steps_per_second": 49.393, + "step": 46 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 1.2574397325515747, + "learning_rate": 2.8578947368421053e-05, + "loss": 0.1007, + "step": 47 + }, + { + "epoch": 2.1363636363636362, + "eval_loss": 0.0676058903336525, + "eval_runtime": 0.2162, + "eval_samples_per_second": 406.971, + "eval_steps_per_second": 50.871, + "step": 47 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 1.1193581819534302, + "learning_rate": 2.8539473684210527e-05, + "loss": 0.0932, + "step": 48 + }, + { + "epoch": 2.1818181818181817, + "eval_loss": 0.060314346104860306, + "eval_runtime": 0.2456, + "eval_samples_per_second": 358.319, + "eval_steps_per_second": 44.79, + "step": 48 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 1.1668117046356201, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.0885, + "step": 49 + }, + { + "epoch": 2.227272727272727, + "eval_loss": 0.05352572351694107, + "eval_runtime": 0.2143, + "eval_samples_per_second": 410.66, + "eval_steps_per_second": 51.333, + "step": 49 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.9329622387886047, + "learning_rate": 2.8460526315789476e-05, + "loss": 0.0768, + "step": 50 + }, + { + "epoch": 2.2727272727272725, + "eval_loss": 0.049994777888059616, + "eval_runtime": 0.2184, + "eval_samples_per_second": 402.932, + "eval_steps_per_second": 50.367, + "step": 50 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 1.4205875396728516, + "learning_rate": 2.8421052631578946e-05, + "loss": 0.0871, + "step": 51 + }, + { + "epoch": 2.3181818181818183, + "eval_loss": 0.046269264072179794, + "eval_runtime": 0.2199, + "eval_samples_per_second": 400.111, + "eval_steps_per_second": 50.014, + "step": 51 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.6296802163124084, + "learning_rate": 2.838157894736842e-05, + "loss": 0.0597, + "step": 52 + }, + { + "epoch": 2.3636363636363638, + "eval_loss": 0.04288846254348755, + "eval_runtime": 0.2154, + "eval_samples_per_second": 408.528, + "eval_steps_per_second": 51.066, + "step": 52 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 0.8016664981842041, + "learning_rate": 2.8342105263157898e-05, + "loss": 0.0573, + "step": 53 + }, + { + "epoch": 2.409090909090909, + "eval_loss": 0.03866353631019592, + "eval_runtime": 0.2104, + "eval_samples_per_second": 418.258, + "eval_steps_per_second": 52.282, + "step": 53 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 0.5186643600463867, + "learning_rate": 2.830263157894737e-05, + "loss": 0.0533, + "step": 54 + }, + { + "epoch": 2.4545454545454546, + "eval_loss": 0.03540382906794548, + "eval_runtime": 0.2148, + "eval_samples_per_second": 409.705, + "eval_steps_per_second": 51.213, + "step": 54 + }, + { + "epoch": 2.5, + "grad_norm": 0.616000771522522, + "learning_rate": 2.8263157894736843e-05, + "loss": 0.0543, + "step": 55 + }, + { + "epoch": 2.5, + "eval_loss": 0.03242316469550133, + "eval_runtime": 0.2116, + "eval_samples_per_second": 415.828, + "eval_steps_per_second": 51.979, + "step": 55 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.6781826615333557, + "learning_rate": 2.8223684210526317e-05, + "loss": 0.0527, + "step": 56 + }, + { + "epoch": 2.5454545454545454, + "eval_loss": 0.029892653226852417, + "eval_runtime": 0.2231, + "eval_samples_per_second": 394.465, + "eval_steps_per_second": 49.308, + "step": 56 + }, + { + "epoch": 2.590909090909091, + "grad_norm": 0.40553542971611023, + "learning_rate": 2.818421052631579e-05, + "loss": 0.043, + "step": 57 + }, + { + "epoch": 2.590909090909091, + "eval_loss": 0.02773384563624859, + "eval_runtime": 0.212, + "eval_samples_per_second": 415.108, + "eval_steps_per_second": 51.889, + "step": 57 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 0.46068763732910156, + "learning_rate": 2.8144736842105262e-05, + "loss": 0.0408, + "step": 58 + }, + { + "epoch": 2.6363636363636362, + "eval_loss": 0.025741351768374443, + "eval_runtime": 0.2177, + "eval_samples_per_second": 404.269, + "eval_steps_per_second": 50.534, + "step": 58 + }, + { + "epoch": 2.6818181818181817, + "grad_norm": 0.42782941460609436, + "learning_rate": 2.810526315789474e-05, + "loss": 0.0404, + "step": 59 + }, + { + "epoch": 2.6818181818181817, + "eval_loss": 0.023805884644389153, + "eval_runtime": 0.2164, + "eval_samples_per_second": 406.611, + "eval_steps_per_second": 50.826, + "step": 59 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.3100360035896301, + "learning_rate": 2.806578947368421e-05, + "loss": 0.0348, + "step": 60 + }, + { + "epoch": 2.7272727272727275, + "eval_loss": 0.022079171612858772, + "eval_runtime": 0.2121, + "eval_samples_per_second": 414.803, + "eval_steps_per_second": 51.85, + "step": 60 + }, + { + "epoch": 2.7727272727272725, + "grad_norm": 0.3292113244533539, + "learning_rate": 2.8026315789473685e-05, + "loss": 0.0331, + "step": 61 + }, + { + "epoch": 2.7727272727272725, + "eval_loss": 0.020567093044519424, + "eval_runtime": 0.2183, + "eval_samples_per_second": 403.093, + "eval_steps_per_second": 50.387, + "step": 61 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 0.4177182912826538, + "learning_rate": 2.798684210526316e-05, + "loss": 0.0323, + "step": 62 + }, + { + "epoch": 2.8181818181818183, + "eval_loss": 0.019224492833018303, + "eval_runtime": 0.2119, + "eval_samples_per_second": 415.211, + "eval_steps_per_second": 51.901, + "step": 62 + }, + { + "epoch": 2.8636363636363638, + "grad_norm": 0.23254263401031494, + "learning_rate": 2.7947368421052633e-05, + "loss": 0.0252, + "step": 63 + }, + { + "epoch": 2.8636363636363638, + "eval_loss": 0.01814187504351139, + "eval_runtime": 0.2203, + "eval_samples_per_second": 399.469, + "eval_steps_per_second": 49.934, + "step": 63 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.38803598284721375, + "learning_rate": 2.7907894736842104e-05, + "loss": 0.031, + "step": 64 + }, + { + "epoch": 2.909090909090909, + "eval_loss": 0.01718403585255146, + "eval_runtime": 0.2179, + "eval_samples_per_second": 403.933, + "eval_steps_per_second": 50.492, + "step": 64 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.33151182532310486, + "learning_rate": 2.786842105263158e-05, + "loss": 0.03, + "step": 65 + }, + { + "epoch": 2.9545454545454546, + "eval_loss": 0.016221443191170692, + "eval_runtime": 0.2114, + "eval_samples_per_second": 416.237, + "eval_steps_per_second": 52.03, + "step": 65 + }, + { + "epoch": 3.0, + "grad_norm": 0.25049498677253723, + "learning_rate": 2.7828947368421055e-05, + "loss": 0.0244, + "step": 66 + }, + { + "epoch": 3.0, + "eval_loss": 0.015314313583076, + "eval_runtime": 0.2173, + "eval_samples_per_second": 404.944, + "eval_steps_per_second": 50.618, + "step": 66 + }, + { + "epoch": 3.0454545454545454, + "grad_norm": 0.2723033130168915, + "learning_rate": 2.7789473684210526e-05, + "loss": 0.0235, + "step": 67 + }, + { + "epoch": 3.0454545454545454, + "eval_loss": 0.014571275562047958, + "eval_runtime": 0.2218, + "eval_samples_per_second": 396.808, + "eval_steps_per_second": 49.601, + "step": 67 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 0.20975647866725922, + "learning_rate": 2.7750000000000004e-05, + "loss": 0.0222, + "step": 68 + }, + { + "epoch": 3.090909090909091, + "eval_loss": 0.013959475792944431, + "eval_runtime": 0.2232, + "eval_samples_per_second": 394.228, + "eval_steps_per_second": 49.279, + "step": 68 + }, + { + "epoch": 3.1363636363636362, + "grad_norm": 0.2025345116853714, + "learning_rate": 2.7710526315789474e-05, + "loss": 0.0228, + "step": 69 + }, + { + "epoch": 3.1363636363636362, + "eval_loss": 0.013426948338747025, + "eval_runtime": 0.2201, + "eval_samples_per_second": 399.844, + "eval_steps_per_second": 49.981, + "step": 69 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.2033005654811859, + "learning_rate": 2.767105263157895e-05, + "loss": 0.0209, + "step": 70 + }, + { + "epoch": 3.1818181818181817, + "eval_loss": 0.012989457696676254, + "eval_runtime": 0.2125, + "eval_samples_per_second": 414.107, + "eval_steps_per_second": 51.763, + "step": 70 + }, + { + "epoch": 3.227272727272727, + "grad_norm": 0.18534056842327118, + "learning_rate": 2.7631578947368423e-05, + "loss": 0.0199, + "step": 71 + }, + { + "epoch": 3.227272727272727, + "eval_loss": 0.012577124871313572, + "eval_runtime": 0.2145, + "eval_samples_per_second": 410.253, + "eval_steps_per_second": 51.282, + "step": 71 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 0.16536517441272736, + "learning_rate": 2.7592105263157897e-05, + "loss": 0.017, + "step": 72 + }, + { + "epoch": 3.2727272727272725, + "eval_loss": 0.012171071022748947, + "eval_runtime": 0.225, + "eval_samples_per_second": 391.108, + "eval_steps_per_second": 48.888, + "step": 72 + }, + { + "epoch": 3.3181818181818183, + "grad_norm": 0.14233346283435822, + "learning_rate": 2.7552631578947368e-05, + "loss": 0.0173, + "step": 73 + }, + { + "epoch": 3.3181818181818183, + "eval_loss": 0.011801562272012234, + "eval_runtime": 0.2206, + "eval_samples_per_second": 398.895, + "eval_steps_per_second": 49.862, + "step": 73 + }, + { + "epoch": 3.3636363636363638, + "grad_norm": 0.18418766558170319, + "learning_rate": 2.7513157894736842e-05, + "loss": 0.0193, + "step": 74 + }, + { + "epoch": 3.3636363636363638, + "eval_loss": 0.01138223335146904, + "eval_runtime": 0.2269, + "eval_samples_per_second": 387.873, + "eval_steps_per_second": 48.484, + "step": 74 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.1584126502275467, + "learning_rate": 2.7473684210526316e-05, + "loss": 0.0174, + "step": 75 + }, + { + "epoch": 3.409090909090909, + "eval_loss": 0.010961382649838924, + "eval_runtime": 0.2368, + "eval_samples_per_second": 371.65, + "eval_steps_per_second": 46.456, + "step": 75 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 0.15311338007450104, + "learning_rate": 2.743421052631579e-05, + "loss": 0.0152, + "step": 76 + }, + { + "epoch": 3.4545454545454546, + "eval_loss": 0.01055182795971632, + "eval_runtime": 0.2222, + "eval_samples_per_second": 396.112, + "eval_steps_per_second": 49.514, + "step": 76 + }, + { + "epoch": 3.5, + "grad_norm": 0.1895849108695984, + "learning_rate": 2.739473684210526e-05, + "loss": 0.0185, + "step": 77 + }, + { + "epoch": 3.5, + "eval_loss": 0.01013518963009119, + "eval_runtime": 0.2228, + "eval_samples_per_second": 394.993, + "eval_steps_per_second": 49.374, + "step": 77 + }, + { + "epoch": 3.5454545454545454, + "grad_norm": 0.1422702521085739, + "learning_rate": 2.735526315789474e-05, + "loss": 0.0163, + "step": 78 + }, + { + "epoch": 3.5454545454545454, + "eval_loss": 0.009774941019713879, + "eval_runtime": 0.2328, + "eval_samples_per_second": 378.047, + "eval_steps_per_second": 47.256, + "step": 78 + }, + { + "epoch": 3.590909090909091, + "grad_norm": 0.15089201927185059, + "learning_rate": 2.7315789473684213e-05, + "loss": 0.0162, + "step": 79 + }, + { + "epoch": 3.590909090909091, + "eval_loss": 0.009458563290536404, + "eval_runtime": 0.2335, + "eval_samples_per_second": 376.887, + "eval_steps_per_second": 47.111, + "step": 79 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.16338452696800232, + "learning_rate": 2.7276315789473683e-05, + "loss": 0.015, + "step": 80 + }, + { + "epoch": 3.6363636363636362, + "eval_loss": 0.00917022954672575, + "eval_runtime": 0.2355, + "eval_samples_per_second": 373.621, + "eval_steps_per_second": 46.703, + "step": 80 + }, + { + "epoch": 3.6818181818181817, + "grad_norm": 0.14390893280506134, + "learning_rate": 2.723684210526316e-05, + "loss": 0.0148, + "step": 81 + }, + { + "epoch": 3.6818181818181817, + "eval_loss": 0.00891400221735239, + "eval_runtime": 0.2182, + "eval_samples_per_second": 403.39, + "eval_steps_per_second": 50.424, + "step": 81 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.23557034134864807, + "learning_rate": 2.719736842105263e-05, + "loss": 0.0173, + "step": 82 + }, + { + "epoch": 3.7272727272727275, + "eval_loss": 0.008688293397426605, + "eval_runtime": 0.2236, + "eval_samples_per_second": 393.639, + "eval_steps_per_second": 49.205, + "step": 82 + }, + { + "epoch": 3.7727272727272725, + "grad_norm": 0.12254065275192261, + "learning_rate": 2.7157894736842106e-05, + "loss": 0.0133, + "step": 83 + }, + { + "epoch": 3.7727272727272725, + "eval_loss": 0.008477870374917984, + "eval_runtime": 0.2215, + "eval_samples_per_second": 397.361, + "eval_steps_per_second": 49.67, + "step": 83 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 0.10980476438999176, + "learning_rate": 2.711842105263158e-05, + "loss": 0.0128, + "step": 84 + }, + { + "epoch": 3.8181818181818183, + "eval_loss": 0.00827844813466072, + "eval_runtime": 0.2234, + "eval_samples_per_second": 393.942, + "eval_steps_per_second": 49.243, + "step": 84 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.13196319341659546, + "learning_rate": 2.7078947368421054e-05, + "loss": 0.013, + "step": 85 + }, + { + "epoch": 3.8636363636363638, + "eval_loss": 0.008079243823885918, + "eval_runtime": 0.2221, + "eval_samples_per_second": 396.214, + "eval_steps_per_second": 49.527, + "step": 85 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.10154274851083755, + "learning_rate": 2.7039473684210525e-05, + "loss": 0.0122, + "step": 86 + }, + { + "epoch": 3.909090909090909, + "eval_loss": 0.007896007038652897, + "eval_runtime": 0.224, + "eval_samples_per_second": 392.924, + "eval_steps_per_second": 49.115, + "step": 86 + }, + { + "epoch": 3.9545454545454546, + "grad_norm": 0.1324293613433838, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0126, + "step": 87 + }, + { + "epoch": 3.9545454545454546, + "eval_loss": 0.007718592882156372, + "eval_runtime": 0.2196, + "eval_samples_per_second": 400.741, + "eval_steps_per_second": 50.093, + "step": 87 + }, + { + "epoch": 4.0, + "grad_norm": 0.10327129811048508, + "learning_rate": 2.6960526315789473e-05, + "loss": 0.012, + "step": 88 + }, + { + "epoch": 4.0, + "eval_loss": 0.007555495481938124, + "eval_runtime": 0.2221, + "eval_samples_per_second": 396.243, + "eval_steps_per_second": 49.53, + "step": 88 + }, + { + "epoch": 4.045454545454546, + "grad_norm": 0.09408023953437805, + "learning_rate": 2.6921052631578947e-05, + "loss": 0.0115, + "step": 89 + }, + { + "epoch": 4.045454545454546, + "eval_loss": 0.0074074105359613895, + "eval_runtime": 0.2205, + "eval_samples_per_second": 399.137, + "eval_steps_per_second": 49.892, + "step": 89 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.09438669681549072, + "learning_rate": 2.688157894736842e-05, + "loss": 0.0117, + "step": 90 + }, + { + "epoch": 4.090909090909091, + "eval_loss": 0.007270295638591051, + "eval_runtime": 0.2207, + "eval_samples_per_second": 398.716, + "eval_steps_per_second": 49.839, + "step": 90 + }, + { + "epoch": 4.136363636363637, + "grad_norm": 0.10392805188894272, + "learning_rate": 2.6842105263157896e-05, + "loss": 0.0121, + "step": 91 + }, + { + "epoch": 4.136363636363637, + "eval_loss": 0.007134940009564161, + "eval_runtime": 0.2226, + "eval_samples_per_second": 395.399, + "eval_steps_per_second": 49.425, + "step": 91 + }, + { + "epoch": 4.181818181818182, + "grad_norm": 0.09916353225708008, + "learning_rate": 2.6802631578947366e-05, + "loss": 0.0111, + "step": 92 + }, + { + "epoch": 4.181818181818182, + "eval_loss": 0.007011328358203173, + "eval_runtime": 0.2218, + "eval_samples_per_second": 396.679, + "eval_steps_per_second": 49.585, + "step": 92 + }, + { + "epoch": 4.2272727272727275, + "grad_norm": 0.11726672202348709, + "learning_rate": 2.6763157894736844e-05, + "loss": 0.0128, + "step": 93 + }, + { + "epoch": 4.2272727272727275, + "eval_loss": 0.006890672724694014, + "eval_runtime": 0.2242, + "eval_samples_per_second": 392.462, + "eval_steps_per_second": 49.058, + "step": 93 + }, + { + "epoch": 4.2727272727272725, + "grad_norm": 0.10044334828853607, + "learning_rate": 2.6723684210526318e-05, + "loss": 0.0115, + "step": 94 + }, + { + "epoch": 4.2727272727272725, + "eval_loss": 0.006776686292141676, + "eval_runtime": 0.2201, + "eval_samples_per_second": 399.833, + "eval_steps_per_second": 49.979, + "step": 94 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.09276948869228363, + "learning_rate": 2.668421052631579e-05, + "loss": 0.011, + "step": 95 + }, + { + "epoch": 4.318181818181818, + "eval_loss": 0.00667022867128253, + "eval_runtime": 0.2225, + "eval_samples_per_second": 395.502, + "eval_steps_per_second": 49.438, + "step": 95 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.09718704223632812, + "learning_rate": 2.6644736842105266e-05, + "loss": 0.0113, + "step": 96 + }, + { + "epoch": 4.363636363636363, + "eval_loss": 0.006560015957802534, + "eval_runtime": 0.2172, + "eval_samples_per_second": 405.161, + "eval_steps_per_second": 50.645, + "step": 96 + }, + { + "epoch": 4.409090909090909, + "grad_norm": 0.11359906196594238, + "learning_rate": 2.6605263157894737e-05, + "loss": 0.0105, + "step": 97 + }, + { + "epoch": 4.409090909090909, + "eval_loss": 0.0064485338516533375, + "eval_runtime": 0.221, + "eval_samples_per_second": 398.174, + "eval_steps_per_second": 49.772, + "step": 97 + }, + { + "epoch": 4.454545454545454, + "grad_norm": 0.0942469909787178, + "learning_rate": 2.656578947368421e-05, + "loss": 0.0104, + "step": 98 + }, + { + "epoch": 4.454545454545454, + "eval_loss": 0.00633326917886734, + "eval_runtime": 0.2241, + "eval_samples_per_second": 392.749, + "eval_steps_per_second": 49.094, + "step": 98 + }, + { + "epoch": 4.5, + "grad_norm": 0.08770338445901871, + "learning_rate": 2.6526315789473685e-05, + "loss": 0.0097, + "step": 99 + }, + { + "epoch": 4.5, + "eval_loss": 0.006226606201380491, + "eval_runtime": 0.221, + "eval_samples_per_second": 398.22, + "eval_steps_per_second": 49.777, + "step": 99 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.0902254730463028, + "learning_rate": 2.648684210526316e-05, + "loss": 0.0102, + "step": 100 + }, + { + "epoch": 4.545454545454545, + "eval_loss": 0.0061218636110424995, + "eval_runtime": 0.2218, + "eval_samples_per_second": 396.725, + "eval_steps_per_second": 49.591, + "step": 100 + }, + { + "epoch": 4.590909090909091, + "grad_norm": 0.07302330434322357, + "learning_rate": 2.644736842105263e-05, + "loss": 0.0086, + "step": 101 + }, + { + "epoch": 4.590909090909091, + "eval_loss": 0.006022432819008827, + "eval_runtime": 0.2242, + "eval_samples_per_second": 392.497, + "eval_steps_per_second": 49.062, + "step": 101 + }, + { + "epoch": 4.636363636363637, + "grad_norm": 0.09044598042964935, + "learning_rate": 2.6407894736842108e-05, + "loss": 0.0098, + "step": 102 + }, + { + "epoch": 4.636363636363637, + "eval_loss": 0.005927449557930231, + "eval_runtime": 0.219, + "eval_samples_per_second": 401.867, + "eval_steps_per_second": 50.233, + "step": 102 + }, + { + "epoch": 4.681818181818182, + "grad_norm": 0.07847205549478531, + "learning_rate": 2.636842105263158e-05, + "loss": 0.0093, + "step": 103 + }, + { + "epoch": 4.681818181818182, + "eval_loss": 0.005836833734065294, + "eval_runtime": 0.2477, + "eval_samples_per_second": 355.291, + "eval_steps_per_second": 44.411, + "step": 103 + }, + { + "epoch": 4.7272727272727275, + "grad_norm": 0.09054490178823471, + "learning_rate": 2.6328947368421053e-05, + "loss": 0.0093, + "step": 104 + }, + { + "epoch": 4.7272727272727275, + "eval_loss": 0.005744776222854853, + "eval_runtime": 0.2237, + "eval_samples_per_second": 393.373, + "eval_steps_per_second": 49.172, + "step": 104 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.08056215196847916, + "learning_rate": 2.6289473684210527e-05, + "loss": 0.0095, + "step": 105 + }, + { + "epoch": 4.7727272727272725, + "eval_loss": 0.005655229557305574, + "eval_runtime": 0.2221, + "eval_samples_per_second": 396.284, + "eval_steps_per_second": 49.535, + "step": 105 + }, + { + "epoch": 4.818181818181818, + "grad_norm": 0.07413677871227264, + "learning_rate": 2.625e-05, + "loss": 0.0095, + "step": 106 + }, + { + "epoch": 4.818181818181818, + "eval_loss": 0.005573854316025972, + "eval_runtime": 0.2219, + "eval_samples_per_second": 396.499, + "eval_steps_per_second": 49.562, + "step": 106 + }, + { + "epoch": 4.863636363636363, + "grad_norm": 0.09156908839941025, + "learning_rate": 2.6210526315789475e-05, + "loss": 0.0094, + "step": 107 + }, + { + "epoch": 4.863636363636363, + "eval_loss": 0.005500171799212694, + "eval_runtime": 0.218, + "eval_samples_per_second": 403.697, + "eval_steps_per_second": 50.462, + "step": 107 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 0.07806240767240524, + "learning_rate": 2.617105263157895e-05, + "loss": 0.009, + "step": 108 + }, + { + "epoch": 4.909090909090909, + "eval_loss": 0.005432323087006807, + "eval_runtime": 0.2208, + "eval_samples_per_second": 398.497, + "eval_steps_per_second": 49.812, + "step": 108 + }, + { + "epoch": 4.954545454545455, + "grad_norm": 0.07705673575401306, + "learning_rate": 2.6131578947368424e-05, + "loss": 0.0091, + "step": 109 + }, + { + "epoch": 4.954545454545455, + "eval_loss": 0.005366500001400709, + "eval_runtime": 0.2187, + "eval_samples_per_second": 402.388, + "eval_steps_per_second": 50.299, + "step": 109 + }, + { + "epoch": 5.0, + "grad_norm": 0.0743311420083046, + "learning_rate": 2.6092105263157894e-05, + "loss": 0.0087, + "step": 110 + }, + { + "epoch": 5.0, + "eval_loss": 0.005299717653542757, + "eval_runtime": 0.215, + "eval_samples_per_second": 409.298, + "eval_steps_per_second": 51.162, + "step": 110 + }, + { + "epoch": 5.045454545454546, + "grad_norm": 0.0689927488565445, + "learning_rate": 2.605263157894737e-05, + "loss": 0.0081, + "step": 111 + }, + { + "epoch": 5.045454545454546, + "eval_loss": 0.005235993303358555, + "eval_runtime": 0.2192, + "eval_samples_per_second": 401.457, + "eval_steps_per_second": 50.182, + "step": 111 + }, + { + "epoch": 5.090909090909091, + "grad_norm": 0.06892900913953781, + "learning_rate": 2.6013157894736843e-05, + "loss": 0.0082, + "step": 112 + }, + { + "epoch": 5.090909090909091, + "eval_loss": 0.005173509940505028, + "eval_runtime": 0.219, + "eval_samples_per_second": 401.777, + "eval_steps_per_second": 50.222, + "step": 112 + }, + { + "epoch": 5.136363636363637, + "grad_norm": 0.06960764527320862, + "learning_rate": 2.5973684210526317e-05, + "loss": 0.0081, + "step": 113 + }, + { + "epoch": 5.136363636363637, + "eval_loss": 0.005112760700285435, + "eval_runtime": 0.2203, + "eval_samples_per_second": 399.491, + "eval_steps_per_second": 49.936, + "step": 113 + }, + { + "epoch": 5.181818181818182, + "grad_norm": 0.07173731923103333, + "learning_rate": 2.5934210526315788e-05, + "loss": 0.008, + "step": 114 + }, + { + "epoch": 5.181818181818182, + "eval_loss": 0.00505533954128623, + "eval_runtime": 0.2227, + "eval_samples_per_second": 395.105, + "eval_steps_per_second": 49.388, + "step": 114 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 0.06811046600341797, + "learning_rate": 2.5894736842105265e-05, + "loss": 0.0074, + "step": 115 + }, + { + "epoch": 5.2272727272727275, + "eval_loss": 0.0049970815889537334, + "eval_runtime": 0.2171, + "eval_samples_per_second": 405.344, + "eval_steps_per_second": 50.668, + "step": 115 + }, + { + "epoch": 5.2727272727272725, + "grad_norm": 0.0676768496632576, + "learning_rate": 2.5855263157894736e-05, + "loss": 0.0076, + "step": 116 + }, + { + "epoch": 5.2727272727272725, + "eval_loss": 0.004939272068440914, + "eval_runtime": 0.2319, + "eval_samples_per_second": 379.44, + "eval_steps_per_second": 47.43, + "step": 116 + }, + { + "epoch": 5.318181818181818, + "grad_norm": 0.06927932053804398, + "learning_rate": 2.581578947368421e-05, + "loss": 0.0078, + "step": 117 + }, + { + "epoch": 5.318181818181818, + "eval_loss": 0.004879767540842295, + "eval_runtime": 0.2354, + "eval_samples_per_second": 373.859, + "eval_steps_per_second": 46.732, + "step": 117 + }, + { + "epoch": 5.363636363636363, + "grad_norm": 0.0733099952340126, + "learning_rate": 2.5776315789473684e-05, + "loss": 0.009, + "step": 118 + }, + { + "epoch": 5.363636363636363, + "eval_loss": 0.00482180854305625, + "eval_runtime": 0.22, + "eval_samples_per_second": 399.97, + "eval_steps_per_second": 49.996, + "step": 118 + }, + { + "epoch": 5.409090909090909, + "grad_norm": 0.07873851805925369, + "learning_rate": 2.5736842105263158e-05, + "loss": 0.0085, + "step": 119 + }, + { + "epoch": 5.409090909090909, + "eval_loss": 0.004767131991684437, + "eval_runtime": 0.2272, + "eval_samples_per_second": 387.355, + "eval_steps_per_second": 48.419, + "step": 119 + }, + { + "epoch": 5.454545454545454, + "grad_norm": 0.06912100315093994, + "learning_rate": 2.5697368421052632e-05, + "loss": 0.0075, + "step": 120 + }, + { + "epoch": 5.454545454545454, + "eval_loss": 0.004715087823569775, + "eval_runtime": 0.2216, + "eval_samples_per_second": 397.159, + "eval_steps_per_second": 49.645, + "step": 120 + }, + { + "epoch": 5.5, + "grad_norm": 0.059973061084747314, + "learning_rate": 2.5657894736842107e-05, + "loss": 0.0078, + "step": 121 + }, + { + "epoch": 5.5, + "eval_loss": 0.004667165223509073, + "eval_runtime": 0.2226, + "eval_samples_per_second": 395.251, + "eval_steps_per_second": 49.406, + "step": 121 + }, + { + "epoch": 5.545454545454545, + "grad_norm": 0.06346078962087631, + "learning_rate": 2.561842105263158e-05, + "loss": 0.0073, + "step": 122 + }, + { + "epoch": 5.545454545454545, + "eval_loss": 0.004621806554496288, + "eval_runtime": 0.2263, + "eval_samples_per_second": 388.791, + "eval_steps_per_second": 48.599, + "step": 122 + }, + { + "epoch": 5.590909090909091, + "grad_norm": 0.07588130235671997, + "learning_rate": 2.557894736842105e-05, + "loss": 0.0079, + "step": 123 + }, + { + "epoch": 5.590909090909091, + "eval_loss": 0.004576975479722023, + "eval_runtime": 0.2216, + "eval_samples_per_second": 397.081, + "eval_steps_per_second": 49.635, + "step": 123 + }, + { + "epoch": 5.636363636363637, + "grad_norm": 0.0569930225610733, + "learning_rate": 2.553947368421053e-05, + "loss": 0.0068, + "step": 124 + }, + { + "epoch": 5.636363636363637, + "eval_loss": 0.004534974228590727, + "eval_runtime": 0.2207, + "eval_samples_per_second": 398.807, + "eval_steps_per_second": 49.851, + "step": 124 + }, + { + "epoch": 5.681818181818182, + "grad_norm": 0.07023297250270844, + "learning_rate": 2.55e-05, + "loss": 0.0078, + "step": 125 + }, + { + "epoch": 5.681818181818182, + "eval_loss": 0.004494456574320793, + "eval_runtime": 0.2276, + "eval_samples_per_second": 386.655, + "eval_steps_per_second": 48.332, + "step": 125 + }, + { + "epoch": 5.7272727272727275, + "grad_norm": 0.0586245059967041, + "learning_rate": 2.5460526315789474e-05, + "loss": 0.0072, + "step": 126 + }, + { + "epoch": 5.7272727272727275, + "eval_loss": 0.0044531743042171, + "eval_runtime": 0.2354, + "eval_samples_per_second": 373.803, + "eval_steps_per_second": 46.725, + "step": 126 + }, + { + "epoch": 5.7727272727272725, + "grad_norm": 0.0652911588549614, + "learning_rate": 2.5421052631578948e-05, + "loss": 0.0073, + "step": 127 + }, + { + "epoch": 5.7727272727272725, + "eval_loss": 0.004411415662616491, + "eval_runtime": 0.236, + "eval_samples_per_second": 372.941, + "eval_steps_per_second": 46.618, + "step": 127 + }, + { + "epoch": 5.818181818181818, + "grad_norm": 0.05701863393187523, + "learning_rate": 2.5381578947368422e-05, + "loss": 0.0067, + "step": 128 + }, + { + "epoch": 5.818181818181818, + "eval_loss": 0.004371690563857555, + "eval_runtime": 0.2358, + "eval_samples_per_second": 373.191, + "eval_steps_per_second": 46.649, + "step": 128 + }, + { + "epoch": 5.863636363636363, + "grad_norm": 0.05990603566169739, + "learning_rate": 2.5342105263157893e-05, + "loss": 0.0071, + "step": 129 + }, + { + "epoch": 5.863636363636363, + "eval_loss": 0.004331877455115318, + "eval_runtime": 0.2301, + "eval_samples_per_second": 382.487, + "eval_steps_per_second": 47.811, + "step": 129 + }, + { + "epoch": 5.909090909090909, + "grad_norm": 0.06283283233642578, + "learning_rate": 2.530263157894737e-05, + "loss": 0.0071, + "step": 130 + }, + { + "epoch": 5.909090909090909, + "eval_loss": 0.0042935688979923725, + "eval_runtime": 0.2387, + "eval_samples_per_second": 368.63, + "eval_steps_per_second": 46.079, + "step": 130 + }, + { + "epoch": 5.954545454545455, + "grad_norm": 0.060048509389162064, + "learning_rate": 2.526315789473684e-05, + "loss": 0.0067, + "step": 131 + }, + { + "epoch": 5.954545454545455, + "eval_loss": 0.0042540752328932285, + "eval_runtime": 0.2471, + "eval_samples_per_second": 356.096, + "eval_steps_per_second": 44.512, + "step": 131 + }, + { + "epoch": 6.0, + "grad_norm": 0.060563940554857254, + "learning_rate": 2.5223684210526315e-05, + "loss": 0.0064, + "step": 132 + }, + { + "epoch": 6.0, + "eval_loss": 0.004213025793433189, + "eval_runtime": 0.2399, + "eval_samples_per_second": 366.883, + "eval_steps_per_second": 45.86, + "step": 132 + }, + { + "epoch": 6.045454545454546, + "grad_norm": 0.060382332652807236, + "learning_rate": 2.518421052631579e-05, + "loss": 0.0071, + "step": 133 + }, + { + "epoch": 6.045454545454546, + "eval_loss": 0.004174065310508013, + "eval_runtime": 0.2268, + "eval_samples_per_second": 388.075, + "eval_steps_per_second": 48.509, + "step": 133 + }, + { + "epoch": 6.090909090909091, + "grad_norm": 0.06080484017729759, + "learning_rate": 2.5144736842105264e-05, + "loss": 0.0073, + "step": 134 + }, + { + "epoch": 6.090909090909091, + "eval_loss": 0.0041358619928359985, + "eval_runtime": 0.2229, + "eval_samples_per_second": 394.875, + "eval_steps_per_second": 49.359, + "step": 134 + }, + { + "epoch": 6.136363636363637, + "grad_norm": 0.057626206427812576, + "learning_rate": 2.5105263157894738e-05, + "loss": 0.0066, + "step": 135 + }, + { + "epoch": 6.136363636363637, + "eval_loss": 0.004101672675460577, + "eval_runtime": 0.2283, + "eval_samples_per_second": 385.395, + "eval_steps_per_second": 48.174, + "step": 135 + }, + { + "epoch": 6.181818181818182, + "grad_norm": 0.06599877029657364, + "learning_rate": 2.5065789473684212e-05, + "loss": 0.0075, + "step": 136 + }, + { + "epoch": 6.181818181818182, + "eval_loss": 0.004067064728587866, + "eval_runtime": 0.221, + "eval_samples_per_second": 398.26, + "eval_steps_per_second": 49.783, + "step": 136 + }, + { + "epoch": 6.2272727272727275, + "grad_norm": 0.05654873698949814, + "learning_rate": 2.5026315789473686e-05, + "loss": 0.0066, + "step": 137 + }, + { + "epoch": 6.2272727272727275, + "eval_loss": 0.0040321690030395985, + "eval_runtime": 0.2329, + "eval_samples_per_second": 377.882, + "eval_steps_per_second": 47.235, + "step": 137 + }, + { + "epoch": 6.2727272727272725, + "grad_norm": 0.05717283487319946, + "learning_rate": 2.4986842105263157e-05, + "loss": 0.0067, + "step": 138 + }, + { + "epoch": 6.2727272727272725, + "eval_loss": 0.003995668143033981, + "eval_runtime": 0.2203, + "eval_samples_per_second": 399.464, + "eval_steps_per_second": 49.933, + "step": 138 + }, + { + "epoch": 6.318181818181818, + "grad_norm": 0.06036869063973427, + "learning_rate": 2.4947368421052635e-05, + "loss": 0.0064, + "step": 139 + }, + { + "epoch": 6.318181818181818, + "eval_loss": 0.003956829197704792, + "eval_runtime": 0.2294, + "eval_samples_per_second": 383.681, + "eval_steps_per_second": 47.96, + "step": 139 + }, + { + "epoch": 6.363636363636363, + "grad_norm": 0.05111813545227051, + "learning_rate": 2.4907894736842105e-05, + "loss": 0.0063, + "step": 140 + }, + { + "epoch": 6.363636363636363, + "eval_loss": 0.003918844275176525, + "eval_runtime": 0.2186, + "eval_samples_per_second": 402.513, + "eval_steps_per_second": 50.314, + "step": 140 + }, + { + "epoch": 6.409090909090909, + "grad_norm": 0.0621768981218338, + "learning_rate": 2.486842105263158e-05, + "loss": 0.0064, + "step": 141 + }, + { + "epoch": 6.409090909090909, + "eval_loss": 0.00388046121224761, + "eval_runtime": 0.2309, + "eval_samples_per_second": 381.086, + "eval_steps_per_second": 47.636, + "step": 141 + }, + { + "epoch": 6.454545454545454, + "grad_norm": 0.06089349836111069, + "learning_rate": 2.4828947368421054e-05, + "loss": 0.0066, + "step": 142 + }, + { + "epoch": 6.454545454545454, + "eval_loss": 0.003839746816083789, + "eval_runtime": 0.2206, + "eval_samples_per_second": 398.928, + "eval_steps_per_second": 49.866, + "step": 142 + }, + { + "epoch": 6.5, + "grad_norm": 0.05007468909025192, + "learning_rate": 2.4789473684210528e-05, + "loss": 0.0061, + "step": 143 + }, + { + "epoch": 6.5, + "eval_loss": 0.003801233833655715, + "eval_runtime": 0.2295, + "eval_samples_per_second": 383.433, + "eval_steps_per_second": 47.929, + "step": 143 + }, + { + "epoch": 6.545454545454545, + "grad_norm": 0.053182121366262436, + "learning_rate": 2.475e-05, + "loss": 0.0059, + "step": 144 + }, + { + "epoch": 6.545454545454545, + "eval_loss": 0.003766607493162155, + "eval_runtime": 0.2261, + "eval_samples_per_second": 389.124, + "eval_steps_per_second": 48.64, + "step": 144 + }, + { + "epoch": 6.590909090909091, + "grad_norm": 0.051414087414741516, + "learning_rate": 2.4710526315789476e-05, + "loss": 0.0061, + "step": 145 + }, + { + "epoch": 6.590909090909091, + "eval_loss": 0.0037348391488194466, + "eval_runtime": 0.2309, + "eval_samples_per_second": 381.083, + "eval_steps_per_second": 47.635, + "step": 145 + }, + { + "epoch": 6.636363636363637, + "grad_norm": 0.051980625838041306, + "learning_rate": 2.4671052631578947e-05, + "loss": 0.0061, + "step": 146 + }, + { + "epoch": 6.636363636363637, + "eval_loss": 0.0037048642989248037, + "eval_runtime": 0.2327, + "eval_samples_per_second": 378.163, + "eval_steps_per_second": 47.27, + "step": 146 + }, + { + "epoch": 6.681818181818182, + "grad_norm": 0.054644446820020676, + "learning_rate": 2.463157894736842e-05, + "loss": 0.006, + "step": 147 + }, + { + "epoch": 6.681818181818182, + "eval_loss": 0.003674545791000128, + "eval_runtime": 0.2332, + "eval_samples_per_second": 377.322, + "eval_steps_per_second": 47.165, + "step": 147 + }, + { + "epoch": 6.7272727272727275, + "grad_norm": 0.04687352105975151, + "learning_rate": 2.45921052631579e-05, + "loss": 0.0057, + "step": 148 + }, + { + "epoch": 6.7272727272727275, + "eval_loss": 0.0036456272937357426, + "eval_runtime": 0.2302, + "eval_samples_per_second": 382.325, + "eval_steps_per_second": 47.791, + "step": 148 + }, + { + "epoch": 6.7727272727272725, + "grad_norm": 0.0500478520989418, + "learning_rate": 2.455263157894737e-05, + "loss": 0.0054, + "step": 149 + }, + { + "epoch": 6.7727272727272725, + "eval_loss": 0.003618737915530801, + "eval_runtime": 0.2281, + "eval_samples_per_second": 385.776, + "eval_steps_per_second": 48.222, + "step": 149 + }, + { + "epoch": 6.818181818181818, + "grad_norm": 0.05092916265130043, + "learning_rate": 2.4513157894736843e-05, + "loss": 0.0054, + "step": 150 + }, + { + "epoch": 6.818181818181818, + "eval_loss": 0.0035921267699450254, + "eval_runtime": 0.2298, + "eval_samples_per_second": 382.977, + "eval_steps_per_second": 47.872, + "step": 150 + }, + { + "epoch": 6.863636363636363, + "grad_norm": 0.05389472842216492, + "learning_rate": 2.4473684210526318e-05, + "loss": 0.0057, + "step": 151 + }, + { + "epoch": 6.863636363636363, + "eval_loss": 0.003567308420315385, + "eval_runtime": 0.2896, + "eval_samples_per_second": 303.912, + "eval_steps_per_second": 37.989, + "step": 151 + }, + { + "epoch": 6.909090909090909, + "grad_norm": 0.051427211612463, + "learning_rate": 2.4434210526315792e-05, + "loss": 0.0058, + "step": 152 + }, + { + "epoch": 6.909090909090909, + "eval_loss": 0.003539604600518942, + "eval_runtime": 0.2314, + "eval_samples_per_second": 380.243, + "eval_steps_per_second": 47.53, + "step": 152 + }, + { + "epoch": 6.954545454545455, + "grad_norm": 0.05391733720898628, + "learning_rate": 2.4394736842105262e-05, + "loss": 0.0058, + "step": 153 + }, + { + "epoch": 6.954545454545455, + "eval_loss": 0.0035100304521620274, + "eval_runtime": 0.2452, + "eval_samples_per_second": 358.914, + "eval_steps_per_second": 44.864, + "step": 153 + }, + { + "epoch": 7.0, + "grad_norm": 0.05612335354089737, + "learning_rate": 2.4355263157894737e-05, + "loss": 0.0056, + "step": 154 + }, + { + "epoch": 7.0, + "eval_loss": 0.0034810365177690983, + "eval_runtime": 0.2328, + "eval_samples_per_second": 378.038, + "eval_steps_per_second": 47.255, + "step": 154 + }, + { + "epoch": 7.045454545454546, + "grad_norm": 0.05799683555960655, + "learning_rate": 2.431578947368421e-05, + "loss": 0.0062, + "step": 155 + }, + { + "epoch": 7.045454545454546, + "eval_loss": 0.003452845150604844, + "eval_runtime": 0.2326, + "eval_samples_per_second": 378.28, + "eval_steps_per_second": 47.285, + "step": 155 + }, + { + "epoch": 7.090909090909091, + "grad_norm": 0.05095871537923813, + "learning_rate": 2.4276315789473685e-05, + "loss": 0.0051, + "step": 156 + }, + { + "epoch": 7.090909090909091, + "eval_loss": 0.003425983479246497, + "eval_runtime": 0.2387, + "eval_samples_per_second": 368.611, + "eval_steps_per_second": 46.076, + "step": 156 + }, + { + "epoch": 7.136363636363637, + "grad_norm": 0.05834353715181351, + "learning_rate": 2.4236842105263156e-05, + "loss": 0.0061, + "step": 157 + }, + { + "epoch": 7.136363636363637, + "eval_loss": 0.003400736255571246, + "eval_runtime": 0.233, + "eval_samples_per_second": 377.737, + "eval_steps_per_second": 47.217, + "step": 157 + }, + { + "epoch": 7.181818181818182, + "grad_norm": 0.05226532742381096, + "learning_rate": 2.4197368421052633e-05, + "loss": 0.006, + "step": 158 + }, + { + "epoch": 7.181818181818182, + "eval_loss": 0.003375994274392724, + "eval_runtime": 0.2371, + "eval_samples_per_second": 371.159, + "eval_steps_per_second": 46.395, + "step": 158 + }, + { + "epoch": 7.2272727272727275, + "grad_norm": 0.044102054089307785, + "learning_rate": 2.4157894736842104e-05, + "loss": 0.0051, + "step": 159 + }, + { + "epoch": 7.2272727272727275, + "eval_loss": 0.003351524705067277, + "eval_runtime": 0.2391, + "eval_samples_per_second": 368.077, + "eval_steps_per_second": 46.01, + "step": 159 + }, + { + "epoch": 7.2727272727272725, + "grad_norm": 0.050387196242809296, + "learning_rate": 2.4118421052631578e-05, + "loss": 0.0055, + "step": 160 + }, + { + "epoch": 7.2727272727272725, + "eval_loss": 0.003328080987557769, + "eval_runtime": 0.2367, + "eval_samples_per_second": 371.775, + "eval_steps_per_second": 46.472, + "step": 160 + }, + { + "epoch": 7.318181818181818, + "grad_norm": 0.05944162234663963, + "learning_rate": 2.4078947368421056e-05, + "loss": 0.0062, + "step": 161 + }, + { + "epoch": 7.318181818181818, + "eval_loss": 0.003306704806163907, + "eval_runtime": 0.2302, + "eval_samples_per_second": 382.244, + "eval_steps_per_second": 47.781, + "step": 161 + }, + { + "epoch": 7.363636363636363, + "grad_norm": 0.058280494064092636, + "learning_rate": 2.4039473684210526e-05, + "loss": 0.0055, + "step": 162 + }, + { + "epoch": 7.363636363636363, + "eval_loss": 0.0032875537872314453, + "eval_runtime": 0.2313, + "eval_samples_per_second": 380.46, + "eval_steps_per_second": 47.557, + "step": 162 + }, + { + "epoch": 7.409090909090909, + "grad_norm": 0.04580385982990265, + "learning_rate": 2.4e-05, + "loss": 0.0051, + "step": 163 + }, + { + "epoch": 7.409090909090909, + "eval_loss": 0.003268357366323471, + "eval_runtime": 0.2329, + "eval_samples_per_second": 377.841, + "eval_steps_per_second": 47.23, + "step": 163 + }, + { + "epoch": 7.454545454545454, + "grad_norm": 0.047211576253175735, + "learning_rate": 2.3960526315789475e-05, + "loss": 0.0049, + "step": 164 + }, + { + "epoch": 7.454545454545454, + "eval_loss": 0.003249667352065444, + "eval_runtime": 0.2288, + "eval_samples_per_second": 384.62, + "eval_steps_per_second": 48.077, + "step": 164 + }, + { + "epoch": 7.5, + "grad_norm": 0.04698212072253227, + "learning_rate": 2.392105263157895e-05, + "loss": 0.0051, + "step": 165 + }, + { + "epoch": 7.5, + "eval_loss": 0.003230377798900008, + "eval_runtime": 0.2336, + "eval_samples_per_second": 376.761, + "eval_steps_per_second": 47.095, + "step": 165 + }, + { + "epoch": 7.545454545454545, + "grad_norm": 0.049539972096681595, + "learning_rate": 2.388157894736842e-05, + "loss": 0.0053, + "step": 166 + }, + { + "epoch": 7.545454545454545, + "eval_loss": 0.003210590686649084, + "eval_runtime": 0.2308, + "eval_samples_per_second": 381.225, + "eval_steps_per_second": 47.653, + "step": 166 + }, + { + "epoch": 7.590909090909091, + "grad_norm": 0.06876406818628311, + "learning_rate": 2.3842105263157897e-05, + "loss": 0.0054, + "step": 167 + }, + { + "epoch": 7.590909090909091, + "eval_loss": 0.0031811357475817204, + "eval_runtime": 0.2314, + "eval_samples_per_second": 380.236, + "eval_steps_per_second": 47.53, + "step": 167 + }, + { + "epoch": 7.636363636363637, + "grad_norm": 0.03961968049407005, + "learning_rate": 2.3802631578947368e-05, + "loss": 0.0048, + "step": 168 + }, + { + "epoch": 7.636363636363637, + "eval_loss": 0.003153204219415784, + "eval_runtime": 0.2327, + "eval_samples_per_second": 378.242, + "eval_steps_per_second": 47.28, + "step": 168 + }, + { + "epoch": 7.681818181818182, + "grad_norm": 0.046262938529253006, + "learning_rate": 2.3763157894736842e-05, + "loss": 0.0054, + "step": 169 + }, + { + "epoch": 7.681818181818182, + "eval_loss": 0.0031256629154086113, + "eval_runtime": 0.2285, + "eval_samples_per_second": 385.163, + "eval_steps_per_second": 48.145, + "step": 169 + }, + { + "epoch": 7.7272727272727275, + "grad_norm": 0.04695883020758629, + "learning_rate": 2.3723684210526316e-05, + "loss": 0.0053, + "step": 170 + }, + { + "epoch": 7.7272727272727275, + "eval_loss": 0.00310018053278327, + "eval_runtime": 0.2345, + "eval_samples_per_second": 375.19, + "eval_steps_per_second": 46.899, + "step": 170 + }, + { + "epoch": 7.7727272727272725, + "grad_norm": 0.047219086438417435, + "learning_rate": 2.368421052631579e-05, + "loss": 0.0052, + "step": 171 + }, + { + "epoch": 7.7727272727272725, + "eval_loss": 0.003074278589338064, + "eval_runtime": 0.2331, + "eval_samples_per_second": 377.522, + "eval_steps_per_second": 47.19, + "step": 171 + }, + { + "epoch": 7.818181818181818, + "grad_norm": 0.05439964681863785, + "learning_rate": 2.364473684210526e-05, + "loss": 0.0055, + "step": 172 + }, + { + "epoch": 7.818181818181818, + "eval_loss": 0.003049066523090005, + "eval_runtime": 0.2239, + "eval_samples_per_second": 393.01, + "eval_steps_per_second": 49.126, + "step": 172 + }, + { + "epoch": 7.863636363636363, + "grad_norm": 0.041486483067274094, + "learning_rate": 2.360526315789474e-05, + "loss": 0.0047, + "step": 173 + }, + { + "epoch": 7.863636363636363, + "eval_loss": 0.0030262693762779236, + "eval_runtime": 0.2278, + "eval_samples_per_second": 386.243, + "eval_steps_per_second": 48.28, + "step": 173 + }, + { + "epoch": 7.909090909090909, + "grad_norm": 0.040691111236810684, + "learning_rate": 2.3565789473684213e-05, + "loss": 0.0046, + "step": 174 + }, + { + "epoch": 7.909090909090909, + "eval_loss": 0.0030057693365961313, + "eval_runtime": 0.2276, + "eval_samples_per_second": 386.567, + "eval_steps_per_second": 48.321, + "step": 174 + }, + { + "epoch": 7.954545454545455, + "grad_norm": 0.048391714692115784, + "learning_rate": 2.3526315789473684e-05, + "loss": 0.0055, + "step": 175 + }, + { + "epoch": 7.954545454545455, + "eval_loss": 0.0029874229803681374, + "eval_runtime": 0.2269, + "eval_samples_per_second": 387.906, + "eval_steps_per_second": 48.488, + "step": 175 + }, + { + "epoch": 8.0, + "grad_norm": 0.04458646848797798, + "learning_rate": 2.348684210526316e-05, + "loss": 0.005, + "step": 176 + }, + { + "epoch": 8.0, + "eval_loss": 0.0029713741969317198, + "eval_runtime": 0.2305, + "eval_samples_per_second": 381.854, + "eval_steps_per_second": 47.732, + "step": 176 + }, + { + "epoch": 8.045454545454545, + "grad_norm": 0.044490914791822433, + "learning_rate": 2.3447368421052632e-05, + "loss": 0.005, + "step": 177 + }, + { + "epoch": 8.045454545454545, + "eval_loss": 0.002958006225526333, + "eval_runtime": 0.2331, + "eval_samples_per_second": 377.519, + "eval_steps_per_second": 47.19, + "step": 177 + }, + { + "epoch": 8.090909090909092, + "grad_norm": 0.04664753004908562, + "learning_rate": 2.3407894736842106e-05, + "loss": 0.0053, + "step": 178 + }, + { + "epoch": 8.090909090909092, + "eval_loss": 0.0029434128664433956, + "eval_runtime": 0.2369, + "eval_samples_per_second": 371.478, + "eval_steps_per_second": 46.435, + "step": 178 + }, + { + "epoch": 8.136363636363637, + "grad_norm": 0.05114319175481796, + "learning_rate": 2.336842105263158e-05, + "loss": 0.0052, + "step": 179 + }, + { + "epoch": 8.136363636363637, + "eval_loss": 0.002928072353824973, + "eval_runtime": 0.2273, + "eval_samples_per_second": 387.111, + "eval_steps_per_second": 48.389, + "step": 179 + }, + { + "epoch": 8.181818181818182, + "grad_norm": 0.03715480864048004, + "learning_rate": 2.3328947368421054e-05, + "loss": 0.0044, + "step": 180 + }, + { + "epoch": 8.181818181818182, + "eval_loss": 0.002913246164098382, + "eval_runtime": 0.2291, + "eval_samples_per_second": 384.095, + "eval_steps_per_second": 48.012, + "step": 180 + }, + { + "epoch": 8.227272727272727, + "grad_norm": 0.03329971432685852, + "learning_rate": 2.3289473684210525e-05, + "loss": 0.0043, + "step": 181 + }, + { + "epoch": 8.227272727272727, + "eval_loss": 0.0028981559444218874, + "eval_runtime": 0.2387, + "eval_samples_per_second": 368.641, + "eval_steps_per_second": 46.08, + "step": 181 + }, + { + "epoch": 8.272727272727273, + "grad_norm": 0.036768488585948944, + "learning_rate": 2.3250000000000003e-05, + "loss": 0.0043, + "step": 182 + }, + { + "epoch": 8.272727272727273, + "eval_loss": 0.002883592387661338, + "eval_runtime": 0.2382, + "eval_samples_per_second": 369.423, + "eval_steps_per_second": 46.178, + "step": 182 + }, + { + "epoch": 8.318181818181818, + "grad_norm": 0.03704945370554924, + "learning_rate": 2.3210526315789473e-05, + "loss": 0.0042, + "step": 183 + }, + { + "epoch": 8.318181818181818, + "eval_loss": 0.0028684174176305532, + "eval_runtime": 0.2436, + "eval_samples_per_second": 361.18, + "eval_steps_per_second": 45.148, + "step": 183 + }, + { + "epoch": 8.363636363636363, + "grad_norm": 0.038721974939107895, + "learning_rate": 2.3171052631578948e-05, + "loss": 0.0045, + "step": 184 + }, + { + "epoch": 8.363636363636363, + "eval_loss": 0.002850217279046774, + "eval_runtime": 0.2397, + "eval_samples_per_second": 367.125, + "eval_steps_per_second": 45.891, + "step": 184 + }, + { + "epoch": 8.409090909090908, + "grad_norm": 0.0400218740105629, + "learning_rate": 2.3131578947368422e-05, + "loss": 0.0046, + "step": 185 + }, + { + "epoch": 8.409090909090908, + "eval_loss": 0.0028304813895374537, + "eval_runtime": 0.2401, + "eval_samples_per_second": 366.549, + "eval_steps_per_second": 45.819, + "step": 185 + }, + { + "epoch": 8.454545454545455, + "grad_norm": 0.04041934013366699, + "learning_rate": 2.3092105263157896e-05, + "loss": 0.0047, + "step": 186 + }, + { + "epoch": 8.454545454545455, + "eval_loss": 0.0028114623855799437, + "eval_runtime": 0.2373, + "eval_samples_per_second": 370.773, + "eval_steps_per_second": 46.347, + "step": 186 + }, + { + "epoch": 8.5, + "grad_norm": 0.03471284359693527, + "learning_rate": 2.3052631578947367e-05, + "loss": 0.0042, + "step": 187 + }, + { + "epoch": 8.5, + "eval_loss": 0.002793875988572836, + "eval_runtime": 0.2482, + "eval_samples_per_second": 354.499, + "eval_steps_per_second": 44.312, + "step": 187 + }, + { + "epoch": 8.545454545454545, + "grad_norm": 0.044632624834775925, + "learning_rate": 2.3013157894736844e-05, + "loss": 0.0048, + "step": 188 + }, + { + "epoch": 8.545454545454545, + "eval_loss": 0.0027756269555538893, + "eval_runtime": 0.2261, + "eval_samples_per_second": 389.244, + "eval_steps_per_second": 48.656, + "step": 188 + }, + { + "epoch": 8.590909090909092, + "grad_norm": 0.039824243634939194, + "learning_rate": 2.297368421052632e-05, + "loss": 0.0044, + "step": 189 + }, + { + "epoch": 8.590909090909092, + "eval_loss": 0.00275724777020514, + "eval_runtime": 0.2454, + "eval_samples_per_second": 358.66, + "eval_steps_per_second": 44.832, + "step": 189 + }, + { + "epoch": 8.636363636363637, + "grad_norm": 0.03765185549855232, + "learning_rate": 2.293421052631579e-05, + "loss": 0.0046, + "step": 190 + }, + { + "epoch": 8.636363636363637, + "eval_loss": 0.002737644361332059, + "eval_runtime": 0.2301, + "eval_samples_per_second": 382.383, + "eval_steps_per_second": 47.798, + "step": 190 + }, + { + "epoch": 8.681818181818182, + "grad_norm": 0.04460470378398895, + "learning_rate": 2.2894736842105263e-05, + "loss": 0.0049, + "step": 191 + }, + { + "epoch": 8.681818181818182, + "eval_loss": 0.002716499613597989, + "eval_runtime": 0.2404, + "eval_samples_per_second": 366.123, + "eval_steps_per_second": 45.765, + "step": 191 + }, + { + "epoch": 8.727272727272727, + "grad_norm": 0.04597329720854759, + "learning_rate": 2.2855263157894737e-05, + "loss": 0.0046, + "step": 192 + }, + { + "epoch": 8.727272727272727, + "eval_loss": 0.002695793053135276, + "eval_runtime": 0.2287, + "eval_samples_per_second": 384.748, + "eval_steps_per_second": 48.093, + "step": 192 + }, + { + "epoch": 8.772727272727273, + "grad_norm": 0.04175286740064621, + "learning_rate": 2.281578947368421e-05, + "loss": 0.0048, + "step": 193 + }, + { + "epoch": 8.772727272727273, + "eval_loss": 0.0026768320240080357, + "eval_runtime": 0.2297, + "eval_samples_per_second": 383.191, + "eval_steps_per_second": 47.899, + "step": 193 + }, + { + "epoch": 8.818181818181818, + "grad_norm": 0.03605563938617706, + "learning_rate": 2.2776315789473682e-05, + "loss": 0.0042, + "step": 194 + }, + { + "epoch": 8.818181818181818, + "eval_loss": 0.0026587164029479027, + "eval_runtime": 0.2319, + "eval_samples_per_second": 379.432, + "eval_steps_per_second": 47.429, + "step": 194 + }, + { + "epoch": 8.863636363636363, + "grad_norm": 0.03600858151912689, + "learning_rate": 2.273684210526316e-05, + "loss": 0.004, + "step": 195 + }, + { + "epoch": 8.863636363636363, + "eval_loss": 0.0026421842630952597, + "eval_runtime": 0.2375, + "eval_samples_per_second": 370.592, + "eval_steps_per_second": 46.324, + "step": 195 + }, + { + "epoch": 8.909090909090908, + "grad_norm": 0.04040640592575073, + "learning_rate": 2.269736842105263e-05, + "loss": 0.0046, + "step": 196 + }, + { + "epoch": 8.909090909090908, + "eval_loss": 0.002626256085932255, + "eval_runtime": 0.5446, + "eval_samples_per_second": 161.597, + "eval_steps_per_second": 20.2, + "step": 196 + }, + { + "epoch": 8.954545454545455, + "grad_norm": 0.04418746754527092, + "learning_rate": 2.2657894736842105e-05, + "loss": 0.0042, + "step": 197 + }, + { + "epoch": 8.954545454545455, + "eval_loss": 0.002609600778669119, + "eval_runtime": 0.233, + "eval_samples_per_second": 377.684, + "eval_steps_per_second": 47.211, + "step": 197 + }, + { + "epoch": 9.0, + "grad_norm": 0.04399528354406357, + "learning_rate": 2.261842105263158e-05, + "loss": 0.0044, + "step": 198 + }, + { + "epoch": 9.0, + "eval_loss": 0.0025943187065422535, + "eval_runtime": 0.3847, + "eval_samples_per_second": 228.728, + "eval_steps_per_second": 28.591, + "step": 198 + }, + { + "epoch": 9.045454545454545, + "grad_norm": 0.04438379034399986, + "learning_rate": 2.2578947368421053e-05, + "loss": 0.0045, + "step": 199 + }, + { + "epoch": 9.045454545454545, + "eval_loss": 0.0025796808768063784, + "eval_runtime": 0.4906, + "eval_samples_per_second": 179.357, + "eval_steps_per_second": 22.42, + "step": 199 + }, + { + "epoch": 9.090909090909092, + "grad_norm": 0.03908194229006767, + "learning_rate": 2.2539473684210524e-05, + "loss": 0.0045, + "step": 200 + }, + { + "epoch": 9.090909090909092, + "eval_loss": 0.002565391594544053, + "eval_runtime": 0.2293, + "eval_samples_per_second": 383.783, + "eval_steps_per_second": 47.973, + "step": 200 + }, + { + "epoch": 9.136363636363637, + "grad_norm": 0.03590917959809303, + "learning_rate": 2.25e-05, + "loss": 0.0044, + "step": 201 + }, + { + "epoch": 9.136363636363637, + "eval_loss": 0.0025517421308904886, + "eval_runtime": 0.2317, + "eval_samples_per_second": 379.855, + "eval_steps_per_second": 47.482, + "step": 201 + }, + { + "epoch": 9.181818181818182, + "grad_norm": 0.0374373197555542, + "learning_rate": 2.2460526315789476e-05, + "loss": 0.0039, + "step": 202 + }, + { + "epoch": 9.181818181818182, + "eval_loss": 0.0025385154876857996, + "eval_runtime": 0.232, + "eval_samples_per_second": 379.247, + "eval_steps_per_second": 47.406, + "step": 202 + }, + { + "epoch": 9.227272727272727, + "grad_norm": 0.03761666640639305, + "learning_rate": 2.2421052631578946e-05, + "loss": 0.004, + "step": 203 + }, + { + "epoch": 9.227272727272727, + "eval_loss": 0.0025266585871577263, + "eval_runtime": 0.2386, + "eval_samples_per_second": 368.745, + "eval_steps_per_second": 46.093, + "step": 203 + }, + { + "epoch": 9.272727272727273, + "grad_norm": 0.033979009836912155, + "learning_rate": 2.2381578947368424e-05, + "loss": 0.004, + "step": 204 + }, + { + "epoch": 9.272727272727273, + "eval_loss": 0.0025138070341199636, + "eval_runtime": 0.2314, + "eval_samples_per_second": 380.28, + "eval_steps_per_second": 47.535, + "step": 204 + }, + { + "epoch": 9.318181818181818, + "grad_norm": 0.054837603121995926, + "learning_rate": 2.2342105263157895e-05, + "loss": 0.0042, + "step": 205 + }, + { + "epoch": 9.318181818181818, + "eval_loss": 0.002499848371371627, + "eval_runtime": 0.227, + "eval_samples_per_second": 387.733, + "eval_steps_per_second": 48.467, + "step": 205 + }, + { + "epoch": 9.363636363636363, + "grad_norm": 0.03884384036064148, + "learning_rate": 2.230263157894737e-05, + "loss": 0.0043, + "step": 206 + }, + { + "epoch": 9.363636363636363, + "eval_loss": 0.0024857125245034695, + "eval_runtime": 0.2294, + "eval_samples_per_second": 383.548, + "eval_steps_per_second": 47.944, + "step": 206 + }, + { + "epoch": 9.409090909090908, + "grad_norm": 0.03517827019095421, + "learning_rate": 2.2263157894736843e-05, + "loss": 0.004, + "step": 207 + }, + { + "epoch": 9.409090909090908, + "eval_loss": 0.00247101578861475, + "eval_runtime": 0.2336, + "eval_samples_per_second": 376.726, + "eval_steps_per_second": 47.091, + "step": 207 + }, + { + "epoch": 9.454545454545455, + "grad_norm": 0.04209022969007492, + "learning_rate": 2.2223684210526317e-05, + "loss": 0.0041, + "step": 208 + }, + { + "epoch": 9.454545454545455, + "eval_loss": 0.0024564675986766815, + "eval_runtime": 0.242, + "eval_samples_per_second": 363.631, + "eval_steps_per_second": 45.454, + "step": 208 + }, + { + "epoch": 9.5, + "grad_norm": 0.04031739383935928, + "learning_rate": 2.2184210526315788e-05, + "loss": 0.0042, + "step": 209 + }, + { + "epoch": 9.5, + "eval_loss": 0.002442182507365942, + "eval_runtime": 0.2384, + "eval_samples_per_second": 369.056, + "eval_steps_per_second": 46.132, + "step": 209 + }, + { + "epoch": 9.545454545454545, + "grad_norm": 0.03341998904943466, + "learning_rate": 2.2144736842105265e-05, + "loss": 0.0038, + "step": 210 + }, + { + "epoch": 9.545454545454545, + "eval_loss": 0.0024283959064632654, + "eval_runtime": 0.2386, + "eval_samples_per_second": 368.766, + "eval_steps_per_second": 46.096, + "step": 210 + }, + { + "epoch": 9.590909090909092, + "grad_norm": 0.033409975469112396, + "learning_rate": 2.2105263157894736e-05, + "loss": 0.0037, + "step": 211 + }, + { + "epoch": 9.590909090909092, + "eval_loss": 0.002414784161373973, + "eval_runtime": 0.2392, + "eval_samples_per_second": 367.843, + "eval_steps_per_second": 45.98, + "step": 211 + }, + { + "epoch": 9.636363636363637, + "grad_norm": 0.038544610142707825, + "learning_rate": 2.206578947368421e-05, + "loss": 0.0042, + "step": 212 + }, + { + "epoch": 9.636363636363637, + "eval_loss": 0.0024007910396903753, + "eval_runtime": 0.2355, + "eval_samples_per_second": 373.655, + "eval_steps_per_second": 46.707, + "step": 212 + }, + { + "epoch": 9.681818181818182, + "grad_norm": 0.031284794211387634, + "learning_rate": 2.2026315789473684e-05, + "loss": 0.0039, + "step": 213 + }, + { + "epoch": 9.681818181818182, + "eval_loss": 0.00238687708042562, + "eval_runtime": 0.2461, + "eval_samples_per_second": 357.651, + "eval_steps_per_second": 44.706, + "step": 213 + }, + { + "epoch": 9.727272727272727, + "grad_norm": 0.03589053079485893, + "learning_rate": 2.198684210526316e-05, + "loss": 0.004, + "step": 214 + }, + { + "epoch": 9.727272727272727, + "eval_loss": 0.002372899791225791, + "eval_runtime": 0.2388, + "eval_samples_per_second": 368.519, + "eval_steps_per_second": 46.065, + "step": 214 + }, + { + "epoch": 9.772727272727273, + "grad_norm": 0.03422442823648453, + "learning_rate": 2.1947368421052633e-05, + "loss": 0.0037, + "step": 215 + }, + { + "epoch": 9.772727272727273, + "eval_loss": 0.0023599357809871435, + "eval_runtime": 0.2324, + "eval_samples_per_second": 378.632, + "eval_steps_per_second": 47.329, + "step": 215 + }, + { + "epoch": 9.818181818181818, + "grad_norm": 0.03365776687860489, + "learning_rate": 2.1907894736842107e-05, + "loss": 0.0035, + "step": 216 + }, + { + "epoch": 9.818181818181818, + "eval_loss": 0.0023472688626497984, + "eval_runtime": 0.231, + "eval_samples_per_second": 380.916, + "eval_steps_per_second": 47.614, + "step": 216 + }, + { + "epoch": 9.863636363636363, + "grad_norm": 0.030327491462230682, + "learning_rate": 2.186842105263158e-05, + "loss": 0.0037, + "step": 217 + }, + { + "epoch": 9.863636363636363, + "eval_loss": 0.0023344962392002344, + "eval_runtime": 0.229, + "eval_samples_per_second": 384.224, + "eval_steps_per_second": 48.028, + "step": 217 + }, + { + "epoch": 9.909090909090908, + "grad_norm": 0.039349548518657684, + "learning_rate": 2.1828947368421052e-05, + "loss": 0.004, + "step": 218 + }, + { + "epoch": 9.909090909090908, + "eval_loss": 0.0023220828734338284, + "eval_runtime": 0.228, + "eval_samples_per_second": 385.959, + "eval_steps_per_second": 48.245, + "step": 218 + }, + { + "epoch": 9.954545454545455, + "grad_norm": 0.03199224919080734, + "learning_rate": 2.178947368421053e-05, + "loss": 0.0034, + "step": 219 + }, + { + "epoch": 9.954545454545455, + "eval_loss": 0.0023102990817278624, + "eval_runtime": 0.2311, + "eval_samples_per_second": 380.788, + "eval_steps_per_second": 47.598, + "step": 219 + }, + { + "epoch": 10.0, + "grad_norm": 0.03278977796435356, + "learning_rate": 2.175e-05, + "loss": 0.0036, + "step": 220 + }, + { + "epoch": 10.0, + "eval_loss": 0.002298795385286212, + "eval_runtime": 0.3275, + "eval_samples_per_second": 268.678, + "eval_steps_per_second": 33.585, + "step": 220 + }, + { + "epoch": 10.045454545454545, + "grad_norm": 0.0341983363032341, + "learning_rate": 2.1710526315789474e-05, + "loss": 0.0039, + "step": 221 + }, + { + "epoch": 10.045454545454545, + "eval_loss": 0.0022870004177093506, + "eval_runtime": 0.3861, + "eval_samples_per_second": 227.931, + "eval_steps_per_second": 28.491, + "step": 221 + }, + { + "epoch": 10.090909090909092, + "grad_norm": 0.03134067356586456, + "learning_rate": 2.167105263157895e-05, + "loss": 0.0038, + "step": 222 + }, + { + "epoch": 10.090909090909092, + "eval_loss": 0.002274780999869108, + "eval_runtime": 0.2973, + "eval_samples_per_second": 296.022, + "eval_steps_per_second": 37.003, + "step": 222 + }, + { + "epoch": 10.136363636363637, + "grad_norm": 0.03246266394853592, + "learning_rate": 2.1631578947368423e-05, + "loss": 0.0035, + "step": 223 + }, + { + "epoch": 10.136363636363637, + "eval_loss": 0.002262603724375367, + "eval_runtime": 0.2788, + "eval_samples_per_second": 315.607, + "eval_steps_per_second": 39.451, + "step": 223 + }, + { + "epoch": 10.181818181818182, + "grad_norm": 0.035311244428157806, + "learning_rate": 2.1592105263157893e-05, + "loss": 0.0036, + "step": 224 + }, + { + "epoch": 10.181818181818182, + "eval_loss": 0.002250505844131112, + "eval_runtime": 0.3259, + "eval_samples_per_second": 270.042, + "eval_steps_per_second": 33.755, + "step": 224 + }, + { + "epoch": 10.227272727272727, + "grad_norm": 0.03288138657808304, + "learning_rate": 2.155263157894737e-05, + "loss": 0.0039, + "step": 225 + }, + { + "epoch": 10.227272727272727, + "eval_loss": 0.0022388615179806948, + "eval_runtime": 0.3627, + "eval_samples_per_second": 242.648, + "eval_steps_per_second": 30.331, + "step": 225 + }, + { + "epoch": 10.272727272727273, + "grad_norm": 0.032804686576128006, + "learning_rate": 2.151315789473684e-05, + "loss": 0.0038, + "step": 226 + }, + { + "epoch": 10.272727272727273, + "eval_loss": 0.0022277701646089554, + "eval_runtime": 0.4861, + "eval_samples_per_second": 181.023, + "eval_steps_per_second": 22.628, + "step": 226 + }, + { + "epoch": 10.318181818181818, + "grad_norm": 0.036528490483760834, + "learning_rate": 2.1473684210526316e-05, + "loss": 0.004, + "step": 227 + }, + { + "epoch": 10.318181818181818, + "eval_loss": 0.0022167994175106287, + "eval_runtime": 0.3048, + "eval_samples_per_second": 288.714, + "eval_steps_per_second": 36.089, + "step": 227 + }, + { + "epoch": 10.363636363636363, + "grad_norm": 0.029931485652923584, + "learning_rate": 2.143421052631579e-05, + "loss": 0.0036, + "step": 228 + }, + { + "epoch": 10.363636363636363, + "eval_loss": 0.002205794909968972, + "eval_runtime": 0.2918, + "eval_samples_per_second": 301.612, + "eval_steps_per_second": 37.701, + "step": 228 + }, + { + "epoch": 10.409090909090908, + "grad_norm": 0.03588961437344551, + "learning_rate": 2.1394736842105264e-05, + "loss": 0.0039, + "step": 229 + }, + { + "epoch": 10.409090909090908, + "eval_loss": 0.0021950947120785713, + "eval_runtime": 0.2407, + "eval_samples_per_second": 365.554, + "eval_steps_per_second": 45.694, + "step": 229 + }, + { + "epoch": 10.454545454545455, + "grad_norm": 0.033503517508506775, + "learning_rate": 2.1355263157894738e-05, + "loss": 0.0036, + "step": 230 + }, + { + "epoch": 10.454545454545455, + "eval_loss": 0.0021843963768333197, + "eval_runtime": 0.2737, + "eval_samples_per_second": 321.531, + "eval_steps_per_second": 40.191, + "step": 230 + }, + { + "epoch": 10.5, + "grad_norm": 0.032428622245788574, + "learning_rate": 2.1315789473684212e-05, + "loss": 0.0035, + "step": 231 + }, + { + "epoch": 10.5, + "eval_loss": 0.002173727611079812, + "eval_runtime": 0.4053, + "eval_samples_per_second": 217.137, + "eval_steps_per_second": 27.142, + "step": 231 + }, + { + "epoch": 10.545454545454545, + "grad_norm": 0.0326942577958107, + "learning_rate": 2.1276315789473687e-05, + "loss": 0.0035, + "step": 232 + }, + { + "epoch": 10.545454545454545, + "eval_loss": 0.0021637016907334328, + "eval_runtime": 0.7117, + "eval_samples_per_second": 123.656, + "eval_steps_per_second": 15.457, + "step": 232 + }, + { + "epoch": 10.590909090909092, + "grad_norm": 0.03240852802991867, + "learning_rate": 2.1236842105263157e-05, + "loss": 0.0034, + "step": 233 + }, + { + "epoch": 10.590909090909092, + "eval_loss": 0.002153951907530427, + "eval_runtime": 0.2454, + "eval_samples_per_second": 358.581, + "eval_steps_per_second": 44.823, + "step": 233 + }, + { + "epoch": 10.636363636363637, + "grad_norm": 0.029470907524228096, + "learning_rate": 2.119736842105263e-05, + "loss": 0.0035, + "step": 234 + }, + { + "epoch": 10.636363636363637, + "eval_loss": 0.002144550671800971, + "eval_runtime": 0.2443, + "eval_samples_per_second": 360.165, + "eval_steps_per_second": 45.021, + "step": 234 + }, + { + "epoch": 10.681818181818182, + "grad_norm": 0.02820722572505474, + "learning_rate": 2.1157894736842106e-05, + "loss": 0.0034, + "step": 235 + }, + { + "epoch": 10.681818181818182, + "eval_loss": 0.002135734772309661, + "eval_runtime": 0.2643, + "eval_samples_per_second": 333.008, + "eval_steps_per_second": 41.626, + "step": 235 + }, + { + "epoch": 10.727272727272727, + "grad_norm": 0.02772766724228859, + "learning_rate": 2.111842105263158e-05, + "loss": 0.0033, + "step": 236 + }, + { + "epoch": 10.727272727272727, + "eval_loss": 0.0021269202698022127, + "eval_runtime": 0.2751, + "eval_samples_per_second": 319.882, + "eval_steps_per_second": 39.985, + "step": 236 + }, + { + "epoch": 10.772727272727273, + "grad_norm": 0.03653711825609207, + "learning_rate": 2.107894736842105e-05, + "loss": 0.0038, + "step": 237 + }, + { + "epoch": 10.772727272727273, + "eval_loss": 0.0021178810857236385, + "eval_runtime": 0.227, + "eval_samples_per_second": 387.716, + "eval_steps_per_second": 48.465, + "step": 237 + }, + { + "epoch": 10.818181818181818, + "grad_norm": 0.03011268563568592, + "learning_rate": 2.1039473684210528e-05, + "loss": 0.0035, + "step": 238 + }, + { + "epoch": 10.818181818181818, + "eval_loss": 0.002109181135892868, + "eval_runtime": 0.2398, + "eval_samples_per_second": 366.897, + "eval_steps_per_second": 45.862, + "step": 238 + }, + { + "epoch": 10.863636363636363, + "grad_norm": 0.025909798219799995, + "learning_rate": 2.1e-05, + "loss": 0.003, + "step": 239 + }, + { + "epoch": 10.863636363636363, + "eval_loss": 0.0021006783936172724, + "eval_runtime": 0.2342, + "eval_samples_per_second": 375.674, + "eval_steps_per_second": 46.959, + "step": 239 + }, + { + "epoch": 10.909090909090908, + "grad_norm": 0.02720109187066555, + "learning_rate": 2.0960526315789473e-05, + "loss": 0.0033, + "step": 240 + }, + { + "epoch": 10.909090909090908, + "eval_loss": 0.002092132344841957, + "eval_runtime": 0.2362, + "eval_samples_per_second": 372.632, + "eval_steps_per_second": 46.579, + "step": 240 + }, + { + "epoch": 10.954545454545455, + "grad_norm": 0.03358568996191025, + "learning_rate": 2.0921052631578947e-05, + "loss": 0.0034, + "step": 241 + }, + { + "epoch": 10.954545454545455, + "eval_loss": 0.0020830982830375433, + "eval_runtime": 0.2268, + "eval_samples_per_second": 387.964, + "eval_steps_per_second": 48.496, + "step": 241 + }, + { + "epoch": 11.0, + "grad_norm": 0.030720144510269165, + "learning_rate": 2.088157894736842e-05, + "loss": 0.0036, + "step": 242 + }, + { + "epoch": 11.0, + "eval_loss": 0.002074107527732849, + "eval_runtime": 0.2253, + "eval_samples_per_second": 390.639, + "eval_steps_per_second": 48.83, + "step": 242 + }, + { + "epoch": 11.045454545454545, + "grad_norm": 0.029408905655145645, + "learning_rate": 2.0842105263157895e-05, + "loss": 0.0035, + "step": 243 + }, + { + "epoch": 11.045454545454545, + "eval_loss": 0.0020653316751122475, + "eval_runtime": 0.234, + "eval_samples_per_second": 376.079, + "eval_steps_per_second": 47.01, + "step": 243 + }, + { + "epoch": 11.090909090909092, + "grad_norm": 0.02971459925174713, + "learning_rate": 2.080263157894737e-05, + "loss": 0.0034, + "step": 244 + }, + { + "epoch": 11.090909090909092, + "eval_loss": 0.0020563837606459856, + "eval_runtime": 0.2306, + "eval_samples_per_second": 381.673, + "eval_steps_per_second": 47.709, + "step": 244 + }, + { + "epoch": 11.136363636363637, + "grad_norm": 0.028164513409137726, + "learning_rate": 2.0763157894736844e-05, + "loss": 0.0034, + "step": 245 + }, + { + "epoch": 11.136363636363637, + "eval_loss": 0.0020477415528148413, + "eval_runtime": 0.2363, + "eval_samples_per_second": 372.455, + "eval_steps_per_second": 46.557, + "step": 245 + }, + { + "epoch": 11.181818181818182, + "grad_norm": 0.027845608070492744, + "learning_rate": 2.0723684210526315e-05, + "loss": 0.0034, + "step": 246 + }, + { + "epoch": 11.181818181818182, + "eval_loss": 0.002039202954620123, + "eval_runtime": 0.2314, + "eval_samples_per_second": 380.293, + "eval_steps_per_second": 47.537, + "step": 246 + }, + { + "epoch": 11.227272727272727, + "grad_norm": 0.03046409972012043, + "learning_rate": 2.0684210526315792e-05, + "loss": 0.0035, + "step": 247 + }, + { + "epoch": 11.227272727272727, + "eval_loss": 0.0020310634281486273, + "eval_runtime": 0.2258, + "eval_samples_per_second": 389.786, + "eval_steps_per_second": 48.723, + "step": 247 + }, + { + "epoch": 11.272727272727273, + "grad_norm": 0.025676798075437546, + "learning_rate": 2.0644736842105263e-05, + "loss": 0.0031, + "step": 248 + }, + { + "epoch": 11.272727272727273, + "eval_loss": 0.0020227304194122553, + "eval_runtime": 0.2266, + "eval_samples_per_second": 388.395, + "eval_steps_per_second": 48.549, + "step": 248 + }, + { + "epoch": 11.318181818181818, + "grad_norm": 0.029285188764333725, + "learning_rate": 2.0605263157894737e-05, + "loss": 0.0036, + "step": 249 + }, + { + "epoch": 11.318181818181818, + "eval_loss": 0.0020139189437031746, + "eval_runtime": 0.2399, + "eval_samples_per_second": 366.874, + "eval_steps_per_second": 45.859, + "step": 249 + }, + { + "epoch": 11.363636363636363, + "grad_norm": 0.03067379631102085, + "learning_rate": 2.056578947368421e-05, + "loss": 0.0033, + "step": 250 + }, + { + "epoch": 11.363636363636363, + "eval_loss": 0.0020049491431564093, + "eval_runtime": 0.2296, + "eval_samples_per_second": 383.216, + "eval_steps_per_second": 47.902, + "step": 250 + }, + { + "epoch": 11.409090909090908, + "grad_norm": 0.030429691076278687, + "learning_rate": 2.0526315789473685e-05, + "loss": 0.0034, + "step": 251 + }, + { + "epoch": 11.409090909090908, + "eval_loss": 0.0019955127499997616, + "eval_runtime": 0.3825, + "eval_samples_per_second": 230.047, + "eval_steps_per_second": 28.756, + "step": 251 + }, + { + "epoch": 11.454545454545455, + "grad_norm": 0.03006516583263874, + "learning_rate": 2.0486842105263156e-05, + "loss": 0.0032, + "step": 252 + }, + { + "epoch": 11.454545454545455, + "eval_loss": 0.001985815353691578, + "eval_runtime": 0.5232, + "eval_samples_per_second": 168.209, + "eval_steps_per_second": 21.026, + "step": 252 + }, + { + "epoch": 11.5, + "grad_norm": 0.03021743707358837, + "learning_rate": 2.0447368421052634e-05, + "loss": 0.0035, + "step": 253 + }, + { + "epoch": 11.5, + "eval_loss": 0.001975873252376914, + "eval_runtime": 0.5816, + "eval_samples_per_second": 151.301, + "eval_steps_per_second": 18.913, + "step": 253 + }, + { + "epoch": 11.545454545454545, + "grad_norm": 0.026514986529946327, + "learning_rate": 2.0407894736842104e-05, + "loss": 0.0032, + "step": 254 + }, + { + "epoch": 11.545454545454545, + "eval_loss": 0.0019660864491015673, + "eval_runtime": 0.2403, + "eval_samples_per_second": 366.164, + "eval_steps_per_second": 45.77, + "step": 254 + }, + { + "epoch": 11.590909090909092, + "grad_norm": 0.028690319508314133, + "learning_rate": 2.036842105263158e-05, + "loss": 0.0033, + "step": 255 + }, + { + "epoch": 11.590909090909092, + "eval_loss": 0.0019563750829547644, + "eval_runtime": 0.2248, + "eval_samples_per_second": 391.417, + "eval_steps_per_second": 48.927, + "step": 255 + }, + { + "epoch": 11.636363636363637, + "grad_norm": 0.03033028170466423, + "learning_rate": 2.0328947368421056e-05, + "loss": 0.0034, + "step": 256 + }, + { + "epoch": 11.636363636363637, + "eval_loss": 0.0019468939863145351, + "eval_runtime": 0.2311, + "eval_samples_per_second": 380.835, + "eval_steps_per_second": 47.604, + "step": 256 + }, + { + "epoch": 11.681818181818182, + "grad_norm": 0.03320786729454994, + "learning_rate": 2.0289473684210527e-05, + "loss": 0.0035, + "step": 257 + }, + { + "epoch": 11.681818181818182, + "eval_loss": 0.0019374135881662369, + "eval_runtime": 0.2307, + "eval_samples_per_second": 381.512, + "eval_steps_per_second": 47.689, + "step": 257 + }, + { + "epoch": 11.727272727272727, + "grad_norm": 0.027468524873256683, + "learning_rate": 2.025e-05, + "loss": 0.0031, + "step": 258 + }, + { + "epoch": 11.727272727272727, + "eval_loss": 0.0019284605514258146, + "eval_runtime": 0.2303, + "eval_samples_per_second": 382.049, + "eval_steps_per_second": 47.756, + "step": 258 + }, + { + "epoch": 11.772727272727273, + "grad_norm": 0.02426382340490818, + "learning_rate": 2.0210526315789475e-05, + "loss": 0.0029, + "step": 259 + }, + { + "epoch": 11.772727272727273, + "eval_loss": 0.0019197481451556087, + "eval_runtime": 0.23, + "eval_samples_per_second": 382.529, + "eval_steps_per_second": 47.816, + "step": 259 + }, + { + "epoch": 11.818181818181818, + "grad_norm": 0.028253108263015747, + "learning_rate": 2.017105263157895e-05, + "loss": 0.003, + "step": 260 + }, + { + "epoch": 11.818181818181818, + "eval_loss": 0.0019117832416668534, + "eval_runtime": 0.2345, + "eval_samples_per_second": 375.238, + "eval_steps_per_second": 46.905, + "step": 260 + }, + { + "epoch": 11.863636363636363, + "grad_norm": 0.03305625915527344, + "learning_rate": 2.013157894736842e-05, + "loss": 0.0034, + "step": 261 + }, + { + "epoch": 11.863636363636363, + "eval_loss": 0.0019041887717321515, + "eval_runtime": 0.2239, + "eval_samples_per_second": 393.025, + "eval_steps_per_second": 49.128, + "step": 261 + }, + { + "epoch": 11.909090909090908, + "grad_norm": 0.027725212275981903, + "learning_rate": 2.0092105263157898e-05, + "loss": 0.0033, + "step": 262 + }, + { + "epoch": 11.909090909090908, + "eval_loss": 0.0018966187490150332, + "eval_runtime": 0.2303, + "eval_samples_per_second": 382.148, + "eval_steps_per_second": 47.769, + "step": 262 + }, + { + "epoch": 11.954545454545455, + "grad_norm": 0.02550244890153408, + "learning_rate": 2.0052631578947368e-05, + "loss": 0.0032, + "step": 263 + }, + { + "epoch": 11.954545454545455, + "eval_loss": 0.0018891972722485662, + "eval_runtime": 0.2274, + "eval_samples_per_second": 386.939, + "eval_steps_per_second": 48.367, + "step": 263 + }, + { + "epoch": 12.0, + "grad_norm": 0.02780972793698311, + "learning_rate": 2.0013157894736842e-05, + "loss": 0.0034, + "step": 264 + }, + { + "epoch": 12.0, + "eval_loss": 0.001881771837361157, + "eval_runtime": 0.2332, + "eval_samples_per_second": 377.388, + "eval_steps_per_second": 47.174, + "step": 264 + }, + { + "epoch": 12.045454545454545, + "grad_norm": 0.03385490924119949, + "learning_rate": 1.9973684210526317e-05, + "loss": 0.0034, + "step": 265 + }, + { + "epoch": 12.045454545454545, + "eval_loss": 0.001874623354524374, + "eval_runtime": 0.2413, + "eval_samples_per_second": 364.627, + "eval_steps_per_second": 45.578, + "step": 265 + }, + { + "epoch": 12.090909090909092, + "grad_norm": 0.029128815978765488, + "learning_rate": 1.993421052631579e-05, + "loss": 0.003, + "step": 266 + }, + { + "epoch": 12.090909090909092, + "eval_loss": 0.0018677938496693969, + "eval_runtime": 0.235, + "eval_samples_per_second": 374.427, + "eval_steps_per_second": 46.803, + "step": 266 + }, + { + "epoch": 12.136363636363637, + "grad_norm": 0.025781184434890747, + "learning_rate": 1.989473684210526e-05, + "loss": 0.0031, + "step": 267 + }, + { + "epoch": 12.136363636363637, + "eval_loss": 0.001861188909970224, + "eval_runtime": 0.2382, + "eval_samples_per_second": 369.363, + "eval_steps_per_second": 46.17, + "step": 267 + }, + { + "epoch": 12.181818181818182, + "grad_norm": 0.0294223353266716, + "learning_rate": 1.985526315789474e-05, + "loss": 0.0033, + "step": 268 + }, + { + "epoch": 12.181818181818182, + "eval_loss": 0.001854045782238245, + "eval_runtime": 0.2289, + "eval_samples_per_second": 384.52, + "eval_steps_per_second": 48.065, + "step": 268 + }, + { + "epoch": 12.227272727272727, + "grad_norm": 0.028326552361249924, + "learning_rate": 1.9815789473684213e-05, + "loss": 0.003, + "step": 269 + }, + { + "epoch": 12.227272727272727, + "eval_loss": 0.0018470593495294452, + "eval_runtime": 0.2289, + "eval_samples_per_second": 384.399, + "eval_steps_per_second": 48.05, + "step": 269 + }, + { + "epoch": 12.272727272727273, + "grad_norm": 0.030360590666532516, + "learning_rate": 1.9776315789473684e-05, + "loss": 0.0031, + "step": 270 + }, + { + "epoch": 12.272727272727273, + "eval_loss": 0.0018398199463263154, + "eval_runtime": 0.2311, + "eval_samples_per_second": 380.8, + "eval_steps_per_second": 47.6, + "step": 270 + }, + { + "epoch": 12.318181818181818, + "grad_norm": 0.02833518758416176, + "learning_rate": 1.9736842105263158e-05, + "loss": 0.0034, + "step": 271 + }, + { + "epoch": 12.318181818181818, + "eval_loss": 0.0018325141863897443, + "eval_runtime": 0.233, + "eval_samples_per_second": 377.758, + "eval_steps_per_second": 47.22, + "step": 271 + }, + { + "epoch": 12.363636363636363, + "grad_norm": 0.029960816726088524, + "learning_rate": 1.9697368421052632e-05, + "loss": 0.0032, + "step": 272 + }, + { + "epoch": 12.363636363636363, + "eval_loss": 0.0018252148292958736, + "eval_runtime": 0.231, + "eval_samples_per_second": 381.016, + "eval_steps_per_second": 47.627, + "step": 272 + }, + { + "epoch": 12.409090909090908, + "grad_norm": 0.027226990088820457, + "learning_rate": 1.9657894736842106e-05, + "loss": 0.0029, + "step": 273 + }, + { + "epoch": 12.409090909090908, + "eval_loss": 0.0018177549354732037, + "eval_runtime": 0.233, + "eval_samples_per_second": 377.605, + "eval_steps_per_second": 47.201, + "step": 273 + }, + { + "epoch": 12.454545454545455, + "grad_norm": 0.02402249164879322, + "learning_rate": 1.9618421052631577e-05, + "loss": 0.0029, + "step": 274 + }, + { + "epoch": 12.454545454545455, + "eval_loss": 0.0018104868941009045, + "eval_runtime": 0.2464, + "eval_samples_per_second": 357.208, + "eval_steps_per_second": 44.651, + "step": 274 + }, + { + "epoch": 12.5, + "grad_norm": 0.025068577378988266, + "learning_rate": 1.9578947368421055e-05, + "loss": 0.003, + "step": 275 + }, + { + "epoch": 12.5, + "eval_loss": 0.0018031727522611618, + "eval_runtime": 0.2561, + "eval_samples_per_second": 343.628, + "eval_steps_per_second": 42.953, + "step": 275 + }, + { + "epoch": 12.545454545454545, + "grad_norm": 0.03290198743343353, + "learning_rate": 1.9539473684210525e-05, + "loss": 0.0032, + "step": 276 + }, + { + "epoch": 12.545454545454545, + "eval_loss": 0.0017959319520741701, + "eval_runtime": 0.2473, + "eval_samples_per_second": 355.844, + "eval_steps_per_second": 44.48, + "step": 276 + }, + { + "epoch": 12.590909090909092, + "grad_norm": 0.025103066116571426, + "learning_rate": 1.95e-05, + "loss": 0.0028, + "step": 277 + }, + { + "epoch": 12.590909090909092, + "eval_loss": 0.0017883635591715574, + "eval_runtime": 0.2312, + "eval_samples_per_second": 380.663, + "eval_steps_per_second": 47.583, + "step": 277 + }, + { + "epoch": 12.636363636363637, + "grad_norm": 0.02768297679722309, + "learning_rate": 1.9460526315789474e-05, + "loss": 0.003, + "step": 278 + }, + { + "epoch": 12.636363636363637, + "eval_loss": 0.0017810885328799486, + "eval_runtime": 0.2411, + "eval_samples_per_second": 365.033, + "eval_steps_per_second": 45.629, + "step": 278 + }, + { + "epoch": 12.681818181818182, + "grad_norm": 0.026979558169841766, + "learning_rate": 1.9421052631578948e-05, + "loss": 0.0033, + "step": 279 + }, + { + "epoch": 12.681818181818182, + "eval_loss": 0.0017738312017172575, + "eval_runtime": 0.2981, + "eval_samples_per_second": 295.202, + "eval_steps_per_second": 36.9, + "step": 279 + }, + { + "epoch": 12.727272727272727, + "grad_norm": 0.025757014751434326, + "learning_rate": 1.938157894736842e-05, + "loss": 0.0031, + "step": 280 + }, + { + "epoch": 12.727272727272727, + "eval_loss": 0.0017666955245658755, + "eval_runtime": 0.2467, + "eval_samples_per_second": 356.773, + "eval_steps_per_second": 44.597, + "step": 280 + }, + { + "epoch": 12.772727272727273, + "grad_norm": 0.026617391034960747, + "learning_rate": 1.9342105263157896e-05, + "loss": 0.003, + "step": 281 + }, + { + "epoch": 12.772727272727273, + "eval_loss": 0.0017593905795365572, + "eval_runtime": 0.2388, + "eval_samples_per_second": 368.469, + "eval_steps_per_second": 46.059, + "step": 281 + }, + { + "epoch": 12.818181818181818, + "grad_norm": 0.027713097631931305, + "learning_rate": 1.9302631578947367e-05, + "loss": 0.0028, + "step": 282 + }, + { + "epoch": 12.818181818181818, + "eval_loss": 0.0017523803981021047, + "eval_runtime": 0.2531, + "eval_samples_per_second": 347.71, + "eval_steps_per_second": 43.464, + "step": 282 + }, + { + "epoch": 12.863636363636363, + "grad_norm": 0.021941719576716423, + "learning_rate": 1.926315789473684e-05, + "loss": 0.0028, + "step": 283 + }, + { + "epoch": 12.863636363636363, + "eval_loss": 0.0017456583445891738, + "eval_runtime": 0.2275, + "eval_samples_per_second": 386.831, + "eval_steps_per_second": 48.354, + "step": 283 + }, + { + "epoch": 12.909090909090908, + "grad_norm": 0.029443973675370216, + "learning_rate": 1.922368421052632e-05, + "loss": 0.0029, + "step": 284 + }, + { + "epoch": 12.909090909090908, + "eval_loss": 0.0017391174333170056, + "eval_runtime": 0.2259, + "eval_samples_per_second": 389.61, + "eval_steps_per_second": 48.701, + "step": 284 + }, + { + "epoch": 12.954545454545455, + "grad_norm": 0.023187711834907532, + "learning_rate": 1.918421052631579e-05, + "loss": 0.0027, + "step": 285 + }, + { + "epoch": 12.954545454545455, + "eval_loss": 0.0017328561516478658, + "eval_runtime": 0.2229, + "eval_samples_per_second": 394.794, + "eval_steps_per_second": 49.349, + "step": 285 + }, + { + "epoch": 13.0, + "grad_norm": 0.02683272212743759, + "learning_rate": 1.9144736842105264e-05, + "loss": 0.0028, + "step": 286 + }, + { + "epoch": 13.0, + "eval_loss": 0.0017264141933992505, + "eval_runtime": 0.2281, + "eval_samples_per_second": 385.759, + "eval_steps_per_second": 48.22, + "step": 286 + }, + { + "epoch": 13.045454545454545, + "grad_norm": 0.026485104113817215, + "learning_rate": 1.9105263157894738e-05, + "loss": 0.0029, + "step": 287 + }, + { + "epoch": 13.045454545454545, + "eval_loss": 0.0017197772394865751, + "eval_runtime": 0.2245, + "eval_samples_per_second": 392.011, + "eval_steps_per_second": 49.001, + "step": 287 + }, + { + "epoch": 13.090909090909092, + "grad_norm": 0.025229312479496002, + "learning_rate": 1.9065789473684212e-05, + "loss": 0.0027, + "step": 288 + }, + { + "epoch": 13.090909090909092, + "eval_loss": 0.0017132211942225695, + "eval_runtime": 0.2288, + "eval_samples_per_second": 384.654, + "eval_steps_per_second": 48.082, + "step": 288 + }, + { + "epoch": 13.136363636363637, + "grad_norm": 0.026387052610516548, + "learning_rate": 1.9026315789473683e-05, + "loss": 0.003, + "step": 289 + }, + { + "epoch": 13.136363636363637, + "eval_loss": 0.001706792158074677, + "eval_runtime": 0.2251, + "eval_samples_per_second": 390.981, + "eval_steps_per_second": 48.873, + "step": 289 + }, + { + "epoch": 13.181818181818182, + "grad_norm": 0.0232387688010931, + "learning_rate": 1.898684210526316e-05, + "loss": 0.0028, + "step": 290 + }, + { + "epoch": 13.181818181818182, + "eval_loss": 0.0017004094552248716, + "eval_runtime": 0.2307, + "eval_samples_per_second": 381.375, + "eval_steps_per_second": 47.672, + "step": 290 + }, + { + "epoch": 13.227272727272727, + "grad_norm": 0.030720511451363564, + "learning_rate": 1.894736842105263e-05, + "loss": 0.003, + "step": 291 + }, + { + "epoch": 13.227272727272727, + "eval_loss": 0.0016942427027970552, + "eval_runtime": 0.2316, + "eval_samples_per_second": 379.934, + "eval_steps_per_second": 47.492, + "step": 291 + }, + { + "epoch": 13.272727272727273, + "grad_norm": 0.023519422858953476, + "learning_rate": 1.8907894736842105e-05, + "loss": 0.0025, + "step": 292 + }, + { + "epoch": 13.272727272727273, + "eval_loss": 0.0016882912022992969, + "eval_runtime": 0.2298, + "eval_samples_per_second": 383.008, + "eval_steps_per_second": 47.876, + "step": 292 + }, + { + "epoch": 13.318181818181818, + "grad_norm": 0.02608366496860981, + "learning_rate": 1.886842105263158e-05, + "loss": 0.003, + "step": 293 + }, + { + "epoch": 13.318181818181818, + "eval_loss": 0.001682400587014854, + "eval_runtime": 0.2333, + "eval_samples_per_second": 377.26, + "eval_steps_per_second": 47.157, + "step": 293 + }, + { + "epoch": 13.363636363636363, + "grad_norm": 0.02541464753448963, + "learning_rate": 1.8828947368421053e-05, + "loss": 0.0028, + "step": 294 + }, + { + "epoch": 13.363636363636363, + "eval_loss": 0.0016764701576903462, + "eval_runtime": 0.2276, + "eval_samples_per_second": 386.598, + "eval_steps_per_second": 48.325, + "step": 294 + }, + { + "epoch": 13.409090909090908, + "grad_norm": 0.026540333405137062, + "learning_rate": 1.8789473684210524e-05, + "loss": 0.0028, + "step": 295 + }, + { + "epoch": 13.409090909090908, + "eval_loss": 0.0016703385626897216, + "eval_runtime": 0.2313, + "eval_samples_per_second": 380.436, + "eval_steps_per_second": 47.554, + "step": 295 + }, + { + "epoch": 13.454545454545455, + "grad_norm": 0.021979449316859245, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.0027, + "step": 296 + }, + { + "epoch": 13.454545454545455, + "eval_loss": 0.0016644434072077274, + "eval_runtime": 0.2267, + "eval_samples_per_second": 388.248, + "eval_steps_per_second": 48.531, + "step": 296 + }, + { + "epoch": 13.5, + "grad_norm": 0.027137625962495804, + "learning_rate": 1.8710526315789476e-05, + "loss": 0.0027, + "step": 297 + }, + { + "epoch": 13.5, + "eval_loss": 0.001658798661082983, + "eval_runtime": 0.2286, + "eval_samples_per_second": 384.972, + "eval_steps_per_second": 48.121, + "step": 297 + }, + { + "epoch": 13.545454545454545, + "grad_norm": 0.02321833185851574, + "learning_rate": 1.8671052631578947e-05, + "loss": 0.0027, + "step": 298 + }, + { + "epoch": 13.545454545454545, + "eval_loss": 0.001653428073041141, + "eval_runtime": 0.227, + "eval_samples_per_second": 387.714, + "eval_steps_per_second": 48.464, + "step": 298 + }, + { + "epoch": 13.590909090909092, + "grad_norm": 0.028996985405683517, + "learning_rate": 1.8631578947368424e-05, + "loss": 0.0029, + "step": 299 + }, + { + "epoch": 13.590909090909092, + "eval_loss": 0.0016476112650707364, + "eval_runtime": 0.2299, + "eval_samples_per_second": 382.812, + "eval_steps_per_second": 47.852, + "step": 299 + }, + { + "epoch": 13.636363636363637, + "grad_norm": 0.028486257418990135, + "learning_rate": 1.8592105263157895e-05, + "loss": 0.0027, + "step": 300 + }, + { + "epoch": 13.636363636363637, + "eval_loss": 0.001642104354687035, + "eval_runtime": 0.2398, + "eval_samples_per_second": 367.028, + "eval_steps_per_second": 45.878, + "step": 300 + }, + { + "epoch": 13.681818181818182, + "grad_norm": 0.022658037021756172, + "learning_rate": 1.855263157894737e-05, + "loss": 0.0025, + "step": 301 + }, + { + "epoch": 13.681818181818182, + "eval_loss": 0.0016368039650842547, + "eval_runtime": 0.2377, + "eval_samples_per_second": 370.172, + "eval_steps_per_second": 46.271, + "step": 301 + }, + { + "epoch": 13.727272727272727, + "grad_norm": 0.024452779442071915, + "learning_rate": 1.8513157894736843e-05, + "loss": 0.0028, + "step": 302 + }, + { + "epoch": 13.727272727272727, + "eval_loss": 0.0016317162662744522, + "eval_runtime": 0.2252, + "eval_samples_per_second": 390.707, + "eval_steps_per_second": 48.838, + "step": 302 + }, + { + "epoch": 13.772727272727273, + "grad_norm": 0.02014131471514702, + "learning_rate": 1.8473684210526317e-05, + "loss": 0.0024, + "step": 303 + }, + { + "epoch": 13.772727272727273, + "eval_loss": 0.001626785146072507, + "eval_runtime": 0.2343, + "eval_samples_per_second": 375.607, + "eval_steps_per_second": 46.951, + "step": 303 + }, + { + "epoch": 13.818181818181818, + "grad_norm": 0.02657116763293743, + "learning_rate": 1.8434210526315788e-05, + "loss": 0.0025, + "step": 304 + }, + { + "epoch": 13.818181818181818, + "eval_loss": 0.001621657982468605, + "eval_runtime": 0.2287, + "eval_samples_per_second": 384.722, + "eval_steps_per_second": 48.09, + "step": 304 + }, + { + "epoch": 13.863636363636363, + "grad_norm": 0.02328609488904476, + "learning_rate": 1.8394736842105266e-05, + "loss": 0.0025, + "step": 305 + }, + { + "epoch": 13.863636363636363, + "eval_loss": 0.001616165740415454, + "eval_runtime": 0.2335, + "eval_samples_per_second": 376.921, + "eval_steps_per_second": 47.115, + "step": 305 + }, + { + "epoch": 13.909090909090908, + "grad_norm": 0.02286568656563759, + "learning_rate": 1.8355263157894736e-05, + "loss": 0.0027, + "step": 306 + }, + { + "epoch": 13.909090909090908, + "eval_loss": 0.001610812614671886, + "eval_runtime": 0.2295, + "eval_samples_per_second": 383.365, + "eval_steps_per_second": 47.921, + "step": 306 + }, + { + "epoch": 13.954545454545455, + "grad_norm": 0.025216739624738693, + "learning_rate": 1.831578947368421e-05, + "loss": 0.0026, + "step": 307 + }, + { + "epoch": 13.954545454545455, + "eval_loss": 0.001605312223546207, + "eval_runtime": 0.2306, + "eval_samples_per_second": 381.68, + "eval_steps_per_second": 47.71, + "step": 307 + }, + { + "epoch": 14.0, + "grad_norm": 0.02698989026248455, + "learning_rate": 1.8276315789473685e-05, + "loss": 0.003, + "step": 308 + }, + { + "epoch": 14.0, + "eval_loss": 0.001599607290700078, + "eval_runtime": 0.234, + "eval_samples_per_second": 376.05, + "eval_steps_per_second": 47.006, + "step": 308 + }, + { + "epoch": 14.045454545454545, + "grad_norm": 0.02121439203619957, + "learning_rate": 1.823684210526316e-05, + "loss": 0.0026, + "step": 309 + }, + { + "epoch": 14.045454545454545, + "eval_loss": 0.0015940162120386958, + "eval_runtime": 0.2335, + "eval_samples_per_second": 376.91, + "eval_steps_per_second": 47.114, + "step": 309 + }, + { + "epoch": 14.090909090909092, + "grad_norm": 0.02412167377769947, + "learning_rate": 1.8197368421052633e-05, + "loss": 0.0028, + "step": 310 + }, + { + "epoch": 14.090909090909092, + "eval_loss": 0.001588392653502524, + "eval_runtime": 0.2321, + "eval_samples_per_second": 379.14, + "eval_steps_per_second": 47.392, + "step": 310 + }, + { + "epoch": 14.136363636363637, + "grad_norm": 0.02534678392112255, + "learning_rate": 1.8157894736842107e-05, + "loss": 0.0027, + "step": 311 + }, + { + "epoch": 14.136363636363637, + "eval_loss": 0.0015829313779249787, + "eval_runtime": 0.2274, + "eval_samples_per_second": 386.989, + "eval_steps_per_second": 48.374, + "step": 311 + }, + { + "epoch": 14.181818181818182, + "grad_norm": 0.021638307720422745, + "learning_rate": 1.811842105263158e-05, + "loss": 0.0025, + "step": 312 + }, + { + "epoch": 14.181818181818182, + "eval_loss": 0.0015773712657392025, + "eval_runtime": 0.2294, + "eval_samples_per_second": 383.682, + "eval_steps_per_second": 47.96, + "step": 312 + }, + { + "epoch": 14.227272727272727, + "grad_norm": 0.024357490241527557, + "learning_rate": 1.8078947368421052e-05, + "loss": 0.0027, + "step": 313 + }, + { + "epoch": 14.227272727272727, + "eval_loss": 0.0015717636561021209, + "eval_runtime": 0.2294, + "eval_samples_per_second": 383.662, + "eval_steps_per_second": 47.958, + "step": 313 + }, + { + "epoch": 14.272727272727273, + "grad_norm": 0.022512707859277725, + "learning_rate": 1.8039473684210526e-05, + "loss": 0.0026, + "step": 314 + }, + { + "epoch": 14.272727272727273, + "eval_loss": 0.001566153485327959, + "eval_runtime": 0.2263, + "eval_samples_per_second": 388.817, + "eval_steps_per_second": 48.602, + "step": 314 + }, + { + "epoch": 14.318181818181818, + "grad_norm": 0.022913463413715363, + "learning_rate": 1.8e-05, + "loss": 0.0026, + "step": 315 + }, + { + "epoch": 14.318181818181818, + "eval_loss": 0.001560671953484416, + "eval_runtime": 0.2319, + "eval_samples_per_second": 379.401, + "eval_steps_per_second": 47.425, + "step": 315 + }, + { + "epoch": 14.363636363636363, + "grad_norm": 0.024906402453780174, + "learning_rate": 1.7960526315789475e-05, + "loss": 0.0026, + "step": 316 + }, + { + "epoch": 14.363636363636363, + "eval_loss": 0.0015550776151940227, + "eval_runtime": 0.2309, + "eval_samples_per_second": 381.176, + "eval_steps_per_second": 47.647, + "step": 316 + }, + { + "epoch": 14.409090909090908, + "grad_norm": 0.020846841856837273, + "learning_rate": 1.7921052631578945e-05, + "loss": 0.0024, + "step": 317 + }, + { + "epoch": 14.409090909090908, + "eval_loss": 0.0015492510283365846, + "eval_runtime": 0.23, + "eval_samples_per_second": 382.625, + "eval_steps_per_second": 47.828, + "step": 317 + }, + { + "epoch": 14.454545454545455, + "grad_norm": 0.020949576050043106, + "learning_rate": 1.7881578947368423e-05, + "loss": 0.0024, + "step": 318 + }, + { + "epoch": 14.454545454545455, + "eval_loss": 0.001543792081065476, + "eval_runtime": 0.2687, + "eval_samples_per_second": 327.535, + "eval_steps_per_second": 40.942, + "step": 318 + }, + { + "epoch": 14.5, + "grad_norm": 0.027320073917508125, + "learning_rate": 1.7842105263157894e-05, + "loss": 0.0029, + "step": 319 + }, + { + "epoch": 14.5, + "eval_loss": 0.0015383724821731448, + "eval_runtime": 0.2378, + "eval_samples_per_second": 369.998, + "eval_steps_per_second": 46.25, + "step": 319 + }, + { + "epoch": 14.545454545454545, + "grad_norm": 0.023768380284309387, + "learning_rate": 1.7802631578947368e-05, + "loss": 0.0024, + "step": 320 + }, + { + "epoch": 14.545454545454545, + "eval_loss": 0.0015328243607655168, + "eval_runtime": 0.2636, + "eval_samples_per_second": 333.891, + "eval_steps_per_second": 41.736, + "step": 320 + }, + { + "epoch": 14.590909090909092, + "grad_norm": 0.023090893402695656, + "learning_rate": 1.7763157894736842e-05, + "loss": 0.0028, + "step": 321 + }, + { + "epoch": 14.590909090909092, + "eval_loss": 0.0015273126773536205, + "eval_runtime": 0.2297, + "eval_samples_per_second": 383.091, + "eval_steps_per_second": 47.886, + "step": 321 + }, + { + "epoch": 14.636363636363637, + "grad_norm": 0.021861301735043526, + "learning_rate": 1.7723684210526316e-05, + "loss": 0.0023, + "step": 322 + }, + { + "epoch": 14.636363636363637, + "eval_loss": 0.0015220079803839326, + "eval_runtime": 0.2395, + "eval_samples_per_second": 367.485, + "eval_steps_per_second": 45.936, + "step": 322 + }, + { + "epoch": 14.681818181818182, + "grad_norm": 0.02089674212038517, + "learning_rate": 1.7684210526315787e-05, + "loss": 0.0025, + "step": 323 + }, + { + "epoch": 14.681818181818182, + "eval_loss": 0.0015169020043686032, + "eval_runtime": 0.2277, + "eval_samples_per_second": 386.55, + "eval_steps_per_second": 48.319, + "step": 323 + }, + { + "epoch": 14.727272727272727, + "grad_norm": 0.026943515986204147, + "learning_rate": 1.7644736842105264e-05, + "loss": 0.0027, + "step": 324 + }, + { + "epoch": 14.727272727272727, + "eval_loss": 0.0015122044133022428, + "eval_runtime": 0.2504, + "eval_samples_per_second": 351.497, + "eval_steps_per_second": 43.937, + "step": 324 + }, + { + "epoch": 14.772727272727273, + "grad_norm": 0.021125871688127518, + "learning_rate": 1.760526315789474e-05, + "loss": 0.0024, + "step": 325 + }, + { + "epoch": 14.772727272727273, + "eval_loss": 0.0015074351103976369, + "eval_runtime": 0.2277, + "eval_samples_per_second": 386.421, + "eval_steps_per_second": 48.303, + "step": 325 + }, + { + "epoch": 14.818181818181818, + "grad_norm": 0.023058133199810982, + "learning_rate": 1.756578947368421e-05, + "loss": 0.0025, + "step": 326 + }, + { + "epoch": 14.818181818181818, + "eval_loss": 0.001502548111602664, + "eval_runtime": 0.2371, + "eval_samples_per_second": 371.118, + "eval_steps_per_second": 46.39, + "step": 326 + }, + { + "epoch": 14.863636363636363, + "grad_norm": 0.020260730758309364, + "learning_rate": 1.7526315789473687e-05, + "loss": 0.0023, + "step": 327 + }, + { + "epoch": 14.863636363636363, + "eval_loss": 0.0014978590188547969, + "eval_runtime": 0.231, + "eval_samples_per_second": 380.935, + "eval_steps_per_second": 47.617, + "step": 327 + }, + { + "epoch": 14.909090909090908, + "grad_norm": 0.021094167605042458, + "learning_rate": 1.7486842105263158e-05, + "loss": 0.0024, + "step": 328 + }, + { + "epoch": 14.909090909090908, + "eval_loss": 0.0014932234771549702, + "eval_runtime": 0.2309, + "eval_samples_per_second": 381.042, + "eval_steps_per_second": 47.63, + "step": 328 + }, + { + "epoch": 14.954545454545455, + "grad_norm": 0.023162171244621277, + "learning_rate": 1.7447368421052632e-05, + "loss": 0.0027, + "step": 329 + }, + { + "epoch": 14.954545454545455, + "eval_loss": 0.0014887653524056077, + "eval_runtime": 0.2298, + "eval_samples_per_second": 382.875, + "eval_steps_per_second": 47.859, + "step": 329 + }, + { + "epoch": 15.0, + "grad_norm": 0.021899493411183357, + "learning_rate": 1.7407894736842106e-05, + "loss": 0.0026, + "step": 330 + }, + { + "epoch": 15.0, + "eval_loss": 0.0014844763791188598, + "eval_runtime": 0.2287, + "eval_samples_per_second": 384.811, + "eval_steps_per_second": 48.101, + "step": 330 + }, + { + "epoch": 15.045454545454545, + "grad_norm": 0.02722894586622715, + "learning_rate": 1.736842105263158e-05, + "loss": 0.0029, + "step": 331 + }, + { + "epoch": 15.045454545454545, + "eval_loss": 0.001479836879298091, + "eval_runtime": 0.2296, + "eval_samples_per_second": 383.331, + "eval_steps_per_second": 47.916, + "step": 331 + }, + { + "epoch": 15.090909090909092, + "grad_norm": 0.0198600422590971, + "learning_rate": 1.732894736842105e-05, + "loss": 0.0023, + "step": 332 + }, + { + "epoch": 15.090909090909092, + "eval_loss": 0.001475546509027481, + "eval_runtime": 0.2293, + "eval_samples_per_second": 383.783, + "eval_steps_per_second": 47.973, + "step": 332 + }, + { + "epoch": 15.136363636363637, + "grad_norm": 0.018213720992207527, + "learning_rate": 1.728947368421053e-05, + "loss": 0.0021, + "step": 333 + }, + { + "epoch": 15.136363636363637, + "eval_loss": 0.0014714114367961884, + "eval_runtime": 0.2229, + "eval_samples_per_second": 394.882, + "eval_steps_per_second": 49.36, + "step": 333 + }, + { + "epoch": 15.181818181818182, + "grad_norm": 0.02195083722472191, + "learning_rate": 1.725e-05, + "loss": 0.0026, + "step": 334 + }, + { + "epoch": 15.181818181818182, + "eval_loss": 0.0014672887045890093, + "eval_runtime": 0.2307, + "eval_samples_per_second": 381.499, + "eval_steps_per_second": 47.687, + "step": 334 + }, + { + "epoch": 15.227272727272727, + "grad_norm": 0.020630402490496635, + "learning_rate": 1.7210526315789473e-05, + "loss": 0.0023, + "step": 335 + }, + { + "epoch": 15.227272727272727, + "eval_loss": 0.0014632240636274219, + "eval_runtime": 0.2345, + "eval_samples_per_second": 375.326, + "eval_steps_per_second": 46.916, + "step": 335 + }, + { + "epoch": 15.272727272727273, + "grad_norm": 0.01985459215939045, + "learning_rate": 1.7171052631578947e-05, + "loss": 0.0024, + "step": 336 + }, + { + "epoch": 15.272727272727273, + "eval_loss": 0.0014591444050893188, + "eval_runtime": 0.2344, + "eval_samples_per_second": 375.401, + "eval_steps_per_second": 46.925, + "step": 336 + }, + { + "epoch": 15.318181818181818, + "grad_norm": 0.02400742471218109, + "learning_rate": 1.713157894736842e-05, + "loss": 0.0024, + "step": 337 + }, + { + "epoch": 15.318181818181818, + "eval_loss": 0.001454763114452362, + "eval_runtime": 0.2401, + "eval_samples_per_second": 366.585, + "eval_steps_per_second": 45.823, + "step": 337 + }, + { + "epoch": 15.363636363636363, + "grad_norm": 0.02545950934290886, + "learning_rate": 1.7092105263157896e-05, + "loss": 0.0026, + "step": 338 + }, + { + "epoch": 15.363636363636363, + "eval_loss": 0.0014504102291539311, + "eval_runtime": 0.2315, + "eval_samples_per_second": 380.122, + "eval_steps_per_second": 47.515, + "step": 338 + }, + { + "epoch": 15.409090909090908, + "grad_norm": 0.02126440778374672, + "learning_rate": 1.705263157894737e-05, + "loss": 0.0024, + "step": 339 + }, + { + "epoch": 15.409090909090908, + "eval_loss": 0.0014461844693869352, + "eval_runtime": 0.2351, + "eval_samples_per_second": 374.294, + "eval_steps_per_second": 46.787, + "step": 339 + }, + { + "epoch": 15.454545454545455, + "grad_norm": 0.025197012349963188, + "learning_rate": 1.7013157894736844e-05, + "loss": 0.0025, + "step": 340 + }, + { + "epoch": 15.454545454545455, + "eval_loss": 0.0014418490463867784, + "eval_runtime": 0.2274, + "eval_samples_per_second": 387.064, + "eval_steps_per_second": 48.383, + "step": 340 + }, + { + "epoch": 15.5, + "grad_norm": 0.022640075534582138, + "learning_rate": 1.6973684210526315e-05, + "loss": 0.0024, + "step": 341 + }, + { + "epoch": 15.5, + "eval_loss": 0.0014375299215316772, + "eval_runtime": 0.2405, + "eval_samples_per_second": 365.83, + "eval_steps_per_second": 45.729, + "step": 341 + }, + { + "epoch": 15.545454545454545, + "grad_norm": 0.021050602197647095, + "learning_rate": 1.6934210526315792e-05, + "loss": 0.0024, + "step": 342 + }, + { + "epoch": 15.545454545454545, + "eval_loss": 0.0014335111482068896, + "eval_runtime": 0.226, + "eval_samples_per_second": 389.393, + "eval_steps_per_second": 48.674, + "step": 342 + }, + { + "epoch": 15.590909090909092, + "grad_norm": 0.0219247005879879, + "learning_rate": 1.6894736842105263e-05, + "loss": 0.0025, + "step": 343 + }, + { + "epoch": 15.590909090909092, + "eval_loss": 0.0014295299770310521, + "eval_runtime": 0.2342, + "eval_samples_per_second": 375.717, + "eval_steps_per_second": 46.965, + "step": 343 + }, + { + "epoch": 15.636363636363637, + "grad_norm": 0.020925231277942657, + "learning_rate": 1.6855263157894737e-05, + "loss": 0.0024, + "step": 344 + }, + { + "epoch": 15.636363636363637, + "eval_loss": 0.0014257035218179226, + "eval_runtime": 0.2368, + "eval_samples_per_second": 371.622, + "eval_steps_per_second": 46.453, + "step": 344 + }, + { + "epoch": 15.681818181818182, + "grad_norm": 0.019099295139312744, + "learning_rate": 1.681578947368421e-05, + "loss": 0.0023, + "step": 345 + }, + { + "epoch": 15.681818181818182, + "eval_loss": 0.0014218251453712583, + "eval_runtime": 0.2291, + "eval_samples_per_second": 384.074, + "eval_steps_per_second": 48.009, + "step": 345 + }, + { + "epoch": 15.727272727272727, + "grad_norm": 0.021133864298462868, + "learning_rate": 1.6776315789473686e-05, + "loss": 0.0023, + "step": 346 + }, + { + "epoch": 15.727272727272727, + "eval_loss": 0.0014178574783727527, + "eval_runtime": 0.2372, + "eval_samples_per_second": 370.96, + "eval_steps_per_second": 46.37, + "step": 346 + }, + { + "epoch": 15.772727272727273, + "grad_norm": 0.0220933947712183, + "learning_rate": 1.6736842105263156e-05, + "loss": 0.0024, + "step": 347 + }, + { + "epoch": 15.772727272727273, + "eval_loss": 0.0014137736288830638, + "eval_runtime": 0.2311, + "eval_samples_per_second": 380.859, + "eval_steps_per_second": 47.607, + "step": 347 + }, + { + "epoch": 15.818181818181818, + "grad_norm": 0.02274385653436184, + "learning_rate": 1.6697368421052634e-05, + "loss": 0.0023, + "step": 348 + }, + { + "epoch": 15.818181818181818, + "eval_loss": 0.0014094788348302245, + "eval_runtime": 0.2489, + "eval_samples_per_second": 353.537, + "eval_steps_per_second": 44.192, + "step": 348 + }, + { + "epoch": 15.863636363636363, + "grad_norm": 0.023772120475769043, + "learning_rate": 1.6657894736842105e-05, + "loss": 0.0025, + "step": 349 + }, + { + "epoch": 15.863636363636363, + "eval_loss": 0.0014053123304620385, + "eval_runtime": 0.2394, + "eval_samples_per_second": 367.516, + "eval_steps_per_second": 45.94, + "step": 349 + }, + { + "epoch": 15.909090909090908, + "grad_norm": 0.023701833561062813, + "learning_rate": 1.661842105263158e-05, + "loss": 0.0026, + "step": 350 + }, + { + "epoch": 15.909090909090908, + "eval_loss": 0.0014007468707859516, + "eval_runtime": 0.2428, + "eval_samples_per_second": 362.454, + "eval_steps_per_second": 45.307, + "step": 350 + }, + { + "epoch": 15.954545454545455, + "grad_norm": 0.020177854225039482, + "learning_rate": 1.6578947368421053e-05, + "loss": 0.0023, + "step": 351 + }, + { + "epoch": 15.954545454545455, + "eval_loss": 0.001396444975398481, + "eval_runtime": 0.2227, + "eval_samples_per_second": 395.086, + "eval_steps_per_second": 49.386, + "step": 351 + }, + { + "epoch": 16.0, + "grad_norm": 0.018302910029888153, + "learning_rate": 1.6539473684210527e-05, + "loss": 0.0022, + "step": 352 + }, + { + "epoch": 16.0, + "eval_loss": 0.0013921987265348434, + "eval_runtime": 0.2255, + "eval_samples_per_second": 390.23, + "eval_steps_per_second": 48.779, + "step": 352 + }, + { + "epoch": 16.045454545454547, + "grad_norm": 0.02006903663277626, + "learning_rate": 1.65e-05, + "loss": 0.0024, + "step": 353 + }, + { + "epoch": 16.045454545454547, + "eval_loss": 0.0013879131292924285, + "eval_runtime": 0.2332, + "eval_samples_per_second": 377.362, + "eval_steps_per_second": 47.17, + "step": 353 + }, + { + "epoch": 16.09090909090909, + "grad_norm": 0.02006879448890686, + "learning_rate": 1.6460526315789472e-05, + "loss": 0.0024, + "step": 354 + }, + { + "epoch": 16.09090909090909, + "eval_loss": 0.0013836818980053067, + "eval_runtime": 0.2294, + "eval_samples_per_second": 383.546, + "eval_steps_per_second": 47.943, + "step": 354 + }, + { + "epoch": 16.136363636363637, + "grad_norm": 0.01927405595779419, + "learning_rate": 1.642105263157895e-05, + "loss": 0.0021, + "step": 355 + }, + { + "epoch": 16.136363636363637, + "eval_loss": 0.001379486988298595, + "eval_runtime": 0.2304, + "eval_samples_per_second": 381.9, + "eval_steps_per_second": 47.738, + "step": 355 + }, + { + "epoch": 16.181818181818183, + "grad_norm": 0.019441615790128708, + "learning_rate": 1.638157894736842e-05, + "loss": 0.0024, + "step": 356 + }, + { + "epoch": 16.181818181818183, + "eval_loss": 0.0013752405066043139, + "eval_runtime": 0.2339, + "eval_samples_per_second": 376.279, + "eval_steps_per_second": 47.035, + "step": 356 + }, + { + "epoch": 16.227272727272727, + "grad_norm": 0.019047444686293602, + "learning_rate": 1.6342105263157894e-05, + "loss": 0.0022, + "step": 357 + }, + { + "epoch": 16.227272727272727, + "eval_loss": 0.0013710103230550885, + "eval_runtime": 0.2296, + "eval_samples_per_second": 383.255, + "eval_steps_per_second": 47.907, + "step": 357 + }, + { + "epoch": 16.272727272727273, + "grad_norm": 0.02004443109035492, + "learning_rate": 1.630263157894737e-05, + "loss": 0.002, + "step": 358 + }, + { + "epoch": 16.272727272727273, + "eval_loss": 0.0013666612794622779, + "eval_runtime": 0.2306, + "eval_samples_per_second": 381.651, + "eval_steps_per_second": 47.706, + "step": 358 + }, + { + "epoch": 16.318181818181817, + "grad_norm": 0.018162380903959274, + "learning_rate": 1.6263157894736843e-05, + "loss": 0.0022, + "step": 359 + }, + { + "epoch": 16.318181818181817, + "eval_loss": 0.0013625015271827579, + "eval_runtime": 0.2336, + "eval_samples_per_second": 376.757, + "eval_steps_per_second": 47.095, + "step": 359 + }, + { + "epoch": 16.363636363636363, + "grad_norm": 0.01866663061082363, + "learning_rate": 1.6223684210526314e-05, + "loss": 0.0023, + "step": 360 + }, + { + "epoch": 16.363636363636363, + "eval_loss": 0.001358471461571753, + "eval_runtime": 0.234, + "eval_samples_per_second": 376.031, + "eval_steps_per_second": 47.004, + "step": 360 + }, + { + "epoch": 16.40909090909091, + "grad_norm": 0.023692943155765533, + "learning_rate": 1.618421052631579e-05, + "loss": 0.0021, + "step": 361 + }, + { + "epoch": 16.40909090909091, + "eval_loss": 0.001354728126898408, + "eval_runtime": 0.236, + "eval_samples_per_second": 372.916, + "eval_steps_per_second": 46.614, + "step": 361 + }, + { + "epoch": 16.454545454545453, + "grad_norm": 0.021557440981268883, + "learning_rate": 1.6144736842105262e-05, + "loss": 0.0025, + "step": 362 + }, + { + "epoch": 16.454545454545453, + "eval_loss": 0.0013508024858310819, + "eval_runtime": 0.2359, + "eval_samples_per_second": 373.118, + "eval_steps_per_second": 46.64, + "step": 362 + }, + { + "epoch": 16.5, + "grad_norm": 0.02110958844423294, + "learning_rate": 1.6105263157894736e-05, + "loss": 0.0023, + "step": 363 + }, + { + "epoch": 16.5, + "eval_loss": 0.0013467645039781928, + "eval_runtime": 0.2299, + "eval_samples_per_second": 382.703, + "eval_steps_per_second": 47.838, + "step": 363 + }, + { + "epoch": 16.545454545454547, + "grad_norm": 0.019328676164150238, + "learning_rate": 1.6065789473684214e-05, + "loss": 0.0024, + "step": 364 + }, + { + "epoch": 16.545454545454547, + "eval_loss": 0.0013428251259028912, + "eval_runtime": 0.2289, + "eval_samples_per_second": 384.389, + "eval_steps_per_second": 48.049, + "step": 364 + }, + { + "epoch": 16.59090909090909, + "grad_norm": 0.022835319861769676, + "learning_rate": 1.6026315789473684e-05, + "loss": 0.0023, + "step": 365 + }, + { + "epoch": 16.59090909090909, + "eval_loss": 0.0013391603715717793, + "eval_runtime": 0.2311, + "eval_samples_per_second": 380.86, + "eval_steps_per_second": 47.607, + "step": 365 + }, + { + "epoch": 16.636363636363637, + "grad_norm": 0.01819239743053913, + "learning_rate": 1.598684210526316e-05, + "loss": 0.0022, + "step": 366 + }, + { + "epoch": 16.636363636363637, + "eval_loss": 0.0013354604598134756, + "eval_runtime": 0.2268, + "eval_samples_per_second": 388.088, + "eval_steps_per_second": 48.511, + "step": 366 + }, + { + "epoch": 16.681818181818183, + "grad_norm": 0.019428908824920654, + "learning_rate": 1.5947368421052633e-05, + "loss": 0.0021, + "step": 367 + }, + { + "epoch": 16.681818181818183, + "eval_loss": 0.001331814331933856, + "eval_runtime": 0.2357, + "eval_samples_per_second": 373.331, + "eval_steps_per_second": 46.666, + "step": 367 + }, + { + "epoch": 16.727272727272727, + "grad_norm": 0.018047934398055077, + "learning_rate": 1.5907894736842107e-05, + "loss": 0.0022, + "step": 368 + }, + { + "epoch": 16.727272727272727, + "eval_loss": 0.0013281968422234058, + "eval_runtime": 0.2353, + "eval_samples_per_second": 374.058, + "eval_steps_per_second": 46.757, + "step": 368 + }, + { + "epoch": 16.772727272727273, + "grad_norm": 0.022303372621536255, + "learning_rate": 1.5868421052631578e-05, + "loss": 0.0022, + "step": 369 + }, + { + "epoch": 16.772727272727273, + "eval_loss": 0.0013246661983430386, + "eval_runtime": 0.2364, + "eval_samples_per_second": 372.233, + "eval_steps_per_second": 46.529, + "step": 369 + }, + { + "epoch": 16.818181818181817, + "grad_norm": 0.017466159537434578, + "learning_rate": 1.5828947368421055e-05, + "loss": 0.0021, + "step": 370 + }, + { + "epoch": 16.818181818181817, + "eval_loss": 0.001321229967288673, + "eval_runtime": 0.2328, + "eval_samples_per_second": 378.075, + "eval_steps_per_second": 47.259, + "step": 370 + }, + { + "epoch": 16.863636363636363, + "grad_norm": 0.018749618902802467, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.0021, + "step": 371 + }, + { + "epoch": 16.863636363636363, + "eval_loss": 0.0013179152738302946, + "eval_runtime": 0.2372, + "eval_samples_per_second": 371.017, + "eval_steps_per_second": 46.377, + "step": 371 + }, + { + "epoch": 16.90909090909091, + "grad_norm": 0.01943541131913662, + "learning_rate": 1.575e-05, + "loss": 0.0021, + "step": 372 + }, + { + "epoch": 16.90909090909091, + "eval_loss": 0.0013147753197699785, + "eval_runtime": 0.2325, + "eval_samples_per_second": 378.47, + "eval_steps_per_second": 47.309, + "step": 372 + }, + { + "epoch": 16.954545454545453, + "grad_norm": 0.018470529466867447, + "learning_rate": 1.5710526315789474e-05, + "loss": 0.0021, + "step": 373 + }, + { + "epoch": 16.954545454545453, + "eval_loss": 0.0013116110349074006, + "eval_runtime": 0.2449, + "eval_samples_per_second": 359.303, + "eval_steps_per_second": 44.913, + "step": 373 + }, + { + "epoch": 17.0, + "grad_norm": 0.02088373526930809, + "learning_rate": 1.5671052631578948e-05, + "loss": 0.0022, + "step": 374 + }, + { + "epoch": 17.0, + "eval_loss": 0.0013083978556096554, + "eval_runtime": 0.2373, + "eval_samples_per_second": 370.786, + "eval_steps_per_second": 46.348, + "step": 374 + }, + { + "epoch": 17.045454545454547, + "grad_norm": 0.02049199491739273, + "learning_rate": 1.563157894736842e-05, + "loss": 0.0021, + "step": 375 + }, + { + "epoch": 17.045454545454547, + "eval_loss": 0.0013052173890173435, + "eval_runtime": 0.2375, + "eval_samples_per_second": 370.52, + "eval_steps_per_second": 46.315, + "step": 375 + }, + { + "epoch": 17.09090909090909, + "grad_norm": 0.022884204983711243, + "learning_rate": 1.5592105263157897e-05, + "loss": 0.0023, + "step": 376 + }, + { + "epoch": 17.09090909090909, + "eval_loss": 0.0013022100320085883, + "eval_runtime": 0.2451, + "eval_samples_per_second": 359.032, + "eval_steps_per_second": 44.879, + "step": 376 + }, + { + "epoch": 17.136363636363637, + "grad_norm": 0.018668444827198982, + "learning_rate": 1.5552631578947367e-05, + "loss": 0.002, + "step": 377 + }, + { + "epoch": 17.136363636363637, + "eval_loss": 0.0012990765972062945, + "eval_runtime": 0.2377, + "eval_samples_per_second": 370.243, + "eval_steps_per_second": 46.28, + "step": 377 + }, + { + "epoch": 17.181818181818183, + "grad_norm": 0.018272867426276207, + "learning_rate": 1.551315789473684e-05, + "loss": 0.002, + "step": 378 + }, + { + "epoch": 17.181818181818183, + "eval_loss": 0.0012959121959283948, + "eval_runtime": 0.2445, + "eval_samples_per_second": 359.966, + "eval_steps_per_second": 44.996, + "step": 378 + }, + { + "epoch": 17.227272727272727, + "grad_norm": 0.018142884597182274, + "learning_rate": 1.547368421052632e-05, + "loss": 0.0023, + "step": 379 + }, + { + "epoch": 17.227272727272727, + "eval_loss": 0.0012926937779411674, + "eval_runtime": 0.2463, + "eval_samples_per_second": 357.295, + "eval_steps_per_second": 44.662, + "step": 379 + }, + { + "epoch": 17.272727272727273, + "grad_norm": 0.019035378471016884, + "learning_rate": 1.543421052631579e-05, + "loss": 0.002, + "step": 380 + }, + { + "epoch": 17.272727272727273, + "eval_loss": 0.0012895982945337892, + "eval_runtime": 0.2335, + "eval_samples_per_second": 376.923, + "eval_steps_per_second": 47.115, + "step": 380 + }, + { + "epoch": 17.318181818181817, + "grad_norm": 0.02087828330695629, + "learning_rate": 1.5394736842105264e-05, + "loss": 0.0023, + "step": 381 + }, + { + "epoch": 17.318181818181817, + "eval_loss": 0.0012864163145422935, + "eval_runtime": 0.2398, + "eval_samples_per_second": 367.034, + "eval_steps_per_second": 45.879, + "step": 381 + }, + { + "epoch": 17.363636363636363, + "grad_norm": 0.019186902791261673, + "learning_rate": 1.5355263157894738e-05, + "loss": 0.0021, + "step": 382 + }, + { + "epoch": 17.363636363636363, + "eval_loss": 0.001283234334550798, + "eval_runtime": 0.2265, + "eval_samples_per_second": 388.504, + "eval_steps_per_second": 48.563, + "step": 382 + }, + { + "epoch": 17.40909090909091, + "grad_norm": 0.01789664290845394, + "learning_rate": 1.5315789473684212e-05, + "loss": 0.002, + "step": 383 + }, + { + "epoch": 17.40909090909091, + "eval_loss": 0.0012801456032320857, + "eval_runtime": 0.229, + "eval_samples_per_second": 384.262, + "eval_steps_per_second": 48.033, + "step": 383 + }, + { + "epoch": 17.454545454545453, + "grad_norm": 0.017828669399023056, + "learning_rate": 1.5276315789473683e-05, + "loss": 0.0021, + "step": 384 + }, + { + "epoch": 17.454545454545453, + "eval_loss": 0.0012770771281793714, + "eval_runtime": 0.2259, + "eval_samples_per_second": 389.598, + "eval_steps_per_second": 48.7, + "step": 384 + }, + { + "epoch": 17.5, + "grad_norm": 0.0225471593439579, + "learning_rate": 1.5236842105263159e-05, + "loss": 0.0022, + "step": 385 + }, + { + "epoch": 17.5, + "eval_loss": 0.0012742335675284266, + "eval_runtime": 0.2398, + "eval_samples_per_second": 366.97, + "eval_steps_per_second": 45.871, + "step": 385 + }, + { + "epoch": 17.545454545454547, + "grad_norm": 0.02024303376674652, + "learning_rate": 1.5197368421052631e-05, + "loss": 0.0021, + "step": 386 + }, + { + "epoch": 17.545454545454547, + "eval_loss": 0.0012715155025944114, + "eval_runtime": 0.2322, + "eval_samples_per_second": 378.914, + "eval_steps_per_second": 47.364, + "step": 386 + }, + { + "epoch": 17.59090909090909, + "grad_norm": 0.021520059555768967, + "learning_rate": 1.5157894736842105e-05, + "loss": 0.0021, + "step": 387 + }, + { + "epoch": 17.59090909090909, + "eval_loss": 0.0012686135014519095, + "eval_runtime": 0.2273, + "eval_samples_per_second": 387.222, + "eval_steps_per_second": 48.403, + "step": 387 + }, + { + "epoch": 17.636363636363637, + "grad_norm": 0.02026878483593464, + "learning_rate": 1.5118421052631578e-05, + "loss": 0.0024, + "step": 388 + }, + { + "epoch": 17.636363636363637, + "eval_loss": 0.0012655220925807953, + "eval_runtime": 0.2345, + "eval_samples_per_second": 375.273, + "eval_steps_per_second": 46.909, + "step": 388 + }, + { + "epoch": 17.681818181818183, + "grad_norm": 0.017312707379460335, + "learning_rate": 1.5078947368421054e-05, + "loss": 0.0019, + "step": 389 + }, + { + "epoch": 17.681818181818183, + "eval_loss": 0.0012624793453142047, + "eval_runtime": 0.2341, + "eval_samples_per_second": 375.953, + "eval_steps_per_second": 46.994, + "step": 389 + }, + { + "epoch": 17.727272727272727, + "grad_norm": 0.014796672388911247, + "learning_rate": 1.5039473684210525e-05, + "loss": 0.0018, + "step": 390 + }, + { + "epoch": 17.727272727272727, + "eval_loss": 0.0012595909647643566, + "eval_runtime": 0.2412, + "eval_samples_per_second": 364.883, + "eval_steps_per_second": 45.61, + "step": 390 + }, + { + "epoch": 17.772727272727273, + "grad_norm": 0.024672966450452805, + "learning_rate": 1.5e-05, + "loss": 0.0024, + "step": 391 + }, + { + "epoch": 17.772727272727273, + "eval_loss": 0.001256533432751894, + "eval_runtime": 0.2394, + "eval_samples_per_second": 367.656, + "eval_steps_per_second": 45.957, + "step": 391 + }, + { + "epoch": 17.818181818181817, + "grad_norm": 0.01785973645746708, + "learning_rate": 1.4960526315789475e-05, + "loss": 0.0021, + "step": 392 + }, + { + "epoch": 17.818181818181817, + "eval_loss": 0.001253555528819561, + "eval_runtime": 0.2448, + "eval_samples_per_second": 359.499, + "eval_steps_per_second": 44.937, + "step": 392 + }, + { + "epoch": 17.863636363636363, + "grad_norm": 0.018725674599409103, + "learning_rate": 1.4921052631578947e-05, + "loss": 0.0022, + "step": 393 + }, + { + "epoch": 17.863636363636363, + "eval_loss": 0.001250546658411622, + "eval_runtime": 0.2295, + "eval_samples_per_second": 383.446, + "eval_steps_per_second": 47.931, + "step": 393 + }, + { + "epoch": 17.90909090909091, + "grad_norm": 0.01906488463282585, + "learning_rate": 1.4881578947368421e-05, + "loss": 0.0019, + "step": 394 + }, + { + "epoch": 17.90909090909091, + "eval_loss": 0.0012476051924750209, + "eval_runtime": 0.2392, + "eval_samples_per_second": 367.955, + "eval_steps_per_second": 45.994, + "step": 394 + }, + { + "epoch": 17.954545454545453, + "grad_norm": 0.01702312007546425, + "learning_rate": 1.4842105263157895e-05, + "loss": 0.0021, + "step": 395 + }, + { + "epoch": 17.954545454545453, + "eval_loss": 0.0012446870096027851, + "eval_runtime": 0.2408, + "eval_samples_per_second": 365.513, + "eval_steps_per_second": 45.689, + "step": 395 + }, + { + "epoch": 18.0, + "grad_norm": 0.018446706235408783, + "learning_rate": 1.4802631578947368e-05, + "loss": 0.0021, + "step": 396 + }, + { + "epoch": 18.0, + "eval_loss": 0.0012417498510330915, + "eval_runtime": 0.2401, + "eval_samples_per_second": 366.532, + "eval_steps_per_second": 45.816, + "step": 396 + }, + { + "epoch": 18.045454545454547, + "grad_norm": 0.017580052837729454, + "learning_rate": 1.4763157894736842e-05, + "loss": 0.002, + "step": 397 + }, + { + "epoch": 18.045454545454547, + "eval_loss": 0.0012387962779030204, + "eval_runtime": 0.2359, + "eval_samples_per_second": 373.019, + "eval_steps_per_second": 46.627, + "step": 397 + }, + { + "epoch": 18.09090909090909, + "grad_norm": 0.018549149855971336, + "learning_rate": 1.4723684210526318e-05, + "loss": 0.002, + "step": 398 + }, + { + "epoch": 18.09090909090909, + "eval_loss": 0.0012358062667772174, + "eval_runtime": 0.2409, + "eval_samples_per_second": 365.331, + "eval_steps_per_second": 45.666, + "step": 398 + }, + { + "epoch": 18.136363636363637, + "grad_norm": 0.021288642659783363, + "learning_rate": 1.468421052631579e-05, + "loss": 0.0021, + "step": 399 + }, + { + "epoch": 18.136363636363637, + "eval_loss": 0.00123285548761487, + "eval_runtime": 0.239, + "eval_samples_per_second": 368.2, + "eval_steps_per_second": 46.025, + "step": 399 + }, + { + "epoch": 18.181818181818183, + "grad_norm": 0.018042676150798798, + "learning_rate": 1.4644736842105264e-05, + "loss": 0.0021, + "step": 400 + }, + { + "epoch": 18.181818181818183, + "eval_loss": 0.0012299600057303905, + "eval_runtime": 0.2368, + "eval_samples_per_second": 371.628, + "eval_steps_per_second": 46.454, + "step": 400 + }, + { + "epoch": 18.227272727272727, + "grad_norm": 0.017950624227523804, + "learning_rate": 1.4605263157894737e-05, + "loss": 0.002, + "step": 401 + }, + { + "epoch": 18.227272727272727, + "eval_loss": 0.0012270959559828043, + "eval_runtime": 0.2217, + "eval_samples_per_second": 396.934, + "eval_steps_per_second": 49.617, + "step": 401 + }, + { + "epoch": 18.272727272727273, + "grad_norm": 0.016649143770337105, + "learning_rate": 1.4565789473684211e-05, + "loss": 0.002, + "step": 402 + }, + { + "epoch": 18.272727272727273, + "eval_loss": 0.0012242384254932404, + "eval_runtime": 0.2287, + "eval_samples_per_second": 384.84, + "eval_steps_per_second": 48.105, + "step": 402 + }, + { + "epoch": 18.318181818181817, + "grad_norm": 0.016468649730086327, + "learning_rate": 1.4526315789473685e-05, + "loss": 0.0018, + "step": 403 + }, + { + "epoch": 18.318181818181817, + "eval_loss": 0.001221520360559225, + "eval_runtime": 0.2271, + "eval_samples_per_second": 387.51, + "eval_steps_per_second": 48.439, + "step": 403 + }, + { + "epoch": 18.363636363636363, + "grad_norm": 0.01778615266084671, + "learning_rate": 1.4486842105263158e-05, + "loss": 0.002, + "step": 404 + }, + { + "epoch": 18.363636363636363, + "eval_loss": 0.0012188454857096076, + "eval_runtime": 0.2323, + "eval_samples_per_second": 378.869, + "eval_steps_per_second": 47.359, + "step": 404 + }, + { + "epoch": 18.40909090909091, + "grad_norm": 0.019096923992037773, + "learning_rate": 1.4447368421052632e-05, + "loss": 0.0021, + "step": 405 + }, + { + "epoch": 18.40909090909091, + "eval_loss": 0.0012163707287982106, + "eval_runtime": 0.2287, + "eval_samples_per_second": 384.807, + "eval_steps_per_second": 48.101, + "step": 405 + }, + { + "epoch": 18.454545454545453, + "grad_norm": 0.020378055050969124, + "learning_rate": 1.4407894736842106e-05, + "loss": 0.0019, + "step": 406 + }, + { + "epoch": 18.454545454545453, + "eval_loss": 0.0012139691971242428, + "eval_runtime": 0.2285, + "eval_samples_per_second": 385.172, + "eval_steps_per_second": 48.146, + "step": 406 + }, + { + "epoch": 18.5, + "grad_norm": 0.01801607571542263, + "learning_rate": 1.4368421052631578e-05, + "loss": 0.0019, + "step": 407 + }, + { + "epoch": 18.5, + "eval_loss": 0.0012113729026168585, + "eval_runtime": 0.2323, + "eval_samples_per_second": 378.867, + "eval_steps_per_second": 47.358, + "step": 407 + }, + { + "epoch": 18.545454545454547, + "grad_norm": 0.016806334257125854, + "learning_rate": 1.4328947368421052e-05, + "loss": 0.0019, + "step": 408 + }, + { + "epoch": 18.545454545454547, + "eval_loss": 0.0012086898786947131, + "eval_runtime": 0.2266, + "eval_samples_per_second": 388.422, + "eval_steps_per_second": 48.553, + "step": 408 + }, + { + "epoch": 18.59090909090909, + "grad_norm": 0.01768423058092594, + "learning_rate": 1.4289473684210527e-05, + "loss": 0.0019, + "step": 409 + }, + { + "epoch": 18.59090909090909, + "eval_loss": 0.001205993234179914, + "eval_runtime": 0.233, + "eval_samples_per_second": 377.712, + "eval_steps_per_second": 47.214, + "step": 409 + }, + { + "epoch": 18.636363636363637, + "grad_norm": 0.016840273514389992, + "learning_rate": 1.4249999999999999e-05, + "loss": 0.0019, + "step": 410 + }, + { + "epoch": 18.636363636363637, + "eval_loss": 0.00120334152597934, + "eval_runtime": 0.2278, + "eval_samples_per_second": 386.255, + "eval_steps_per_second": 48.282, + "step": 410 + }, + { + "epoch": 18.681818181818183, + "grad_norm": 0.019254090264439583, + "learning_rate": 1.4210526315789473e-05, + "loss": 0.0021, + "step": 411 + }, + { + "epoch": 18.681818181818183, + "eval_loss": 0.001200651633553207, + "eval_runtime": 0.2414, + "eval_samples_per_second": 364.529, + "eval_steps_per_second": 45.566, + "step": 411 + }, + { + "epoch": 18.727272727272727, + "grad_norm": 0.018222426995635033, + "learning_rate": 1.4171052631578949e-05, + "loss": 0.0021, + "step": 412 + }, + { + "epoch": 18.727272727272727, + "eval_loss": 0.0011977426474913955, + "eval_runtime": 0.2297, + "eval_samples_per_second": 383.168, + "eval_steps_per_second": 47.896, + "step": 412 + }, + { + "epoch": 18.772727272727273, + "grad_norm": 0.017460381612181664, + "learning_rate": 1.4131578947368422e-05, + "loss": 0.0019, + "step": 413 + }, + { + "epoch": 18.772727272727273, + "eval_loss": 0.0011948675382882357, + "eval_runtime": 0.2295, + "eval_samples_per_second": 383.384, + "eval_steps_per_second": 47.923, + "step": 413 + }, + { + "epoch": 18.818181818181817, + "grad_norm": 0.014636803418397903, + "learning_rate": 1.4092105263157896e-05, + "loss": 0.0018, + "step": 414 + }, + { + "epoch": 18.818181818181817, + "eval_loss": 0.0011919679818674922, + "eval_runtime": 0.2375, + "eval_samples_per_second": 370.502, + "eval_steps_per_second": 46.313, + "step": 414 + }, + { + "epoch": 18.863636363636363, + "grad_norm": 0.01725298911333084, + "learning_rate": 1.405263157894737e-05, + "loss": 0.0019, + "step": 415 + }, + { + "epoch": 18.863636363636363, + "eval_loss": 0.0011888709850609303, + "eval_runtime": 0.2319, + "eval_samples_per_second": 379.492, + "eval_steps_per_second": 47.437, + "step": 415 + }, + { + "epoch": 18.90909090909091, + "grad_norm": 0.017635343596339226, + "learning_rate": 1.4013157894736842e-05, + "loss": 0.0019, + "step": 416 + }, + { + "epoch": 18.90909090909091, + "eval_loss": 0.0011859294027090073, + "eval_runtime": 0.232, + "eval_samples_per_second": 379.329, + "eval_steps_per_second": 47.416, + "step": 416 + }, + { + "epoch": 18.954545454545453, + "grad_norm": 0.017270755022764206, + "learning_rate": 1.3973684210526316e-05, + "loss": 0.002, + "step": 417 + }, + { + "epoch": 18.954545454545453, + "eval_loss": 0.0011831001611426473, + "eval_runtime": 0.2293, + "eval_samples_per_second": 383.786, + "eval_steps_per_second": 47.973, + "step": 417 + }, + { + "epoch": 19.0, + "grad_norm": 0.017159774899482727, + "learning_rate": 1.393421052631579e-05, + "loss": 0.0018, + "step": 418 + }, + { + "epoch": 19.0, + "eval_loss": 0.001180406310595572, + "eval_runtime": 0.2475, + "eval_samples_per_second": 355.577, + "eval_steps_per_second": 44.447, + "step": 418 + }, + { + "epoch": 19.045454545454547, + "grad_norm": 0.015916157513856888, + "learning_rate": 1.3894736842105263e-05, + "loss": 0.0018, + "step": 419 + }, + { + "epoch": 19.045454545454547, + "eval_loss": 0.0011776703177019954, + "eval_runtime": 0.2406, + "eval_samples_per_second": 365.71, + "eval_steps_per_second": 45.714, + "step": 419 + }, + { + "epoch": 19.09090909090909, + "grad_norm": 0.016425369307398796, + "learning_rate": 1.3855263157894737e-05, + "loss": 0.002, + "step": 420 + }, + { + "epoch": 19.09090909090909, + "eval_loss": 0.0011750170961022377, + "eval_runtime": 0.2379, + "eval_samples_per_second": 369.975, + "eval_steps_per_second": 46.247, + "step": 420 + }, + { + "epoch": 19.136363636363637, + "grad_norm": 0.017857089638710022, + "learning_rate": 1.3815789473684211e-05, + "loss": 0.0019, + "step": 421 + }, + { + "epoch": 19.136363636363637, + "eval_loss": 0.0011724097421392798, + "eval_runtime": 0.2504, + "eval_samples_per_second": 351.397, + "eval_steps_per_second": 43.925, + "step": 421 + }, + { + "epoch": 19.181818181818183, + "grad_norm": 0.01837003231048584, + "learning_rate": 1.3776315789473684e-05, + "loss": 0.0022, + "step": 422 + }, + { + "epoch": 19.181818181818183, + "eval_loss": 0.0011697578011080623, + "eval_runtime": 0.2585, + "eval_samples_per_second": 340.422, + "eval_steps_per_second": 42.553, + "step": 422 + }, + { + "epoch": 19.227272727272727, + "grad_norm": 0.019487086683511734, + "learning_rate": 1.3736842105263158e-05, + "loss": 0.0021, + "step": 423 + }, + { + "epoch": 19.227272727272727, + "eval_loss": 0.0011671868851408362, + "eval_runtime": 0.2398, + "eval_samples_per_second": 366.896, + "eval_steps_per_second": 45.862, + "step": 423 + }, + { + "epoch": 19.272727272727273, + "grad_norm": 0.016021518036723137, + "learning_rate": 1.369736842105263e-05, + "loss": 0.0019, + "step": 424 + }, + { + "epoch": 19.272727272727273, + "eval_loss": 0.001164758112281561, + "eval_runtime": 0.2642, + "eval_samples_per_second": 333.083, + "eval_steps_per_second": 41.635, + "step": 424 + }, + { + "epoch": 19.318181818181817, + "grad_norm": 0.018122289329767227, + "learning_rate": 1.3657894736842106e-05, + "loss": 0.0019, + "step": 425 + }, + { + "epoch": 19.318181818181817, + "eval_loss": 0.001162288710474968, + "eval_runtime": 0.2578, + "eval_samples_per_second": 341.316, + "eval_steps_per_second": 42.665, + "step": 425 + }, + { + "epoch": 19.363636363636363, + "grad_norm": 0.015892351046204567, + "learning_rate": 1.361842105263158e-05, + "loss": 0.0018, + "step": 426 + }, + { + "epoch": 19.363636363636363, + "eval_loss": 0.001159931649453938, + "eval_runtime": 0.2409, + "eval_samples_per_second": 365.291, + "eval_steps_per_second": 45.661, + "step": 426 + }, + { + "epoch": 19.40909090909091, + "grad_norm": 0.015699921175837517, + "learning_rate": 1.3578947368421053e-05, + "loss": 0.0019, + "step": 427 + }, + { + "epoch": 19.40909090909091, + "eval_loss": 0.0011575055541470647, + "eval_runtime": 0.2388, + "eval_samples_per_second": 368.523, + "eval_steps_per_second": 46.065, + "step": 427 + }, + { + "epoch": 19.454545454545453, + "grad_norm": 0.01474451832473278, + "learning_rate": 1.3539473684210527e-05, + "loss": 0.0017, + "step": 428 + }, + { + "epoch": 19.454545454545453, + "eval_loss": 0.001155222998932004, + "eval_runtime": 0.2408, + "eval_samples_per_second": 365.449, + "eval_steps_per_second": 45.681, + "step": 428 + }, + { + "epoch": 19.5, + "grad_norm": 0.016437875106930733, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.0018, + "step": 429 + }, + { + "epoch": 19.5, + "eval_loss": 0.0011530268238857388, + "eval_runtime": 0.2325, + "eval_samples_per_second": 378.535, + "eval_steps_per_second": 47.317, + "step": 429 + }, + { + "epoch": 19.545454545454547, + "grad_norm": 0.01538484264165163, + "learning_rate": 1.3460526315789474e-05, + "loss": 0.0018, + "step": 430 + }, + { + "epoch": 19.545454545454547, + "eval_loss": 0.0011508835013955832, + "eval_runtime": 0.2309, + "eval_samples_per_second": 381.166, + "eval_steps_per_second": 47.646, + "step": 430 + }, + { + "epoch": 19.59090909090909, + "grad_norm": 0.017129214480519295, + "learning_rate": 1.3421052631578948e-05, + "loss": 0.0019, + "step": 431 + }, + { + "epoch": 19.59090909090909, + "eval_loss": 0.0011487645097076893, + "eval_runtime": 0.2362, + "eval_samples_per_second": 372.58, + "eval_steps_per_second": 46.573, + "step": 431 + }, + { + "epoch": 19.636363636363637, + "grad_norm": 0.016592320054769516, + "learning_rate": 1.3381578947368422e-05, + "loss": 0.0019, + "step": 432 + }, + { + "epoch": 19.636363636363637, + "eval_loss": 0.0011467835865914822, + "eval_runtime": 0.2418, + "eval_samples_per_second": 364.003, + "eval_steps_per_second": 45.5, + "step": 432 + }, + { + "epoch": 19.681818181818183, + "grad_norm": 0.018111824989318848, + "learning_rate": 1.3342105263157894e-05, + "loss": 0.0019, + "step": 433 + }, + { + "epoch": 19.681818181818183, + "eval_loss": 0.0011448581935837865, + "eval_runtime": 0.2437, + "eval_samples_per_second": 361.142, + "eval_steps_per_second": 45.143, + "step": 433 + }, + { + "epoch": 19.727272727272727, + "grad_norm": 0.01678645797073841, + "learning_rate": 1.3302631578947369e-05, + "loss": 0.0018, + "step": 434 + }, + { + "epoch": 19.727272727272727, + "eval_loss": 0.0011427812278270721, + "eval_runtime": 0.229, + "eval_samples_per_second": 384.254, + "eval_steps_per_second": 48.032, + "step": 434 + }, + { + "epoch": 19.772727272727273, + "grad_norm": 0.01921844109892845, + "learning_rate": 1.3263157894736843e-05, + "loss": 0.0021, + "step": 435 + }, + { + "epoch": 19.772727272727273, + "eval_loss": 0.0011407433776184916, + "eval_runtime": 0.24, + "eval_samples_per_second": 366.62, + "eval_steps_per_second": 45.828, + "step": 435 + }, + { + "epoch": 19.818181818181817, + "grad_norm": 0.01700635813176632, + "learning_rate": 1.3223684210526315e-05, + "loss": 0.0019, + "step": 436 + }, + { + "epoch": 19.818181818181817, + "eval_loss": 0.0011388043640181422, + "eval_runtime": 0.241, + "eval_samples_per_second": 365.22, + "eval_steps_per_second": 45.652, + "step": 436 + }, + { + "epoch": 19.863636363636363, + "grad_norm": 0.02139265649020672, + "learning_rate": 1.318421052631579e-05, + "loss": 0.0021, + "step": 437 + }, + { + "epoch": 19.863636363636363, + "eval_loss": 0.0011367396218702197, + "eval_runtime": 0.2327, + "eval_samples_per_second": 378.128, + "eval_steps_per_second": 47.266, + "step": 437 + }, + { + "epoch": 19.90909090909091, + "grad_norm": 0.016315054148435593, + "learning_rate": 1.3144736842105263e-05, + "loss": 0.0018, + "step": 438 + }, + { + "epoch": 19.90909090909091, + "eval_loss": 0.001134704565629363, + "eval_runtime": 0.243, + "eval_samples_per_second": 362.095, + "eval_steps_per_second": 45.262, + "step": 438 + }, + { + "epoch": 19.954545454545453, + "grad_norm": 0.015357021242380142, + "learning_rate": 1.3105263157894738e-05, + "loss": 0.0019, + "step": 439 + }, + { + "epoch": 19.954545454545453, + "eval_loss": 0.0011326519306749105, + "eval_runtime": 0.238, + "eval_samples_per_second": 369.798, + "eval_steps_per_second": 46.225, + "step": 439 + }, + { + "epoch": 20.0, + "grad_norm": 0.01644103042781353, + "learning_rate": 1.3065789473684212e-05, + "loss": 0.0019, + "step": 440 + }, + { + "epoch": 20.0, + "eval_loss": 0.0011306345695629716, + "eval_runtime": 0.2373, + "eval_samples_per_second": 370.835, + "eval_steps_per_second": 46.354, + "step": 440 + }, + { + "epoch": 20.045454545454547, + "grad_norm": 0.0168069489300251, + "learning_rate": 1.3026315789473684e-05, + "loss": 0.002, + "step": 441 + }, + { + "epoch": 20.045454545454547, + "eval_loss": 0.0011284599313512444, + "eval_runtime": 0.2305, + "eval_samples_per_second": 381.741, + "eval_steps_per_second": 47.718, + "step": 441 + }, + { + "epoch": 20.09090909090909, + "grad_norm": 0.015401924960315228, + "learning_rate": 1.2986842105263158e-05, + "loss": 0.0019, + "step": 442 + }, + { + "epoch": 20.09090909090909, + "eval_loss": 0.0011262963525950909, + "eval_runtime": 0.2364, + "eval_samples_per_second": 372.316, + "eval_steps_per_second": 46.54, + "step": 442 + }, + { + "epoch": 20.136363636363637, + "grad_norm": 0.019058704376220703, + "learning_rate": 1.2947368421052633e-05, + "loss": 0.0019, + "step": 443 + }, + { + "epoch": 20.136363636363637, + "eval_loss": 0.0011239717714488506, + "eval_runtime": 0.2383, + "eval_samples_per_second": 369.214, + "eval_steps_per_second": 46.152, + "step": 443 + }, + { + "epoch": 20.181818181818183, + "grad_norm": 0.018643731251358986, + "learning_rate": 1.2907894736842105e-05, + "loss": 0.0019, + "step": 444 + }, + { + "epoch": 20.181818181818183, + "eval_loss": 0.0011216469574719667, + "eval_runtime": 0.2413, + "eval_samples_per_second": 364.671, + "eval_steps_per_second": 45.584, + "step": 444 + }, + { + "epoch": 20.227272727272727, + "grad_norm": 0.018360739573836327, + "learning_rate": 1.2868421052631579e-05, + "loss": 0.002, + "step": 445 + }, + { + "epoch": 20.227272727272727, + "eval_loss": 0.0011192425154149532, + "eval_runtime": 0.2331, + "eval_samples_per_second": 377.473, + "eval_steps_per_second": 47.184, + "step": 445 + }, + { + "epoch": 20.272727272727273, + "grad_norm": 0.016574162989854813, + "learning_rate": 1.2828947368421053e-05, + "loss": 0.0019, + "step": 446 + }, + { + "epoch": 20.272727272727273, + "eval_loss": 0.001116919214837253, + "eval_runtime": 0.2433, + "eval_samples_per_second": 361.621, + "eval_steps_per_second": 45.203, + "step": 446 + }, + { + "epoch": 20.318181818181817, + "grad_norm": 0.01646783947944641, + "learning_rate": 1.2789473684210526e-05, + "loss": 0.0019, + "step": 447 + }, + { + "epoch": 20.318181818181817, + "eval_loss": 0.0011146310716867447, + "eval_runtime": 0.2514, + "eval_samples_per_second": 349.985, + "eval_steps_per_second": 43.748, + "step": 447 + }, + { + "epoch": 20.363636363636363, + "grad_norm": 0.017044425010681152, + "learning_rate": 1.275e-05, + "loss": 0.0018, + "step": 448 + }, + { + "epoch": 20.363636363636363, + "eval_loss": 0.0011123091680929065, + "eval_runtime": 0.253, + "eval_samples_per_second": 347.827, + "eval_steps_per_second": 43.478, + "step": 448 + }, + { + "epoch": 20.40909090909091, + "grad_norm": 0.017729461193084717, + "learning_rate": 1.2710526315789474e-05, + "loss": 0.0019, + "step": 449 + }, + { + "epoch": 20.40909090909091, + "eval_loss": 0.001110163051635027, + "eval_runtime": 0.2651, + "eval_samples_per_second": 331.944, + "eval_steps_per_second": 41.493, + "step": 449 + }, + { + "epoch": 20.454545454545453, + "grad_norm": 0.014911322854459286, + "learning_rate": 1.2671052631578947e-05, + "loss": 0.0017, + "step": 450 + }, + { + "epoch": 20.454545454545453, + "eval_loss": 0.0011080644326284528, + "eval_runtime": 0.2496, + "eval_samples_per_second": 352.625, + "eval_steps_per_second": 44.078, + "step": 450 + }, + { + "epoch": 20.5, + "grad_norm": 0.016675200313329697, + "learning_rate": 1.263157894736842e-05, + "loss": 0.0019, + "step": 451 + }, + { + "epoch": 20.5, + "eval_loss": 0.0011060454417020082, + "eval_runtime": 0.26, + "eval_samples_per_second": 338.446, + "eval_steps_per_second": 42.306, + "step": 451 + }, + { + "epoch": 20.545454545454547, + "grad_norm": 0.016018547117710114, + "learning_rate": 1.2592105263157895e-05, + "loss": 0.0018, + "step": 452 + }, + { + "epoch": 20.545454545454547, + "eval_loss": 0.0011039102682843804, + "eval_runtime": 0.2399, + "eval_samples_per_second": 366.846, + "eval_steps_per_second": 45.856, + "step": 452 + }, + { + "epoch": 20.59090909090909, + "grad_norm": 0.016912776976823807, + "learning_rate": 1.2552631578947369e-05, + "loss": 0.0019, + "step": 453 + }, + { + "epoch": 20.59090909090909, + "eval_loss": 0.0011017858050763607, + "eval_runtime": 0.2273, + "eval_samples_per_second": 387.134, + "eval_steps_per_second": 48.392, + "step": 453 + }, + { + "epoch": 20.636363636363637, + "grad_norm": 0.015879783779382706, + "learning_rate": 1.2513157894736843e-05, + "loss": 0.0018, + "step": 454 + }, + { + "epoch": 20.636363636363637, + "eval_loss": 0.0010996219934895635, + "eval_runtime": 0.2449, + "eval_samples_per_second": 359.378, + "eval_steps_per_second": 44.922, + "step": 454 + }, + { + "epoch": 20.681818181818183, + "grad_norm": 0.017021868377923965, + "learning_rate": 1.2473684210526317e-05, + "loss": 0.0019, + "step": 455 + }, + { + "epoch": 20.681818181818183, + "eval_loss": 0.0010973933385685086, + "eval_runtime": 0.229, + "eval_samples_per_second": 384.317, + "eval_steps_per_second": 48.04, + "step": 455 + }, + { + "epoch": 20.727272727272727, + "grad_norm": 0.015419513918459415, + "learning_rate": 1.243421052631579e-05, + "loss": 0.0019, + "step": 456 + }, + { + "epoch": 20.727272727272727, + "eval_loss": 0.001095130923204124, + "eval_runtime": 0.2362, + "eval_samples_per_second": 372.489, + "eval_steps_per_second": 46.561, + "step": 456 + }, + { + "epoch": 20.772727272727273, + "grad_norm": 0.01693497784435749, + "learning_rate": 1.2394736842105264e-05, + "loss": 0.0018, + "step": 457 + }, + { + "epoch": 20.772727272727273, + "eval_loss": 0.0010928618721663952, + "eval_runtime": 0.2233, + "eval_samples_per_second": 394.174, + "eval_steps_per_second": 49.272, + "step": 457 + }, + { + "epoch": 20.818181818181817, + "grad_norm": 0.017432473599910736, + "learning_rate": 1.2355263157894738e-05, + "loss": 0.0018, + "step": 458 + }, + { + "epoch": 20.818181818181817, + "eval_loss": 0.0010908265830948949, + "eval_runtime": 0.2275, + "eval_samples_per_second": 386.81, + "eval_steps_per_second": 48.351, + "step": 458 + }, + { + "epoch": 20.863636363636363, + "grad_norm": 0.014237020164728165, + "learning_rate": 1.231578947368421e-05, + "loss": 0.0016, + "step": 459 + }, + { + "epoch": 20.863636363636363, + "eval_loss": 0.0010887522948905826, + "eval_runtime": 0.236, + "eval_samples_per_second": 372.82, + "eval_steps_per_second": 46.603, + "step": 459 + }, + { + "epoch": 20.90909090909091, + "grad_norm": 0.016278453171253204, + "learning_rate": 1.2276315789473685e-05, + "loss": 0.0017, + "step": 460 + }, + { + "epoch": 20.90909090909091, + "eval_loss": 0.0010867157252505422, + "eval_runtime": 0.2288, + "eval_samples_per_second": 384.554, + "eval_steps_per_second": 48.069, + "step": 460 + }, + { + "epoch": 20.954545454545453, + "grad_norm": 0.01595933921635151, + "learning_rate": 1.2236842105263159e-05, + "loss": 0.0019, + "step": 461 + }, + { + "epoch": 20.954545454545453, + "eval_loss": 0.0010847292141988873, + "eval_runtime": 0.2252, + "eval_samples_per_second": 390.754, + "eval_steps_per_second": 48.844, + "step": 461 + }, + { + "epoch": 21.0, + "grad_norm": 0.017483873292803764, + "learning_rate": 1.2197368421052631e-05, + "loss": 0.0018, + "step": 462 + }, + { + "epoch": 21.0, + "eval_loss": 0.0010827549267560244, + "eval_runtime": 0.2236, + "eval_samples_per_second": 393.554, + "eval_steps_per_second": 49.194, + "step": 462 + }, + { + "epoch": 21.045454545454547, + "grad_norm": 0.01537961047142744, + "learning_rate": 1.2157894736842105e-05, + "loss": 0.0018, + "step": 463 + }, + { + "epoch": 21.045454545454547, + "eval_loss": 0.0010808442020788789, + "eval_runtime": 0.2361, + "eval_samples_per_second": 372.729, + "eval_steps_per_second": 46.591, + "step": 463 + }, + { + "epoch": 21.09090909090909, + "grad_norm": 0.015306917950510979, + "learning_rate": 1.2118421052631578e-05, + "loss": 0.0017, + "step": 464 + }, + { + "epoch": 21.09090909090909, + "eval_loss": 0.0010790039086714387, + "eval_runtime": 0.2298, + "eval_samples_per_second": 382.888, + "eval_steps_per_second": 47.861, + "step": 464 + }, + { + "epoch": 21.136363636363637, + "grad_norm": 0.013436819426715374, + "learning_rate": 1.2078947368421052e-05, + "loss": 0.0016, + "step": 465 + }, + { + "epoch": 21.136363636363637, + "eval_loss": 0.0010772122768685222, + "eval_runtime": 0.2421, + "eval_samples_per_second": 363.528, + "eval_steps_per_second": 45.441, + "step": 465 + }, + { + "epoch": 21.181818181818183, + "grad_norm": 0.016245294362306595, + "learning_rate": 1.2039473684210528e-05, + "loss": 0.0018, + "step": 466 + }, + { + "epoch": 21.181818181818183, + "eval_loss": 0.0010752826929092407, + "eval_runtime": 0.2313, + "eval_samples_per_second": 380.386, + "eval_steps_per_second": 47.548, + "step": 466 + }, + { + "epoch": 21.227272727272727, + "grad_norm": 0.015921350568532944, + "learning_rate": 1.2e-05, + "loss": 0.0017, + "step": 467 + }, + { + "epoch": 21.227272727272727, + "eval_loss": 0.0010733003728091717, + "eval_runtime": 0.2302, + "eval_samples_per_second": 382.349, + "eval_steps_per_second": 47.794, + "step": 467 + }, + { + "epoch": 21.272727272727273, + "grad_norm": 0.016333753243088722, + "learning_rate": 1.1960526315789474e-05, + "loss": 0.0018, + "step": 468 + }, + { + "epoch": 21.272727272727273, + "eval_loss": 0.0010712259681895375, + "eval_runtime": 0.2299, + "eval_samples_per_second": 382.824, + "eval_steps_per_second": 47.853, + "step": 468 + }, + { + "epoch": 21.318181818181817, + "grad_norm": 0.015542343258857727, + "learning_rate": 1.1921052631578949e-05, + "loss": 0.0017, + "step": 469 + }, + { + "epoch": 21.318181818181817, + "eval_loss": 0.0010691812494769692, + "eval_runtime": 0.2401, + "eval_samples_per_second": 366.569, + "eval_steps_per_second": 45.821, + "step": 469 + }, + { + "epoch": 21.363636363636363, + "grad_norm": 0.017036397010087967, + "learning_rate": 1.1881578947368421e-05, + "loss": 0.0019, + "step": 470 + }, + { + "epoch": 21.363636363636363, + "eval_loss": 0.0010671325726434588, + "eval_runtime": 0.2367, + "eval_samples_per_second": 371.749, + "eval_steps_per_second": 46.469, + "step": 470 + }, + { + "epoch": 21.40909090909091, + "grad_norm": 0.01621134579181671, + "learning_rate": 1.1842105263157895e-05, + "loss": 0.0018, + "step": 471 + }, + { + "epoch": 21.40909090909091, + "eval_loss": 0.0010652164928615093, + "eval_runtime": 0.2376, + "eval_samples_per_second": 370.382, + "eval_steps_per_second": 46.298, + "step": 471 + }, + { + "epoch": 21.454545454545453, + "grad_norm": 0.013604752719402313, + "learning_rate": 1.180263157894737e-05, + "loss": 0.0017, + "step": 472 + }, + { + "epoch": 21.454545454545453, + "eval_loss": 0.0010633313795551658, + "eval_runtime": 0.2408, + "eval_samples_per_second": 365.399, + "eval_steps_per_second": 45.675, + "step": 472 + }, + { + "epoch": 21.5, + "grad_norm": 0.014795001596212387, + "learning_rate": 1.1763157894736842e-05, + "loss": 0.0016, + "step": 473 + }, + { + "epoch": 21.5, + "eval_loss": 0.001061469316482544, + "eval_runtime": 0.2486, + "eval_samples_per_second": 354.0, + "eval_steps_per_second": 44.25, + "step": 473 + }, + { + "epoch": 21.545454545454547, + "grad_norm": 0.015267064794898033, + "learning_rate": 1.1723684210526316e-05, + "loss": 0.0018, + "step": 474 + }, + { + "epoch": 21.545454545454547, + "eval_loss": 0.0010596156353130937, + "eval_runtime": 0.2421, + "eval_samples_per_second": 363.419, + "eval_steps_per_second": 45.427, + "step": 474 + }, + { + "epoch": 21.59090909090909, + "grad_norm": 0.017209574580192566, + "learning_rate": 1.168421052631579e-05, + "loss": 0.0018, + "step": 475 + }, + { + "epoch": 21.59090909090909, + "eval_loss": 0.0010576344793662429, + "eval_runtime": 0.2464, + "eval_samples_per_second": 357.122, + "eval_steps_per_second": 44.64, + "step": 475 + }, + { + "epoch": 21.636363636363637, + "grad_norm": 0.0154210040345788, + "learning_rate": 1.1644736842105263e-05, + "loss": 0.0018, + "step": 476 + }, + { + "epoch": 21.636363636363637, + "eval_loss": 0.0010555870831012726, + "eval_runtime": 0.2538, + "eval_samples_per_second": 346.671, + "eval_steps_per_second": 43.334, + "step": 476 + }, + { + "epoch": 21.681818181818183, + "grad_norm": 0.017148546874523163, + "learning_rate": 1.1605263157894737e-05, + "loss": 0.0018, + "step": 477 + }, + { + "epoch": 21.681818181818183, + "eval_loss": 0.0010535044129937887, + "eval_runtime": 0.2437, + "eval_samples_per_second": 361.038, + "eval_steps_per_second": 45.13, + "step": 477 + }, + { + "epoch": 21.727272727272727, + "grad_norm": 0.01518462784588337, + "learning_rate": 1.1565789473684211e-05, + "loss": 0.0017, + "step": 478 + }, + { + "epoch": 21.727272727272727, + "eval_loss": 0.0010514232562854886, + "eval_runtime": 0.2402, + "eval_samples_per_second": 366.378, + "eval_steps_per_second": 45.797, + "step": 478 + }, + { + "epoch": 21.772727272727273, + "grad_norm": 0.01500785257667303, + "learning_rate": 1.1526315789473683e-05, + "loss": 0.0016, + "step": 479 + }, + { + "epoch": 21.772727272727273, + "eval_loss": 0.0010493744630366564, + "eval_runtime": 0.2449, + "eval_samples_per_second": 359.362, + "eval_steps_per_second": 44.92, + "step": 479 + }, + { + "epoch": 21.818181818181817, + "grad_norm": 0.015978703275322914, + "learning_rate": 1.148684210526316e-05, + "loss": 0.0018, + "step": 480 + }, + { + "epoch": 21.818181818181817, + "eval_loss": 0.0010474204318597913, + "eval_runtime": 0.2586, + "eval_samples_per_second": 340.345, + "eval_steps_per_second": 42.543, + "step": 480 + }, + { + "epoch": 21.863636363636363, + "grad_norm": 0.01765250600874424, + "learning_rate": 1.1447368421052632e-05, + "loss": 0.0017, + "step": 481 + }, + { + "epoch": 21.863636363636363, + "eval_loss": 0.0010454690782353282, + "eval_runtime": 0.2292, + "eval_samples_per_second": 383.999, + "eval_steps_per_second": 48.0, + "step": 481 + }, + { + "epoch": 21.90909090909091, + "grad_norm": 0.016576098278164864, + "learning_rate": 1.1407894736842106e-05, + "loss": 0.0017, + "step": 482 + }, + { + "epoch": 21.90909090909091, + "eval_loss": 0.0010435826843604445, + "eval_runtime": 0.2414, + "eval_samples_per_second": 364.501, + "eval_steps_per_second": 45.563, + "step": 482 + }, + { + "epoch": 21.954545454545453, + "grad_norm": 0.014276851899921894, + "learning_rate": 1.136842105263158e-05, + "loss": 0.0017, + "step": 483 + }, + { + "epoch": 21.954545454545453, + "eval_loss": 0.0010416691657155752, + "eval_runtime": 0.2241, + "eval_samples_per_second": 392.673, + "eval_steps_per_second": 49.084, + "step": 483 + }, + { + "epoch": 22.0, + "grad_norm": 0.01667684316635132, + "learning_rate": 1.1328947368421052e-05, + "loss": 0.0017, + "step": 484 + }, + { + "epoch": 22.0, + "eval_loss": 0.0010398232843726873, + "eval_runtime": 0.24, + "eval_samples_per_second": 366.592, + "eval_steps_per_second": 45.824, + "step": 484 + }, + { + "epoch": 22.045454545454547, + "grad_norm": 0.016187671571969986, + "learning_rate": 1.1289473684210527e-05, + "loss": 0.0018, + "step": 485 + }, + { + "epoch": 22.045454545454547, + "eval_loss": 0.0010379315353929996, + "eval_runtime": 0.2306, + "eval_samples_per_second": 381.551, + "eval_steps_per_second": 47.694, + "step": 485 + }, + { + "epoch": 22.09090909090909, + "grad_norm": 0.014743163250386715, + "learning_rate": 1.125e-05, + "loss": 0.0018, + "step": 486 + }, + { + "epoch": 22.09090909090909, + "eval_loss": 0.0010359951993450522, + "eval_runtime": 0.227, + "eval_samples_per_second": 387.598, + "eval_steps_per_second": 48.45, + "step": 486 + }, + { + "epoch": 22.136363636363637, + "grad_norm": 0.01694609597325325, + "learning_rate": 1.1210526315789473e-05, + "loss": 0.0017, + "step": 487 + }, + { + "epoch": 22.136363636363637, + "eval_loss": 0.0010341384913772345, + "eval_runtime": 0.2407, + "eval_samples_per_second": 365.633, + "eval_steps_per_second": 45.704, + "step": 487 + }, + { + "epoch": 22.181818181818183, + "grad_norm": 0.014260073192417622, + "learning_rate": 1.1171052631578947e-05, + "loss": 0.0017, + "step": 488 + }, + { + "epoch": 22.181818181818183, + "eval_loss": 0.0010322789894416928, + "eval_runtime": 0.2279, + "eval_samples_per_second": 386.189, + "eval_steps_per_second": 48.274, + "step": 488 + }, + { + "epoch": 22.227272727272727, + "grad_norm": 0.017539717257022858, + "learning_rate": 1.1131578947368421e-05, + "loss": 0.0016, + "step": 489 + }, + { + "epoch": 22.227272727272727, + "eval_loss": 0.001030544051900506, + "eval_runtime": 0.239, + "eval_samples_per_second": 368.276, + "eval_steps_per_second": 46.034, + "step": 489 + }, + { + "epoch": 22.272727272727273, + "grad_norm": 0.013456945307552814, + "learning_rate": 1.1092105263157894e-05, + "loss": 0.0016, + "step": 490 + }, + { + "epoch": 22.272727272727273, + "eval_loss": 0.0010288661578670144, + "eval_runtime": 0.2301, + "eval_samples_per_second": 382.513, + "eval_steps_per_second": 47.814, + "step": 490 + }, + { + "epoch": 22.318181818181817, + "grad_norm": 0.016474781557917595, + "learning_rate": 1.1052631578947368e-05, + "loss": 0.0017, + "step": 491 + }, + { + "epoch": 22.318181818181817, + "eval_loss": 0.0010273018851876259, + "eval_runtime": 0.235, + "eval_samples_per_second": 374.491, + "eval_steps_per_second": 46.811, + "step": 491 + }, + { + "epoch": 22.363636363636363, + "grad_norm": 0.01373574323952198, + "learning_rate": 1.1013157894736842e-05, + "loss": 0.0014, + "step": 492 + }, + { + "epoch": 22.363636363636363, + "eval_loss": 0.00102571165189147, + "eval_runtime": 0.2263, + "eval_samples_per_second": 388.813, + "eval_steps_per_second": 48.602, + "step": 492 + }, + { + "epoch": 22.40909090909091, + "grad_norm": 0.015442097559571266, + "learning_rate": 1.0973684210526316e-05, + "loss": 0.0016, + "step": 493 + }, + { + "epoch": 22.40909090909091, + "eval_loss": 0.0010241527343168855, + "eval_runtime": 0.2352, + "eval_samples_per_second": 374.081, + "eval_steps_per_second": 46.76, + "step": 493 + }, + { + "epoch": 22.454545454545453, + "grad_norm": 0.015592455863952637, + "learning_rate": 1.093421052631579e-05, + "loss": 0.0017, + "step": 494 + }, + { + "epoch": 22.454545454545453, + "eval_loss": 0.0010226276936009526, + "eval_runtime": 0.2373, + "eval_samples_per_second": 370.902, + "eval_steps_per_second": 46.363, + "step": 494 + }, + { + "epoch": 22.5, + "grad_norm": 0.013556539081037045, + "learning_rate": 1.0894736842105265e-05, + "loss": 0.0016, + "step": 495 + }, + { + "epoch": 22.5, + "eval_loss": 0.001021133502945304, + "eval_runtime": 0.2433, + "eval_samples_per_second": 361.732, + "eval_steps_per_second": 45.217, + "step": 495 + }, + { + "epoch": 22.545454545454547, + "grad_norm": 0.012894881889224052, + "learning_rate": 1.0855263157894737e-05, + "loss": 0.0016, + "step": 496 + }, + { + "epoch": 22.545454545454547, + "eval_loss": 0.0010197004303336143, + "eval_runtime": 0.2415, + "eval_samples_per_second": 364.331, + "eval_steps_per_second": 45.541, + "step": 496 + }, + { + "epoch": 22.59090909090909, + "grad_norm": 0.014628540724515915, + "learning_rate": 1.0815789473684211e-05, + "loss": 0.0017, + "step": 497 + }, + { + "epoch": 22.59090909090909, + "eval_loss": 0.0010182132245972753, + "eval_runtime": 0.2417, + "eval_samples_per_second": 364.047, + "eval_steps_per_second": 45.506, + "step": 497 + }, + { + "epoch": 22.636363636363637, + "grad_norm": 0.014721691608428955, + "learning_rate": 1.0776315789473685e-05, + "loss": 0.0017, + "step": 498 + }, + { + "epoch": 22.636363636363637, + "eval_loss": 0.0010166773572564125, + "eval_runtime": 0.2388, + "eval_samples_per_second": 368.522, + "eval_steps_per_second": 46.065, + "step": 498 + }, + { + "epoch": 22.681818181818183, + "grad_norm": 0.01576976478099823, + "learning_rate": 1.0736842105263158e-05, + "loss": 0.0018, + "step": 499 + }, + { + "epoch": 22.681818181818183, + "eval_loss": 0.001015029032714665, + "eval_runtime": 0.2308, + "eval_samples_per_second": 381.26, + "eval_steps_per_second": 47.657, + "step": 499 + }, + { + "epoch": 22.727272727272727, + "grad_norm": 0.015886450186371803, + "learning_rate": 1.0697368421052632e-05, + "loss": 0.0017, + "step": 500 + }, + { + "epoch": 22.727272727272727, + "eval_loss": 0.0010134456679224968, + "eval_runtime": 0.236, + "eval_samples_per_second": 372.817, + "eval_steps_per_second": 46.602, + "step": 500 + }, + { + "epoch": 22.772727272727273, + "grad_norm": 0.01687587983906269, + "learning_rate": 1.0657894736842106e-05, + "loss": 0.0017, + "step": 501 + }, + { + "epoch": 22.772727272727273, + "eval_loss": 0.0010118514765053988, + "eval_runtime": 0.2468, + "eval_samples_per_second": 356.526, + "eval_steps_per_second": 44.566, + "step": 501 + }, + { + "epoch": 22.818181818181817, + "grad_norm": 0.013874330557882786, + "learning_rate": 1.0618421052631579e-05, + "loss": 0.0016, + "step": 502 + }, + { + "epoch": 22.818181818181817, + "eval_loss": 0.0010103358654305339, + "eval_runtime": 0.2231, + "eval_samples_per_second": 394.376, + "eval_steps_per_second": 49.297, + "step": 502 + }, + { + "epoch": 22.863636363636363, + "grad_norm": 0.014864981174468994, + "learning_rate": 1.0578947368421053e-05, + "loss": 0.0017, + "step": 503 + }, + { + "epoch": 22.863636363636363, + "eval_loss": 0.001008835039101541, + "eval_runtime": 0.2399, + "eval_samples_per_second": 366.77, + "eval_steps_per_second": 45.846, + "step": 503 + }, + { + "epoch": 22.90909090909091, + "grad_norm": 0.013614412397146225, + "learning_rate": 1.0539473684210525e-05, + "loss": 0.0016, + "step": 504 + }, + { + "epoch": 22.90909090909091, + "eval_loss": 0.001007361221127212, + "eval_runtime": 0.2267, + "eval_samples_per_second": 388.143, + "eval_steps_per_second": 48.518, + "step": 504 + }, + { + "epoch": 22.954545454545453, + "grad_norm": 0.019395658746361732, + "learning_rate": 1.05e-05, + "loss": 0.0019, + "step": 505 + }, + { + "epoch": 22.954545454545453, + "eval_loss": 0.0010058052139356732, + "eval_runtime": 0.2757, + "eval_samples_per_second": 319.24, + "eval_steps_per_second": 39.905, + "step": 505 + }, + { + "epoch": 23.0, + "grad_norm": 0.017713138833642006, + "learning_rate": 1.0460526315789474e-05, + "loss": 0.0018, + "step": 506 + }, + { + "epoch": 23.0, + "eval_loss": 0.0010041649220511317, + "eval_runtime": 0.2305, + "eval_samples_per_second": 381.809, + "eval_steps_per_second": 47.726, + "step": 506 + }, + { + "epoch": 23.045454545454547, + "grad_norm": 0.014331554993987083, + "learning_rate": 1.0421052631578948e-05, + "loss": 0.0017, + "step": 507 + }, + { + "epoch": 23.045454545454547, + "eval_loss": 0.0010025816736742854, + "eval_runtime": 0.2325, + "eval_samples_per_second": 378.56, + "eval_steps_per_second": 47.32, + "step": 507 + }, + { + "epoch": 23.09090909090909, + "grad_norm": 0.014041769318282604, + "learning_rate": 1.0381578947368422e-05, + "loss": 0.0017, + "step": 508 + }, + { + "epoch": 23.09090909090909, + "eval_loss": 0.001001022639684379, + "eval_runtime": 0.2296, + "eval_samples_per_second": 383.301, + "eval_steps_per_second": 47.913, + "step": 508 + }, + { + "epoch": 23.136363636363637, + "grad_norm": 0.014782671816647053, + "learning_rate": 1.0342105263157896e-05, + "loss": 0.0017, + "step": 509 + }, + { + "epoch": 23.136363636363637, + "eval_loss": 0.0009995178552344441, + "eval_runtime": 0.2324, + "eval_samples_per_second": 378.719, + "eval_steps_per_second": 47.34, + "step": 509 + }, + { + "epoch": 23.181818181818183, + "grad_norm": 0.014820964075624943, + "learning_rate": 1.0302631578947368e-05, + "loss": 0.0017, + "step": 510 + }, + { + "epoch": 23.181818181818183, + "eval_loss": 0.0009979914175346494, + "eval_runtime": 0.2306, + "eval_samples_per_second": 381.609, + "eval_steps_per_second": 47.701, + "step": 510 + }, + { + "epoch": 23.227272727272727, + "grad_norm": 0.014552117325365543, + "learning_rate": 1.0263157894736843e-05, + "loss": 0.0017, + "step": 511 + }, + { + "epoch": 23.227272727272727, + "eval_loss": 0.0009964742930606008, + "eval_runtime": 0.2277, + "eval_samples_per_second": 386.477, + "eval_steps_per_second": 48.31, + "step": 511 + }, + { + "epoch": 23.272727272727273, + "grad_norm": 0.016575666144490242, + "learning_rate": 1.0223684210526317e-05, + "loss": 0.0018, + "step": 512 + }, + { + "epoch": 23.272727272727273, + "eval_loss": 0.0009949287632480264, + "eval_runtime": 0.2408, + "eval_samples_per_second": 365.446, + "eval_steps_per_second": 45.681, + "step": 512 + }, + { + "epoch": 23.318181818181817, + "grad_norm": 0.013247662223875523, + "learning_rate": 1.018421052631579e-05, + "loss": 0.0016, + "step": 513 + }, + { + "epoch": 23.318181818181817, + "eval_loss": 0.0009934090776368976, + "eval_runtime": 0.2312, + "eval_samples_per_second": 380.548, + "eval_steps_per_second": 47.569, + "step": 513 + }, + { + "epoch": 23.363636363636363, + "grad_norm": 0.014102768152952194, + "learning_rate": 1.0144736842105263e-05, + "loss": 0.0017, + "step": 514 + }, + { + "epoch": 23.363636363636363, + "eval_loss": 0.0009918283903971314, + "eval_runtime": 0.2315, + "eval_samples_per_second": 380.202, + "eval_steps_per_second": 47.525, + "step": 514 + }, + { + "epoch": 23.40909090909091, + "grad_norm": 0.015047273598611355, + "learning_rate": 1.0105263157894738e-05, + "loss": 0.0017, + "step": 515 + }, + { + "epoch": 23.40909090909091, + "eval_loss": 0.0009903222089633346, + "eval_runtime": 0.2308, + "eval_samples_per_second": 381.309, + "eval_steps_per_second": 47.664, + "step": 515 + }, + { + "epoch": 23.454545454545453, + "grad_norm": 0.016119079664349556, + "learning_rate": 1.006578947368421e-05, + "loss": 0.0018, + "step": 516 + }, + { + "epoch": 23.454545454545453, + "eval_loss": 0.0009887360502034426, + "eval_runtime": 0.2356, + "eval_samples_per_second": 373.467, + "eval_steps_per_second": 46.683, + "step": 516 + }, + { + "epoch": 23.5, + "grad_norm": 0.013055874034762383, + "learning_rate": 1.0026315789473684e-05, + "loss": 0.0015, + "step": 517 + }, + { + "epoch": 23.5, + "eval_loss": 0.0009872028604149818, + "eval_runtime": 0.2353, + "eval_samples_per_second": 373.918, + "eval_steps_per_second": 46.74, + "step": 517 + }, + { + "epoch": 23.545454545454547, + "grad_norm": 0.014796939678490162, + "learning_rate": 9.986842105263158e-06, + "loss": 0.0017, + "step": 518 + }, + { + "epoch": 23.545454545454547, + "eval_loss": 0.0009856532560661435, + "eval_runtime": 0.236, + "eval_samples_per_second": 372.816, + "eval_steps_per_second": 46.602, + "step": 518 + }, + { + "epoch": 23.59090909090909, + "grad_norm": 0.01749352179467678, + "learning_rate": 9.94736842105263e-06, + "loss": 0.0018, + "step": 519 + }, + { + "epoch": 23.59090909090909, + "eval_loss": 0.000984109123237431, + "eval_runtime": 0.2375, + "eval_samples_per_second": 370.519, + "eval_steps_per_second": 46.315, + "step": 519 + }, + { + "epoch": 23.636363636363637, + "grad_norm": 0.014436857774853706, + "learning_rate": 9.907894736842107e-06, + "loss": 0.0017, + "step": 520 + }, + { + "epoch": 23.636363636363637, + "eval_loss": 0.0009825569577515125, + "eval_runtime": 0.2329, + "eval_samples_per_second": 377.839, + "eval_steps_per_second": 47.23, + "step": 520 + }, + { + "epoch": 23.681818181818183, + "grad_norm": 0.0134369982406497, + "learning_rate": 9.868421052631579e-06, + "loss": 0.0015, + "step": 521 + }, + { + "epoch": 23.681818181818183, + "eval_loss": 0.0009810830233618617, + "eval_runtime": 0.2463, + "eval_samples_per_second": 357.352, + "eval_steps_per_second": 44.669, + "step": 521 + }, + { + "epoch": 23.727272727272727, + "grad_norm": 0.015284021385014057, + "learning_rate": 9.828947368421053e-06, + "loss": 0.0017, + "step": 522 + }, + { + "epoch": 23.727272727272727, + "eval_loss": 0.0009796229423955083, + "eval_runtime": 0.2303, + "eval_samples_per_second": 382.111, + "eval_steps_per_second": 47.764, + "step": 522 + }, + { + "epoch": 23.772727272727273, + "grad_norm": 0.01389851700514555, + "learning_rate": 9.789473684210527e-06, + "loss": 0.0016, + "step": 523 + }, + { + "epoch": 23.772727272727273, + "eval_loss": 0.0009782682172954082, + "eval_runtime": 0.2358, + "eval_samples_per_second": 373.188, + "eval_steps_per_second": 46.649, + "step": 523 + }, + { + "epoch": 23.818181818181817, + "grad_norm": 0.013064984232187271, + "learning_rate": 9.75e-06, + "loss": 0.0016, + "step": 524 + }, + { + "epoch": 23.818181818181817, + "eval_loss": 0.000976921641267836, + "eval_runtime": 0.4347, + "eval_samples_per_second": 202.42, + "eval_steps_per_second": 25.303, + "step": 524 + }, + { + "epoch": 23.863636363636363, + "grad_norm": 0.01853189431130886, + "learning_rate": 9.710526315789474e-06, + "loss": 0.0018, + "step": 525 + }, + { + "epoch": 23.863636363636363, + "eval_loss": 0.0009755737846717238, + "eval_runtime": 0.3094, + "eval_samples_per_second": 284.419, + "eval_steps_per_second": 35.552, + "step": 525 + }, + { + "epoch": 23.90909090909091, + "grad_norm": 0.015431704930961132, + "learning_rate": 9.671052631578948e-06, + "loss": 0.0016, + "step": 526 + }, + { + "epoch": 23.90909090909091, + "eval_loss": 0.0009742649854160845, + "eval_runtime": 0.3285, + "eval_samples_per_second": 267.881, + "eval_steps_per_second": 33.485, + "step": 526 + }, + { + "epoch": 23.954545454545453, + "grad_norm": 0.015396500937640667, + "learning_rate": 9.63157894736842e-06, + "loss": 0.0017, + "step": 527 + }, + { + "epoch": 23.954545454545453, + "eval_loss": 0.0009728847653605044, + "eval_runtime": 0.379, + "eval_samples_per_second": 232.167, + "eval_steps_per_second": 29.021, + "step": 527 + }, + { + "epoch": 24.0, + "grad_norm": 0.018940720707178116, + "learning_rate": 9.592105263157895e-06, + "loss": 0.0018, + "step": 528 + }, + { + "epoch": 24.0, + "eval_loss": 0.0009714543703012168, + "eval_runtime": 0.4121, + "eval_samples_per_second": 213.562, + "eval_steps_per_second": 26.695, + "step": 528 + }, + { + "epoch": 24.045454545454547, + "grad_norm": 0.013447549194097519, + "learning_rate": 9.552631578947369e-06, + "loss": 0.0016, + "step": 529 + }, + { + "epoch": 24.045454545454547, + "eval_loss": 0.0009699968504719436, + "eval_runtime": 0.4871, + "eval_samples_per_second": 180.65, + "eval_steps_per_second": 22.581, + "step": 529 + }, + { + "epoch": 24.09090909090909, + "grad_norm": 0.01361093670129776, + "learning_rate": 9.513157894736841e-06, + "loss": 0.0016, + "step": 530 + }, + { + "epoch": 24.09090909090909, + "eval_loss": 0.0009685555123724043, + "eval_runtime": 0.4944, + "eval_samples_per_second": 178.009, + "eval_steps_per_second": 22.251, + "step": 530 + }, + { + "epoch": 24.136363636363637, + "grad_norm": 0.014719787985086441, + "learning_rate": 9.473684210526315e-06, + "loss": 0.0016, + "step": 531 + }, + { + "epoch": 24.136363636363637, + "eval_loss": 0.0009670979925431311, + "eval_runtime": 0.3472, + "eval_samples_per_second": 253.483, + "eval_steps_per_second": 31.685, + "step": 531 + }, + { + "epoch": 24.181818181818183, + "grad_norm": 0.01682870462536812, + "learning_rate": 9.43421052631579e-06, + "loss": 0.0018, + "step": 532 + }, + { + "epoch": 24.181818181818183, + "eval_loss": 0.0009655930334702134, + "eval_runtime": 0.2295, + "eval_samples_per_second": 383.473, + "eval_steps_per_second": 47.934, + "step": 532 + }, + { + "epoch": 24.227272727272727, + "grad_norm": 0.015661459416151047, + "learning_rate": 9.394736842105262e-06, + "loss": 0.0016, + "step": 533 + }, + { + "epoch": 24.227272727272727, + "eval_loss": 0.0009641083306632936, + "eval_runtime": 0.247, + "eval_samples_per_second": 356.243, + "eval_steps_per_second": 44.53, + "step": 533 + }, + { + "epoch": 24.272727272727273, + "grad_norm": 0.015652479603886604, + "learning_rate": 9.355263157894738e-06, + "loss": 0.0016, + "step": 534 + }, + { + "epoch": 24.272727272727273, + "eval_loss": 0.0009626846294850111, + "eval_runtime": 0.2337, + "eval_samples_per_second": 376.608, + "eval_steps_per_second": 47.076, + "step": 534 + }, + { + "epoch": 24.318181818181817, + "grad_norm": 0.013394070789217949, + "learning_rate": 9.315789473684212e-06, + "loss": 0.0016, + "step": 535 + }, + { + "epoch": 24.318181818181817, + "eval_loss": 0.0009613109868951142, + "eval_runtime": 0.2315, + "eval_samples_per_second": 380.202, + "eval_steps_per_second": 47.525, + "step": 535 + }, + { + "epoch": 24.363636363636363, + "grad_norm": 0.015152989886701107, + "learning_rate": 9.276315789473685e-06, + "loss": 0.0016, + "step": 536 + }, + { + "epoch": 24.363636363636363, + "eval_loss": 0.0009599780314601958, + "eval_runtime": 0.2373, + "eval_samples_per_second": 370.835, + "eval_steps_per_second": 46.354, + "step": 536 + }, + { + "epoch": 24.40909090909091, + "grad_norm": 0.014209273271262646, + "learning_rate": 9.236842105263159e-06, + "loss": 0.0016, + "step": 537 + }, + { + "epoch": 24.40909090909091, + "eval_loss": 0.0009586341911926866, + "eval_runtime": 0.2342, + "eval_samples_per_second": 375.816, + "eval_steps_per_second": 46.977, + "step": 537 + }, + { + "epoch": 24.454545454545453, + "grad_norm": 0.014566083438694477, + "learning_rate": 9.197368421052633e-06, + "loss": 0.0015, + "step": 538 + }, + { + "epoch": 24.454545454545453, + "eval_loss": 0.000957344425842166, + "eval_runtime": 0.2373, + "eval_samples_per_second": 370.82, + "eval_steps_per_second": 46.352, + "step": 538 + }, + { + "epoch": 24.5, + "grad_norm": 0.016195589676499367, + "learning_rate": 9.157894736842105e-06, + "loss": 0.0017, + "step": 539 + }, + { + "epoch": 24.5, + "eval_loss": 0.0009560330072417855, + "eval_runtime": 0.2313, + "eval_samples_per_second": 380.382, + "eval_steps_per_second": 47.548, + "step": 539 + }, + { + "epoch": 24.545454545454547, + "grad_norm": 0.01577996276319027, + "learning_rate": 9.11842105263158e-06, + "loss": 0.0017, + "step": 540 + }, + { + "epoch": 24.545454545454547, + "eval_loss": 0.0009547690278850496, + "eval_runtime": 0.2288, + "eval_samples_per_second": 384.628, + "eval_steps_per_second": 48.079, + "step": 540 + }, + { + "epoch": 24.59090909090909, + "grad_norm": 0.013901899568736553, + "learning_rate": 9.078947368421054e-06, + "loss": 0.0015, + "step": 541 + }, + { + "epoch": 24.59090909090909, + "eval_loss": 0.0009535103454254568, + "eval_runtime": 0.2351, + "eval_samples_per_second": 374.379, + "eval_steps_per_second": 46.797, + "step": 541 + }, + { + "epoch": 24.636363636363637, + "grad_norm": 0.014091338962316513, + "learning_rate": 9.039473684210526e-06, + "loss": 0.0016, + "step": 542 + }, + { + "epoch": 24.636363636363637, + "eval_loss": 0.0009522747131995857, + "eval_runtime": 0.2274, + "eval_samples_per_second": 387.015, + "eval_steps_per_second": 48.377, + "step": 542 + }, + { + "epoch": 24.681818181818183, + "grad_norm": 0.014544407837092876, + "learning_rate": 9e-06, + "loss": 0.0017, + "step": 543 + }, + { + "epoch": 24.681818181818183, + "eval_loss": 0.0009510606760159135, + "eval_runtime": 0.2442, + "eval_samples_per_second": 360.336, + "eval_steps_per_second": 45.042, + "step": 543 + }, + { + "epoch": 24.727272727272727, + "grad_norm": 0.01616845279932022, + "learning_rate": 8.960526315789473e-06, + "loss": 0.0017, + "step": 544 + }, + { + "epoch": 24.727272727272727, + "eval_loss": 0.0009498685249127448, + "eval_runtime": 0.2388, + "eval_samples_per_second": 368.514, + "eval_steps_per_second": 46.064, + "step": 544 + }, + { + "epoch": 24.772727272727273, + "grad_norm": 0.01609298586845398, + "learning_rate": 8.921052631578947e-06, + "loss": 0.0017, + "step": 545 + }, + { + "epoch": 24.772727272727273, + "eval_loss": 0.0009486477356404066, + "eval_runtime": 0.2287, + "eval_samples_per_second": 384.803, + "eval_steps_per_second": 48.1, + "step": 545 + }, + { + "epoch": 24.818181818181817, + "grad_norm": 0.013633071444928646, + "learning_rate": 8.881578947368421e-06, + "loss": 0.0016, + "step": 546 + }, + { + "epoch": 24.818181818181817, + "eval_loss": 0.0009474134421907365, + "eval_runtime": 0.2393, + "eval_samples_per_second": 367.684, + "eval_steps_per_second": 45.96, + "step": 546 + }, + { + "epoch": 24.863636363636363, + "grad_norm": 0.013738269917666912, + "learning_rate": 8.842105263157893e-06, + "loss": 0.0016, + "step": 547 + }, + { + "epoch": 24.863636363636363, + "eval_loss": 0.0009461792069487274, + "eval_runtime": 0.2312, + "eval_samples_per_second": 380.637, + "eval_steps_per_second": 47.58, + "step": 547 + }, + { + "epoch": 24.90909090909091, + "grad_norm": 0.013620936311781406, + "learning_rate": 8.80263157894737e-06, + "loss": 0.0015, + "step": 548 + }, + { + "epoch": 24.90909090909091, + "eval_loss": 0.0009449638891965151, + "eval_runtime": 0.2459, + "eval_samples_per_second": 357.891, + "eval_steps_per_second": 44.736, + "step": 548 + }, + { + "epoch": 24.954545454545453, + "grad_norm": 0.015967663377523422, + "learning_rate": 8.763157894736843e-06, + "loss": 0.0017, + "step": 549 + }, + { + "epoch": 24.954545454545453, + "eval_loss": 0.0009437742992304265, + "eval_runtime": 0.239, + "eval_samples_per_second": 368.142, + "eval_steps_per_second": 46.018, + "step": 549 + }, + { + "epoch": 25.0, + "grad_norm": 0.012870087288320065, + "learning_rate": 8.723684210526316e-06, + "loss": 0.0015, + "step": 550 + }, + { + "epoch": 25.0, + "eval_loss": 0.0009425426251254976, + "eval_runtime": 0.2335, + "eval_samples_per_second": 376.798, + "eval_steps_per_second": 47.1, + "step": 550 + }, + { + "epoch": 25.045454545454547, + "grad_norm": 0.012893461622297764, + "learning_rate": 8.68421052631579e-06, + "loss": 0.0015, + "step": 551 + }, + { + "epoch": 25.045454545454547, + "eval_loss": 0.0009413667139597237, + "eval_runtime": 0.236, + "eval_samples_per_second": 372.836, + "eval_steps_per_second": 46.605, + "step": 551 + }, + { + "epoch": 25.09090909090909, + "grad_norm": 0.014959870837628841, + "learning_rate": 8.644736842105264e-06, + "loss": 0.0016, + "step": 552 + }, + { + "epoch": 25.09090909090909, + "eval_loss": 0.0009402299183420837, + "eval_runtime": 0.2482, + "eval_samples_per_second": 354.624, + "eval_steps_per_second": 44.328, + "step": 552 + }, + { + "epoch": 25.136363636363637, + "grad_norm": 0.01649138703942299, + "learning_rate": 8.605263157894737e-06, + "loss": 0.0017, + "step": 553 + }, + { + "epoch": 25.136363636363637, + "eval_loss": 0.0009390347986482084, + "eval_runtime": 0.2599, + "eval_samples_per_second": 338.554, + "eval_steps_per_second": 42.319, + "step": 553 + }, + { + "epoch": 25.181818181818183, + "grad_norm": 0.01470938976854086, + "learning_rate": 8.56578947368421e-06, + "loss": 0.0016, + "step": 554 + }, + { + "epoch": 25.181818181818183, + "eval_loss": 0.0009378465474583209, + "eval_runtime": 0.2574, + "eval_samples_per_second": 341.926, + "eval_steps_per_second": 42.741, + "step": 554 + }, + { + "epoch": 25.227272727272727, + "grad_norm": 0.011589915491640568, + "learning_rate": 8.526315789473685e-06, + "loss": 0.0014, + "step": 555 + }, + { + "epoch": 25.227272727272727, + "eval_loss": 0.000936675991397351, + "eval_runtime": 0.2348, + "eval_samples_per_second": 374.714, + "eval_steps_per_second": 46.839, + "step": 555 + }, + { + "epoch": 25.272727272727273, + "grad_norm": 0.012033880688250065, + "learning_rate": 8.486842105263157e-06, + "loss": 0.0014, + "step": 556 + }, + { + "epoch": 25.272727272727273, + "eval_loss": 0.0009355823858641088, + "eval_runtime": 0.2479, + "eval_samples_per_second": 354.912, + "eval_steps_per_second": 44.364, + "step": 556 + }, + { + "epoch": 25.318181818181817, + "grad_norm": 0.012967276386916637, + "learning_rate": 8.447368421052632e-06, + "loss": 0.0016, + "step": 557 + }, + { + "epoch": 25.318181818181817, + "eval_loss": 0.0009344658465124667, + "eval_runtime": 0.2455, + "eval_samples_per_second": 358.387, + "eval_steps_per_second": 44.798, + "step": 557 + }, + { + "epoch": 25.363636363636363, + "grad_norm": 0.01223038136959076, + "learning_rate": 8.407894736842106e-06, + "loss": 0.0015, + "step": 558 + }, + { + "epoch": 25.363636363636363, + "eval_loss": 0.0009333452326245606, + "eval_runtime": 0.2906, + "eval_samples_per_second": 302.832, + "eval_steps_per_second": 37.854, + "step": 558 + }, + { + "epoch": 25.40909090909091, + "grad_norm": 0.015218369662761688, + "learning_rate": 8.368421052631578e-06, + "loss": 0.0016, + "step": 559 + }, + { + "epoch": 25.40909090909091, + "eval_loss": 0.0009322408004663885, + "eval_runtime": 0.2272, + "eval_samples_per_second": 387.247, + "eval_steps_per_second": 48.406, + "step": 559 + }, + { + "epoch": 25.454545454545453, + "grad_norm": 0.015988919883966446, + "learning_rate": 8.328947368421052e-06, + "loss": 0.0016, + "step": 560 + }, + { + "epoch": 25.454545454545453, + "eval_loss": 0.0009310647728852928, + "eval_runtime": 0.2299, + "eval_samples_per_second": 382.796, + "eval_steps_per_second": 47.85, + "step": 560 + }, + { + "epoch": 25.5, + "grad_norm": 0.012890150770545006, + "learning_rate": 8.289473684210526e-06, + "loss": 0.0015, + "step": 561 + }, + { + "epoch": 25.5, + "eval_loss": 0.0009298656368628144, + "eval_runtime": 0.2335, + "eval_samples_per_second": 376.874, + "eval_steps_per_second": 47.109, + "step": 561 + }, + { + "epoch": 25.545454545454547, + "grad_norm": 0.013084178790450096, + "learning_rate": 8.25e-06, + "loss": 0.0016, + "step": 562 + }, + { + "epoch": 25.545454545454547, + "eval_loss": 0.0009286908898502588, + "eval_runtime": 0.2286, + "eval_samples_per_second": 384.978, + "eval_steps_per_second": 48.122, + "step": 562 + }, + { + "epoch": 25.59090909090909, + "grad_norm": 0.01568671688437462, + "learning_rate": 8.210526315789475e-06, + "loss": 0.0018, + "step": 563 + }, + { + "epoch": 25.59090909090909, + "eval_loss": 0.0009274999029003084, + "eval_runtime": 0.2258, + "eval_samples_per_second": 389.702, + "eval_steps_per_second": 48.713, + "step": 563 + }, + { + "epoch": 25.636363636363637, + "grad_norm": 0.012654740363359451, + "learning_rate": 8.171052631578947e-06, + "loss": 0.0014, + "step": 564 + }, + { + "epoch": 25.636363636363637, + "eval_loss": 0.0009263442480005324, + "eval_runtime": 0.2297, + "eval_samples_per_second": 383.078, + "eval_steps_per_second": 47.885, + "step": 564 + }, + { + "epoch": 25.681818181818183, + "grad_norm": 0.014308282174170017, + "learning_rate": 8.131578947368421e-06, + "loss": 0.0016, + "step": 565 + }, + { + "epoch": 25.681818181818183, + "eval_loss": 0.0009251585579477251, + "eval_runtime": 0.2407, + "eval_samples_per_second": 365.643, + "eval_steps_per_second": 45.705, + "step": 565 + }, + { + "epoch": 25.727272727272727, + "grad_norm": 0.013645520433783531, + "learning_rate": 8.092105263157896e-06, + "loss": 0.0016, + "step": 566 + }, + { + "epoch": 25.727272727272727, + "eval_loss": 0.000924033869523555, + "eval_runtime": 0.2295, + "eval_samples_per_second": 383.49, + "eval_steps_per_second": 47.936, + "step": 566 + }, + { + "epoch": 25.772727272727273, + "grad_norm": 0.013325618579983711, + "learning_rate": 8.052631578947368e-06, + "loss": 0.0016, + "step": 567 + }, + { + "epoch": 25.772727272727273, + "eval_loss": 0.0009229186689481139, + "eval_runtime": 0.2286, + "eval_samples_per_second": 384.951, + "eval_steps_per_second": 48.119, + "step": 567 + }, + { + "epoch": 25.818181818181817, + "grad_norm": 0.013046055100858212, + "learning_rate": 8.013157894736842e-06, + "loss": 0.0015, + "step": 568 + }, + { + "epoch": 25.818181818181817, + "eval_loss": 0.0009218386840075254, + "eval_runtime": 0.2278, + "eval_samples_per_second": 386.339, + "eval_steps_per_second": 48.292, + "step": 568 + }, + { + "epoch": 25.863636363636363, + "grad_norm": 0.014013804495334625, + "learning_rate": 7.973684210526316e-06, + "loss": 0.0015, + "step": 569 + }, + { + "epoch": 25.863636363636363, + "eval_loss": 0.0009208493283949792, + "eval_runtime": 0.239, + "eval_samples_per_second": 368.218, + "eval_steps_per_second": 46.027, + "step": 569 + }, + { + "epoch": 25.90909090909091, + "grad_norm": 0.014438400976359844, + "learning_rate": 7.934210526315789e-06, + "loss": 0.0016, + "step": 570 + }, + { + "epoch": 25.90909090909091, + "eval_loss": 0.0009198287734761834, + "eval_runtime": 0.2403, + "eval_samples_per_second": 366.205, + "eval_steps_per_second": 45.776, + "step": 570 + }, + { + "epoch": 25.954545454545453, + "grad_norm": 0.013837904669344425, + "learning_rate": 7.894736842105263e-06, + "loss": 0.0016, + "step": 571 + }, + { + "epoch": 25.954545454545453, + "eval_loss": 0.0009188164258375764, + "eval_runtime": 0.2295, + "eval_samples_per_second": 383.499, + "eval_steps_per_second": 47.937, + "step": 571 + }, + { + "epoch": 26.0, + "grad_norm": 0.014442033134400845, + "learning_rate": 7.855263157894737e-06, + "loss": 0.0015, + "step": 572 + }, + { + "epoch": 26.0, + "eval_loss": 0.0009178462787531316, + "eval_runtime": 0.2369, + "eval_samples_per_second": 371.428, + "eval_steps_per_second": 46.428, + "step": 572 + }, + { + "epoch": 26.045454545454547, + "grad_norm": 0.01597905345261097, + "learning_rate": 7.81578947368421e-06, + "loss": 0.0016, + "step": 573 + }, + { + "epoch": 26.045454545454547, + "eval_loss": 0.000916794640943408, + "eval_runtime": 0.2272, + "eval_samples_per_second": 387.243, + "eval_steps_per_second": 48.405, + "step": 573 + }, + { + "epoch": 26.09090909090909, + "grad_norm": 0.014845073223114014, + "learning_rate": 7.776315789473684e-06, + "loss": 0.0016, + "step": 574 + }, + { + "epoch": 26.09090909090909, + "eval_loss": 0.0009157375898212194, + "eval_runtime": 0.2356, + "eval_samples_per_second": 373.503, + "eval_steps_per_second": 46.688, + "step": 574 + }, + { + "epoch": 26.136363636363637, + "grad_norm": 0.016282513737678528, + "learning_rate": 7.73684210526316e-06, + "loss": 0.0016, + "step": 575 + }, + { + "epoch": 26.136363636363637, + "eval_loss": 0.0009147171513177454, + "eval_runtime": 0.232, + "eval_samples_per_second": 379.38, + "eval_steps_per_second": 47.422, + "step": 575 + }, + { + "epoch": 26.181818181818183, + "grad_norm": 0.01518057007342577, + "learning_rate": 7.697368421052632e-06, + "loss": 0.0016, + "step": 576 + }, + { + "epoch": 26.181818181818183, + "eval_loss": 0.0009137062006630003, + "eval_runtime": 0.2426, + "eval_samples_per_second": 362.715, + "eval_steps_per_second": 45.339, + "step": 576 + }, + { + "epoch": 26.227272727272727, + "grad_norm": 0.014094051904976368, + "learning_rate": 7.657894736842106e-06, + "loss": 0.0016, + "step": 577 + }, + { + "epoch": 26.227272727272727, + "eval_loss": 0.0009126991499215364, + "eval_runtime": 0.2293, + "eval_samples_per_second": 383.817, + "eval_steps_per_second": 47.977, + "step": 577 + }, + { + "epoch": 26.272727272727273, + "grad_norm": 0.013502271845936775, + "learning_rate": 7.6184210526315794e-06, + "loss": 0.0015, + "step": 578 + }, + { + "epoch": 26.272727272727273, + "eval_loss": 0.0009116692817769945, + "eval_runtime": 0.2603, + "eval_samples_per_second": 338.068, + "eval_steps_per_second": 42.258, + "step": 578 + }, + { + "epoch": 26.318181818181817, + "grad_norm": 0.01577981747686863, + "learning_rate": 7.578947368421053e-06, + "loss": 0.0016, + "step": 579 + }, + { + "epoch": 26.318181818181817, + "eval_loss": 0.0009106568759307265, + "eval_runtime": 0.2284, + "eval_samples_per_second": 385.212, + "eval_steps_per_second": 48.151, + "step": 579 + }, + { + "epoch": 26.363636363636363, + "grad_norm": 0.013350007124245167, + "learning_rate": 7.539473684210527e-06, + "loss": 0.0016, + "step": 580 + }, + { + "epoch": 26.363636363636363, + "eval_loss": 0.0009096513967961073, + "eval_runtime": 0.251, + "eval_samples_per_second": 350.661, + "eval_steps_per_second": 43.833, + "step": 580 + }, + { + "epoch": 26.40909090909091, + "grad_norm": 0.013078941963613033, + "learning_rate": 7.5e-06, + "loss": 0.0014, + "step": 581 + }, + { + "epoch": 26.40909090909091, + "eval_loss": 0.000908670190256089, + "eval_runtime": 0.2388, + "eval_samples_per_second": 368.458, + "eval_steps_per_second": 46.057, + "step": 581 + }, + { + "epoch": 26.454545454545453, + "grad_norm": 0.013791137374937534, + "learning_rate": 7.4605263157894735e-06, + "loss": 0.0015, + "step": 582 + }, + { + "epoch": 26.454545454545453, + "eval_loss": 0.000907672569155693, + "eval_runtime": 0.242, + "eval_samples_per_second": 363.581, + "eval_steps_per_second": 45.448, + "step": 582 + }, + { + "epoch": 26.5, + "grad_norm": 0.015615719370543957, + "learning_rate": 7.421052631578948e-06, + "loss": 0.0017, + "step": 583 + }, + { + "epoch": 26.5, + "eval_loss": 0.0009066305938176811, + "eval_runtime": 0.2567, + "eval_samples_per_second": 342.844, + "eval_steps_per_second": 42.856, + "step": 583 + }, + { + "epoch": 26.545454545454547, + "grad_norm": 0.015224572271108627, + "learning_rate": 7.381578947368421e-06, + "loss": 0.0016, + "step": 584 + }, + { + "epoch": 26.545454545454547, + "eval_loss": 0.000905528839211911, + "eval_runtime": 0.2456, + "eval_samples_per_second": 358.266, + "eval_steps_per_second": 44.783, + "step": 584 + }, + { + "epoch": 26.59090909090909, + "grad_norm": 0.015507878735661507, + "learning_rate": 7.342105263157895e-06, + "loss": 0.0016, + "step": 585 + }, + { + "epoch": 26.59090909090909, + "eval_loss": 0.0009044149774126709, + "eval_runtime": 0.2492, + "eval_samples_per_second": 353.074, + "eval_steps_per_second": 44.134, + "step": 585 + }, + { + "epoch": 26.636363636363637, + "grad_norm": 0.012780736200511456, + "learning_rate": 7.302631578947368e-06, + "loss": 0.0015, + "step": 586 + }, + { + "epoch": 26.636363636363637, + "eval_loss": 0.0009033335372805595, + "eval_runtime": 0.2468, + "eval_samples_per_second": 356.58, + "eval_steps_per_second": 44.572, + "step": 586 + }, + { + "epoch": 26.681818181818183, + "grad_norm": 0.014048571698367596, + "learning_rate": 7.2631578947368426e-06, + "loss": 0.0015, + "step": 587 + }, + { + "epoch": 26.681818181818183, + "eval_loss": 0.0009022265439853072, + "eval_runtime": 0.2552, + "eval_samples_per_second": 344.851, + "eval_steps_per_second": 43.106, + "step": 587 + }, + { + "epoch": 26.727272727272727, + "grad_norm": 0.015583625994622707, + "learning_rate": 7.223684210526316e-06, + "loss": 0.0017, + "step": 588 + }, + { + "epoch": 26.727272727272727, + "eval_loss": 0.0009011449874378741, + "eval_runtime": 0.2308, + "eval_samples_per_second": 381.278, + "eval_steps_per_second": 47.66, + "step": 588 + }, + { + "epoch": 26.772727272727273, + "grad_norm": 0.01401633583009243, + "learning_rate": 7.184210526315789e-06, + "loss": 0.0015, + "step": 589 + }, + { + "epoch": 26.772727272727273, + "eval_loss": 0.0009001016733236611, + "eval_runtime": 0.2374, + "eval_samples_per_second": 370.679, + "eval_steps_per_second": 46.335, + "step": 589 + }, + { + "epoch": 26.818181818181817, + "grad_norm": 0.01262589916586876, + "learning_rate": 7.144736842105263e-06, + "loss": 0.0015, + "step": 590 + }, + { + "epoch": 26.818181818181817, + "eval_loss": 0.0008990716305561364, + "eval_runtime": 0.2399, + "eval_samples_per_second": 366.822, + "eval_steps_per_second": 45.853, + "step": 590 + }, + { + "epoch": 26.863636363636363, + "grad_norm": 0.015306267887353897, + "learning_rate": 7.105263157894737e-06, + "loss": 0.0016, + "step": 591 + }, + { + "epoch": 26.863636363636363, + "eval_loss": 0.0008980457205325365, + "eval_runtime": 0.2286, + "eval_samples_per_second": 385.033, + "eval_steps_per_second": 48.129, + "step": 591 + }, + { + "epoch": 26.90909090909091, + "grad_norm": 0.014178605750203133, + "learning_rate": 7.065789473684211e-06, + "loss": 0.0016, + "step": 592 + }, + { + "epoch": 26.90909090909091, + "eval_loss": 0.0008970522903837264, + "eval_runtime": 0.2364, + "eval_samples_per_second": 372.229, + "eval_steps_per_second": 46.529, + "step": 592 + }, + { + "epoch": 26.954545454545453, + "grad_norm": 0.013244709931313992, + "learning_rate": 7.026315789473685e-06, + "loss": 0.0016, + "step": 593 + }, + { + "epoch": 26.954545454545453, + "eval_loss": 0.0008960642153397202, + "eval_runtime": 0.2462, + "eval_samples_per_second": 357.44, + "eval_steps_per_second": 44.68, + "step": 593 + }, + { + "epoch": 27.0, + "grad_norm": 0.012383348308503628, + "learning_rate": 6.986842105263158e-06, + "loss": 0.0014, + "step": 594 + }, + { + "epoch": 27.0, + "eval_loss": 0.0008951277122832835, + "eval_runtime": 0.2326, + "eval_samples_per_second": 378.306, + "eval_steps_per_second": 47.288, + "step": 594 + }, + { + "epoch": 27.045454545454547, + "grad_norm": 0.011418252252042294, + "learning_rate": 6.9473684210526315e-06, + "loss": 0.0014, + "step": 595 + }, + { + "epoch": 27.045454545454547, + "eval_loss": 0.0008942168205976486, + "eval_runtime": 0.2431, + "eval_samples_per_second": 362.037, + "eval_steps_per_second": 45.255, + "step": 595 + }, + { + "epoch": 27.09090909090909, + "grad_norm": 0.013398653827607632, + "learning_rate": 6.907894736842106e-06, + "loss": 0.0014, + "step": 596 + }, + { + "epoch": 27.09090909090909, + "eval_loss": 0.0008933371282182634, + "eval_runtime": 0.2375, + "eval_samples_per_second": 370.507, + "eval_steps_per_second": 46.313, + "step": 596 + }, + { + "epoch": 27.136363636363637, + "grad_norm": 0.013324232771992683, + "learning_rate": 6.868421052631579e-06, + "loss": 0.0014, + "step": 597 + }, + { + "epoch": 27.136363636363637, + "eval_loss": 0.0008924913126975298, + "eval_runtime": 0.2409, + "eval_samples_per_second": 365.308, + "eval_steps_per_second": 45.663, + "step": 597 + }, + { + "epoch": 27.181818181818183, + "grad_norm": 0.014774598181247711, + "learning_rate": 6.828947368421053e-06, + "loss": 0.0016, + "step": 598 + }, + { + "epoch": 27.181818181818183, + "eval_loss": 0.0008916006772778928, + "eval_runtime": 0.2374, + "eval_samples_per_second": 370.613, + "eval_steps_per_second": 46.327, + "step": 598 + }, + { + "epoch": 27.227272727272727, + "grad_norm": 0.015260329470038414, + "learning_rate": 6.7894736842105264e-06, + "loss": 0.0016, + "step": 599 + }, + { + "epoch": 27.227272727272727, + "eval_loss": 0.0008907453739084303, + "eval_runtime": 0.2427, + "eval_samples_per_second": 362.645, + "eval_steps_per_second": 45.331, + "step": 599 + }, + { + "epoch": 27.272727272727273, + "grad_norm": 0.01440617348998785, + "learning_rate": 6.750000000000001e-06, + "loss": 0.0016, + "step": 600 + }, + { + "epoch": 27.272727272727273, + "eval_loss": 0.0008899224339984357, + "eval_runtime": 0.2506, + "eval_samples_per_second": 351.14, + "eval_steps_per_second": 43.892, + "step": 600 + }, + { + "epoch": 27.318181818181817, + "grad_norm": 0.0139328483492136, + "learning_rate": 6.710526315789474e-06, + "loss": 0.0015, + "step": 601 + }, + { + "epoch": 27.318181818181817, + "eval_loss": 0.0008891185279935598, + "eval_runtime": 0.223, + "eval_samples_per_second": 394.603, + "eval_steps_per_second": 49.325, + "step": 601 + }, + { + "epoch": 27.363636363636363, + "grad_norm": 0.014009720645844936, + "learning_rate": 6.671052631578947e-06, + "loss": 0.0015, + "step": 602 + }, + { + "epoch": 27.363636363636363, + "eval_loss": 0.0008883295231498778, + "eval_runtime": 0.2262, + "eval_samples_per_second": 389.114, + "eval_steps_per_second": 48.639, + "step": 602 + }, + { + "epoch": 27.40909090909091, + "grad_norm": 0.014640220440924168, + "learning_rate": 6.631578947368421e-06, + "loss": 0.0016, + "step": 603 + }, + { + "epoch": 27.40909090909091, + "eval_loss": 0.0008875647909007967, + "eval_runtime": 0.2259, + "eval_samples_per_second": 389.586, + "eval_steps_per_second": 48.698, + "step": 603 + }, + { + "epoch": 27.454545454545453, + "grad_norm": 0.012875789776444435, + "learning_rate": 6.592105263157895e-06, + "loss": 0.0014, + "step": 604 + }, + { + "epoch": 27.454545454545453, + "eval_loss": 0.0008868000004440546, + "eval_runtime": 0.2267, + "eval_samples_per_second": 388.239, + "eval_steps_per_second": 48.53, + "step": 604 + }, + { + "epoch": 27.5, + "grad_norm": 0.012748241424560547, + "learning_rate": 6.552631578947369e-06, + "loss": 0.0014, + "step": 605 + }, + { + "epoch": 27.5, + "eval_loss": 0.0008860474918037653, + "eval_runtime": 0.2273, + "eval_samples_per_second": 387.108, + "eval_steps_per_second": 48.388, + "step": 605 + }, + { + "epoch": 27.545454545454547, + "grad_norm": 0.015082623809576035, + "learning_rate": 6.513157894736842e-06, + "loss": 0.0016, + "step": 606 + }, + { + "epoch": 27.545454545454547, + "eval_loss": 0.0008852502796798944, + "eval_runtime": 0.2413, + "eval_samples_per_second": 364.656, + "eval_steps_per_second": 45.582, + "step": 606 + }, + { + "epoch": 27.59090909090909, + "grad_norm": 0.012016087770462036, + "learning_rate": 6.473684210526316e-06, + "loss": 0.0014, + "step": 607 + }, + { + "epoch": 27.59090909090909, + "eval_loss": 0.0008844301337376237, + "eval_runtime": 0.2344, + "eval_samples_per_second": 375.37, + "eval_steps_per_second": 46.921, + "step": 607 + }, + { + "epoch": 27.636363636363637, + "grad_norm": 0.013424508273601532, + "learning_rate": 6.4342105263157896e-06, + "loss": 0.0014, + "step": 608 + }, + { + "epoch": 27.636363636363637, + "eval_loss": 0.0008835734915919602, + "eval_runtime": 0.2456, + "eval_samples_per_second": 358.327, + "eval_steps_per_second": 44.791, + "step": 608 + }, + { + "epoch": 27.681818181818183, + "grad_norm": 0.014258569106459618, + "learning_rate": 6.394736842105263e-06, + "loss": 0.0016, + "step": 609 + }, + { + "epoch": 27.681818181818183, + "eval_loss": 0.0008827546262182295, + "eval_runtime": 0.2293, + "eval_samples_per_second": 383.729, + "eval_steps_per_second": 47.966, + "step": 609 + }, + { + "epoch": 27.727272727272727, + "grad_norm": 0.012304065749049187, + "learning_rate": 6.355263157894737e-06, + "loss": 0.0014, + "step": 610 + }, + { + "epoch": 27.727272727272727, + "eval_loss": 0.0008819656213745475, + "eval_runtime": 0.2293, + "eval_samples_per_second": 383.825, + "eval_steps_per_second": 47.978, + "step": 610 + }, + { + "epoch": 27.772727272727273, + "grad_norm": 0.01459804829210043, + "learning_rate": 6.31578947368421e-06, + "loss": 0.0016, + "step": 611 + }, + { + "epoch": 27.772727272727273, + "eval_loss": 0.000881133193615824, + "eval_runtime": 0.2354, + "eval_samples_per_second": 373.888, + "eval_steps_per_second": 46.736, + "step": 611 + }, + { + "epoch": 27.818181818181817, + "grad_norm": 0.013015978038311005, + "learning_rate": 6.2763157894736845e-06, + "loss": 0.0014, + "step": 612 + }, + { + "epoch": 27.818181818181817, + "eval_loss": 0.0008803331875242293, + "eval_runtime": 0.267, + "eval_samples_per_second": 329.599, + "eval_steps_per_second": 41.2, + "step": 612 + }, + { + "epoch": 27.863636363636363, + "grad_norm": 0.013901845552027225, + "learning_rate": 6.236842105263159e-06, + "loss": 0.0016, + "step": 613 + }, + { + "epoch": 27.863636363636363, + "eval_loss": 0.0008795224712230265, + "eval_runtime": 0.2596, + "eval_samples_per_second": 339.008, + "eval_steps_per_second": 42.376, + "step": 613 + }, + { + "epoch": 27.90909090909091, + "grad_norm": 0.012065750546753407, + "learning_rate": 6.197368421052632e-06, + "loss": 0.0014, + "step": 614 + }, + { + "epoch": 27.90909090909091, + "eval_loss": 0.0008787267142906785, + "eval_runtime": 0.2638, + "eval_samples_per_second": 333.543, + "eval_steps_per_second": 41.693, + "step": 614 + }, + { + "epoch": 27.954545454545453, + "grad_norm": 0.013637811876833439, + "learning_rate": 6.157894736842105e-06, + "loss": 0.0016, + "step": 615 + }, + { + "epoch": 27.954545454545453, + "eval_loss": 0.0008779308409430087, + "eval_runtime": 0.2586, + "eval_samples_per_second": 340.291, + "eval_steps_per_second": 42.536, + "step": 615 + }, + { + "epoch": 28.0, + "grad_norm": 0.012989726848900318, + "learning_rate": 6.118421052631579e-06, + "loss": 0.0015, + "step": 616 + }, + { + "epoch": 28.0, + "eval_loss": 0.0008771241991780698, + "eval_runtime": 0.2429, + "eval_samples_per_second": 362.255, + "eval_steps_per_second": 45.282, + "step": 616 + }, + { + "epoch": 28.045454545454547, + "grad_norm": 0.011249346658587456, + "learning_rate": 6.078947368421053e-06, + "loss": 0.0013, + "step": 617 + }, + { + "epoch": 28.045454545454547, + "eval_loss": 0.0008763446821831167, + "eval_runtime": 0.2419, + "eval_samples_per_second": 363.721, + "eval_steps_per_second": 45.465, + "step": 617 + }, + { + "epoch": 28.09090909090909, + "grad_norm": 0.013492336496710777, + "learning_rate": 6.039473684210526e-06, + "loss": 0.0016, + "step": 618 + }, + { + "epoch": 28.09090909090909, + "eval_loss": 0.000875540659762919, + "eval_runtime": 0.2616, + "eval_samples_per_second": 336.357, + "eval_steps_per_second": 42.045, + "step": 618 + }, + { + "epoch": 28.136363636363637, + "grad_norm": 0.013201452791690826, + "learning_rate": 6e-06, + "loss": 0.0014, + "step": 619 + }, + { + "epoch": 28.136363636363637, + "eval_loss": 0.0008747638785280287, + "eval_runtime": 0.2332, + "eval_samples_per_second": 377.308, + "eval_steps_per_second": 47.163, + "step": 619 + }, + { + "epoch": 28.181818181818183, + "grad_norm": 0.012346605770289898, + "learning_rate": 5.960526315789474e-06, + "loss": 0.0015, + "step": 620 + }, + { + "epoch": 28.181818181818183, + "eval_loss": 0.0008740072953514755, + "eval_runtime": 0.2297, + "eval_samples_per_second": 383.134, + "eval_steps_per_second": 47.892, + "step": 620 + }, + { + "epoch": 28.227272727272727, + "grad_norm": 0.013474266044795513, + "learning_rate": 5.921052631578948e-06, + "loss": 0.0015, + "step": 621 + }, + { + "epoch": 28.227272727272727, + "eval_loss": 0.0008732505375519395, + "eval_runtime": 0.2261, + "eval_samples_per_second": 389.249, + "eval_steps_per_second": 48.656, + "step": 621 + }, + { + "epoch": 28.272727272727273, + "grad_norm": 0.011779211461544037, + "learning_rate": 5.881578947368421e-06, + "loss": 0.0013, + "step": 622 + }, + { + "epoch": 28.272727272727273, + "eval_loss": 0.0008725319639779627, + "eval_runtime": 0.2358, + "eval_samples_per_second": 373.257, + "eval_steps_per_second": 46.657, + "step": 622 + }, + { + "epoch": 28.318181818181817, + "grad_norm": 0.01458238996565342, + "learning_rate": 5.842105263157895e-06, + "loss": 0.0015, + "step": 623 + }, + { + "epoch": 28.318181818181817, + "eval_loss": 0.0008718472090549767, + "eval_runtime": 0.2469, + "eval_samples_per_second": 356.442, + "eval_steps_per_second": 44.555, + "step": 623 + }, + { + "epoch": 28.363636363636363, + "grad_norm": 0.013492444530129433, + "learning_rate": 5.802631578947368e-06, + "loss": 0.0015, + "step": 624 + }, + { + "epoch": 28.363636363636363, + "eval_loss": 0.0008711445843800902, + "eval_runtime": 0.2339, + "eval_samples_per_second": 376.299, + "eval_steps_per_second": 47.037, + "step": 624 + }, + { + "epoch": 28.40909090909091, + "grad_norm": 0.016801927238702774, + "learning_rate": 5.763157894736842e-06, + "loss": 0.0016, + "step": 625 + }, + { + "epoch": 28.40909090909091, + "eval_loss": 0.0008704178035259247, + "eval_runtime": 0.2467, + "eval_samples_per_second": 356.761, + "eval_steps_per_second": 44.595, + "step": 625 + }, + { + "epoch": 28.454545454545453, + "grad_norm": 0.01472269557416439, + "learning_rate": 5.723684210526316e-06, + "loss": 0.0015, + "step": 626 + }, + { + "epoch": 28.454545454545453, + "eval_loss": 0.0008697099983692169, + "eval_runtime": 0.2361, + "eval_samples_per_second": 372.695, + "eval_steps_per_second": 46.587, + "step": 626 + }, + { + "epoch": 28.5, + "grad_norm": 0.012456816621124744, + "learning_rate": 5.68421052631579e-06, + "loss": 0.0014, + "step": 627 + }, + { + "epoch": 28.5, + "eval_loss": 0.0008690251270309091, + "eval_runtime": 0.227, + "eval_samples_per_second": 387.675, + "eval_steps_per_second": 48.459, + "step": 627 + }, + { + "epoch": 28.545454545454547, + "grad_norm": 0.010930378921329975, + "learning_rate": 5.644736842105263e-06, + "loss": 0.0013, + "step": 628 + }, + { + "epoch": 28.545454545454547, + "eval_loss": 0.0008683226769790053, + "eval_runtime": 0.2396, + "eval_samples_per_second": 367.217, + "eval_steps_per_second": 45.902, + "step": 628 + }, + { + "epoch": 28.59090909090909, + "grad_norm": 0.013773776590824127, + "learning_rate": 5.605263157894737e-06, + "loss": 0.0016, + "step": 629 + }, + { + "epoch": 28.59090909090909, + "eval_loss": 0.0008676418801769614, + "eval_runtime": 0.2255, + "eval_samples_per_second": 390.204, + "eval_steps_per_second": 48.776, + "step": 629 + }, + { + "epoch": 28.636363636363637, + "grad_norm": 0.01485821045935154, + "learning_rate": 5.565789473684211e-06, + "loss": 0.0015, + "step": 630 + }, + { + "epoch": 28.636363636363637, + "eval_loss": 0.0008669787785038352, + "eval_runtime": 0.238, + "eval_samples_per_second": 369.806, + "eval_steps_per_second": 46.226, + "step": 630 + }, + { + "epoch": 28.681818181818183, + "grad_norm": 0.012882347218692303, + "learning_rate": 5.526315789473684e-06, + "loss": 0.0015, + "step": 631 + }, + { + "epoch": 28.681818181818183, + "eval_loss": 0.0008663006592541933, + "eval_runtime": 0.2392, + "eval_samples_per_second": 367.945, + "eval_steps_per_second": 45.993, + "step": 631 + }, + { + "epoch": 28.727272727272727, + "grad_norm": 0.013756033033132553, + "learning_rate": 5.486842105263158e-06, + "loss": 0.0015, + "step": 632 + }, + { + "epoch": 28.727272727272727, + "eval_loss": 0.0008656617719680071, + "eval_runtime": 0.2392, + "eval_samples_per_second": 367.897, + "eval_steps_per_second": 45.987, + "step": 632 + }, + { + "epoch": 28.772727272727273, + "grad_norm": 0.011964356526732445, + "learning_rate": 5.447368421052632e-06, + "loss": 0.0014, + "step": 633 + }, + { + "epoch": 28.772727272727273, + "eval_loss": 0.0008649809169583023, + "eval_runtime": 0.2416, + "eval_samples_per_second": 364.235, + "eval_steps_per_second": 45.529, + "step": 633 + }, + { + "epoch": 28.818181818181817, + "grad_norm": 0.014426548965275288, + "learning_rate": 5.407894736842106e-06, + "loss": 0.0015, + "step": 634 + }, + { + "epoch": 28.818181818181817, + "eval_loss": 0.0008642975008115172, + "eval_runtime": 0.2426, + "eval_samples_per_second": 362.673, + "eval_steps_per_second": 45.334, + "step": 634 + }, + { + "epoch": 28.863636363636363, + "grad_norm": 0.013472221791744232, + "learning_rate": 5.368421052631579e-06, + "loss": 0.0014, + "step": 635 + }, + { + "epoch": 28.863636363636363, + "eval_loss": 0.0008636031416244805, + "eval_runtime": 0.2517, + "eval_samples_per_second": 349.684, + "eval_steps_per_second": 43.711, + "step": 635 + }, + { + "epoch": 28.90909090909091, + "grad_norm": 0.012157904915511608, + "learning_rate": 5.328947368421053e-06, + "loss": 0.0014, + "step": 636 + }, + { + "epoch": 28.90909090909091, + "eval_loss": 0.000862881715875119, + "eval_runtime": 0.2369, + "eval_samples_per_second": 371.509, + "eval_steps_per_second": 46.439, + "step": 636 + }, + { + "epoch": 28.954545454545453, + "grad_norm": 0.012409983202815056, + "learning_rate": 5.289473684210526e-06, + "loss": 0.0014, + "step": 637 + }, + { + "epoch": 28.954545454545453, + "eval_loss": 0.0008621684974059463, + "eval_runtime": 0.2465, + "eval_samples_per_second": 357.054, + "eval_steps_per_second": 44.632, + "step": 637 + }, + { + "epoch": 29.0, + "grad_norm": 0.013315846212208271, + "learning_rate": 5.25e-06, + "loss": 0.0015, + "step": 638 + }, + { + "epoch": 29.0, + "eval_loss": 0.0008614835678599775, + "eval_runtime": 0.2407, + "eval_samples_per_second": 365.586, + "eval_steps_per_second": 45.698, + "step": 638 + }, + { + "epoch": 29.045454545454547, + "grad_norm": 0.015236815437674522, + "learning_rate": 5.210526315789474e-06, + "loss": 0.0016, + "step": 639 + }, + { + "epoch": 29.045454545454547, + "eval_loss": 0.0008607918862253428, + "eval_runtime": 0.2362, + "eval_samples_per_second": 372.636, + "eval_steps_per_second": 46.579, + "step": 639 + }, + { + "epoch": 29.09090909090909, + "grad_norm": 0.01497814990580082, + "learning_rate": 5.171052631578948e-06, + "loss": 0.0015, + "step": 640 + }, + { + "epoch": 29.09090909090909, + "eval_loss": 0.0008601464214734733, + "eval_runtime": 0.2513, + "eval_samples_per_second": 350.225, + "eval_steps_per_second": 43.778, + "step": 640 + }, + { + "epoch": 29.136363636363637, + "grad_norm": 0.010525020770728588, + "learning_rate": 5.131578947368421e-06, + "loss": 0.0013, + "step": 641 + }, + { + "epoch": 29.136363636363637, + "eval_loss": 0.0008594872197136283, + "eval_runtime": 0.2472, + "eval_samples_per_second": 355.947, + "eval_steps_per_second": 44.493, + "step": 641 + }, + { + "epoch": 29.181818181818183, + "grad_norm": 0.012257490307092667, + "learning_rate": 5.092105263157895e-06, + "loss": 0.0014, + "step": 642 + }, + { + "epoch": 29.181818181818183, + "eval_loss": 0.0008588552009314299, + "eval_runtime": 0.2514, + "eval_samples_per_second": 350.01, + "eval_steps_per_second": 43.751, + "step": 642 + }, + { + "epoch": 29.227272727272727, + "grad_norm": 0.016379721462726593, + "learning_rate": 5.052631578947369e-06, + "loss": 0.0016, + "step": 643 + }, + { + "epoch": 29.227272727272727, + "eval_loss": 0.0008582230657339096, + "eval_runtime": 0.2421, + "eval_samples_per_second": 363.525, + "eval_steps_per_second": 45.441, + "step": 643 + }, + { + "epoch": 29.272727272727273, + "grad_norm": 0.013389473780989647, + "learning_rate": 5.013157894736842e-06, + "loss": 0.0014, + "step": 644 + }, + { + "epoch": 29.272727272727273, + "eval_loss": 0.0008576181135140359, + "eval_runtime": 0.2837, + "eval_samples_per_second": 310.222, + "eval_steps_per_second": 38.778, + "step": 644 + }, + { + "epoch": 29.318181818181817, + "grad_norm": 0.011728441342711449, + "learning_rate": 4.973684210526315e-06, + "loss": 0.0014, + "step": 645 + }, + { + "epoch": 29.318181818181817, + "eval_loss": 0.0008570144418627024, + "eval_runtime": 0.3144, + "eval_samples_per_second": 279.869, + "eval_steps_per_second": 34.984, + "step": 645 + }, + { + "epoch": 29.363636363636363, + "grad_norm": 0.014150052331387997, + "learning_rate": 4.9342105263157895e-06, + "loss": 0.0015, + "step": 646 + }, + { + "epoch": 29.363636363636363, + "eval_loss": 0.0008564500021748245, + "eval_runtime": 0.2427, + "eval_samples_per_second": 362.611, + "eval_steps_per_second": 45.326, + "step": 646 + }, + { + "epoch": 29.40909090909091, + "grad_norm": 0.012562847696244717, + "learning_rate": 4.894736842105264e-06, + "loss": 0.0015, + "step": 647 + }, + { + "epoch": 29.40909090909091, + "eval_loss": 0.0008558626868762076, + "eval_runtime": 0.2385, + "eval_samples_per_second": 368.954, + "eval_steps_per_second": 46.119, + "step": 647 + }, + { + "epoch": 29.454545454545453, + "grad_norm": 0.01115860603749752, + "learning_rate": 4.855263157894737e-06, + "loss": 0.0012, + "step": 648 + }, + { + "epoch": 29.454545454545453, + "eval_loss": 0.000855276535730809, + "eval_runtime": 0.2434, + "eval_samples_per_second": 361.501, + "eval_steps_per_second": 45.188, + "step": 648 + }, + { + "epoch": 29.5, + "grad_norm": 0.014787169173359871, + "learning_rate": 4.81578947368421e-06, + "loss": 0.0015, + "step": 649 + }, + { + "epoch": 29.5, + "eval_loss": 0.0008546687895432115, + "eval_runtime": 0.2404, + "eval_samples_per_second": 366.019, + "eval_steps_per_second": 45.752, + "step": 649 + }, + { + "epoch": 29.545454545454547, + "grad_norm": 0.014013570733368397, + "learning_rate": 4.7763157894736844e-06, + "loss": 0.0014, + "step": 650 + }, + { + "epoch": 29.545454545454547, + "eval_loss": 0.0008540409035049379, + "eval_runtime": 0.2415, + "eval_samples_per_second": 364.376, + "eval_steps_per_second": 45.547, + "step": 650 + }, + { + "epoch": 29.59090909090909, + "grad_norm": 0.013314800336956978, + "learning_rate": 4.736842105263158e-06, + "loss": 0.0015, + "step": 651 + }, + { + "epoch": 29.59090909090909, + "eval_loss": 0.0008533978252671659, + "eval_runtime": 0.2334, + "eval_samples_per_second": 377.055, + "eval_steps_per_second": 47.132, + "step": 651 + }, + { + "epoch": 29.636363636363637, + "grad_norm": 0.011727740988135338, + "learning_rate": 4.697368421052631e-06, + "loss": 0.0014, + "step": 652 + }, + { + "epoch": 29.636363636363637, + "eval_loss": 0.0008527915342710912, + "eval_runtime": 0.2324, + "eval_samples_per_second": 378.693, + "eval_steps_per_second": 47.337, + "step": 652 + }, + { + "epoch": 29.681818181818183, + "grad_norm": 0.014551502652466297, + "learning_rate": 4.657894736842106e-06, + "loss": 0.0016, + "step": 653 + }, + { + "epoch": 29.681818181818183, + "eval_loss": 0.0008521459531039, + "eval_runtime": 0.2274, + "eval_samples_per_second": 387.021, + "eval_steps_per_second": 48.378, + "step": 653 + }, + { + "epoch": 29.727272727272727, + "grad_norm": 0.01226063258945942, + "learning_rate": 4.618421052631579e-06, + "loss": 0.0013, + "step": 654 + }, + { + "epoch": 29.727272727272727, + "eval_loss": 0.0008515057852491736, + "eval_runtime": 0.2277, + "eval_samples_per_second": 386.523, + "eval_steps_per_second": 48.315, + "step": 654 + }, + { + "epoch": 29.772727272727273, + "grad_norm": 0.013769338838756084, + "learning_rate": 4.578947368421053e-06, + "loss": 0.0015, + "step": 655 + }, + { + "epoch": 29.772727272727273, + "eval_loss": 0.0008508588653057814, + "eval_runtime": 0.2246, + "eval_samples_per_second": 391.814, + "eval_steps_per_second": 48.977, + "step": 655 + }, + { + "epoch": 29.818181818181817, + "grad_norm": 0.012221275828778744, + "learning_rate": 4.539473684210527e-06, + "loss": 0.0015, + "step": 656 + }, + { + "epoch": 29.818181818181817, + "eval_loss": 0.0008502537966705859, + "eval_runtime": 0.2288, + "eval_samples_per_second": 384.596, + "eval_steps_per_second": 48.075, + "step": 656 + }, + { + "epoch": 29.863636363636363, + "grad_norm": 0.011863375082612038, + "learning_rate": 4.5e-06, + "loss": 0.0013, + "step": 657 + }, + { + "epoch": 29.863636363636363, + "eval_loss": 0.0008496582740917802, + "eval_runtime": 0.2473, + "eval_samples_per_second": 355.889, + "eval_steps_per_second": 44.486, + "step": 657 + }, + { + "epoch": 29.90909090909091, + "grad_norm": 0.01440768875181675, + "learning_rate": 4.460526315789473e-06, + "loss": 0.0015, + "step": 658 + }, + { + "epoch": 29.90909090909091, + "eval_loss": 0.0008490938926115632, + "eval_runtime": 0.2279, + "eval_samples_per_second": 386.137, + "eval_steps_per_second": 48.267, + "step": 658 + }, + { + "epoch": 29.954545454545453, + "grad_norm": 0.013953134417533875, + "learning_rate": 4.421052631578947e-06, + "loss": 0.0014, + "step": 659 + }, + { + "epoch": 29.954545454545453, + "eval_loss": 0.0008485484286211431, + "eval_runtime": 0.2309, + "eval_samples_per_second": 381.17, + "eval_steps_per_second": 47.646, + "step": 659 + }, + { + "epoch": 30.0, + "grad_norm": 0.012044006027281284, + "learning_rate": 4.381578947368422e-06, + "loss": 0.0014, + "step": 660 + }, + { + "epoch": 30.0, + "eval_loss": 0.0008479988318867981, + "eval_runtime": 0.2301, + "eval_samples_per_second": 382.482, + "eval_steps_per_second": 47.81, + "step": 660 + }, + { + "epoch": 30.045454545454547, + "grad_norm": 0.014352229423820972, + "learning_rate": 4.342105263157895e-06, + "loss": 0.0015, + "step": 661 + }, + { + "epoch": 30.045454545454547, + "eval_loss": 0.0008474763599224389, + "eval_runtime": 0.228, + "eval_samples_per_second": 385.949, + "eval_steps_per_second": 48.244, + "step": 661 + }, + { + "epoch": 30.09090909090909, + "grad_norm": 0.012857983820140362, + "learning_rate": 4.302631578947368e-06, + "loss": 0.0015, + "step": 662 + }, + { + "epoch": 30.09090909090909, + "eval_loss": 0.0008469296153634787, + "eval_runtime": 0.2254, + "eval_samples_per_second": 390.463, + "eval_steps_per_second": 48.808, + "step": 662 + }, + { + "epoch": 30.136363636363637, + "grad_norm": 0.013745253905653954, + "learning_rate": 4.2631578947368425e-06, + "loss": 0.0014, + "step": 663 + }, + { + "epoch": 30.136363636363637, + "eval_loss": 0.0008464112761430442, + "eval_runtime": 0.2615, + "eval_samples_per_second": 336.504, + "eval_steps_per_second": 42.063, + "step": 663 + }, + { + "epoch": 30.181818181818183, + "grad_norm": 0.011542108841240406, + "learning_rate": 4.223684210526316e-06, + "loss": 0.0014, + "step": 664 + }, + { + "epoch": 30.181818181818183, + "eval_loss": 0.0008458928787149489, + "eval_runtime": 0.2363, + "eval_samples_per_second": 372.361, + "eval_steps_per_second": 46.545, + "step": 664 + }, + { + "epoch": 30.227272727272727, + "grad_norm": 0.013680350966751575, + "learning_rate": 4.184210526315789e-06, + "loss": 0.0015, + "step": 665 + }, + { + "epoch": 30.227272727272727, + "eval_loss": 0.0008453825721517205, + "eval_runtime": 0.2422, + "eval_samples_per_second": 363.317, + "eval_steps_per_second": 45.415, + "step": 665 + }, + { + "epoch": 30.272727272727273, + "grad_norm": 0.01278683077543974, + "learning_rate": 4.144736842105263e-06, + "loss": 0.0013, + "step": 666 + }, + { + "epoch": 30.272727272727273, + "eval_loss": 0.0008448913577012718, + "eval_runtime": 0.2274, + "eval_samples_per_second": 386.997, + "eval_steps_per_second": 48.375, + "step": 666 + }, + { + "epoch": 30.318181818181817, + "grad_norm": 0.013793477788567543, + "learning_rate": 4.105263157894737e-06, + "loss": 0.0016, + "step": 667 + }, + { + "epoch": 30.318181818181817, + "eval_loss": 0.0008444040431641042, + "eval_runtime": 0.2383, + "eval_samples_per_second": 369.303, + "eval_steps_per_second": 46.163, + "step": 667 + }, + { + "epoch": 30.363636363636363, + "grad_norm": 0.013766897842288017, + "learning_rate": 4.065789473684211e-06, + "loss": 0.0014, + "step": 668 + }, + { + "epoch": 30.363636363636363, + "eval_loss": 0.0008439045632258058, + "eval_runtime": 0.2472, + "eval_samples_per_second": 355.963, + "eval_steps_per_second": 44.495, + "step": 668 + }, + { + "epoch": 30.40909090909091, + "grad_norm": 0.01388518325984478, + "learning_rate": 4.026315789473684e-06, + "loss": 0.0014, + "step": 669 + }, + { + "epoch": 30.40909090909091, + "eval_loss": 0.0008434146293438971, + "eval_runtime": 0.2555, + "eval_samples_per_second": 344.476, + "eval_steps_per_second": 43.059, + "step": 669 + }, + { + "epoch": 30.454545454545453, + "grad_norm": 0.013302307575941086, + "learning_rate": 3.986842105263158e-06, + "loss": 0.0014, + "step": 670 + }, + { + "epoch": 30.454545454545453, + "eval_loss": 0.0008429314475506544, + "eval_runtime": 0.234, + "eval_samples_per_second": 375.99, + "eval_steps_per_second": 46.999, + "step": 670 + }, + { + "epoch": 30.5, + "grad_norm": 0.015602638944983482, + "learning_rate": 3.9473684210526315e-06, + "loss": 0.0015, + "step": 671 + }, + { + "epoch": 30.5, + "eval_loss": 0.0008424482657574117, + "eval_runtime": 0.2312, + "eval_samples_per_second": 380.69, + "eval_steps_per_second": 47.586, + "step": 671 + }, + { + "epoch": 30.545454545454547, + "grad_norm": 0.012195833958685398, + "learning_rate": 3.907894736842105e-06, + "loss": 0.0014, + "step": 672 + }, + { + "epoch": 30.545454545454547, + "eval_loss": 0.0008419921505264938, + "eval_runtime": 0.2348, + "eval_samples_per_second": 374.848, + "eval_steps_per_second": 46.856, + "step": 672 + }, + { + "epoch": 30.59090909090909, + "grad_norm": 0.012124909088015556, + "learning_rate": 3.86842105263158e-06, + "loss": 0.0014, + "step": 673 + }, + { + "epoch": 30.59090909090909, + "eval_loss": 0.0008415495394729078, + "eval_runtime": 0.2337, + "eval_samples_per_second": 376.514, + "eval_steps_per_second": 47.064, + "step": 673 + }, + { + "epoch": 30.636363636363637, + "grad_norm": 0.012487749569118023, + "learning_rate": 3.828947368421053e-06, + "loss": 0.0014, + "step": 674 + }, + { + "epoch": 30.636363636363637, + "eval_loss": 0.0008411163580603898, + "eval_runtime": 0.2252, + "eval_samples_per_second": 390.686, + "eval_steps_per_second": 48.836, + "step": 674 + }, + { + "epoch": 30.681818181818183, + "grad_norm": 0.013694563880562782, + "learning_rate": 3.7894736842105264e-06, + "loss": 0.0015, + "step": 675 + }, + { + "epoch": 30.681818181818183, + "eval_loss": 0.0008406452834606171, + "eval_runtime": 0.2277, + "eval_samples_per_second": 386.401, + "eval_steps_per_second": 48.3, + "step": 675 + }, + { + "epoch": 30.727272727272727, + "grad_norm": 0.012177863158285618, + "learning_rate": 3.75e-06, + "loss": 0.0015, + "step": 676 + }, + { + "epoch": 30.727272727272727, + "eval_loss": 0.0008401837549172342, + "eval_runtime": 0.2284, + "eval_samples_per_second": 385.297, + "eval_steps_per_second": 48.162, + "step": 676 + }, + { + "epoch": 30.772727272727273, + "grad_norm": 0.011734875850379467, + "learning_rate": 3.710526315789474e-06, + "loss": 0.0013, + "step": 677 + }, + { + "epoch": 30.772727272727273, + "eval_loss": 0.0008397437632083893, + "eval_runtime": 0.2349, + "eval_samples_per_second": 374.661, + "eval_steps_per_second": 46.833, + "step": 677 + }, + { + "epoch": 30.818181818181817, + "grad_norm": 0.012181814759969711, + "learning_rate": 3.6710526315789476e-06, + "loss": 0.0015, + "step": 678 + }, + { + "epoch": 30.818181818181817, + "eval_loss": 0.000839310756418854, + "eval_runtime": 0.2267, + "eval_samples_per_second": 388.249, + "eval_steps_per_second": 48.531, + "step": 678 + }, + { + "epoch": 30.863636363636363, + "grad_norm": 0.014351209625601768, + "learning_rate": 3.6315789473684213e-06, + "loss": 0.0015, + "step": 679 + }, + { + "epoch": 30.863636363636363, + "eval_loss": 0.0008388804271817207, + "eval_runtime": 0.2382, + "eval_samples_per_second": 369.474, + "eval_steps_per_second": 46.184, + "step": 679 + }, + { + "epoch": 30.90909090909091, + "grad_norm": 0.01179533638060093, + "learning_rate": 3.5921052631578946e-06, + "loss": 0.0014, + "step": 680 + }, + { + "epoch": 30.90909090909091, + "eval_loss": 0.0008384499233216047, + "eval_runtime": 0.227, + "eval_samples_per_second": 387.694, + "eval_steps_per_second": 48.462, + "step": 680 + }, + { + "epoch": 30.954545454545453, + "grad_norm": 0.01200299896299839, + "learning_rate": 3.5526315789473683e-06, + "loss": 0.0014, + "step": 681 + }, + { + "epoch": 30.954545454545453, + "eval_loss": 0.0008380439248867333, + "eval_runtime": 0.2357, + "eval_samples_per_second": 373.384, + "eval_steps_per_second": 46.673, + "step": 681 + }, + { + "epoch": 31.0, + "grad_norm": 0.012165653519332409, + "learning_rate": 3.5131578947368425e-06, + "loss": 0.0014, + "step": 682 + }, + { + "epoch": 31.0, + "eval_loss": 0.0008376243058592081, + "eval_runtime": 0.2268, + "eval_samples_per_second": 387.994, + "eval_steps_per_second": 48.499, + "step": 682 + }, + { + "epoch": 31.045454545454547, + "grad_norm": 0.013023504056036472, + "learning_rate": 3.4736842105263158e-06, + "loss": 0.0014, + "step": 683 + }, + { + "epoch": 31.045454545454547, + "eval_loss": 0.0008372208685614169, + "eval_runtime": 0.2408, + "eval_samples_per_second": 365.51, + "eval_steps_per_second": 45.689, + "step": 683 + }, + { + "epoch": 31.09090909090909, + "grad_norm": 0.012478847056627274, + "learning_rate": 3.4342105263157895e-06, + "loss": 0.0015, + "step": 684 + }, + { + "epoch": 31.09090909090909, + "eval_loss": 0.0008367864647880197, + "eval_runtime": 0.2261, + "eval_samples_per_second": 389.221, + "eval_steps_per_second": 48.653, + "step": 684 + }, + { + "epoch": 31.136363636363637, + "grad_norm": 0.011943116784095764, + "learning_rate": 3.3947368421052632e-06, + "loss": 0.0014, + "step": 685 + }, + { + "epoch": 31.136363636363637, + "eval_loss": 0.0008363695815205574, + "eval_runtime": 0.2405, + "eval_samples_per_second": 365.829, + "eval_steps_per_second": 45.729, + "step": 685 + }, + { + "epoch": 31.181818181818183, + "grad_norm": 0.012198768556118011, + "learning_rate": 3.355263157894737e-06, + "loss": 0.0014, + "step": 686 + }, + { + "epoch": 31.181818181818183, + "eval_loss": 0.000835962186101824, + "eval_runtime": 0.2414, + "eval_samples_per_second": 364.526, + "eval_steps_per_second": 45.566, + "step": 686 + }, + { + "epoch": 31.227272727272727, + "grad_norm": 0.012970656156539917, + "learning_rate": 3.3157894736842107e-06, + "loss": 0.0014, + "step": 687 + }, + { + "epoch": 31.227272727272727, + "eval_loss": 0.0008355574682354927, + "eval_runtime": 0.2355, + "eval_samples_per_second": 373.686, + "eval_steps_per_second": 46.711, + "step": 687 + }, + { + "epoch": 31.272727272727273, + "grad_norm": 0.01133756898343563, + "learning_rate": 3.2763157894736844e-06, + "loss": 0.0012, + "step": 688 + }, + { + "epoch": 31.272727272727273, + "eval_loss": 0.0008351581636816263, + "eval_runtime": 0.239, + "eval_samples_per_second": 368.146, + "eval_steps_per_second": 46.018, + "step": 688 + }, + { + "epoch": 31.318181818181817, + "grad_norm": 0.014246292412281036, + "learning_rate": 3.236842105263158e-06, + "loss": 0.0014, + "step": 689 + }, + { + "epoch": 31.318181818181817, + "eval_loss": 0.0008347549010068178, + "eval_runtime": 0.2413, + "eval_samples_per_second": 364.723, + "eval_steps_per_second": 45.59, + "step": 689 + }, + { + "epoch": 31.363636363636363, + "grad_norm": 0.01505040843039751, + "learning_rate": 3.1973684210526314e-06, + "loss": 0.0016, + "step": 690 + }, + { + "epoch": 31.363636363636363, + "eval_loss": 0.0008343501249328256, + "eval_runtime": 0.2321, + "eval_samples_per_second": 379.09, + "eval_steps_per_second": 47.386, + "step": 690 + }, + { + "epoch": 31.40909090909091, + "grad_norm": 0.011749452911317348, + "learning_rate": 3.157894736842105e-06, + "loss": 0.0013, + "step": 691 + }, + { + "epoch": 31.40909090909091, + "eval_loss": 0.0008339481428265572, + "eval_runtime": 0.2656, + "eval_samples_per_second": 331.332, + "eval_steps_per_second": 41.416, + "step": 691 + }, + { + "epoch": 31.454545454545453, + "grad_norm": 0.012921934016048908, + "learning_rate": 3.1184210526315793e-06, + "loss": 0.0015, + "step": 692 + }, + { + "epoch": 31.454545454545453, + "eval_loss": 0.0008335394668392837, + "eval_runtime": 0.2542, + "eval_samples_per_second": 346.242, + "eval_steps_per_second": 43.28, + "step": 692 + }, + { + "epoch": 31.5, + "grad_norm": 0.01331315003335476, + "learning_rate": 3.0789473684210526e-06, + "loss": 0.0014, + "step": 693 + }, + { + "epoch": 31.5, + "eval_loss": 0.0008331468561664224, + "eval_runtime": 0.2417, + "eval_samples_per_second": 364.055, + "eval_steps_per_second": 45.507, + "step": 693 + }, + { + "epoch": 31.545454545454547, + "grad_norm": 0.012770496308803558, + "learning_rate": 3.0394736842105263e-06, + "loss": 0.0015, + "step": 694 + }, + { + "epoch": 31.545454545454547, + "eval_loss": 0.0008327368414029479, + "eval_runtime": 0.2689, + "eval_samples_per_second": 327.265, + "eval_steps_per_second": 40.908, + "step": 694 + }, + { + "epoch": 31.59090909090909, + "grad_norm": 0.012804139405488968, + "learning_rate": 3e-06, + "loss": 0.0014, + "step": 695 + }, + { + "epoch": 31.59090909090909, + "eval_loss": 0.0008323252550326288, + "eval_runtime": 0.2468, + "eval_samples_per_second": 356.61, + "eval_steps_per_second": 44.576, + "step": 695 + }, + { + "epoch": 31.636363636363637, + "grad_norm": 0.014062759466469288, + "learning_rate": 2.960526315789474e-06, + "loss": 0.0015, + "step": 696 + }, + { + "epoch": 31.636363636363637, + "eval_loss": 0.0008318935870192945, + "eval_runtime": 0.2529, + "eval_samples_per_second": 347.95, + "eval_steps_per_second": 43.494, + "step": 696 + }, + { + "epoch": 31.681818181818183, + "grad_norm": 0.013049440458416939, + "learning_rate": 2.9210526315789475e-06, + "loss": 0.0014, + "step": 697 + }, + { + "epoch": 31.681818181818183, + "eval_loss": 0.0008314928272739053, + "eval_runtime": 0.2521, + "eval_samples_per_second": 349.0, + "eval_steps_per_second": 43.625, + "step": 697 + }, + { + "epoch": 31.727272727272727, + "grad_norm": 0.01172225084155798, + "learning_rate": 2.881578947368421e-06, + "loss": 0.0013, + "step": 698 + }, + { + "epoch": 31.727272727272727, + "eval_loss": 0.0008310881094075739, + "eval_runtime": 0.2672, + "eval_samples_per_second": 329.329, + "eval_steps_per_second": 41.166, + "step": 698 + }, + { + "epoch": 31.772727272727273, + "grad_norm": 0.01266531739383936, + "learning_rate": 2.842105263157895e-06, + "loss": 0.0014, + "step": 699 + }, + { + "epoch": 31.772727272727273, + "eval_loss": 0.0008307105163112283, + "eval_runtime": 0.3176, + "eval_samples_per_second": 277.082, + "eval_steps_per_second": 34.635, + "step": 699 + }, + { + "epoch": 31.818181818181817, + "grad_norm": 0.014071842655539513, + "learning_rate": 2.8026315789473683e-06, + "loss": 0.0015, + "step": 700 + }, + { + "epoch": 31.818181818181817, + "eval_loss": 0.0008303424110636115, + "eval_runtime": 0.2648, + "eval_samples_per_second": 332.279, + "eval_steps_per_second": 41.535, + "step": 700 + }, + { + "epoch": 31.863636363636363, + "grad_norm": 0.01333391759544611, + "learning_rate": 2.763157894736842e-06, + "loss": 0.0015, + "step": 701 + }, + { + "epoch": 31.863636363636363, + "eval_loss": 0.0008299809414893389, + "eval_runtime": 0.2429, + "eval_samples_per_second": 362.239, + "eval_steps_per_second": 45.28, + "step": 701 + }, + { + "epoch": 31.90909090909091, + "grad_norm": 0.010583317838609219, + "learning_rate": 2.723684210526316e-06, + "loss": 0.0012, + "step": 702 + }, + { + "epoch": 31.90909090909091, + "eval_loss": 0.0008296439773403108, + "eval_runtime": 0.2463, + "eval_samples_per_second": 357.358, + "eval_steps_per_second": 44.67, + "step": 702 + }, + { + "epoch": 31.954545454545453, + "grad_norm": 0.01122986525297165, + "learning_rate": 2.6842105263157895e-06, + "loss": 0.0013, + "step": 703 + }, + { + "epoch": 31.954545454545453, + "eval_loss": 0.0008293138234876096, + "eval_runtime": 0.2433, + "eval_samples_per_second": 361.652, + "eval_steps_per_second": 45.206, + "step": 703 + }, + { + "epoch": 32.0, + "grad_norm": 0.011437175795435905, + "learning_rate": 2.644736842105263e-06, + "loss": 0.0013, + "step": 704 + }, + { + "epoch": 32.0, + "eval_loss": 0.0008289901888929307, + "eval_runtime": 0.2357, + "eval_samples_per_second": 373.319, + "eval_steps_per_second": 46.665, + "step": 704 + }, + { + "epoch": 32.04545454545455, + "grad_norm": 0.012699670158326626, + "learning_rate": 2.605263157894737e-06, + "loss": 0.0014, + "step": 705 + }, + { + "epoch": 32.04545454545455, + "eval_loss": 0.0008286829688586295, + "eval_runtime": 0.2319, + "eval_samples_per_second": 379.476, + "eval_steps_per_second": 47.435, + "step": 705 + }, + { + "epoch": 32.09090909090909, + "grad_norm": 0.013239861465990543, + "learning_rate": 2.5657894736842107e-06, + "loss": 0.0014, + "step": 706 + }, + { + "epoch": 32.09090909090909, + "eval_loss": 0.0008283716160804033, + "eval_runtime": 0.2319, + "eval_samples_per_second": 379.415, + "eval_steps_per_second": 47.427, + "step": 706 + }, + { + "epoch": 32.13636363636363, + "grad_norm": 0.012133197858929634, + "learning_rate": 2.5263157894736844e-06, + "loss": 0.0013, + "step": 707 + }, + { + "epoch": 32.13636363636363, + "eval_loss": 0.0008280739421024919, + "eval_runtime": 0.2242, + "eval_samples_per_second": 392.558, + "eval_steps_per_second": 49.07, + "step": 707 + }, + { + "epoch": 32.18181818181818, + "grad_norm": 0.011126801371574402, + "learning_rate": 2.4868421052631577e-06, + "loss": 0.0013, + "step": 708 + }, + { + "epoch": 32.18181818181818, + "eval_loss": 0.0008277747547253966, + "eval_runtime": 0.2381, + "eval_samples_per_second": 369.534, + "eval_steps_per_second": 46.192, + "step": 708 + }, + { + "epoch": 32.22727272727273, + "grad_norm": 0.012151258997619152, + "learning_rate": 2.447368421052632e-06, + "loss": 0.0014, + "step": 709 + }, + { + "epoch": 32.22727272727273, + "eval_loss": 0.0008274810388684273, + "eval_runtime": 0.265, + "eval_samples_per_second": 332.045, + "eval_steps_per_second": 41.506, + "step": 709 + }, + { + "epoch": 32.27272727272727, + "grad_norm": 0.013219231739640236, + "learning_rate": 2.407894736842105e-06, + "loss": 0.0014, + "step": 710 + }, + { + "epoch": 32.27272727272727, + "eval_loss": 0.0008271847036667168, + "eval_runtime": 0.2428, + "eval_samples_per_second": 362.463, + "eval_steps_per_second": 45.308, + "step": 710 + }, + { + "epoch": 32.31818181818182, + "grad_norm": 0.010275053791701794, + "learning_rate": 2.368421052631579e-06, + "loss": 0.0012, + "step": 711 + }, + { + "epoch": 32.31818181818182, + "eval_loss": 0.0008268963429145515, + "eval_runtime": 0.2418, + "eval_samples_per_second": 363.953, + "eval_steps_per_second": 45.494, + "step": 711 + }, + { + "epoch": 32.36363636363637, + "grad_norm": 0.013079304248094559, + "learning_rate": 2.328947368421053e-06, + "loss": 0.0014, + "step": 712 + }, + { + "epoch": 32.36363636363637, + "eval_loss": 0.00082661077613011, + "eval_runtime": 0.232, + "eval_samples_per_second": 379.238, + "eval_steps_per_second": 47.405, + "step": 712 + }, + { + "epoch": 32.40909090909091, + "grad_norm": 0.019619744271039963, + "learning_rate": 2.2894736842105263e-06, + "loss": 0.0014, + "step": 713 + }, + { + "epoch": 32.40909090909091, + "eval_loss": 0.0008263156050816178, + "eval_runtime": 0.2626, + "eval_samples_per_second": 335.09, + "eval_steps_per_second": 41.886, + "step": 713 + }, + { + "epoch": 32.45454545454545, + "grad_norm": 0.014103109948337078, + "learning_rate": 2.25e-06, + "loss": 0.0015, + "step": 714 + }, + { + "epoch": 32.45454545454545, + "eval_loss": 0.0008260206668637693, + "eval_runtime": 0.256, + "eval_samples_per_second": 343.813, + "eval_steps_per_second": 42.977, + "step": 714 + }, + { + "epoch": 32.5, + "grad_norm": 0.013360358774662018, + "learning_rate": 2.2105263157894734e-06, + "loss": 0.0015, + "step": 715 + }, + { + "epoch": 32.5, + "eval_loss": 0.0008257552981376648, + "eval_runtime": 0.2719, + "eval_samples_per_second": 323.628, + "eval_steps_per_second": 40.454, + "step": 715 + }, + { + "epoch": 32.54545454545455, + "grad_norm": 0.012335807085037231, + "learning_rate": 2.1710526315789475e-06, + "loss": 0.0014, + "step": 716 + }, + { + "epoch": 32.54545454545455, + "eval_loss": 0.0008254764834418893, + "eval_runtime": 0.257, + "eval_samples_per_second": 342.453, + "eval_steps_per_second": 42.807, + "step": 716 + }, + { + "epoch": 32.59090909090909, + "grad_norm": 0.012738436460494995, + "learning_rate": 2.1315789473684212e-06, + "loss": 0.0014, + "step": 717 + }, + { + "epoch": 32.59090909090909, + "eval_loss": 0.0008252071565948427, + "eval_runtime": 0.2774, + "eval_samples_per_second": 317.284, + "eval_steps_per_second": 39.66, + "step": 717 + }, + { + "epoch": 32.63636363636363, + "grad_norm": 0.011913586407899857, + "learning_rate": 2.0921052631578945e-06, + "loss": 0.0013, + "step": 718 + }, + { + "epoch": 32.63636363636363, + "eval_loss": 0.0008249431848526001, + "eval_runtime": 0.2458, + "eval_samples_per_second": 358.083, + "eval_steps_per_second": 44.76, + "step": 718 + }, + { + "epoch": 32.68181818181818, + "grad_norm": 0.010375920683145523, + "learning_rate": 2.0526315789473687e-06, + "loss": 0.0013, + "step": 719 + }, + { + "epoch": 32.68181818181818, + "eval_loss": 0.0008246820070780814, + "eval_runtime": 0.2548, + "eval_samples_per_second": 345.32, + "eval_steps_per_second": 43.165, + "step": 719 + }, + { + "epoch": 32.72727272727273, + "grad_norm": 0.016080064699053764, + "learning_rate": 2.013157894736842e-06, + "loss": 0.0016, + "step": 720 + }, + { + "epoch": 32.72727272727273, + "eval_loss": 0.0008244179771281779, + "eval_runtime": 0.2695, + "eval_samples_per_second": 326.571, + "eval_steps_per_second": 40.821, + "step": 720 + }, + { + "epoch": 32.77272727272727, + "grad_norm": 0.01252568420022726, + "learning_rate": 1.9736842105263157e-06, + "loss": 0.0013, + "step": 721 + }, + { + "epoch": 32.77272727272727, + "eval_loss": 0.0008241839241236448, + "eval_runtime": 0.2948, + "eval_samples_per_second": 298.515, + "eval_steps_per_second": 37.314, + "step": 721 + }, + { + "epoch": 32.81818181818182, + "grad_norm": 0.012378372251987457, + "learning_rate": 1.93421052631579e-06, + "loss": 0.0014, + "step": 722 + }, + { + "epoch": 32.81818181818182, + "eval_loss": 0.0008239619201049209, + "eval_runtime": 0.2733, + "eval_samples_per_second": 321.958, + "eval_steps_per_second": 40.245, + "step": 722 + }, + { + "epoch": 32.86363636363637, + "grad_norm": 0.013344389386475086, + "learning_rate": 1.8947368421052632e-06, + "loss": 0.0015, + "step": 723 + }, + { + "epoch": 32.86363636363637, + "eval_loss": 0.000823718321043998, + "eval_runtime": 0.2569, + "eval_samples_per_second": 342.55, + "eval_steps_per_second": 42.819, + "step": 723 + }, + { + "epoch": 32.90909090909091, + "grad_norm": 0.012948358431458473, + "learning_rate": 1.855263157894737e-06, + "loss": 0.0015, + "step": 724 + }, + { + "epoch": 32.90909090909091, + "eval_loss": 0.0008235003333538771, + "eval_runtime": 0.2559, + "eval_samples_per_second": 343.901, + "eval_steps_per_second": 42.988, + "step": 724 + }, + { + "epoch": 32.95454545454545, + "grad_norm": 0.011233711615204811, + "learning_rate": 1.8157894736842106e-06, + "loss": 0.0012, + "step": 725 + }, + { + "epoch": 32.95454545454545, + "eval_loss": 0.000823282403871417, + "eval_runtime": 0.2642, + "eval_samples_per_second": 333.059, + "eval_steps_per_second": 41.632, + "step": 725 + }, + { + "epoch": 33.0, + "grad_norm": 0.01327808853238821, + "learning_rate": 1.7763157894736842e-06, + "loss": 0.0015, + "step": 726 + }, + { + "epoch": 33.0, + "eval_loss": 0.0008230686071328819, + "eval_runtime": 0.2841, + "eval_samples_per_second": 309.721, + "eval_steps_per_second": 38.715, + "step": 726 + }, + { + "epoch": 33.04545454545455, + "grad_norm": 0.011662392877042294, + "learning_rate": 1.7368421052631579e-06, + "loss": 0.0014, + "step": 727 + }, + { + "epoch": 33.04545454545455, + "eval_loss": 0.0008228750666603446, + "eval_runtime": 0.249, + "eval_samples_per_second": 353.382, + "eval_steps_per_second": 44.173, + "step": 727 + }, + { + "epoch": 33.09090909090909, + "grad_norm": 0.011290736496448517, + "learning_rate": 1.6973684210526316e-06, + "loss": 0.0013, + "step": 728 + }, + { + "epoch": 33.09090909090909, + "eval_loss": 0.0008226787904277444, + "eval_runtime": 0.2459, + "eval_samples_per_second": 357.906, + "eval_steps_per_second": 44.738, + "step": 728 + }, + { + "epoch": 33.13636363636363, + "grad_norm": 0.011928938329219818, + "learning_rate": 1.6578947368421053e-06, + "loss": 0.0014, + "step": 729 + }, + { + "epoch": 33.13636363636363, + "eval_loss": 0.0008224839111790061, + "eval_runtime": 0.2693, + "eval_samples_per_second": 326.807, + "eval_steps_per_second": 40.851, + "step": 729 + }, + { + "epoch": 33.18181818181818, + "grad_norm": 0.013969271443784237, + "learning_rate": 1.618421052631579e-06, + "loss": 0.0014, + "step": 730 + }, + { + "epoch": 33.18181818181818, + "eval_loss": 0.0008223024778999388, + "eval_runtime": 0.2985, + "eval_samples_per_second": 294.848, + "eval_steps_per_second": 36.856, + "step": 730 + }, + { + "epoch": 33.22727272727273, + "grad_norm": 0.01247771643102169, + "learning_rate": 1.5789473684210526e-06, + "loss": 0.0014, + "step": 731 + }, + { + "epoch": 33.22727272727273, + "eval_loss": 0.0008221129537560046, + "eval_runtime": 0.2564, + "eval_samples_per_second": 343.194, + "eval_steps_per_second": 42.899, + "step": 731 + }, + { + "epoch": 33.27272727272727, + "grad_norm": 0.012111688032746315, + "learning_rate": 1.5394736842105263e-06, + "loss": 0.0013, + "step": 732 + }, + { + "epoch": 33.27272727272727, + "eval_loss": 0.0008219464216381311, + "eval_runtime": 0.2484, + "eval_samples_per_second": 354.308, + "eval_steps_per_second": 44.289, + "step": 732 + }, + { + "epoch": 33.31818181818182, + "grad_norm": 0.01268478948622942, + "learning_rate": 1.5e-06, + "loss": 0.0014, + "step": 733 + }, + { + "epoch": 33.31818181818182, + "eval_loss": 0.0008217745926231146, + "eval_runtime": 0.2938, + "eval_samples_per_second": 299.518, + "eval_steps_per_second": 37.44, + "step": 733 + }, + { + "epoch": 33.36363636363637, + "grad_norm": 0.01151086576282978, + "learning_rate": 1.4605263157894738e-06, + "loss": 0.0012, + "step": 734 + }, + { + "epoch": 33.36363636363637, + "eval_loss": 0.0008215824491344392, + "eval_runtime": 0.2627, + "eval_samples_per_second": 335.021, + "eval_steps_per_second": 41.878, + "step": 734 + }, + { + "epoch": 33.40909090909091, + "grad_norm": 0.012743664905428886, + "learning_rate": 1.4210526315789475e-06, + "loss": 0.0014, + "step": 735 + }, + { + "epoch": 33.40909090909091, + "eval_loss": 0.0008214117842726409, + "eval_runtime": 0.257, + "eval_samples_per_second": 342.469, + "eval_steps_per_second": 42.809, + "step": 735 + }, + { + "epoch": 33.45454545454545, + "grad_norm": 0.014465508982539177, + "learning_rate": 1.381578947368421e-06, + "loss": 0.0015, + "step": 736 + }, + { + "epoch": 33.45454545454545, + "eval_loss": 0.0008212332031689584, + "eval_runtime": 0.2383, + "eval_samples_per_second": 369.294, + "eval_steps_per_second": 46.162, + "step": 736 + }, + { + "epoch": 33.5, + "grad_norm": 0.011136289685964584, + "learning_rate": 1.3421052631578947e-06, + "loss": 0.0013, + "step": 737 + }, + { + "epoch": 33.5, + "eval_loss": 0.0008210748201236129, + "eval_runtime": 0.2515, + "eval_samples_per_second": 349.848, + "eval_steps_per_second": 43.731, + "step": 737 + }, + { + "epoch": 33.54545454545455, + "grad_norm": 0.013279801234602928, + "learning_rate": 1.3026315789473685e-06, + "loss": 0.0014, + "step": 738 + }, + { + "epoch": 33.54545454545455, + "eval_loss": 0.0008209014777094126, + "eval_runtime": 0.2624, + "eval_samples_per_second": 335.412, + "eval_steps_per_second": 41.926, + "step": 738 + }, + { + "epoch": 33.59090909090909, + "grad_norm": 0.011146324686706066, + "learning_rate": 1.2631578947368422e-06, + "loss": 0.0012, + "step": 739 + }, + { + "epoch": 33.59090909090909, + "eval_loss": 0.0008207445498555899, + "eval_runtime": 0.2477, + "eval_samples_per_second": 355.277, + "eval_steps_per_second": 44.41, + "step": 739 + }, + { + "epoch": 33.63636363636363, + "grad_norm": 0.011300037615001202, + "learning_rate": 1.223684210526316e-06, + "loss": 0.0013, + "step": 740 + }, + { + "epoch": 33.63636363636363, + "eval_loss": 0.0008206011261790991, + "eval_runtime": 0.2524, + "eval_samples_per_second": 348.598, + "eval_steps_per_second": 43.575, + "step": 740 + }, + { + "epoch": 33.68181818181818, + "grad_norm": 0.013210857287049294, + "learning_rate": 1.1842105263157894e-06, + "loss": 0.0013, + "step": 741 + }, + { + "epoch": 33.68181818181818, + "eval_loss": 0.0008204494952224195, + "eval_runtime": 0.2769, + "eval_samples_per_second": 317.777, + "eval_steps_per_second": 39.722, + "step": 741 + }, + { + "epoch": 33.72727272727273, + "grad_norm": 0.011201176792383194, + "learning_rate": 1.1447368421052632e-06, + "loss": 0.0013, + "step": 742 + }, + { + "epoch": 33.72727272727273, + "eval_loss": 0.0008203128236345947, + "eval_runtime": 0.259, + "eval_samples_per_second": 339.749, + "eval_steps_per_second": 42.469, + "step": 742 + }, + { + "epoch": 33.77272727272727, + "grad_norm": 0.012550720945000648, + "learning_rate": 1.1052631578947367e-06, + "loss": 0.0013, + "step": 743 + }, + { + "epoch": 33.77272727272727, + "eval_loss": 0.0008201680611819029, + "eval_runtime": 0.2549, + "eval_samples_per_second": 345.3, + "eval_steps_per_second": 43.163, + "step": 743 + }, + { + "epoch": 33.81818181818182, + "grad_norm": 0.011524029076099396, + "learning_rate": 1.0657894736842106e-06, + "loss": 0.0014, + "step": 744 + }, + { + "epoch": 33.81818181818182, + "eval_loss": 0.0008200569427572191, + "eval_runtime": 0.2576, + "eval_samples_per_second": 341.575, + "eval_steps_per_second": 42.697, + "step": 744 + }, + { + "epoch": 33.86363636363637, + "grad_norm": 0.014999749138951302, + "learning_rate": 1.0263157894736843e-06, + "loss": 0.0015, + "step": 745 + }, + { + "epoch": 33.86363636363637, + "eval_loss": 0.0008199459407478571, + "eval_runtime": 0.2573, + "eval_samples_per_second": 342.038, + "eval_steps_per_second": 42.755, + "step": 745 + }, + { + "epoch": 33.90909090909091, + "grad_norm": 0.013432620093226433, + "learning_rate": 9.868421052631579e-07, + "loss": 0.0014, + "step": 746 + }, + { + "epoch": 33.90909090909091, + "eval_loss": 0.0008198119467124343, + "eval_runtime": 0.2794, + "eval_samples_per_second": 314.936, + "eval_steps_per_second": 39.367, + "step": 746 + }, + { + "epoch": 33.95454545454545, + "grad_norm": 0.011333504691720009, + "learning_rate": 9.473684210526316e-07, + "loss": 0.0014, + "step": 747 + }, + { + "epoch": 33.95454545454545, + "eval_loss": 0.0008196914568543434, + "eval_runtime": 0.2549, + "eval_samples_per_second": 345.205, + "eval_steps_per_second": 43.151, + "step": 747 + }, + { + "epoch": 34.0, + "grad_norm": 0.0102554215118289, + "learning_rate": 9.078947368421053e-07, + "loss": 0.0012, + "step": 748 + }, + { + "epoch": 34.0, + "eval_loss": 0.0008195764967240393, + "eval_runtime": 0.2656, + "eval_samples_per_second": 331.349, + "eval_steps_per_second": 41.419, + "step": 748 + }, + { + "epoch": 34.04545454545455, + "grad_norm": 0.011500447988510132, + "learning_rate": 8.684210526315789e-07, + "loss": 0.0013, + "step": 749 + }, + { + "epoch": 34.04545454545455, + "eval_loss": 0.0008194709080271423, + "eval_runtime": 0.2631, + "eval_samples_per_second": 334.415, + "eval_steps_per_second": 41.802, + "step": 749 + }, + { + "epoch": 34.09090909090909, + "grad_norm": 0.011614636518061161, + "learning_rate": 8.289473684210527e-07, + "loss": 0.0014, + "step": 750 + }, + { + "epoch": 34.09090909090909, + "eval_loss": 0.0008193707326427102, + "eval_runtime": 0.2898, + "eval_samples_per_second": 303.667, + "eval_steps_per_second": 37.958, + "step": 750 + }, + { + "epoch": 34.13636363636363, + "grad_norm": 0.010696956887841225, + "learning_rate": 7.894736842105263e-07, + "loss": 0.0013, + "step": 751 + }, + { + "epoch": 34.13636363636363, + "eval_loss": 0.0008192665409296751, + "eval_runtime": 0.2328, + "eval_samples_per_second": 378.006, + "eval_steps_per_second": 47.251, + "step": 751 + }, + { + "epoch": 34.18181818181818, + "grad_norm": 0.011633389629423618, + "learning_rate": 7.5e-07, + "loss": 0.0014, + "step": 752 + }, + { + "epoch": 34.18181818181818, + "eval_loss": 0.0008191689848899841, + "eval_runtime": 0.2379, + "eval_samples_per_second": 369.833, + "eval_steps_per_second": 46.229, + "step": 752 + }, + { + "epoch": 34.22727272727273, + "grad_norm": 0.013071279041469097, + "learning_rate": 7.105263157894737e-07, + "loss": 0.0013, + "step": 753 + }, + { + "epoch": 34.22727272727273, + "eval_loss": 0.0008190743392333388, + "eval_runtime": 0.2373, + "eval_samples_per_second": 370.812, + "eval_steps_per_second": 46.351, + "step": 753 + }, + { + "epoch": 34.27272727272727, + "grad_norm": 0.011386328376829624, + "learning_rate": 6.710526315789474e-07, + "loss": 0.0013, + "step": 754 + }, + { + "epoch": 34.27272727272727, + "eval_loss": 0.0008190052467398345, + "eval_runtime": 0.2237, + "eval_samples_per_second": 393.447, + "eval_steps_per_second": 49.181, + "step": 754 + }, + { + "epoch": 34.31818181818182, + "grad_norm": 0.011327208951115608, + "learning_rate": 6.315789473684211e-07, + "loss": 0.0013, + "step": 755 + }, + { + "epoch": 34.31818181818182, + "eval_loss": 0.0008189321961253881, + "eval_runtime": 0.2293, + "eval_samples_per_second": 383.737, + "eval_steps_per_second": 47.967, + "step": 755 + }, + { + "epoch": 34.36363636363637, + "grad_norm": 0.011524545960128307, + "learning_rate": 5.921052631578947e-07, + "loss": 0.0014, + "step": 756 + }, + { + "epoch": 34.36363636363637, + "eval_loss": 0.0008188713109120727, + "eval_runtime": 0.2355, + "eval_samples_per_second": 373.643, + "eval_steps_per_second": 46.705, + "step": 756 + }, + { + "epoch": 34.40909090909091, + "grad_norm": 0.012313243001699448, + "learning_rate": 5.526315789473683e-07, + "loss": 0.0014, + "step": 757 + }, + { + "epoch": 34.40909090909091, + "eval_loss": 0.0008188103674910963, + "eval_runtime": 0.4518, + "eval_samples_per_second": 194.767, + "eval_steps_per_second": 24.346, + "step": 757 + }, + { + "epoch": 34.45454545454545, + "grad_norm": 0.012687238864600658, + "learning_rate": 5.131578947368422e-07, + "loss": 0.0014, + "step": 758 + }, + { + "epoch": 34.45454545454545, + "eval_loss": 0.0008187480852939188, + "eval_runtime": 0.2941, + "eval_samples_per_second": 299.267, + "eval_steps_per_second": 37.408, + "step": 758 + }, + { + "epoch": 34.5, + "grad_norm": 0.012826275080442429, + "learning_rate": 4.736842105263158e-07, + "loss": 0.0013, + "step": 759 + }, + { + "epoch": 34.5, + "eval_loss": 0.0008186926716007292, + "eval_runtime": 0.3991, + "eval_samples_per_second": 220.5, + "eval_steps_per_second": 27.562, + "step": 759 + }, + { + "epoch": 34.54545454545455, + "grad_norm": 0.012961960397660732, + "learning_rate": 4.3421052631578947e-07, + "loss": 0.0015, + "step": 760 + }, + { + "epoch": 34.54545454545455, + "eval_loss": 0.0008186465711332858, + "eval_runtime": 0.2484, + "eval_samples_per_second": 354.263, + "eval_steps_per_second": 44.283, + "step": 760 + }, + { + "epoch": 34.59090909090909, + "grad_norm": 0.013269671238958836, + "learning_rate": 3.9473684210526315e-07, + "loss": 0.0014, + "step": 761 + }, + { + "epoch": 34.59090909090909, + "eval_loss": 0.0008186018676497042, + "eval_runtime": 0.291, + "eval_samples_per_second": 302.416, + "eval_steps_per_second": 37.802, + "step": 761 + }, + { + "epoch": 34.63636363636363, + "grad_norm": 0.012951558455824852, + "learning_rate": 3.5526315789473687e-07, + "loss": 0.0013, + "step": 762 + }, + { + "epoch": 34.63636363636363, + "eval_loss": 0.0008185504120774567, + "eval_runtime": 0.2594, + "eval_samples_per_second": 339.213, + "eval_steps_per_second": 42.402, + "step": 762 + }, + { + "epoch": 34.68181818181818, + "grad_norm": 0.01040305569767952, + "learning_rate": 3.1578947368421055e-07, + "loss": 0.0013, + "step": 763 + }, + { + "epoch": 34.68181818181818, + "eval_loss": 0.0008185274782590568, + "eval_runtime": 0.3222, + "eval_samples_per_second": 273.081, + "eval_steps_per_second": 34.135, + "step": 763 + }, + { + "epoch": 34.72727272727273, + "grad_norm": 0.013104148209095001, + "learning_rate": 2.7631578947368417e-07, + "loss": 0.0014, + "step": 764 + }, + { + "epoch": 34.72727272727273, + "eval_loss": 0.0008184895268641412, + "eval_runtime": 0.3124, + "eval_samples_per_second": 281.65, + "eval_steps_per_second": 35.206, + "step": 764 + }, + { + "epoch": 34.77272727272727, + "grad_norm": 0.012136269360780716, + "learning_rate": 2.368421052631579e-07, + "loss": 0.0014, + "step": 765 + }, + { + "epoch": 34.77272727272727, + "eval_loss": 0.0008184570469893515, + "eval_runtime": 0.2394, + "eval_samples_per_second": 367.55, + "eval_steps_per_second": 45.944, + "step": 765 + }, + { + "epoch": 34.81818181818182, + "grad_norm": 0.011621113866567612, + "learning_rate": 1.9736842105263157e-07, + "loss": 0.0014, + "step": 766 + }, + { + "epoch": 34.81818181818182, + "eval_loss": 0.0008184341131709516, + "eval_runtime": 0.2627, + "eval_samples_per_second": 334.967, + "eval_steps_per_second": 41.871, + "step": 766 + }, + { + "epoch": 34.86363636363637, + "grad_norm": 0.0140585508197546, + "learning_rate": 1.5789473684210527e-07, + "loss": 0.0014, + "step": 767 + }, + { + "epoch": 34.86363636363637, + "eval_loss": 0.0008184110629372299, + "eval_runtime": 0.2624, + "eval_samples_per_second": 335.359, + "eval_steps_per_second": 41.92, + "step": 767 + }, + { + "epoch": 34.90909090909091, + "grad_norm": 0.0137332146987319, + "learning_rate": 1.1842105263157895e-07, + "loss": 0.0013, + "step": 768 + }, + { + "epoch": 34.90909090909091, + "eval_loss": 0.000818394822999835, + "eval_runtime": 0.3455, + "eval_samples_per_second": 254.714, + "eval_steps_per_second": 31.839, + "step": 768 + }, + { + "epoch": 34.95454545454545, + "grad_norm": 0.013574851676821709, + "learning_rate": 7.894736842105264e-08, + "loss": 0.0015, + "step": 769 + }, + { + "epoch": 34.95454545454545, + "eval_loss": 0.000818385393358767, + "eval_runtime": 0.4011, + "eval_samples_per_second": 219.413, + "eval_steps_per_second": 27.427, + "step": 769 + }, + { + "epoch": 35.0, + "grad_norm": 0.01393211167305708, + "learning_rate": 3.947368421052632e-08, + "loss": 0.0014, + "step": 770 + }, + { + "epoch": 35.0, + "eval_loss": 0.0008183813188225031, + "eval_runtime": 0.2776, + "eval_samples_per_second": 316.984, + "eval_steps_per_second": 39.623, + "step": 770 + } + ], + "logging_steps": 1, + "max_steps": 770, + "num_input_tokens_seen": 0, + "num_train_epochs": 35, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 335566894333440.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}