diff --git "a/checkpoint-770/trainer_state.json" "b/checkpoint-770/trainer_state.json" deleted file mode 100644--- "a/checkpoint-770/trainer_state.json" +++ /dev/null @@ -1,11584 +0,0 @@ -{ - "best_global_step": null, - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 35.0, - "eval_steps": 1, - "global_step": 770, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.045454545454545456, - "grad_norm": 5.237588882446289, - "learning_rate": 0.0, - "loss": 2.0682, - "step": 1 - }, - { - "epoch": 0.045454545454545456, - "eval_loss": 2.063732147216797, - "eval_runtime": 0.2778, - "eval_samples_per_second": 316.813, - "eval_steps_per_second": 39.602, - "step": 1 - }, - { - "epoch": 0.09090909090909091, - "grad_norm": 5.7836594581604, - "learning_rate": 3e-06, - "loss": 2.0543, - "step": 2 - }, - { - "epoch": 0.09090909090909091, - "eval_loss": 2.058272123336792, - "eval_runtime": 0.2138, - "eval_samples_per_second": 411.689, - "eval_steps_per_second": 51.461, - "step": 2 - }, - { - "epoch": 0.13636363636363635, - "grad_norm": 4.997707366943359, - "learning_rate": 6e-06, - "loss": 2.106, - "step": 3 - }, - { - "epoch": 0.13636363636363635, - "eval_loss": 2.044473171234131, - "eval_runtime": 0.2229, - "eval_samples_per_second": 394.85, - "eval_steps_per_second": 49.356, - "step": 3 - }, - { - "epoch": 0.18181818181818182, - "grad_norm": 4.480862140655518, - "learning_rate": 9e-06, - "loss": 2.0133, - "step": 4 - }, - { - "epoch": 0.18181818181818182, - "eval_loss": 2.026616096496582, - "eval_runtime": 0.2098, - "eval_samples_per_second": 419.399, - "eval_steps_per_second": 52.425, - "step": 4 - }, - { - "epoch": 0.22727272727272727, - "grad_norm": 4.413949489593506, - "learning_rate": 1.2e-05, - "loss": 2.0339, - "step": 5 - }, - { - "epoch": 0.22727272727272727, - "eval_loss": 2.0050275325775146, - "eval_runtime": 0.2083, - "eval_samples_per_second": 422.489, - "eval_steps_per_second": 52.811, - "step": 5 - }, - { - "epoch": 0.2727272727272727, - "grad_norm": 3.8636281490325928, - "learning_rate": 1.5e-05, - "loss": 1.9456, - "step": 6 - }, - { - "epoch": 0.2727272727272727, - "eval_loss": 1.978696346282959, - "eval_runtime": 0.2234, - "eval_samples_per_second": 393.959, - "eval_steps_per_second": 49.245, - "step": 6 - }, - { - "epoch": 0.3181818181818182, - "grad_norm": 5.352145671844482, - "learning_rate": 1.8e-05, - "loss": 2.0702, - "step": 7 - }, - { - "epoch": 0.3181818181818182, - "eval_loss": 1.9451583623886108, - "eval_runtime": 0.2365, - "eval_samples_per_second": 372.165, - "eval_steps_per_second": 46.521, - "step": 7 - }, - { - "epoch": 0.36363636363636365, - "grad_norm": 6.098653316497803, - "learning_rate": 2.1e-05, - "loss": 1.9057, - "step": 8 - }, - { - "epoch": 0.36363636363636365, - "eval_loss": 1.908401608467102, - "eval_runtime": 0.2109, - "eval_samples_per_second": 417.279, - "eval_steps_per_second": 52.16, - "step": 8 - }, - { - "epoch": 0.4090909090909091, - "grad_norm": 4.3218302726745605, - "learning_rate": 2.4e-05, - "loss": 2.0159, - "step": 9 - }, - { - "epoch": 0.4090909090909091, - "eval_loss": 1.860684871673584, - "eval_runtime": 0.2261, - "eval_samples_per_second": 389.203, - "eval_steps_per_second": 48.65, - "step": 9 - }, - { - "epoch": 0.45454545454545453, - "grad_norm": 4.778627395629883, - "learning_rate": 2.7000000000000002e-05, - "loss": 1.8808, - "step": 10 - }, - { - "epoch": 0.45454545454545453, - "eval_loss": 1.793589472770691, - "eval_runtime": 0.2922, - "eval_samples_per_second": 301.187, - "eval_steps_per_second": 37.648, - "step": 10 - }, - { - "epoch": 0.5, - "grad_norm": 5.957038879394531, - "learning_rate": 3e-05, - "loss": 1.896, - "step": 11 - }, - { - "epoch": 0.5, - "eval_loss": 1.7104023694992065, - "eval_runtime": 0.3181, - "eval_samples_per_second": 276.671, - "eval_steps_per_second": 34.584, - "step": 11 - }, - { - "epoch": 0.5454545454545454, - "grad_norm": 6.62753963470459, - "learning_rate": 2.9960526315789475e-05, - "loss": 1.7627, - "step": 12 - }, - { - "epoch": 0.5454545454545454, - "eval_loss": 1.6353049278259277, - "eval_runtime": 0.4101, - "eval_samples_per_second": 214.57, - "eval_steps_per_second": 26.821, - "step": 12 - }, - { - "epoch": 0.5909090909090909, - "grad_norm": 5.637991905212402, - "learning_rate": 2.992105263157895e-05, - "loss": 1.6927, - "step": 13 - }, - { - "epoch": 0.5909090909090909, - "eval_loss": 1.5653632879257202, - "eval_runtime": 0.3772, - "eval_samples_per_second": 233.322, - "eval_steps_per_second": 29.165, - "step": 13 - }, - { - "epoch": 0.6363636363636364, - "grad_norm": 7.619434356689453, - "learning_rate": 2.9881578947368423e-05, - "loss": 1.5805, - "step": 14 - }, - { - "epoch": 0.6363636363636364, - "eval_loss": 1.4975870847702026, - "eval_runtime": 0.2484, - "eval_samples_per_second": 354.217, - "eval_steps_per_second": 44.277, - "step": 14 - }, - { - "epoch": 0.6818181818181818, - "grad_norm": 8.660569190979004, - "learning_rate": 2.9842105263157894e-05, - "loss": 1.5803, - "step": 15 - }, - { - "epoch": 0.6818181818181818, - "eval_loss": 1.4246007204055786, - "eval_runtime": 0.3233, - "eval_samples_per_second": 272.164, - "eval_steps_per_second": 34.02, - "step": 15 - }, - { - "epoch": 0.7272727272727273, - "grad_norm": 6.809484481811523, - "learning_rate": 2.980263157894737e-05, - "loss": 1.4897, - "step": 16 - }, - { - "epoch": 0.7272727272727273, - "eval_loss": 1.3582329750061035, - "eval_runtime": 0.2729, - "eval_samples_per_second": 322.483, - "eval_steps_per_second": 40.31, - "step": 16 - }, - { - "epoch": 0.7727272727272727, - "grad_norm": 7.0124711990356445, - "learning_rate": 2.9763157894736842e-05, - "loss": 1.3831, - "step": 17 - }, - { - "epoch": 0.7727272727272727, - "eval_loss": 1.2863445281982422, - "eval_runtime": 0.2734, - "eval_samples_per_second": 321.916, - "eval_steps_per_second": 40.24, - "step": 17 - }, - { - "epoch": 0.8181818181818182, - "grad_norm": 6.749629974365234, - "learning_rate": 2.9723684210526316e-05, - "loss": 1.2616, - "step": 18 - }, - { - "epoch": 0.8181818181818182, - "eval_loss": 1.1985043287277222, - "eval_runtime": 0.2953, - "eval_samples_per_second": 297.968, - "eval_steps_per_second": 37.246, - "step": 18 - }, - { - "epoch": 0.8636363636363636, - "grad_norm": 8.935945510864258, - "learning_rate": 2.968421052631579e-05, - "loss": 1.2058, - "step": 19 - }, - { - "epoch": 0.8636363636363636, - "eval_loss": 1.1089844703674316, - "eval_runtime": 0.3886, - "eval_samples_per_second": 226.48, - "eval_steps_per_second": 28.31, - "step": 19 - }, - { - "epoch": 0.9090909090909091, - "grad_norm": 5.048995018005371, - "learning_rate": 2.9644736842105265e-05, - "loss": 1.1399, - "step": 20 - }, - { - "epoch": 0.9090909090909091, - "eval_loss": 1.0176739692687988, - "eval_runtime": 0.2913, - "eval_samples_per_second": 302.091, - "eval_steps_per_second": 37.761, - "step": 20 - }, - { - "epoch": 0.9545454545454546, - "grad_norm": 6.563332557678223, - "learning_rate": 2.9605263157894735e-05, - "loss": 0.9906, - "step": 21 - }, - { - "epoch": 0.9545454545454546, - "eval_loss": 0.930864155292511, - "eval_runtime": 0.2425, - "eval_samples_per_second": 362.831, - "eval_steps_per_second": 45.354, - "step": 21 - }, - { - "epoch": 1.0, - "grad_norm": 12.079025268554688, - "learning_rate": 2.9565789473684213e-05, - "loss": 1.0795, - "step": 22 - }, - { - "epoch": 1.0, - "eval_loss": 0.8574727773666382, - "eval_runtime": 0.2662, - "eval_samples_per_second": 330.588, - "eval_steps_per_second": 41.323, - "step": 22 - }, - { - "epoch": 1.0454545454545454, - "grad_norm": 5.452284336090088, - "learning_rate": 2.9526315789473684e-05, - "loss": 0.8862, - "step": 23 - }, - { - "epoch": 1.0454545454545454, - "eval_loss": 0.7834421396255493, - "eval_runtime": 0.2559, - "eval_samples_per_second": 343.941, - "eval_steps_per_second": 42.993, - "step": 23 - }, - { - "epoch": 1.0909090909090908, - "grad_norm": 6.780595302581787, - "learning_rate": 2.9486842105263158e-05, - "loss": 0.7825, - "step": 24 - }, - { - "epoch": 1.0909090909090908, - "eval_loss": 0.7133036255836487, - "eval_runtime": 0.4064, - "eval_samples_per_second": 216.526, - "eval_steps_per_second": 27.066, - "step": 24 - }, - { - "epoch": 1.1363636363636362, - "grad_norm": 6.756824016571045, - "learning_rate": 2.9447368421052635e-05, - "loss": 0.9249, - "step": 25 - }, - { - "epoch": 1.1363636363636362, - "eval_loss": 0.652009904384613, - "eval_runtime": 0.4122, - "eval_samples_per_second": 213.486, - "eval_steps_per_second": 26.686, - "step": 25 - }, - { - "epoch": 1.1818181818181819, - "grad_norm": 4.798681259155273, - "learning_rate": 2.9407894736842106e-05, - "loss": 0.5773, - "step": 26 - }, - { - "epoch": 1.1818181818181819, - "eval_loss": 0.6013602614402771, - "eval_runtime": 0.4182, - "eval_samples_per_second": 210.411, - "eval_steps_per_second": 26.301, - "step": 26 - }, - { - "epoch": 1.2272727272727273, - "grad_norm": 4.608880996704102, - "learning_rate": 2.936842105263158e-05, - "loss": 0.6573, - "step": 27 - }, - { - "epoch": 1.2272727272727273, - "eval_loss": 0.5579346418380737, - "eval_runtime": 0.5426, - "eval_samples_per_second": 162.18, - "eval_steps_per_second": 20.272, - "step": 27 - }, - { - "epoch": 1.2727272727272727, - "grad_norm": 4.582436561584473, - "learning_rate": 2.9328947368421055e-05, - "loss": 0.5408, - "step": 28 - }, - { - "epoch": 1.2727272727272727, - "eval_loss": 0.5213125348091125, - "eval_runtime": 0.274, - "eval_samples_per_second": 321.214, - "eval_steps_per_second": 40.152, - "step": 28 - }, - { - "epoch": 1.3181818181818181, - "grad_norm": 6.145488262176514, - "learning_rate": 2.928947368421053e-05, - "loss": 0.6888, - "step": 29 - }, - { - "epoch": 1.3181818181818181, - "eval_loss": 0.47387245297431946, - "eval_runtime": 0.2153, - "eval_samples_per_second": 408.668, - "eval_steps_per_second": 51.083, - "step": 29 - }, - { - "epoch": 1.3636363636363638, - "grad_norm": 4.611596584320068, - "learning_rate": 2.925e-05, - "loss": 0.584, - "step": 30 - }, - { - "epoch": 1.3636363636363638, - "eval_loss": 0.41591426730155945, - "eval_runtime": 0.2262, - "eval_samples_per_second": 388.952, - "eval_steps_per_second": 48.619, - "step": 30 - }, - { - "epoch": 1.4090909090909092, - "grad_norm": 4.470975875854492, - "learning_rate": 2.9210526315789474e-05, - "loss": 0.4962, - "step": 31 - }, - { - "epoch": 1.4090909090909092, - "eval_loss": 0.3586600720882416, - "eval_runtime": 0.2233, - "eval_samples_per_second": 394.029, - "eval_steps_per_second": 49.254, - "step": 31 - }, - { - "epoch": 1.4545454545454546, - "grad_norm": 3.111593008041382, - "learning_rate": 2.9171052631578948e-05, - "loss": 0.3594, - "step": 32 - }, - { - "epoch": 1.4545454545454546, - "eval_loss": 0.3188125491142273, - "eval_runtime": 0.3382, - "eval_samples_per_second": 260.203, - "eval_steps_per_second": 32.525, - "step": 32 - }, - { - "epoch": 1.5, - "grad_norm": 3.246596336364746, - "learning_rate": 2.9131578947368422e-05, - "loss": 0.3643, - "step": 33 - }, - { - "epoch": 1.5, - "eval_loss": 0.2900885343551636, - "eval_runtime": 0.2904, - "eval_samples_per_second": 302.998, - "eval_steps_per_second": 37.875, - "step": 33 - }, - { - "epoch": 1.5454545454545454, - "grad_norm": 4.4003376960754395, - "learning_rate": 2.9092105263157893e-05, - "loss": 0.3334, - "step": 34 - }, - { - "epoch": 1.5454545454545454, - "eval_loss": 0.260213166475296, - "eval_runtime": 0.3641, - "eval_samples_per_second": 241.707, - "eval_steps_per_second": 30.213, - "step": 34 - }, - { - "epoch": 1.5909090909090908, - "grad_norm": 5.7509236335754395, - "learning_rate": 2.905263157894737e-05, - "loss": 0.3754, - "step": 35 - }, - { - "epoch": 1.5909090909090908, - "eval_loss": 0.2297886312007904, - "eval_runtime": 0.3003, - "eval_samples_per_second": 293.032, - "eval_steps_per_second": 36.629, - "step": 35 - }, - { - "epoch": 1.6363636363636362, - "grad_norm": 3.7421319484710693, - "learning_rate": 2.901315789473684e-05, - "loss": 0.3108, - "step": 36 - }, - { - "epoch": 1.6363636363636362, - "eval_loss": 0.21363353729248047, - "eval_runtime": 0.4783, - "eval_samples_per_second": 183.979, - "eval_steps_per_second": 22.997, - "step": 36 - }, - { - "epoch": 1.6818181818181817, - "grad_norm": 3.7049357891082764, - "learning_rate": 2.8973684210526315e-05, - "loss": 0.2933, - "step": 37 - }, - { - "epoch": 1.6818181818181817, - "eval_loss": 0.20323915779590607, - "eval_runtime": 0.25, - "eval_samples_per_second": 351.979, - "eval_steps_per_second": 43.997, - "step": 37 - }, - { - "epoch": 1.7272727272727273, - "grad_norm": 2.6143414974212646, - "learning_rate": 2.893421052631579e-05, - "loss": 0.2208, - "step": 38 - }, - { - "epoch": 1.7272727272727273, - "eval_loss": 0.19065451622009277, - "eval_runtime": 0.284, - "eval_samples_per_second": 309.864, - "eval_steps_per_second": 38.733, - "step": 38 - }, - { - "epoch": 1.7727272727272727, - "grad_norm": 3.0895273685455322, - "learning_rate": 2.8894736842105263e-05, - "loss": 0.2448, - "step": 39 - }, - { - "epoch": 1.7727272727272727, - "eval_loss": 0.17271381616592407, - "eval_runtime": 0.3543, - "eval_samples_per_second": 248.403, - "eval_steps_per_second": 31.05, - "step": 39 - }, - { - "epoch": 1.8181818181818183, - "grad_norm": 1.7658973932266235, - "learning_rate": 2.8855263157894738e-05, - "loss": 0.1742, - "step": 40 - }, - { - "epoch": 1.8181818181818183, - "eval_loss": 0.152969092130661, - "eval_runtime": 0.2714, - "eval_samples_per_second": 324.231, - "eval_steps_per_second": 40.529, - "step": 40 - }, - { - "epoch": 1.8636363636363638, - "grad_norm": 1.7428200244903564, - "learning_rate": 2.8815789473684212e-05, - "loss": 0.1717, - "step": 41 - }, - { - "epoch": 1.8636363636363638, - "eval_loss": 0.13160385191440582, - "eval_runtime": 0.2485, - "eval_samples_per_second": 354.091, - "eval_steps_per_second": 44.261, - "step": 41 - }, - { - "epoch": 1.9090909090909092, - "grad_norm": 1.9848284721374512, - "learning_rate": 2.8776315789473686e-05, - "loss": 0.1487, - "step": 42 - }, - { - "epoch": 1.9090909090909092, - "eval_loss": 0.11496426910161972, - "eval_runtime": 0.2812, - "eval_samples_per_second": 312.902, - "eval_steps_per_second": 39.113, - "step": 42 - }, - { - "epoch": 1.9545454545454546, - "grad_norm": 1.8623422384262085, - "learning_rate": 2.8736842105263157e-05, - "loss": 0.1671, - "step": 43 - }, - { - "epoch": 1.9545454545454546, - "eval_loss": 0.10060829669237137, - "eval_runtime": 0.531, - "eval_samples_per_second": 165.721, - "eval_steps_per_second": 20.715, - "step": 43 - }, - { - "epoch": 2.0, - "grad_norm": 1.254258632659912, - "learning_rate": 2.8697368421052634e-05, - "loss": 0.1296, - "step": 44 - }, - { - "epoch": 2.0, - "eval_loss": 0.09032303839921951, - "eval_runtime": 0.4212, - "eval_samples_per_second": 208.91, - "eval_steps_per_second": 26.114, - "step": 44 - }, - { - "epoch": 2.0454545454545454, - "grad_norm": 1.7023710012435913, - "learning_rate": 2.8657894736842105e-05, - "loss": 0.1269, - "step": 45 - }, - { - "epoch": 2.0454545454545454, - "eval_loss": 0.08172891288995743, - "eval_runtime": 0.3434, - "eval_samples_per_second": 256.286, - "eval_steps_per_second": 32.036, - "step": 45 - }, - { - "epoch": 2.090909090909091, - "grad_norm": 1.1132336854934692, - "learning_rate": 2.861842105263158e-05, - "loss": 0.1087, - "step": 46 - }, - { - "epoch": 2.090909090909091, - "eval_loss": 0.07363786548376083, - "eval_runtime": 0.2227, - "eval_samples_per_second": 395.148, - "eval_steps_per_second": 49.393, - "step": 46 - }, - { - "epoch": 2.1363636363636362, - "grad_norm": 1.2574397325515747, - "learning_rate": 2.8578947368421053e-05, - "loss": 0.1007, - "step": 47 - }, - { - "epoch": 2.1363636363636362, - "eval_loss": 0.0676058903336525, - "eval_runtime": 0.2162, - "eval_samples_per_second": 406.971, - "eval_steps_per_second": 50.871, - "step": 47 - }, - { - "epoch": 2.1818181818181817, - "grad_norm": 1.1193581819534302, - "learning_rate": 2.8539473684210527e-05, - "loss": 0.0932, - "step": 48 - }, - { - "epoch": 2.1818181818181817, - "eval_loss": 0.060314346104860306, - "eval_runtime": 0.2456, - "eval_samples_per_second": 358.319, - "eval_steps_per_second": 44.79, - "step": 48 - }, - { - "epoch": 2.227272727272727, - "grad_norm": 1.1668117046356201, - "learning_rate": 2.8499999999999998e-05, - "loss": 0.0885, - "step": 49 - }, - { - "epoch": 2.227272727272727, - "eval_loss": 0.05352572351694107, - "eval_runtime": 0.2143, - "eval_samples_per_second": 410.66, - "eval_steps_per_second": 51.333, - "step": 49 - }, - { - "epoch": 2.2727272727272725, - "grad_norm": 0.9329622387886047, - "learning_rate": 2.8460526315789476e-05, - "loss": 0.0768, - "step": 50 - }, - { - "epoch": 2.2727272727272725, - "eval_loss": 0.049994777888059616, - "eval_runtime": 0.2184, - "eval_samples_per_second": 402.932, - "eval_steps_per_second": 50.367, - "step": 50 - }, - { - "epoch": 2.3181818181818183, - "grad_norm": 1.4205875396728516, - "learning_rate": 2.8421052631578946e-05, - "loss": 0.0871, - "step": 51 - }, - { - "epoch": 2.3181818181818183, - "eval_loss": 0.046269264072179794, - "eval_runtime": 0.2199, - "eval_samples_per_second": 400.111, - "eval_steps_per_second": 50.014, - "step": 51 - }, - { - "epoch": 2.3636363636363638, - "grad_norm": 0.6296802163124084, - "learning_rate": 2.838157894736842e-05, - "loss": 0.0597, - "step": 52 - }, - { - "epoch": 2.3636363636363638, - "eval_loss": 0.04288846254348755, - "eval_runtime": 0.2154, - "eval_samples_per_second": 408.528, - "eval_steps_per_second": 51.066, - "step": 52 - }, - { - "epoch": 2.409090909090909, - "grad_norm": 0.8016664981842041, - "learning_rate": 2.8342105263157898e-05, - "loss": 0.0573, - "step": 53 - }, - { - "epoch": 2.409090909090909, - "eval_loss": 0.03866353631019592, - "eval_runtime": 0.2104, - "eval_samples_per_second": 418.258, - "eval_steps_per_second": 52.282, - "step": 53 - }, - { - "epoch": 2.4545454545454546, - "grad_norm": 0.5186643600463867, - "learning_rate": 2.830263157894737e-05, - "loss": 0.0533, - "step": 54 - }, - { - "epoch": 2.4545454545454546, - "eval_loss": 0.03540382906794548, - "eval_runtime": 0.2148, - "eval_samples_per_second": 409.705, - "eval_steps_per_second": 51.213, - "step": 54 - }, - { - "epoch": 2.5, - "grad_norm": 0.616000771522522, - "learning_rate": 2.8263157894736843e-05, - "loss": 0.0543, - "step": 55 - }, - { - "epoch": 2.5, - "eval_loss": 0.03242316469550133, - "eval_runtime": 0.2116, - "eval_samples_per_second": 415.828, - "eval_steps_per_second": 51.979, - "step": 55 - }, - { - "epoch": 2.5454545454545454, - "grad_norm": 0.6781826615333557, - "learning_rate": 2.8223684210526317e-05, - "loss": 0.0527, - "step": 56 - }, - { - "epoch": 2.5454545454545454, - "eval_loss": 0.029892653226852417, - "eval_runtime": 0.2231, - "eval_samples_per_second": 394.465, - "eval_steps_per_second": 49.308, - "step": 56 - }, - { - "epoch": 2.590909090909091, - "grad_norm": 0.40553542971611023, - "learning_rate": 2.818421052631579e-05, - "loss": 0.043, - "step": 57 - }, - { - "epoch": 2.590909090909091, - "eval_loss": 0.02773384563624859, - "eval_runtime": 0.212, - "eval_samples_per_second": 415.108, - "eval_steps_per_second": 51.889, - "step": 57 - }, - { - "epoch": 2.6363636363636362, - "grad_norm": 0.46068763732910156, - "learning_rate": 2.8144736842105262e-05, - "loss": 0.0408, - "step": 58 - }, - { - "epoch": 2.6363636363636362, - "eval_loss": 0.025741351768374443, - "eval_runtime": 0.2177, - "eval_samples_per_second": 404.269, - "eval_steps_per_second": 50.534, - "step": 58 - }, - { - "epoch": 2.6818181818181817, - "grad_norm": 0.42782941460609436, - "learning_rate": 2.810526315789474e-05, - "loss": 0.0404, - "step": 59 - }, - { - "epoch": 2.6818181818181817, - "eval_loss": 0.023805884644389153, - "eval_runtime": 0.2164, - "eval_samples_per_second": 406.611, - "eval_steps_per_second": 50.826, - "step": 59 - }, - { - "epoch": 2.7272727272727275, - "grad_norm": 0.3100360035896301, - "learning_rate": 2.806578947368421e-05, - "loss": 0.0348, - "step": 60 - }, - { - "epoch": 2.7272727272727275, - "eval_loss": 0.022079171612858772, - "eval_runtime": 0.2121, - "eval_samples_per_second": 414.803, - "eval_steps_per_second": 51.85, - "step": 60 - }, - { - "epoch": 2.7727272727272725, - "grad_norm": 0.3292113244533539, - "learning_rate": 2.8026315789473685e-05, - "loss": 0.0331, - "step": 61 - }, - { - "epoch": 2.7727272727272725, - "eval_loss": 0.020567093044519424, - "eval_runtime": 0.2183, - "eval_samples_per_second": 403.093, - "eval_steps_per_second": 50.387, - "step": 61 - }, - { - "epoch": 2.8181818181818183, - "grad_norm": 0.4177182912826538, - "learning_rate": 2.798684210526316e-05, - "loss": 0.0323, - "step": 62 - }, - { - "epoch": 2.8181818181818183, - "eval_loss": 0.019224492833018303, - "eval_runtime": 0.2119, - "eval_samples_per_second": 415.211, - "eval_steps_per_second": 51.901, - "step": 62 - }, - { - "epoch": 2.8636363636363638, - "grad_norm": 0.23254263401031494, - "learning_rate": 2.7947368421052633e-05, - "loss": 0.0252, - "step": 63 - }, - { - "epoch": 2.8636363636363638, - "eval_loss": 0.01814187504351139, - "eval_runtime": 0.2203, - "eval_samples_per_second": 399.469, - "eval_steps_per_second": 49.934, - "step": 63 - }, - { - "epoch": 2.909090909090909, - "grad_norm": 0.38803598284721375, - "learning_rate": 2.7907894736842104e-05, - "loss": 0.031, - "step": 64 - }, - { - "epoch": 2.909090909090909, - "eval_loss": 0.01718403585255146, - "eval_runtime": 0.2179, - "eval_samples_per_second": 403.933, - "eval_steps_per_second": 50.492, - "step": 64 - }, - { - "epoch": 2.9545454545454546, - "grad_norm": 0.33151182532310486, - "learning_rate": 2.786842105263158e-05, - "loss": 0.03, - "step": 65 - }, - { - "epoch": 2.9545454545454546, - "eval_loss": 0.016221443191170692, - "eval_runtime": 0.2114, - "eval_samples_per_second": 416.237, - "eval_steps_per_second": 52.03, - "step": 65 - }, - { - "epoch": 3.0, - "grad_norm": 0.25049498677253723, - "learning_rate": 2.7828947368421055e-05, - "loss": 0.0244, - "step": 66 - }, - { - "epoch": 3.0, - "eval_loss": 0.015314313583076, - "eval_runtime": 0.2173, - "eval_samples_per_second": 404.944, - "eval_steps_per_second": 50.618, - "step": 66 - }, - { - "epoch": 3.0454545454545454, - "grad_norm": 0.2723033130168915, - "learning_rate": 2.7789473684210526e-05, - "loss": 0.0235, - "step": 67 - }, - { - "epoch": 3.0454545454545454, - "eval_loss": 0.014571275562047958, - "eval_runtime": 0.2218, - "eval_samples_per_second": 396.808, - "eval_steps_per_second": 49.601, - "step": 67 - }, - { - "epoch": 3.090909090909091, - "grad_norm": 0.20975647866725922, - "learning_rate": 2.7750000000000004e-05, - "loss": 0.0222, - "step": 68 - }, - { - "epoch": 3.090909090909091, - "eval_loss": 0.013959475792944431, - "eval_runtime": 0.2232, - "eval_samples_per_second": 394.228, - "eval_steps_per_second": 49.279, - "step": 68 - }, - { - "epoch": 3.1363636363636362, - "grad_norm": 0.2025345116853714, - "learning_rate": 2.7710526315789474e-05, - "loss": 0.0228, - "step": 69 - }, - { - "epoch": 3.1363636363636362, - "eval_loss": 0.013426948338747025, - "eval_runtime": 0.2201, - "eval_samples_per_second": 399.844, - "eval_steps_per_second": 49.981, - "step": 69 - }, - { - "epoch": 3.1818181818181817, - "grad_norm": 0.2033005654811859, - "learning_rate": 2.767105263157895e-05, - "loss": 0.0209, - "step": 70 - }, - { - "epoch": 3.1818181818181817, - "eval_loss": 0.012989457696676254, - "eval_runtime": 0.2125, - "eval_samples_per_second": 414.107, - "eval_steps_per_second": 51.763, - "step": 70 - }, - { - "epoch": 3.227272727272727, - "grad_norm": 0.18534056842327118, - "learning_rate": 2.7631578947368423e-05, - "loss": 0.0199, - "step": 71 - }, - { - "epoch": 3.227272727272727, - "eval_loss": 0.012577124871313572, - "eval_runtime": 0.2145, - "eval_samples_per_second": 410.253, - "eval_steps_per_second": 51.282, - "step": 71 - }, - { - "epoch": 3.2727272727272725, - "grad_norm": 0.16536517441272736, - "learning_rate": 2.7592105263157897e-05, - "loss": 0.017, - "step": 72 - }, - { - "epoch": 3.2727272727272725, - "eval_loss": 0.012171071022748947, - "eval_runtime": 0.225, - "eval_samples_per_second": 391.108, - "eval_steps_per_second": 48.888, - "step": 72 - }, - { - "epoch": 3.3181818181818183, - "grad_norm": 0.14233346283435822, - "learning_rate": 2.7552631578947368e-05, - "loss": 0.0173, - "step": 73 - }, - { - "epoch": 3.3181818181818183, - "eval_loss": 0.011801562272012234, - "eval_runtime": 0.2206, - "eval_samples_per_second": 398.895, - "eval_steps_per_second": 49.862, - "step": 73 - }, - { - "epoch": 3.3636363636363638, - "grad_norm": 0.18418766558170319, - "learning_rate": 2.7513157894736842e-05, - "loss": 0.0193, - "step": 74 - }, - { - "epoch": 3.3636363636363638, - "eval_loss": 0.01138223335146904, - "eval_runtime": 0.2269, - "eval_samples_per_second": 387.873, - "eval_steps_per_second": 48.484, - "step": 74 - }, - { - "epoch": 3.409090909090909, - "grad_norm": 0.1584126502275467, - "learning_rate": 2.7473684210526316e-05, - "loss": 0.0174, - "step": 75 - }, - { - "epoch": 3.409090909090909, - "eval_loss": 0.010961382649838924, - "eval_runtime": 0.2368, - "eval_samples_per_second": 371.65, - "eval_steps_per_second": 46.456, - "step": 75 - }, - { - "epoch": 3.4545454545454546, - "grad_norm": 0.15311338007450104, - "learning_rate": 2.743421052631579e-05, - "loss": 0.0152, - "step": 76 - }, - { - "epoch": 3.4545454545454546, - "eval_loss": 0.01055182795971632, - "eval_runtime": 0.2222, - "eval_samples_per_second": 396.112, - "eval_steps_per_second": 49.514, - "step": 76 - }, - { - "epoch": 3.5, - "grad_norm": 0.1895849108695984, - "learning_rate": 2.739473684210526e-05, - "loss": 0.0185, - "step": 77 - }, - { - "epoch": 3.5, - "eval_loss": 0.01013518963009119, - "eval_runtime": 0.2228, - "eval_samples_per_second": 394.993, - "eval_steps_per_second": 49.374, - "step": 77 - }, - { - "epoch": 3.5454545454545454, - "grad_norm": 0.1422702521085739, - "learning_rate": 2.735526315789474e-05, - "loss": 0.0163, - "step": 78 - }, - { - "epoch": 3.5454545454545454, - "eval_loss": 0.009774941019713879, - "eval_runtime": 0.2328, - "eval_samples_per_second": 378.047, - "eval_steps_per_second": 47.256, - "step": 78 - }, - { - "epoch": 3.590909090909091, - "grad_norm": 0.15089201927185059, - "learning_rate": 2.7315789473684213e-05, - "loss": 0.0162, - "step": 79 - }, - { - "epoch": 3.590909090909091, - "eval_loss": 0.009458563290536404, - "eval_runtime": 0.2335, - "eval_samples_per_second": 376.887, - "eval_steps_per_second": 47.111, - "step": 79 - }, - { - "epoch": 3.6363636363636362, - "grad_norm": 0.16338452696800232, - "learning_rate": 2.7276315789473683e-05, - "loss": 0.015, - "step": 80 - }, - { - "epoch": 3.6363636363636362, - "eval_loss": 0.00917022954672575, - "eval_runtime": 0.2355, - "eval_samples_per_second": 373.621, - "eval_steps_per_second": 46.703, - "step": 80 - }, - { - "epoch": 3.6818181818181817, - "grad_norm": 0.14390893280506134, - "learning_rate": 2.723684210526316e-05, - "loss": 0.0148, - "step": 81 - }, - { - "epoch": 3.6818181818181817, - "eval_loss": 0.00891400221735239, - "eval_runtime": 0.2182, - "eval_samples_per_second": 403.39, - "eval_steps_per_second": 50.424, - "step": 81 - }, - { - "epoch": 3.7272727272727275, - "grad_norm": 0.23557034134864807, - "learning_rate": 2.719736842105263e-05, - "loss": 0.0173, - "step": 82 - }, - { - "epoch": 3.7272727272727275, - "eval_loss": 0.008688293397426605, - "eval_runtime": 0.2236, - "eval_samples_per_second": 393.639, - "eval_steps_per_second": 49.205, - "step": 82 - }, - { - "epoch": 3.7727272727272725, - "grad_norm": 0.12254065275192261, - "learning_rate": 2.7157894736842106e-05, - "loss": 0.0133, - "step": 83 - }, - { - "epoch": 3.7727272727272725, - "eval_loss": 0.008477870374917984, - "eval_runtime": 0.2215, - "eval_samples_per_second": 397.361, - "eval_steps_per_second": 49.67, - "step": 83 - }, - { - "epoch": 3.8181818181818183, - "grad_norm": 0.10980476438999176, - "learning_rate": 2.711842105263158e-05, - "loss": 0.0128, - "step": 84 - }, - { - "epoch": 3.8181818181818183, - "eval_loss": 0.00827844813466072, - "eval_runtime": 0.2234, - "eval_samples_per_second": 393.942, - "eval_steps_per_second": 49.243, - "step": 84 - }, - { - "epoch": 3.8636363636363638, - "grad_norm": 0.13196319341659546, - "learning_rate": 2.7078947368421054e-05, - "loss": 0.013, - "step": 85 - }, - { - "epoch": 3.8636363636363638, - "eval_loss": 0.008079243823885918, - "eval_runtime": 0.2221, - "eval_samples_per_second": 396.214, - "eval_steps_per_second": 49.527, - "step": 85 - }, - { - "epoch": 3.909090909090909, - "grad_norm": 0.10154274851083755, - "learning_rate": 2.7039473684210525e-05, - "loss": 0.0122, - "step": 86 - }, - { - "epoch": 3.909090909090909, - "eval_loss": 0.007896007038652897, - "eval_runtime": 0.224, - "eval_samples_per_second": 392.924, - "eval_steps_per_second": 49.115, - "step": 86 - }, - { - "epoch": 3.9545454545454546, - "grad_norm": 0.1324293613433838, - "learning_rate": 2.7000000000000002e-05, - "loss": 0.0126, - "step": 87 - }, - { - "epoch": 3.9545454545454546, - "eval_loss": 0.007718592882156372, - "eval_runtime": 0.2196, - "eval_samples_per_second": 400.741, - "eval_steps_per_second": 50.093, - "step": 87 - }, - { - "epoch": 4.0, - "grad_norm": 0.10327129811048508, - "learning_rate": 2.6960526315789473e-05, - "loss": 0.012, - "step": 88 - }, - { - "epoch": 4.0, - "eval_loss": 0.007555495481938124, - "eval_runtime": 0.2221, - "eval_samples_per_second": 396.243, - "eval_steps_per_second": 49.53, - "step": 88 - }, - { - "epoch": 4.045454545454546, - "grad_norm": 0.09408023953437805, - "learning_rate": 2.6921052631578947e-05, - "loss": 0.0115, - "step": 89 - }, - { - "epoch": 4.045454545454546, - "eval_loss": 0.0074074105359613895, - "eval_runtime": 0.2205, - "eval_samples_per_second": 399.137, - "eval_steps_per_second": 49.892, - "step": 89 - }, - { - "epoch": 4.090909090909091, - "grad_norm": 0.09438669681549072, - "learning_rate": 2.688157894736842e-05, - "loss": 0.0117, - "step": 90 - }, - { - "epoch": 4.090909090909091, - "eval_loss": 0.007270295638591051, - "eval_runtime": 0.2207, - "eval_samples_per_second": 398.716, - "eval_steps_per_second": 49.839, - "step": 90 - }, - { - "epoch": 4.136363636363637, - "grad_norm": 0.10392805188894272, - "learning_rate": 2.6842105263157896e-05, - "loss": 0.0121, - "step": 91 - }, - { - "epoch": 4.136363636363637, - "eval_loss": 0.007134940009564161, - "eval_runtime": 0.2226, - "eval_samples_per_second": 395.399, - "eval_steps_per_second": 49.425, - "step": 91 - }, - { - "epoch": 4.181818181818182, - "grad_norm": 0.09916353225708008, - "learning_rate": 2.6802631578947366e-05, - "loss": 0.0111, - "step": 92 - }, - { - "epoch": 4.181818181818182, - "eval_loss": 0.007011328358203173, - "eval_runtime": 0.2218, - "eval_samples_per_second": 396.679, - "eval_steps_per_second": 49.585, - "step": 92 - }, - { - "epoch": 4.2272727272727275, - "grad_norm": 0.11726672202348709, - "learning_rate": 2.6763157894736844e-05, - "loss": 0.0128, - "step": 93 - }, - { - "epoch": 4.2272727272727275, - "eval_loss": 0.006890672724694014, - "eval_runtime": 0.2242, - "eval_samples_per_second": 392.462, - "eval_steps_per_second": 49.058, - "step": 93 - }, - { - "epoch": 4.2727272727272725, - "grad_norm": 0.10044334828853607, - "learning_rate": 2.6723684210526318e-05, - "loss": 0.0115, - "step": 94 - }, - { - "epoch": 4.2727272727272725, - "eval_loss": 0.006776686292141676, - "eval_runtime": 0.2201, - "eval_samples_per_second": 399.833, - "eval_steps_per_second": 49.979, - "step": 94 - }, - { - "epoch": 4.318181818181818, - "grad_norm": 0.09276948869228363, - "learning_rate": 2.668421052631579e-05, - "loss": 0.011, - "step": 95 - }, - { - "epoch": 4.318181818181818, - "eval_loss": 0.00667022867128253, - "eval_runtime": 0.2225, - "eval_samples_per_second": 395.502, - "eval_steps_per_second": 49.438, - "step": 95 - }, - { - "epoch": 4.363636363636363, - "grad_norm": 0.09718704223632812, - "learning_rate": 2.6644736842105266e-05, - "loss": 0.0113, - "step": 96 - }, - { - "epoch": 4.363636363636363, - "eval_loss": 0.006560015957802534, - "eval_runtime": 0.2172, - "eval_samples_per_second": 405.161, - "eval_steps_per_second": 50.645, - "step": 96 - }, - { - "epoch": 4.409090909090909, - "grad_norm": 0.11359906196594238, - "learning_rate": 2.6605263157894737e-05, - "loss": 0.0105, - "step": 97 - }, - { - "epoch": 4.409090909090909, - "eval_loss": 0.0064485338516533375, - "eval_runtime": 0.221, - "eval_samples_per_second": 398.174, - "eval_steps_per_second": 49.772, - "step": 97 - }, - { - "epoch": 4.454545454545454, - "grad_norm": 0.0942469909787178, - "learning_rate": 2.656578947368421e-05, - "loss": 0.0104, - "step": 98 - }, - { - "epoch": 4.454545454545454, - "eval_loss": 0.00633326917886734, - "eval_runtime": 0.2241, - "eval_samples_per_second": 392.749, - "eval_steps_per_second": 49.094, - "step": 98 - }, - { - "epoch": 4.5, - "grad_norm": 0.08770338445901871, - "learning_rate": 2.6526315789473685e-05, - "loss": 0.0097, - "step": 99 - }, - { - "epoch": 4.5, - "eval_loss": 0.006226606201380491, - "eval_runtime": 0.221, - "eval_samples_per_second": 398.22, - "eval_steps_per_second": 49.777, - "step": 99 - }, - { - "epoch": 4.545454545454545, - "grad_norm": 0.0902254730463028, - "learning_rate": 2.648684210526316e-05, - "loss": 0.0102, - "step": 100 - }, - { - "epoch": 4.545454545454545, - "eval_loss": 0.0061218636110424995, - "eval_runtime": 0.2218, - "eval_samples_per_second": 396.725, - "eval_steps_per_second": 49.591, - "step": 100 - }, - { - "epoch": 4.590909090909091, - "grad_norm": 0.07302330434322357, - "learning_rate": 2.644736842105263e-05, - "loss": 0.0086, - "step": 101 - }, - { - "epoch": 4.590909090909091, - "eval_loss": 0.006022432819008827, - "eval_runtime": 0.2242, - "eval_samples_per_second": 392.497, - "eval_steps_per_second": 49.062, - "step": 101 - }, - { - "epoch": 4.636363636363637, - "grad_norm": 0.09044598042964935, - "learning_rate": 2.6407894736842108e-05, - "loss": 0.0098, - "step": 102 - }, - { - "epoch": 4.636363636363637, - "eval_loss": 0.005927449557930231, - "eval_runtime": 0.219, - "eval_samples_per_second": 401.867, - "eval_steps_per_second": 50.233, - "step": 102 - }, - { - "epoch": 4.681818181818182, - "grad_norm": 0.07847205549478531, - "learning_rate": 2.636842105263158e-05, - "loss": 0.0093, - "step": 103 - }, - { - "epoch": 4.681818181818182, - "eval_loss": 0.005836833734065294, - "eval_runtime": 0.2477, - "eval_samples_per_second": 355.291, - "eval_steps_per_second": 44.411, - "step": 103 - }, - { - "epoch": 4.7272727272727275, - "grad_norm": 0.09054490178823471, - "learning_rate": 2.6328947368421053e-05, - "loss": 0.0093, - "step": 104 - }, - { - "epoch": 4.7272727272727275, - "eval_loss": 0.005744776222854853, - "eval_runtime": 0.2237, - "eval_samples_per_second": 393.373, - "eval_steps_per_second": 49.172, - "step": 104 - }, - { - "epoch": 4.7727272727272725, - "grad_norm": 0.08056215196847916, - "learning_rate": 2.6289473684210527e-05, - "loss": 0.0095, - "step": 105 - }, - { - "epoch": 4.7727272727272725, - "eval_loss": 0.005655229557305574, - "eval_runtime": 0.2221, - "eval_samples_per_second": 396.284, - "eval_steps_per_second": 49.535, - "step": 105 - }, - { - "epoch": 4.818181818181818, - "grad_norm": 0.07413677871227264, - "learning_rate": 2.625e-05, - "loss": 0.0095, - "step": 106 - }, - { - "epoch": 4.818181818181818, - "eval_loss": 0.005573854316025972, - "eval_runtime": 0.2219, - "eval_samples_per_second": 396.499, - "eval_steps_per_second": 49.562, - "step": 106 - }, - { - "epoch": 4.863636363636363, - "grad_norm": 0.09156908839941025, - "learning_rate": 2.6210526315789475e-05, - "loss": 0.0094, - "step": 107 - }, - { - "epoch": 4.863636363636363, - "eval_loss": 0.005500171799212694, - "eval_runtime": 0.218, - "eval_samples_per_second": 403.697, - "eval_steps_per_second": 50.462, - "step": 107 - }, - { - "epoch": 4.909090909090909, - "grad_norm": 0.07806240767240524, - "learning_rate": 2.617105263157895e-05, - "loss": 0.009, - "step": 108 - }, - { - "epoch": 4.909090909090909, - "eval_loss": 0.005432323087006807, - "eval_runtime": 0.2208, - "eval_samples_per_second": 398.497, - "eval_steps_per_second": 49.812, - "step": 108 - }, - { - "epoch": 4.954545454545455, - "grad_norm": 0.07705673575401306, - "learning_rate": 2.6131578947368424e-05, - "loss": 0.0091, - "step": 109 - }, - { - "epoch": 4.954545454545455, - "eval_loss": 0.005366500001400709, - "eval_runtime": 0.2187, - "eval_samples_per_second": 402.388, - "eval_steps_per_second": 50.299, - "step": 109 - }, - { - "epoch": 5.0, - "grad_norm": 0.0743311420083046, - "learning_rate": 2.6092105263157894e-05, - "loss": 0.0087, - "step": 110 - }, - { - "epoch": 5.0, - "eval_loss": 0.005299717653542757, - "eval_runtime": 0.215, - "eval_samples_per_second": 409.298, - "eval_steps_per_second": 51.162, - "step": 110 - }, - { - "epoch": 5.045454545454546, - "grad_norm": 0.0689927488565445, - "learning_rate": 2.605263157894737e-05, - "loss": 0.0081, - "step": 111 - }, - { - "epoch": 5.045454545454546, - "eval_loss": 0.005235993303358555, - "eval_runtime": 0.2192, - "eval_samples_per_second": 401.457, - "eval_steps_per_second": 50.182, - "step": 111 - }, - { - "epoch": 5.090909090909091, - "grad_norm": 0.06892900913953781, - "learning_rate": 2.6013157894736843e-05, - "loss": 0.0082, - "step": 112 - }, - { - "epoch": 5.090909090909091, - "eval_loss": 0.005173509940505028, - "eval_runtime": 0.219, - "eval_samples_per_second": 401.777, - "eval_steps_per_second": 50.222, - "step": 112 - }, - { - "epoch": 5.136363636363637, - "grad_norm": 0.06960764527320862, - "learning_rate": 2.5973684210526317e-05, - "loss": 0.0081, - "step": 113 - }, - { - "epoch": 5.136363636363637, - "eval_loss": 0.005112760700285435, - "eval_runtime": 0.2203, - "eval_samples_per_second": 399.491, - "eval_steps_per_second": 49.936, - "step": 113 - }, - { - "epoch": 5.181818181818182, - "grad_norm": 0.07173731923103333, - "learning_rate": 2.5934210526315788e-05, - "loss": 0.008, - "step": 114 - }, - { - "epoch": 5.181818181818182, - "eval_loss": 0.00505533954128623, - "eval_runtime": 0.2227, - "eval_samples_per_second": 395.105, - "eval_steps_per_second": 49.388, - "step": 114 - }, - { - "epoch": 5.2272727272727275, - "grad_norm": 0.06811046600341797, - "learning_rate": 2.5894736842105265e-05, - "loss": 0.0074, - "step": 115 - }, - { - "epoch": 5.2272727272727275, - "eval_loss": 0.0049970815889537334, - "eval_runtime": 0.2171, - "eval_samples_per_second": 405.344, - "eval_steps_per_second": 50.668, - "step": 115 - }, - { - "epoch": 5.2727272727272725, - "grad_norm": 0.0676768496632576, - "learning_rate": 2.5855263157894736e-05, - "loss": 0.0076, - "step": 116 - }, - { - "epoch": 5.2727272727272725, - "eval_loss": 0.004939272068440914, - "eval_runtime": 0.2319, - "eval_samples_per_second": 379.44, - "eval_steps_per_second": 47.43, - "step": 116 - }, - { - "epoch": 5.318181818181818, - "grad_norm": 0.06927932053804398, - "learning_rate": 2.581578947368421e-05, - "loss": 0.0078, - "step": 117 - }, - { - "epoch": 5.318181818181818, - "eval_loss": 0.004879767540842295, - "eval_runtime": 0.2354, - "eval_samples_per_second": 373.859, - "eval_steps_per_second": 46.732, - "step": 117 - }, - { - "epoch": 5.363636363636363, - "grad_norm": 0.0733099952340126, - "learning_rate": 2.5776315789473684e-05, - "loss": 0.009, - "step": 118 - }, - { - "epoch": 5.363636363636363, - "eval_loss": 0.00482180854305625, - "eval_runtime": 0.22, - "eval_samples_per_second": 399.97, - "eval_steps_per_second": 49.996, - "step": 118 - }, - { - "epoch": 5.409090909090909, - "grad_norm": 0.07873851805925369, - "learning_rate": 2.5736842105263158e-05, - "loss": 0.0085, - "step": 119 - }, - { - "epoch": 5.409090909090909, - "eval_loss": 0.004767131991684437, - "eval_runtime": 0.2272, - "eval_samples_per_second": 387.355, - "eval_steps_per_second": 48.419, - "step": 119 - }, - { - "epoch": 5.454545454545454, - "grad_norm": 0.06912100315093994, - "learning_rate": 2.5697368421052632e-05, - "loss": 0.0075, - "step": 120 - }, - { - "epoch": 5.454545454545454, - "eval_loss": 0.004715087823569775, - "eval_runtime": 0.2216, - "eval_samples_per_second": 397.159, - "eval_steps_per_second": 49.645, - "step": 120 - }, - { - "epoch": 5.5, - "grad_norm": 0.059973061084747314, - "learning_rate": 2.5657894736842107e-05, - "loss": 0.0078, - "step": 121 - }, - { - "epoch": 5.5, - "eval_loss": 0.004667165223509073, - "eval_runtime": 0.2226, - "eval_samples_per_second": 395.251, - "eval_steps_per_second": 49.406, - "step": 121 - }, - { - "epoch": 5.545454545454545, - "grad_norm": 0.06346078962087631, - "learning_rate": 2.561842105263158e-05, - "loss": 0.0073, - "step": 122 - }, - { - "epoch": 5.545454545454545, - "eval_loss": 0.004621806554496288, - "eval_runtime": 0.2263, - "eval_samples_per_second": 388.791, - "eval_steps_per_second": 48.599, - "step": 122 - }, - { - "epoch": 5.590909090909091, - "grad_norm": 0.07588130235671997, - "learning_rate": 2.557894736842105e-05, - "loss": 0.0079, - "step": 123 - }, - { - "epoch": 5.590909090909091, - "eval_loss": 0.004576975479722023, - "eval_runtime": 0.2216, - "eval_samples_per_second": 397.081, - "eval_steps_per_second": 49.635, - "step": 123 - }, - { - "epoch": 5.636363636363637, - "grad_norm": 0.0569930225610733, - "learning_rate": 2.553947368421053e-05, - "loss": 0.0068, - "step": 124 - }, - { - "epoch": 5.636363636363637, - "eval_loss": 0.004534974228590727, - "eval_runtime": 0.2207, - "eval_samples_per_second": 398.807, - "eval_steps_per_second": 49.851, - "step": 124 - }, - { - "epoch": 5.681818181818182, - "grad_norm": 0.07023297250270844, - "learning_rate": 2.55e-05, - "loss": 0.0078, - "step": 125 - }, - { - "epoch": 5.681818181818182, - "eval_loss": 0.004494456574320793, - "eval_runtime": 0.2276, - "eval_samples_per_second": 386.655, - "eval_steps_per_second": 48.332, - "step": 125 - }, - { - "epoch": 5.7272727272727275, - "grad_norm": 0.0586245059967041, - "learning_rate": 2.5460526315789474e-05, - "loss": 0.0072, - "step": 126 - }, - { - "epoch": 5.7272727272727275, - "eval_loss": 0.0044531743042171, - "eval_runtime": 0.2354, - "eval_samples_per_second": 373.803, - "eval_steps_per_second": 46.725, - "step": 126 - }, - { - "epoch": 5.7727272727272725, - "grad_norm": 0.0652911588549614, - "learning_rate": 2.5421052631578948e-05, - "loss": 0.0073, - "step": 127 - }, - { - "epoch": 5.7727272727272725, - "eval_loss": 0.004411415662616491, - "eval_runtime": 0.236, - "eval_samples_per_second": 372.941, - "eval_steps_per_second": 46.618, - "step": 127 - }, - { - "epoch": 5.818181818181818, - "grad_norm": 0.05701863393187523, - "learning_rate": 2.5381578947368422e-05, - "loss": 0.0067, - "step": 128 - }, - { - "epoch": 5.818181818181818, - "eval_loss": 0.004371690563857555, - "eval_runtime": 0.2358, - "eval_samples_per_second": 373.191, - "eval_steps_per_second": 46.649, - "step": 128 - }, - { - "epoch": 5.863636363636363, - "grad_norm": 0.05990603566169739, - "learning_rate": 2.5342105263157893e-05, - "loss": 0.0071, - "step": 129 - }, - { - "epoch": 5.863636363636363, - "eval_loss": 0.004331877455115318, - "eval_runtime": 0.2301, - "eval_samples_per_second": 382.487, - "eval_steps_per_second": 47.811, - "step": 129 - }, - { - "epoch": 5.909090909090909, - "grad_norm": 0.06283283233642578, - "learning_rate": 2.530263157894737e-05, - "loss": 0.0071, - "step": 130 - }, - { - "epoch": 5.909090909090909, - "eval_loss": 0.0042935688979923725, - "eval_runtime": 0.2387, - "eval_samples_per_second": 368.63, - "eval_steps_per_second": 46.079, - "step": 130 - }, - { - "epoch": 5.954545454545455, - "grad_norm": 0.060048509389162064, - "learning_rate": 2.526315789473684e-05, - "loss": 0.0067, - "step": 131 - }, - { - "epoch": 5.954545454545455, - "eval_loss": 0.0042540752328932285, - "eval_runtime": 0.2471, - "eval_samples_per_second": 356.096, - "eval_steps_per_second": 44.512, - "step": 131 - }, - { - "epoch": 6.0, - "grad_norm": 0.060563940554857254, - "learning_rate": 2.5223684210526315e-05, - "loss": 0.0064, - "step": 132 - }, - { - "epoch": 6.0, - "eval_loss": 0.004213025793433189, - "eval_runtime": 0.2399, - "eval_samples_per_second": 366.883, - "eval_steps_per_second": 45.86, - "step": 132 - }, - { - "epoch": 6.045454545454546, - "grad_norm": 0.060382332652807236, - "learning_rate": 2.518421052631579e-05, - "loss": 0.0071, - "step": 133 - }, - { - "epoch": 6.045454545454546, - "eval_loss": 0.004174065310508013, - "eval_runtime": 0.2268, - "eval_samples_per_second": 388.075, - "eval_steps_per_second": 48.509, - "step": 133 - }, - { - "epoch": 6.090909090909091, - "grad_norm": 0.06080484017729759, - "learning_rate": 2.5144736842105264e-05, - "loss": 0.0073, - "step": 134 - }, - { - "epoch": 6.090909090909091, - "eval_loss": 0.0041358619928359985, - "eval_runtime": 0.2229, - "eval_samples_per_second": 394.875, - "eval_steps_per_second": 49.359, - "step": 134 - }, - { - "epoch": 6.136363636363637, - "grad_norm": 0.057626206427812576, - "learning_rate": 2.5105263157894738e-05, - "loss": 0.0066, - "step": 135 - }, - { - "epoch": 6.136363636363637, - "eval_loss": 0.004101672675460577, - "eval_runtime": 0.2283, - "eval_samples_per_second": 385.395, - "eval_steps_per_second": 48.174, - "step": 135 - }, - { - "epoch": 6.181818181818182, - "grad_norm": 0.06599877029657364, - "learning_rate": 2.5065789473684212e-05, - "loss": 0.0075, - "step": 136 - }, - { - "epoch": 6.181818181818182, - "eval_loss": 0.004067064728587866, - "eval_runtime": 0.221, - "eval_samples_per_second": 398.26, - "eval_steps_per_second": 49.783, - "step": 136 - }, - { - "epoch": 6.2272727272727275, - "grad_norm": 0.05654873698949814, - "learning_rate": 2.5026315789473686e-05, - "loss": 0.0066, - "step": 137 - }, - { - "epoch": 6.2272727272727275, - "eval_loss": 0.0040321690030395985, - "eval_runtime": 0.2329, - "eval_samples_per_second": 377.882, - "eval_steps_per_second": 47.235, - "step": 137 - }, - { - "epoch": 6.2727272727272725, - "grad_norm": 0.05717283487319946, - "learning_rate": 2.4986842105263157e-05, - "loss": 0.0067, - "step": 138 - }, - { - "epoch": 6.2727272727272725, - "eval_loss": 0.003995668143033981, - "eval_runtime": 0.2203, - "eval_samples_per_second": 399.464, - "eval_steps_per_second": 49.933, - "step": 138 - }, - { - "epoch": 6.318181818181818, - "grad_norm": 0.06036869063973427, - "learning_rate": 2.4947368421052635e-05, - "loss": 0.0064, - "step": 139 - }, - { - "epoch": 6.318181818181818, - "eval_loss": 0.003956829197704792, - "eval_runtime": 0.2294, - "eval_samples_per_second": 383.681, - "eval_steps_per_second": 47.96, - "step": 139 - }, - { - "epoch": 6.363636363636363, - "grad_norm": 0.05111813545227051, - "learning_rate": 2.4907894736842105e-05, - "loss": 0.0063, - "step": 140 - }, - { - "epoch": 6.363636363636363, - "eval_loss": 0.003918844275176525, - "eval_runtime": 0.2186, - "eval_samples_per_second": 402.513, - "eval_steps_per_second": 50.314, - "step": 140 - }, - { - "epoch": 6.409090909090909, - "grad_norm": 0.0621768981218338, - "learning_rate": 2.486842105263158e-05, - "loss": 0.0064, - "step": 141 - }, - { - "epoch": 6.409090909090909, - "eval_loss": 0.00388046121224761, - "eval_runtime": 0.2309, - "eval_samples_per_second": 381.086, - "eval_steps_per_second": 47.636, - "step": 141 - }, - { - "epoch": 6.454545454545454, - "grad_norm": 0.06089349836111069, - "learning_rate": 2.4828947368421054e-05, - "loss": 0.0066, - "step": 142 - }, - { - "epoch": 6.454545454545454, - "eval_loss": 0.003839746816083789, - "eval_runtime": 0.2206, - "eval_samples_per_second": 398.928, - "eval_steps_per_second": 49.866, - "step": 142 - }, - { - "epoch": 6.5, - "grad_norm": 0.05007468909025192, - "learning_rate": 2.4789473684210528e-05, - "loss": 0.0061, - "step": 143 - }, - { - "epoch": 6.5, - "eval_loss": 0.003801233833655715, - "eval_runtime": 0.2295, - "eval_samples_per_second": 383.433, - "eval_steps_per_second": 47.929, - "step": 143 - }, - { - "epoch": 6.545454545454545, - "grad_norm": 0.053182121366262436, - "learning_rate": 2.475e-05, - "loss": 0.0059, - "step": 144 - }, - { - "epoch": 6.545454545454545, - "eval_loss": 0.003766607493162155, - "eval_runtime": 0.2261, - "eval_samples_per_second": 389.124, - "eval_steps_per_second": 48.64, - "step": 144 - }, - { - "epoch": 6.590909090909091, - "grad_norm": 0.051414087414741516, - "learning_rate": 2.4710526315789476e-05, - "loss": 0.0061, - "step": 145 - }, - { - "epoch": 6.590909090909091, - "eval_loss": 0.0037348391488194466, - "eval_runtime": 0.2309, - "eval_samples_per_second": 381.083, - "eval_steps_per_second": 47.635, - "step": 145 - }, - { - "epoch": 6.636363636363637, - "grad_norm": 0.051980625838041306, - "learning_rate": 2.4671052631578947e-05, - "loss": 0.0061, - "step": 146 - }, - { - "epoch": 6.636363636363637, - "eval_loss": 0.0037048642989248037, - "eval_runtime": 0.2327, - "eval_samples_per_second": 378.163, - "eval_steps_per_second": 47.27, - "step": 146 - }, - { - "epoch": 6.681818181818182, - "grad_norm": 0.054644446820020676, - "learning_rate": 2.463157894736842e-05, - "loss": 0.006, - "step": 147 - }, - { - "epoch": 6.681818181818182, - "eval_loss": 0.003674545791000128, - "eval_runtime": 0.2332, - "eval_samples_per_second": 377.322, - "eval_steps_per_second": 47.165, - "step": 147 - }, - { - "epoch": 6.7272727272727275, - "grad_norm": 0.04687352105975151, - "learning_rate": 2.45921052631579e-05, - "loss": 0.0057, - "step": 148 - }, - { - "epoch": 6.7272727272727275, - "eval_loss": 0.0036456272937357426, - "eval_runtime": 0.2302, - "eval_samples_per_second": 382.325, - "eval_steps_per_second": 47.791, - "step": 148 - }, - { - "epoch": 6.7727272727272725, - "grad_norm": 0.0500478520989418, - "learning_rate": 2.455263157894737e-05, - "loss": 0.0054, - "step": 149 - }, - { - "epoch": 6.7727272727272725, - "eval_loss": 0.003618737915530801, - "eval_runtime": 0.2281, - "eval_samples_per_second": 385.776, - "eval_steps_per_second": 48.222, - "step": 149 - }, - { - "epoch": 6.818181818181818, - "grad_norm": 0.05092916265130043, - "learning_rate": 2.4513157894736843e-05, - "loss": 0.0054, - "step": 150 - }, - { - "epoch": 6.818181818181818, - "eval_loss": 0.0035921267699450254, - "eval_runtime": 0.2298, - "eval_samples_per_second": 382.977, - "eval_steps_per_second": 47.872, - "step": 150 - }, - { - "epoch": 6.863636363636363, - "grad_norm": 0.05389472842216492, - "learning_rate": 2.4473684210526318e-05, - "loss": 0.0057, - "step": 151 - }, - { - "epoch": 6.863636363636363, - "eval_loss": 0.003567308420315385, - "eval_runtime": 0.2896, - "eval_samples_per_second": 303.912, - "eval_steps_per_second": 37.989, - "step": 151 - }, - { - "epoch": 6.909090909090909, - "grad_norm": 0.051427211612463, - "learning_rate": 2.4434210526315792e-05, - "loss": 0.0058, - "step": 152 - }, - { - "epoch": 6.909090909090909, - "eval_loss": 0.003539604600518942, - "eval_runtime": 0.2314, - "eval_samples_per_second": 380.243, - "eval_steps_per_second": 47.53, - "step": 152 - }, - { - "epoch": 6.954545454545455, - "grad_norm": 0.05391733720898628, - "learning_rate": 2.4394736842105262e-05, - "loss": 0.0058, - "step": 153 - }, - { - "epoch": 6.954545454545455, - "eval_loss": 0.0035100304521620274, - "eval_runtime": 0.2452, - "eval_samples_per_second": 358.914, - "eval_steps_per_second": 44.864, - "step": 153 - }, - { - "epoch": 7.0, - "grad_norm": 0.05612335354089737, - "learning_rate": 2.4355263157894737e-05, - "loss": 0.0056, - "step": 154 - }, - { - "epoch": 7.0, - "eval_loss": 0.0034810365177690983, - "eval_runtime": 0.2328, - "eval_samples_per_second": 378.038, - "eval_steps_per_second": 47.255, - "step": 154 - }, - { - "epoch": 7.045454545454546, - "grad_norm": 0.05799683555960655, - "learning_rate": 2.431578947368421e-05, - "loss": 0.0062, - "step": 155 - }, - { - "epoch": 7.045454545454546, - "eval_loss": 0.003452845150604844, - "eval_runtime": 0.2326, - "eval_samples_per_second": 378.28, - "eval_steps_per_second": 47.285, - "step": 155 - }, - { - "epoch": 7.090909090909091, - "grad_norm": 0.05095871537923813, - "learning_rate": 2.4276315789473685e-05, - "loss": 0.0051, - "step": 156 - }, - { - "epoch": 7.090909090909091, - "eval_loss": 0.003425983479246497, - "eval_runtime": 0.2387, - "eval_samples_per_second": 368.611, - "eval_steps_per_second": 46.076, - "step": 156 - }, - { - "epoch": 7.136363636363637, - "grad_norm": 0.05834353715181351, - "learning_rate": 2.4236842105263156e-05, - "loss": 0.0061, - "step": 157 - }, - { - "epoch": 7.136363636363637, - "eval_loss": 0.003400736255571246, - "eval_runtime": 0.233, - "eval_samples_per_second": 377.737, - "eval_steps_per_second": 47.217, - "step": 157 - }, - { - "epoch": 7.181818181818182, - "grad_norm": 0.05226532742381096, - "learning_rate": 2.4197368421052633e-05, - "loss": 0.006, - "step": 158 - }, - { - "epoch": 7.181818181818182, - "eval_loss": 0.003375994274392724, - "eval_runtime": 0.2371, - "eval_samples_per_second": 371.159, - "eval_steps_per_second": 46.395, - "step": 158 - }, - { - "epoch": 7.2272727272727275, - "grad_norm": 0.044102054089307785, - "learning_rate": 2.4157894736842104e-05, - "loss": 0.0051, - "step": 159 - }, - { - "epoch": 7.2272727272727275, - "eval_loss": 0.003351524705067277, - "eval_runtime": 0.2391, - "eval_samples_per_second": 368.077, - "eval_steps_per_second": 46.01, - "step": 159 - }, - { - "epoch": 7.2727272727272725, - "grad_norm": 0.050387196242809296, - "learning_rate": 2.4118421052631578e-05, - "loss": 0.0055, - "step": 160 - }, - { - "epoch": 7.2727272727272725, - "eval_loss": 0.003328080987557769, - "eval_runtime": 0.2367, - "eval_samples_per_second": 371.775, - "eval_steps_per_second": 46.472, - "step": 160 - }, - { - "epoch": 7.318181818181818, - "grad_norm": 0.05944162234663963, - "learning_rate": 2.4078947368421056e-05, - "loss": 0.0062, - "step": 161 - }, - { - "epoch": 7.318181818181818, - "eval_loss": 0.003306704806163907, - "eval_runtime": 0.2302, - "eval_samples_per_second": 382.244, - "eval_steps_per_second": 47.781, - "step": 161 - }, - { - "epoch": 7.363636363636363, - "grad_norm": 0.058280494064092636, - "learning_rate": 2.4039473684210526e-05, - "loss": 0.0055, - "step": 162 - }, - { - "epoch": 7.363636363636363, - "eval_loss": 0.0032875537872314453, - "eval_runtime": 0.2313, - "eval_samples_per_second": 380.46, - "eval_steps_per_second": 47.557, - "step": 162 - }, - { - "epoch": 7.409090909090909, - "grad_norm": 0.04580385982990265, - "learning_rate": 2.4e-05, - "loss": 0.0051, - "step": 163 - }, - { - "epoch": 7.409090909090909, - "eval_loss": 0.003268357366323471, - "eval_runtime": 0.2329, - "eval_samples_per_second": 377.841, - "eval_steps_per_second": 47.23, - "step": 163 - }, - { - "epoch": 7.454545454545454, - "grad_norm": 0.047211576253175735, - "learning_rate": 2.3960526315789475e-05, - "loss": 0.0049, - "step": 164 - }, - { - "epoch": 7.454545454545454, - "eval_loss": 0.003249667352065444, - "eval_runtime": 0.2288, - "eval_samples_per_second": 384.62, - "eval_steps_per_second": 48.077, - "step": 164 - }, - { - "epoch": 7.5, - "grad_norm": 0.04698212072253227, - "learning_rate": 2.392105263157895e-05, - "loss": 0.0051, - "step": 165 - }, - { - "epoch": 7.5, - "eval_loss": 0.003230377798900008, - "eval_runtime": 0.2336, - "eval_samples_per_second": 376.761, - "eval_steps_per_second": 47.095, - "step": 165 - }, - { - "epoch": 7.545454545454545, - "grad_norm": 0.049539972096681595, - "learning_rate": 2.388157894736842e-05, - "loss": 0.0053, - "step": 166 - }, - { - "epoch": 7.545454545454545, - "eval_loss": 0.003210590686649084, - "eval_runtime": 0.2308, - "eval_samples_per_second": 381.225, - "eval_steps_per_second": 47.653, - "step": 166 - }, - { - "epoch": 7.590909090909091, - "grad_norm": 0.06876406818628311, - "learning_rate": 2.3842105263157897e-05, - "loss": 0.0054, - "step": 167 - }, - { - "epoch": 7.590909090909091, - "eval_loss": 0.0031811357475817204, - "eval_runtime": 0.2314, - "eval_samples_per_second": 380.236, - "eval_steps_per_second": 47.53, - "step": 167 - }, - { - "epoch": 7.636363636363637, - "grad_norm": 0.03961968049407005, - "learning_rate": 2.3802631578947368e-05, - "loss": 0.0048, - "step": 168 - }, - { - "epoch": 7.636363636363637, - "eval_loss": 0.003153204219415784, - "eval_runtime": 0.2327, - "eval_samples_per_second": 378.242, - "eval_steps_per_second": 47.28, - "step": 168 - }, - { - "epoch": 7.681818181818182, - "grad_norm": 0.046262938529253006, - "learning_rate": 2.3763157894736842e-05, - "loss": 0.0054, - "step": 169 - }, - { - "epoch": 7.681818181818182, - "eval_loss": 0.0031256629154086113, - "eval_runtime": 0.2285, - "eval_samples_per_second": 385.163, - "eval_steps_per_second": 48.145, - "step": 169 - }, - { - "epoch": 7.7272727272727275, - "grad_norm": 0.04695883020758629, - "learning_rate": 2.3723684210526316e-05, - "loss": 0.0053, - "step": 170 - }, - { - "epoch": 7.7272727272727275, - "eval_loss": 0.00310018053278327, - "eval_runtime": 0.2345, - "eval_samples_per_second": 375.19, - "eval_steps_per_second": 46.899, - "step": 170 - }, - { - "epoch": 7.7727272727272725, - "grad_norm": 0.047219086438417435, - "learning_rate": 2.368421052631579e-05, - "loss": 0.0052, - "step": 171 - }, - { - "epoch": 7.7727272727272725, - "eval_loss": 0.003074278589338064, - "eval_runtime": 0.2331, - "eval_samples_per_second": 377.522, - "eval_steps_per_second": 47.19, - "step": 171 - }, - { - "epoch": 7.818181818181818, - "grad_norm": 0.05439964681863785, - "learning_rate": 2.364473684210526e-05, - "loss": 0.0055, - "step": 172 - }, - { - "epoch": 7.818181818181818, - "eval_loss": 0.003049066523090005, - "eval_runtime": 0.2239, - "eval_samples_per_second": 393.01, - "eval_steps_per_second": 49.126, - "step": 172 - }, - { - "epoch": 7.863636363636363, - "grad_norm": 0.041486483067274094, - "learning_rate": 2.360526315789474e-05, - "loss": 0.0047, - "step": 173 - }, - { - "epoch": 7.863636363636363, - "eval_loss": 0.0030262693762779236, - "eval_runtime": 0.2278, - "eval_samples_per_second": 386.243, - "eval_steps_per_second": 48.28, - "step": 173 - }, - { - "epoch": 7.909090909090909, - "grad_norm": 0.040691111236810684, - "learning_rate": 2.3565789473684213e-05, - "loss": 0.0046, - "step": 174 - }, - { - "epoch": 7.909090909090909, - "eval_loss": 0.0030057693365961313, - "eval_runtime": 0.2276, - "eval_samples_per_second": 386.567, - "eval_steps_per_second": 48.321, - "step": 174 - }, - { - "epoch": 7.954545454545455, - "grad_norm": 0.048391714692115784, - "learning_rate": 2.3526315789473684e-05, - "loss": 0.0055, - "step": 175 - }, - { - "epoch": 7.954545454545455, - "eval_loss": 0.0029874229803681374, - "eval_runtime": 0.2269, - "eval_samples_per_second": 387.906, - "eval_steps_per_second": 48.488, - "step": 175 - }, - { - "epoch": 8.0, - "grad_norm": 0.04458646848797798, - "learning_rate": 2.348684210526316e-05, - "loss": 0.005, - "step": 176 - }, - { - "epoch": 8.0, - "eval_loss": 0.0029713741969317198, - "eval_runtime": 0.2305, - "eval_samples_per_second": 381.854, - "eval_steps_per_second": 47.732, - "step": 176 - }, - { - "epoch": 8.045454545454545, - "grad_norm": 0.044490914791822433, - "learning_rate": 2.3447368421052632e-05, - "loss": 0.005, - "step": 177 - }, - { - "epoch": 8.045454545454545, - "eval_loss": 0.002958006225526333, - "eval_runtime": 0.2331, - "eval_samples_per_second": 377.519, - "eval_steps_per_second": 47.19, - "step": 177 - }, - { - "epoch": 8.090909090909092, - "grad_norm": 0.04664753004908562, - "learning_rate": 2.3407894736842106e-05, - "loss": 0.0053, - "step": 178 - }, - { - "epoch": 8.090909090909092, - "eval_loss": 0.0029434128664433956, - "eval_runtime": 0.2369, - "eval_samples_per_second": 371.478, - "eval_steps_per_second": 46.435, - "step": 178 - }, - { - "epoch": 8.136363636363637, - "grad_norm": 0.05114319175481796, - "learning_rate": 2.336842105263158e-05, - "loss": 0.0052, - "step": 179 - }, - { - "epoch": 8.136363636363637, - "eval_loss": 0.002928072353824973, - "eval_runtime": 0.2273, - "eval_samples_per_second": 387.111, - "eval_steps_per_second": 48.389, - "step": 179 - }, - { - "epoch": 8.181818181818182, - "grad_norm": 0.03715480864048004, - "learning_rate": 2.3328947368421054e-05, - "loss": 0.0044, - "step": 180 - }, - { - "epoch": 8.181818181818182, - "eval_loss": 0.002913246164098382, - "eval_runtime": 0.2291, - "eval_samples_per_second": 384.095, - "eval_steps_per_second": 48.012, - "step": 180 - }, - { - "epoch": 8.227272727272727, - "grad_norm": 0.03329971432685852, - "learning_rate": 2.3289473684210525e-05, - "loss": 0.0043, - "step": 181 - }, - { - "epoch": 8.227272727272727, - "eval_loss": 0.0028981559444218874, - "eval_runtime": 0.2387, - "eval_samples_per_second": 368.641, - "eval_steps_per_second": 46.08, - "step": 181 - }, - { - "epoch": 8.272727272727273, - "grad_norm": 0.036768488585948944, - "learning_rate": 2.3250000000000003e-05, - "loss": 0.0043, - "step": 182 - }, - { - "epoch": 8.272727272727273, - "eval_loss": 0.002883592387661338, - "eval_runtime": 0.2382, - "eval_samples_per_second": 369.423, - "eval_steps_per_second": 46.178, - "step": 182 - }, - { - "epoch": 8.318181818181818, - "grad_norm": 0.03704945370554924, - "learning_rate": 2.3210526315789473e-05, - "loss": 0.0042, - "step": 183 - }, - { - "epoch": 8.318181818181818, - "eval_loss": 0.0028684174176305532, - "eval_runtime": 0.2436, - "eval_samples_per_second": 361.18, - "eval_steps_per_second": 45.148, - "step": 183 - }, - { - "epoch": 8.363636363636363, - "grad_norm": 0.038721974939107895, - "learning_rate": 2.3171052631578948e-05, - "loss": 0.0045, - "step": 184 - }, - { - "epoch": 8.363636363636363, - "eval_loss": 0.002850217279046774, - "eval_runtime": 0.2397, - "eval_samples_per_second": 367.125, - "eval_steps_per_second": 45.891, - "step": 184 - }, - { - "epoch": 8.409090909090908, - "grad_norm": 0.0400218740105629, - "learning_rate": 2.3131578947368422e-05, - "loss": 0.0046, - "step": 185 - }, - { - "epoch": 8.409090909090908, - "eval_loss": 0.0028304813895374537, - "eval_runtime": 0.2401, - "eval_samples_per_second": 366.549, - "eval_steps_per_second": 45.819, - "step": 185 - }, - { - "epoch": 8.454545454545455, - "grad_norm": 0.04041934013366699, - "learning_rate": 2.3092105263157896e-05, - "loss": 0.0047, - "step": 186 - }, - { - "epoch": 8.454545454545455, - "eval_loss": 0.0028114623855799437, - "eval_runtime": 0.2373, - "eval_samples_per_second": 370.773, - "eval_steps_per_second": 46.347, - "step": 186 - }, - { - "epoch": 8.5, - "grad_norm": 0.03471284359693527, - "learning_rate": 2.3052631578947367e-05, - "loss": 0.0042, - "step": 187 - }, - { - "epoch": 8.5, - "eval_loss": 0.002793875988572836, - "eval_runtime": 0.2482, - "eval_samples_per_second": 354.499, - "eval_steps_per_second": 44.312, - "step": 187 - }, - { - "epoch": 8.545454545454545, - "grad_norm": 0.044632624834775925, - "learning_rate": 2.3013157894736844e-05, - "loss": 0.0048, - "step": 188 - }, - { - "epoch": 8.545454545454545, - "eval_loss": 0.0027756269555538893, - "eval_runtime": 0.2261, - "eval_samples_per_second": 389.244, - "eval_steps_per_second": 48.656, - "step": 188 - }, - { - "epoch": 8.590909090909092, - "grad_norm": 0.039824243634939194, - "learning_rate": 2.297368421052632e-05, - "loss": 0.0044, - "step": 189 - }, - { - "epoch": 8.590909090909092, - "eval_loss": 0.00275724777020514, - "eval_runtime": 0.2454, - "eval_samples_per_second": 358.66, - "eval_steps_per_second": 44.832, - "step": 189 - }, - { - "epoch": 8.636363636363637, - "grad_norm": 0.03765185549855232, - "learning_rate": 2.293421052631579e-05, - "loss": 0.0046, - "step": 190 - }, - { - "epoch": 8.636363636363637, - "eval_loss": 0.002737644361332059, - "eval_runtime": 0.2301, - "eval_samples_per_second": 382.383, - "eval_steps_per_second": 47.798, - "step": 190 - }, - { - "epoch": 8.681818181818182, - "grad_norm": 0.04460470378398895, - "learning_rate": 2.2894736842105263e-05, - "loss": 0.0049, - "step": 191 - }, - { - "epoch": 8.681818181818182, - "eval_loss": 0.002716499613597989, - "eval_runtime": 0.2404, - "eval_samples_per_second": 366.123, - "eval_steps_per_second": 45.765, - "step": 191 - }, - { - "epoch": 8.727272727272727, - "grad_norm": 0.04597329720854759, - "learning_rate": 2.2855263157894737e-05, - "loss": 0.0046, - "step": 192 - }, - { - "epoch": 8.727272727272727, - "eval_loss": 0.002695793053135276, - "eval_runtime": 0.2287, - "eval_samples_per_second": 384.748, - "eval_steps_per_second": 48.093, - "step": 192 - }, - { - "epoch": 8.772727272727273, - "grad_norm": 0.04175286740064621, - "learning_rate": 2.281578947368421e-05, - "loss": 0.0048, - "step": 193 - }, - { - "epoch": 8.772727272727273, - "eval_loss": 0.0026768320240080357, - "eval_runtime": 0.2297, - "eval_samples_per_second": 383.191, - "eval_steps_per_second": 47.899, - "step": 193 - }, - { - "epoch": 8.818181818181818, - "grad_norm": 0.03605563938617706, - "learning_rate": 2.2776315789473682e-05, - "loss": 0.0042, - "step": 194 - }, - { - "epoch": 8.818181818181818, - "eval_loss": 0.0026587164029479027, - "eval_runtime": 0.2319, - "eval_samples_per_second": 379.432, - "eval_steps_per_second": 47.429, - "step": 194 - }, - { - "epoch": 8.863636363636363, - "grad_norm": 0.03600858151912689, - "learning_rate": 2.273684210526316e-05, - "loss": 0.004, - "step": 195 - }, - { - "epoch": 8.863636363636363, - "eval_loss": 0.0026421842630952597, - "eval_runtime": 0.2375, - "eval_samples_per_second": 370.592, - "eval_steps_per_second": 46.324, - "step": 195 - }, - { - "epoch": 8.909090909090908, - "grad_norm": 0.04040640592575073, - "learning_rate": 2.269736842105263e-05, - "loss": 0.0046, - "step": 196 - }, - { - "epoch": 8.909090909090908, - "eval_loss": 0.002626256085932255, - "eval_runtime": 0.5446, - "eval_samples_per_second": 161.597, - "eval_steps_per_second": 20.2, - "step": 196 - }, - { - "epoch": 8.954545454545455, - "grad_norm": 0.04418746754527092, - "learning_rate": 2.2657894736842105e-05, - "loss": 0.0042, - "step": 197 - }, - { - "epoch": 8.954545454545455, - "eval_loss": 0.002609600778669119, - "eval_runtime": 0.233, - "eval_samples_per_second": 377.684, - "eval_steps_per_second": 47.211, - "step": 197 - }, - { - "epoch": 9.0, - "grad_norm": 0.04399528354406357, - "learning_rate": 2.261842105263158e-05, - "loss": 0.0044, - "step": 198 - }, - { - "epoch": 9.0, - "eval_loss": 0.0025943187065422535, - "eval_runtime": 0.3847, - "eval_samples_per_second": 228.728, - "eval_steps_per_second": 28.591, - "step": 198 - }, - { - "epoch": 9.045454545454545, - "grad_norm": 0.04438379034399986, - "learning_rate": 2.2578947368421053e-05, - "loss": 0.0045, - "step": 199 - }, - { - "epoch": 9.045454545454545, - "eval_loss": 0.0025796808768063784, - "eval_runtime": 0.4906, - "eval_samples_per_second": 179.357, - "eval_steps_per_second": 22.42, - "step": 199 - }, - { - "epoch": 9.090909090909092, - "grad_norm": 0.03908194229006767, - "learning_rate": 2.2539473684210524e-05, - "loss": 0.0045, - "step": 200 - }, - { - "epoch": 9.090909090909092, - "eval_loss": 0.002565391594544053, - "eval_runtime": 0.2293, - "eval_samples_per_second": 383.783, - "eval_steps_per_second": 47.973, - "step": 200 - }, - { - "epoch": 9.136363636363637, - "grad_norm": 0.03590917959809303, - "learning_rate": 2.25e-05, - "loss": 0.0044, - "step": 201 - }, - { - "epoch": 9.136363636363637, - "eval_loss": 0.0025517421308904886, - "eval_runtime": 0.2317, - "eval_samples_per_second": 379.855, - "eval_steps_per_second": 47.482, - "step": 201 - }, - { - "epoch": 9.181818181818182, - "grad_norm": 0.0374373197555542, - "learning_rate": 2.2460526315789476e-05, - "loss": 0.0039, - "step": 202 - }, - { - "epoch": 9.181818181818182, - "eval_loss": 0.0025385154876857996, - "eval_runtime": 0.232, - "eval_samples_per_second": 379.247, - "eval_steps_per_second": 47.406, - "step": 202 - }, - { - "epoch": 9.227272727272727, - "grad_norm": 0.03761666640639305, - "learning_rate": 2.2421052631578946e-05, - "loss": 0.004, - "step": 203 - }, - { - "epoch": 9.227272727272727, - "eval_loss": 0.0025266585871577263, - "eval_runtime": 0.2386, - "eval_samples_per_second": 368.745, - "eval_steps_per_second": 46.093, - "step": 203 - }, - { - "epoch": 9.272727272727273, - "grad_norm": 0.033979009836912155, - "learning_rate": 2.2381578947368424e-05, - "loss": 0.004, - "step": 204 - }, - { - "epoch": 9.272727272727273, - "eval_loss": 0.0025138070341199636, - "eval_runtime": 0.2314, - "eval_samples_per_second": 380.28, - "eval_steps_per_second": 47.535, - "step": 204 - }, - { - "epoch": 9.318181818181818, - "grad_norm": 0.054837603121995926, - "learning_rate": 2.2342105263157895e-05, - "loss": 0.0042, - "step": 205 - }, - { - "epoch": 9.318181818181818, - "eval_loss": 0.002499848371371627, - "eval_runtime": 0.227, - "eval_samples_per_second": 387.733, - "eval_steps_per_second": 48.467, - "step": 205 - }, - { - "epoch": 9.363636363636363, - "grad_norm": 0.03884384036064148, - "learning_rate": 2.230263157894737e-05, - "loss": 0.0043, - "step": 206 - }, - { - "epoch": 9.363636363636363, - "eval_loss": 0.0024857125245034695, - "eval_runtime": 0.2294, - "eval_samples_per_second": 383.548, - "eval_steps_per_second": 47.944, - "step": 206 - }, - { - "epoch": 9.409090909090908, - "grad_norm": 0.03517827019095421, - "learning_rate": 2.2263157894736843e-05, - "loss": 0.004, - "step": 207 - }, - { - "epoch": 9.409090909090908, - "eval_loss": 0.00247101578861475, - "eval_runtime": 0.2336, - "eval_samples_per_second": 376.726, - "eval_steps_per_second": 47.091, - "step": 207 - }, - { - "epoch": 9.454545454545455, - "grad_norm": 0.04209022969007492, - "learning_rate": 2.2223684210526317e-05, - "loss": 0.0041, - "step": 208 - }, - { - "epoch": 9.454545454545455, - "eval_loss": 0.0024564675986766815, - "eval_runtime": 0.242, - "eval_samples_per_second": 363.631, - "eval_steps_per_second": 45.454, - "step": 208 - }, - { - "epoch": 9.5, - "grad_norm": 0.04031739383935928, - "learning_rate": 2.2184210526315788e-05, - "loss": 0.0042, - "step": 209 - }, - { - "epoch": 9.5, - "eval_loss": 0.002442182507365942, - "eval_runtime": 0.2384, - "eval_samples_per_second": 369.056, - "eval_steps_per_second": 46.132, - "step": 209 - }, - { - "epoch": 9.545454545454545, - "grad_norm": 0.03341998904943466, - "learning_rate": 2.2144736842105265e-05, - "loss": 0.0038, - "step": 210 - }, - { - "epoch": 9.545454545454545, - "eval_loss": 0.0024283959064632654, - "eval_runtime": 0.2386, - "eval_samples_per_second": 368.766, - "eval_steps_per_second": 46.096, - "step": 210 - }, - { - "epoch": 9.590909090909092, - "grad_norm": 0.033409975469112396, - "learning_rate": 2.2105263157894736e-05, - "loss": 0.0037, - "step": 211 - }, - { - "epoch": 9.590909090909092, - "eval_loss": 0.002414784161373973, - "eval_runtime": 0.2392, - "eval_samples_per_second": 367.843, - "eval_steps_per_second": 45.98, - "step": 211 - }, - { - "epoch": 9.636363636363637, - "grad_norm": 0.038544610142707825, - "learning_rate": 2.206578947368421e-05, - "loss": 0.0042, - "step": 212 - }, - { - "epoch": 9.636363636363637, - "eval_loss": 0.0024007910396903753, - "eval_runtime": 0.2355, - "eval_samples_per_second": 373.655, - "eval_steps_per_second": 46.707, - "step": 212 - }, - { - "epoch": 9.681818181818182, - "grad_norm": 0.031284794211387634, - "learning_rate": 2.2026315789473684e-05, - "loss": 0.0039, - "step": 213 - }, - { - "epoch": 9.681818181818182, - "eval_loss": 0.00238687708042562, - "eval_runtime": 0.2461, - "eval_samples_per_second": 357.651, - "eval_steps_per_second": 44.706, - "step": 213 - }, - { - "epoch": 9.727272727272727, - "grad_norm": 0.03589053079485893, - "learning_rate": 2.198684210526316e-05, - "loss": 0.004, - "step": 214 - }, - { - "epoch": 9.727272727272727, - "eval_loss": 0.002372899791225791, - "eval_runtime": 0.2388, - "eval_samples_per_second": 368.519, - "eval_steps_per_second": 46.065, - "step": 214 - }, - { - "epoch": 9.772727272727273, - "grad_norm": 0.03422442823648453, - "learning_rate": 2.1947368421052633e-05, - "loss": 0.0037, - "step": 215 - }, - { - "epoch": 9.772727272727273, - "eval_loss": 0.0023599357809871435, - "eval_runtime": 0.2324, - "eval_samples_per_second": 378.632, - "eval_steps_per_second": 47.329, - "step": 215 - }, - { - "epoch": 9.818181818181818, - "grad_norm": 0.03365776687860489, - "learning_rate": 2.1907894736842107e-05, - "loss": 0.0035, - "step": 216 - }, - { - "epoch": 9.818181818181818, - "eval_loss": 0.0023472688626497984, - "eval_runtime": 0.231, - "eval_samples_per_second": 380.916, - "eval_steps_per_second": 47.614, - "step": 216 - }, - { - "epoch": 9.863636363636363, - "grad_norm": 0.030327491462230682, - "learning_rate": 2.186842105263158e-05, - "loss": 0.0037, - "step": 217 - }, - { - "epoch": 9.863636363636363, - "eval_loss": 0.0023344962392002344, - "eval_runtime": 0.229, - "eval_samples_per_second": 384.224, - "eval_steps_per_second": 48.028, - "step": 217 - }, - { - "epoch": 9.909090909090908, - "grad_norm": 0.039349548518657684, - "learning_rate": 2.1828947368421052e-05, - "loss": 0.004, - "step": 218 - }, - { - "epoch": 9.909090909090908, - "eval_loss": 0.0023220828734338284, - "eval_runtime": 0.228, - "eval_samples_per_second": 385.959, - "eval_steps_per_second": 48.245, - "step": 218 - }, - { - "epoch": 9.954545454545455, - "grad_norm": 0.03199224919080734, - "learning_rate": 2.178947368421053e-05, - "loss": 0.0034, - "step": 219 - }, - { - "epoch": 9.954545454545455, - "eval_loss": 0.0023102990817278624, - "eval_runtime": 0.2311, - "eval_samples_per_second": 380.788, - "eval_steps_per_second": 47.598, - "step": 219 - }, - { - "epoch": 10.0, - "grad_norm": 0.03278977796435356, - "learning_rate": 2.175e-05, - "loss": 0.0036, - "step": 220 - }, - { - "epoch": 10.0, - "eval_loss": 0.002298795385286212, - "eval_runtime": 0.3275, - "eval_samples_per_second": 268.678, - "eval_steps_per_second": 33.585, - "step": 220 - }, - { - "epoch": 10.045454545454545, - "grad_norm": 0.0341983363032341, - "learning_rate": 2.1710526315789474e-05, - "loss": 0.0039, - "step": 221 - }, - { - "epoch": 10.045454545454545, - "eval_loss": 0.0022870004177093506, - "eval_runtime": 0.3861, - "eval_samples_per_second": 227.931, - "eval_steps_per_second": 28.491, - "step": 221 - }, - { - "epoch": 10.090909090909092, - "grad_norm": 0.03134067356586456, - "learning_rate": 2.167105263157895e-05, - "loss": 0.0038, - "step": 222 - }, - { - "epoch": 10.090909090909092, - "eval_loss": 0.002274780999869108, - "eval_runtime": 0.2973, - "eval_samples_per_second": 296.022, - "eval_steps_per_second": 37.003, - "step": 222 - }, - { - "epoch": 10.136363636363637, - "grad_norm": 0.03246266394853592, - "learning_rate": 2.1631578947368423e-05, - "loss": 0.0035, - "step": 223 - }, - { - "epoch": 10.136363636363637, - "eval_loss": 0.002262603724375367, - "eval_runtime": 0.2788, - "eval_samples_per_second": 315.607, - "eval_steps_per_second": 39.451, - "step": 223 - }, - { - "epoch": 10.181818181818182, - "grad_norm": 0.035311244428157806, - "learning_rate": 2.1592105263157893e-05, - "loss": 0.0036, - "step": 224 - }, - { - "epoch": 10.181818181818182, - "eval_loss": 0.002250505844131112, - "eval_runtime": 0.3259, - "eval_samples_per_second": 270.042, - "eval_steps_per_second": 33.755, - "step": 224 - }, - { - "epoch": 10.227272727272727, - "grad_norm": 0.03288138657808304, - "learning_rate": 2.155263157894737e-05, - "loss": 0.0039, - "step": 225 - }, - { - "epoch": 10.227272727272727, - "eval_loss": 0.0022388615179806948, - "eval_runtime": 0.3627, - "eval_samples_per_second": 242.648, - "eval_steps_per_second": 30.331, - "step": 225 - }, - { - "epoch": 10.272727272727273, - "grad_norm": 0.032804686576128006, - "learning_rate": 2.151315789473684e-05, - "loss": 0.0038, - "step": 226 - }, - { - "epoch": 10.272727272727273, - "eval_loss": 0.0022277701646089554, - "eval_runtime": 0.4861, - "eval_samples_per_second": 181.023, - "eval_steps_per_second": 22.628, - "step": 226 - }, - { - "epoch": 10.318181818181818, - "grad_norm": 0.036528490483760834, - "learning_rate": 2.1473684210526316e-05, - "loss": 0.004, - "step": 227 - }, - { - "epoch": 10.318181818181818, - "eval_loss": 0.0022167994175106287, - "eval_runtime": 0.3048, - "eval_samples_per_second": 288.714, - "eval_steps_per_second": 36.089, - "step": 227 - }, - { - "epoch": 10.363636363636363, - "grad_norm": 0.029931485652923584, - "learning_rate": 2.143421052631579e-05, - "loss": 0.0036, - "step": 228 - }, - { - "epoch": 10.363636363636363, - "eval_loss": 0.002205794909968972, - "eval_runtime": 0.2918, - "eval_samples_per_second": 301.612, - "eval_steps_per_second": 37.701, - "step": 228 - }, - { - "epoch": 10.409090909090908, - "grad_norm": 0.03588961437344551, - "learning_rate": 2.1394736842105264e-05, - "loss": 0.0039, - "step": 229 - }, - { - "epoch": 10.409090909090908, - "eval_loss": 0.0021950947120785713, - "eval_runtime": 0.2407, - "eval_samples_per_second": 365.554, - "eval_steps_per_second": 45.694, - "step": 229 - }, - { - "epoch": 10.454545454545455, - "grad_norm": 0.033503517508506775, - "learning_rate": 2.1355263157894738e-05, - "loss": 0.0036, - "step": 230 - }, - { - "epoch": 10.454545454545455, - "eval_loss": 0.0021843963768333197, - "eval_runtime": 0.2737, - "eval_samples_per_second": 321.531, - "eval_steps_per_second": 40.191, - "step": 230 - }, - { - "epoch": 10.5, - "grad_norm": 0.032428622245788574, - "learning_rate": 2.1315789473684212e-05, - "loss": 0.0035, - "step": 231 - }, - { - "epoch": 10.5, - "eval_loss": 0.002173727611079812, - "eval_runtime": 0.4053, - "eval_samples_per_second": 217.137, - "eval_steps_per_second": 27.142, - "step": 231 - }, - { - "epoch": 10.545454545454545, - "grad_norm": 0.0326942577958107, - "learning_rate": 2.1276315789473687e-05, - "loss": 0.0035, - "step": 232 - }, - { - "epoch": 10.545454545454545, - "eval_loss": 0.0021637016907334328, - "eval_runtime": 0.7117, - "eval_samples_per_second": 123.656, - "eval_steps_per_second": 15.457, - "step": 232 - }, - { - "epoch": 10.590909090909092, - "grad_norm": 0.03240852802991867, - "learning_rate": 2.1236842105263157e-05, - "loss": 0.0034, - "step": 233 - }, - { - "epoch": 10.590909090909092, - "eval_loss": 0.002153951907530427, - "eval_runtime": 0.2454, - "eval_samples_per_second": 358.581, - "eval_steps_per_second": 44.823, - "step": 233 - }, - { - "epoch": 10.636363636363637, - "grad_norm": 0.029470907524228096, - "learning_rate": 2.119736842105263e-05, - "loss": 0.0035, - "step": 234 - }, - { - "epoch": 10.636363636363637, - "eval_loss": 0.002144550671800971, - "eval_runtime": 0.2443, - "eval_samples_per_second": 360.165, - "eval_steps_per_second": 45.021, - "step": 234 - }, - { - "epoch": 10.681818181818182, - "grad_norm": 0.02820722572505474, - "learning_rate": 2.1157894736842106e-05, - "loss": 0.0034, - "step": 235 - }, - { - "epoch": 10.681818181818182, - "eval_loss": 0.002135734772309661, - "eval_runtime": 0.2643, - "eval_samples_per_second": 333.008, - "eval_steps_per_second": 41.626, - "step": 235 - }, - { - "epoch": 10.727272727272727, - "grad_norm": 0.02772766724228859, - "learning_rate": 2.111842105263158e-05, - "loss": 0.0033, - "step": 236 - }, - { - "epoch": 10.727272727272727, - "eval_loss": 0.0021269202698022127, - "eval_runtime": 0.2751, - "eval_samples_per_second": 319.882, - "eval_steps_per_second": 39.985, - "step": 236 - }, - { - "epoch": 10.772727272727273, - "grad_norm": 0.03653711825609207, - "learning_rate": 2.107894736842105e-05, - "loss": 0.0038, - "step": 237 - }, - { - "epoch": 10.772727272727273, - "eval_loss": 0.0021178810857236385, - "eval_runtime": 0.227, - "eval_samples_per_second": 387.716, - "eval_steps_per_second": 48.465, - "step": 237 - }, - { - "epoch": 10.818181818181818, - "grad_norm": 0.03011268563568592, - "learning_rate": 2.1039473684210528e-05, - "loss": 0.0035, - "step": 238 - }, - { - "epoch": 10.818181818181818, - "eval_loss": 0.002109181135892868, - "eval_runtime": 0.2398, - "eval_samples_per_second": 366.897, - "eval_steps_per_second": 45.862, - "step": 238 - }, - { - "epoch": 10.863636363636363, - "grad_norm": 0.025909798219799995, - "learning_rate": 2.1e-05, - "loss": 0.003, - "step": 239 - }, - { - "epoch": 10.863636363636363, - "eval_loss": 0.0021006783936172724, - "eval_runtime": 0.2342, - "eval_samples_per_second": 375.674, - "eval_steps_per_second": 46.959, - "step": 239 - }, - { - "epoch": 10.909090909090908, - "grad_norm": 0.02720109187066555, - "learning_rate": 2.0960526315789473e-05, - "loss": 0.0033, - "step": 240 - }, - { - "epoch": 10.909090909090908, - "eval_loss": 0.002092132344841957, - "eval_runtime": 0.2362, - "eval_samples_per_second": 372.632, - "eval_steps_per_second": 46.579, - "step": 240 - }, - { - "epoch": 10.954545454545455, - "grad_norm": 0.03358568996191025, - "learning_rate": 2.0921052631578947e-05, - "loss": 0.0034, - "step": 241 - }, - { - "epoch": 10.954545454545455, - "eval_loss": 0.0020830982830375433, - "eval_runtime": 0.2268, - "eval_samples_per_second": 387.964, - "eval_steps_per_second": 48.496, - "step": 241 - }, - { - "epoch": 11.0, - "grad_norm": 0.030720144510269165, - "learning_rate": 2.088157894736842e-05, - "loss": 0.0036, - "step": 242 - }, - { - "epoch": 11.0, - "eval_loss": 0.002074107527732849, - "eval_runtime": 0.2253, - "eval_samples_per_second": 390.639, - "eval_steps_per_second": 48.83, - "step": 242 - }, - { - "epoch": 11.045454545454545, - "grad_norm": 0.029408905655145645, - "learning_rate": 2.0842105263157895e-05, - "loss": 0.0035, - "step": 243 - }, - { - "epoch": 11.045454545454545, - "eval_loss": 0.0020653316751122475, - "eval_runtime": 0.234, - "eval_samples_per_second": 376.079, - "eval_steps_per_second": 47.01, - "step": 243 - }, - { - "epoch": 11.090909090909092, - "grad_norm": 0.02971459925174713, - "learning_rate": 2.080263157894737e-05, - "loss": 0.0034, - "step": 244 - }, - { - "epoch": 11.090909090909092, - "eval_loss": 0.0020563837606459856, - "eval_runtime": 0.2306, - "eval_samples_per_second": 381.673, - "eval_steps_per_second": 47.709, - "step": 244 - }, - { - "epoch": 11.136363636363637, - "grad_norm": 0.028164513409137726, - "learning_rate": 2.0763157894736844e-05, - "loss": 0.0034, - "step": 245 - }, - { - "epoch": 11.136363636363637, - "eval_loss": 0.0020477415528148413, - "eval_runtime": 0.2363, - "eval_samples_per_second": 372.455, - "eval_steps_per_second": 46.557, - "step": 245 - }, - { - "epoch": 11.181818181818182, - "grad_norm": 0.027845608070492744, - "learning_rate": 2.0723684210526315e-05, - "loss": 0.0034, - "step": 246 - }, - { - "epoch": 11.181818181818182, - "eval_loss": 0.002039202954620123, - "eval_runtime": 0.2314, - "eval_samples_per_second": 380.293, - "eval_steps_per_second": 47.537, - "step": 246 - }, - { - "epoch": 11.227272727272727, - "grad_norm": 0.03046409972012043, - "learning_rate": 2.0684210526315792e-05, - "loss": 0.0035, - "step": 247 - }, - { - "epoch": 11.227272727272727, - "eval_loss": 0.0020310634281486273, - "eval_runtime": 0.2258, - "eval_samples_per_second": 389.786, - "eval_steps_per_second": 48.723, - "step": 247 - }, - { - "epoch": 11.272727272727273, - "grad_norm": 0.025676798075437546, - "learning_rate": 2.0644736842105263e-05, - "loss": 0.0031, - "step": 248 - }, - { - "epoch": 11.272727272727273, - "eval_loss": 0.0020227304194122553, - "eval_runtime": 0.2266, - "eval_samples_per_second": 388.395, - "eval_steps_per_second": 48.549, - "step": 248 - }, - { - "epoch": 11.318181818181818, - "grad_norm": 0.029285188764333725, - "learning_rate": 2.0605263157894737e-05, - "loss": 0.0036, - "step": 249 - }, - { - "epoch": 11.318181818181818, - "eval_loss": 0.0020139189437031746, - "eval_runtime": 0.2399, - "eval_samples_per_second": 366.874, - "eval_steps_per_second": 45.859, - "step": 249 - }, - { - "epoch": 11.363636363636363, - "grad_norm": 0.03067379631102085, - "learning_rate": 2.056578947368421e-05, - "loss": 0.0033, - "step": 250 - }, - { - "epoch": 11.363636363636363, - "eval_loss": 0.0020049491431564093, - "eval_runtime": 0.2296, - "eval_samples_per_second": 383.216, - "eval_steps_per_second": 47.902, - "step": 250 - }, - { - "epoch": 11.409090909090908, - "grad_norm": 0.030429691076278687, - "learning_rate": 2.0526315789473685e-05, - "loss": 0.0034, - "step": 251 - }, - { - "epoch": 11.409090909090908, - "eval_loss": 0.0019955127499997616, - "eval_runtime": 0.3825, - "eval_samples_per_second": 230.047, - "eval_steps_per_second": 28.756, - "step": 251 - }, - { - "epoch": 11.454545454545455, - "grad_norm": 0.03006516583263874, - "learning_rate": 2.0486842105263156e-05, - "loss": 0.0032, - "step": 252 - }, - { - "epoch": 11.454545454545455, - "eval_loss": 0.001985815353691578, - "eval_runtime": 0.5232, - "eval_samples_per_second": 168.209, - "eval_steps_per_second": 21.026, - "step": 252 - }, - { - "epoch": 11.5, - "grad_norm": 0.03021743707358837, - "learning_rate": 2.0447368421052634e-05, - "loss": 0.0035, - "step": 253 - }, - { - "epoch": 11.5, - "eval_loss": 0.001975873252376914, - "eval_runtime": 0.5816, - "eval_samples_per_second": 151.301, - "eval_steps_per_second": 18.913, - "step": 253 - }, - { - "epoch": 11.545454545454545, - "grad_norm": 0.026514986529946327, - "learning_rate": 2.0407894736842104e-05, - "loss": 0.0032, - "step": 254 - }, - { - "epoch": 11.545454545454545, - "eval_loss": 0.0019660864491015673, - "eval_runtime": 0.2403, - "eval_samples_per_second": 366.164, - "eval_steps_per_second": 45.77, - "step": 254 - }, - { - "epoch": 11.590909090909092, - "grad_norm": 0.028690319508314133, - "learning_rate": 2.036842105263158e-05, - "loss": 0.0033, - "step": 255 - }, - { - "epoch": 11.590909090909092, - "eval_loss": 0.0019563750829547644, - "eval_runtime": 0.2248, - "eval_samples_per_second": 391.417, - "eval_steps_per_second": 48.927, - "step": 255 - }, - { - "epoch": 11.636363636363637, - "grad_norm": 0.03033028170466423, - "learning_rate": 2.0328947368421056e-05, - "loss": 0.0034, - "step": 256 - }, - { - "epoch": 11.636363636363637, - "eval_loss": 0.0019468939863145351, - "eval_runtime": 0.2311, - "eval_samples_per_second": 380.835, - "eval_steps_per_second": 47.604, - "step": 256 - }, - { - "epoch": 11.681818181818182, - "grad_norm": 0.03320786729454994, - "learning_rate": 2.0289473684210527e-05, - "loss": 0.0035, - "step": 257 - }, - { - "epoch": 11.681818181818182, - "eval_loss": 0.0019374135881662369, - "eval_runtime": 0.2307, - "eval_samples_per_second": 381.512, - "eval_steps_per_second": 47.689, - "step": 257 - }, - { - "epoch": 11.727272727272727, - "grad_norm": 0.027468524873256683, - "learning_rate": 2.025e-05, - "loss": 0.0031, - "step": 258 - }, - { - "epoch": 11.727272727272727, - "eval_loss": 0.0019284605514258146, - "eval_runtime": 0.2303, - "eval_samples_per_second": 382.049, - "eval_steps_per_second": 47.756, - "step": 258 - }, - { - "epoch": 11.772727272727273, - "grad_norm": 0.02426382340490818, - "learning_rate": 2.0210526315789475e-05, - "loss": 0.0029, - "step": 259 - }, - { - "epoch": 11.772727272727273, - "eval_loss": 0.0019197481451556087, - "eval_runtime": 0.23, - "eval_samples_per_second": 382.529, - "eval_steps_per_second": 47.816, - "step": 259 - }, - { - "epoch": 11.818181818181818, - "grad_norm": 0.028253108263015747, - "learning_rate": 2.017105263157895e-05, - "loss": 0.003, - "step": 260 - }, - { - "epoch": 11.818181818181818, - "eval_loss": 0.0019117832416668534, - "eval_runtime": 0.2345, - "eval_samples_per_second": 375.238, - "eval_steps_per_second": 46.905, - "step": 260 - }, - { - "epoch": 11.863636363636363, - "grad_norm": 0.03305625915527344, - "learning_rate": 2.013157894736842e-05, - "loss": 0.0034, - "step": 261 - }, - { - "epoch": 11.863636363636363, - "eval_loss": 0.0019041887717321515, - "eval_runtime": 0.2239, - "eval_samples_per_second": 393.025, - "eval_steps_per_second": 49.128, - "step": 261 - }, - { - "epoch": 11.909090909090908, - "grad_norm": 0.027725212275981903, - "learning_rate": 2.0092105263157898e-05, - "loss": 0.0033, - "step": 262 - }, - { - "epoch": 11.909090909090908, - "eval_loss": 0.0018966187490150332, - "eval_runtime": 0.2303, - "eval_samples_per_second": 382.148, - "eval_steps_per_second": 47.769, - "step": 262 - }, - { - "epoch": 11.954545454545455, - "grad_norm": 0.02550244890153408, - "learning_rate": 2.0052631578947368e-05, - "loss": 0.0032, - "step": 263 - }, - { - "epoch": 11.954545454545455, - "eval_loss": 0.0018891972722485662, - "eval_runtime": 0.2274, - "eval_samples_per_second": 386.939, - "eval_steps_per_second": 48.367, - "step": 263 - }, - { - "epoch": 12.0, - "grad_norm": 0.02780972793698311, - "learning_rate": 2.0013157894736842e-05, - "loss": 0.0034, - "step": 264 - }, - { - "epoch": 12.0, - "eval_loss": 0.001881771837361157, - "eval_runtime": 0.2332, - "eval_samples_per_second": 377.388, - "eval_steps_per_second": 47.174, - "step": 264 - }, - { - "epoch": 12.045454545454545, - "grad_norm": 0.03385490924119949, - "learning_rate": 1.9973684210526317e-05, - "loss": 0.0034, - "step": 265 - }, - { - "epoch": 12.045454545454545, - "eval_loss": 0.001874623354524374, - "eval_runtime": 0.2413, - "eval_samples_per_second": 364.627, - "eval_steps_per_second": 45.578, - "step": 265 - }, - { - "epoch": 12.090909090909092, - "grad_norm": 0.029128815978765488, - "learning_rate": 1.993421052631579e-05, - "loss": 0.003, - "step": 266 - }, - { - "epoch": 12.090909090909092, - "eval_loss": 0.0018677938496693969, - "eval_runtime": 0.235, - "eval_samples_per_second": 374.427, - "eval_steps_per_second": 46.803, - "step": 266 - }, - { - "epoch": 12.136363636363637, - "grad_norm": 0.025781184434890747, - "learning_rate": 1.989473684210526e-05, - "loss": 0.0031, - "step": 267 - }, - { - "epoch": 12.136363636363637, - "eval_loss": 0.001861188909970224, - "eval_runtime": 0.2382, - "eval_samples_per_second": 369.363, - "eval_steps_per_second": 46.17, - "step": 267 - }, - { - "epoch": 12.181818181818182, - "grad_norm": 0.0294223353266716, - "learning_rate": 1.985526315789474e-05, - "loss": 0.0033, - "step": 268 - }, - { - "epoch": 12.181818181818182, - "eval_loss": 0.001854045782238245, - "eval_runtime": 0.2289, - "eval_samples_per_second": 384.52, - "eval_steps_per_second": 48.065, - "step": 268 - }, - { - "epoch": 12.227272727272727, - "grad_norm": 0.028326552361249924, - "learning_rate": 1.9815789473684213e-05, - "loss": 0.003, - "step": 269 - }, - { - "epoch": 12.227272727272727, - "eval_loss": 0.0018470593495294452, - "eval_runtime": 0.2289, - "eval_samples_per_second": 384.399, - "eval_steps_per_second": 48.05, - "step": 269 - }, - { - "epoch": 12.272727272727273, - "grad_norm": 0.030360590666532516, - "learning_rate": 1.9776315789473684e-05, - "loss": 0.0031, - "step": 270 - }, - { - "epoch": 12.272727272727273, - "eval_loss": 0.0018398199463263154, - "eval_runtime": 0.2311, - "eval_samples_per_second": 380.8, - "eval_steps_per_second": 47.6, - "step": 270 - }, - { - "epoch": 12.318181818181818, - "grad_norm": 0.02833518758416176, - "learning_rate": 1.9736842105263158e-05, - "loss": 0.0034, - "step": 271 - }, - { - "epoch": 12.318181818181818, - "eval_loss": 0.0018325141863897443, - "eval_runtime": 0.233, - "eval_samples_per_second": 377.758, - "eval_steps_per_second": 47.22, - "step": 271 - }, - { - "epoch": 12.363636363636363, - "grad_norm": 0.029960816726088524, - "learning_rate": 1.9697368421052632e-05, - "loss": 0.0032, - "step": 272 - }, - { - "epoch": 12.363636363636363, - "eval_loss": 0.0018252148292958736, - "eval_runtime": 0.231, - "eval_samples_per_second": 381.016, - "eval_steps_per_second": 47.627, - "step": 272 - }, - { - "epoch": 12.409090909090908, - "grad_norm": 0.027226990088820457, - "learning_rate": 1.9657894736842106e-05, - "loss": 0.0029, - "step": 273 - }, - { - "epoch": 12.409090909090908, - "eval_loss": 0.0018177549354732037, - "eval_runtime": 0.233, - "eval_samples_per_second": 377.605, - "eval_steps_per_second": 47.201, - "step": 273 - }, - { - "epoch": 12.454545454545455, - "grad_norm": 0.02402249164879322, - "learning_rate": 1.9618421052631577e-05, - "loss": 0.0029, - "step": 274 - }, - { - "epoch": 12.454545454545455, - "eval_loss": 0.0018104868941009045, - "eval_runtime": 0.2464, - "eval_samples_per_second": 357.208, - "eval_steps_per_second": 44.651, - "step": 274 - }, - { - "epoch": 12.5, - "grad_norm": 0.025068577378988266, - "learning_rate": 1.9578947368421055e-05, - "loss": 0.003, - "step": 275 - }, - { - "epoch": 12.5, - "eval_loss": 0.0018031727522611618, - "eval_runtime": 0.2561, - "eval_samples_per_second": 343.628, - "eval_steps_per_second": 42.953, - "step": 275 - }, - { - "epoch": 12.545454545454545, - "grad_norm": 0.03290198743343353, - "learning_rate": 1.9539473684210525e-05, - "loss": 0.0032, - "step": 276 - }, - { - "epoch": 12.545454545454545, - "eval_loss": 0.0017959319520741701, - "eval_runtime": 0.2473, - "eval_samples_per_second": 355.844, - "eval_steps_per_second": 44.48, - "step": 276 - }, - { - "epoch": 12.590909090909092, - "grad_norm": 0.025103066116571426, - "learning_rate": 1.95e-05, - "loss": 0.0028, - "step": 277 - }, - { - "epoch": 12.590909090909092, - "eval_loss": 0.0017883635591715574, - "eval_runtime": 0.2312, - "eval_samples_per_second": 380.663, - "eval_steps_per_second": 47.583, - "step": 277 - }, - { - "epoch": 12.636363636363637, - "grad_norm": 0.02768297679722309, - "learning_rate": 1.9460526315789474e-05, - "loss": 0.003, - "step": 278 - }, - { - "epoch": 12.636363636363637, - "eval_loss": 0.0017810885328799486, - "eval_runtime": 0.2411, - "eval_samples_per_second": 365.033, - "eval_steps_per_second": 45.629, - "step": 278 - }, - { - "epoch": 12.681818181818182, - "grad_norm": 0.026979558169841766, - "learning_rate": 1.9421052631578948e-05, - "loss": 0.0033, - "step": 279 - }, - { - "epoch": 12.681818181818182, - "eval_loss": 0.0017738312017172575, - "eval_runtime": 0.2981, - "eval_samples_per_second": 295.202, - "eval_steps_per_second": 36.9, - "step": 279 - }, - { - "epoch": 12.727272727272727, - "grad_norm": 0.025757014751434326, - "learning_rate": 1.938157894736842e-05, - "loss": 0.0031, - "step": 280 - }, - { - "epoch": 12.727272727272727, - "eval_loss": 0.0017666955245658755, - "eval_runtime": 0.2467, - "eval_samples_per_second": 356.773, - "eval_steps_per_second": 44.597, - "step": 280 - }, - { - "epoch": 12.772727272727273, - "grad_norm": 0.026617391034960747, - "learning_rate": 1.9342105263157896e-05, - "loss": 0.003, - "step": 281 - }, - { - "epoch": 12.772727272727273, - "eval_loss": 0.0017593905795365572, - "eval_runtime": 0.2388, - "eval_samples_per_second": 368.469, - "eval_steps_per_second": 46.059, - "step": 281 - }, - { - "epoch": 12.818181818181818, - "grad_norm": 0.027713097631931305, - "learning_rate": 1.9302631578947367e-05, - "loss": 0.0028, - "step": 282 - }, - { - "epoch": 12.818181818181818, - "eval_loss": 0.0017523803981021047, - "eval_runtime": 0.2531, - "eval_samples_per_second": 347.71, - "eval_steps_per_second": 43.464, - "step": 282 - }, - { - "epoch": 12.863636363636363, - "grad_norm": 0.021941719576716423, - "learning_rate": 1.926315789473684e-05, - "loss": 0.0028, - "step": 283 - }, - { - "epoch": 12.863636363636363, - "eval_loss": 0.0017456583445891738, - "eval_runtime": 0.2275, - "eval_samples_per_second": 386.831, - "eval_steps_per_second": 48.354, - "step": 283 - }, - { - "epoch": 12.909090909090908, - "grad_norm": 0.029443973675370216, - "learning_rate": 1.922368421052632e-05, - "loss": 0.0029, - "step": 284 - }, - { - "epoch": 12.909090909090908, - "eval_loss": 0.0017391174333170056, - "eval_runtime": 0.2259, - "eval_samples_per_second": 389.61, - "eval_steps_per_second": 48.701, - "step": 284 - }, - { - "epoch": 12.954545454545455, - "grad_norm": 0.023187711834907532, - "learning_rate": 1.918421052631579e-05, - "loss": 0.0027, - "step": 285 - }, - { - "epoch": 12.954545454545455, - "eval_loss": 0.0017328561516478658, - "eval_runtime": 0.2229, - "eval_samples_per_second": 394.794, - "eval_steps_per_second": 49.349, - "step": 285 - }, - { - "epoch": 13.0, - "grad_norm": 0.02683272212743759, - "learning_rate": 1.9144736842105264e-05, - "loss": 0.0028, - "step": 286 - }, - { - "epoch": 13.0, - "eval_loss": 0.0017264141933992505, - "eval_runtime": 0.2281, - "eval_samples_per_second": 385.759, - "eval_steps_per_second": 48.22, - "step": 286 - }, - { - "epoch": 13.045454545454545, - "grad_norm": 0.026485104113817215, - "learning_rate": 1.9105263157894738e-05, - "loss": 0.0029, - "step": 287 - }, - { - "epoch": 13.045454545454545, - "eval_loss": 0.0017197772394865751, - "eval_runtime": 0.2245, - "eval_samples_per_second": 392.011, - "eval_steps_per_second": 49.001, - "step": 287 - }, - { - "epoch": 13.090909090909092, - "grad_norm": 0.025229312479496002, - "learning_rate": 1.9065789473684212e-05, - "loss": 0.0027, - "step": 288 - }, - { - "epoch": 13.090909090909092, - "eval_loss": 0.0017132211942225695, - "eval_runtime": 0.2288, - "eval_samples_per_second": 384.654, - "eval_steps_per_second": 48.082, - "step": 288 - }, - { - "epoch": 13.136363636363637, - "grad_norm": 0.026387052610516548, - "learning_rate": 1.9026315789473683e-05, - "loss": 0.003, - "step": 289 - }, - { - "epoch": 13.136363636363637, - "eval_loss": 0.001706792158074677, - "eval_runtime": 0.2251, - "eval_samples_per_second": 390.981, - "eval_steps_per_second": 48.873, - "step": 289 - }, - { - "epoch": 13.181818181818182, - "grad_norm": 0.0232387688010931, - "learning_rate": 1.898684210526316e-05, - "loss": 0.0028, - "step": 290 - }, - { - "epoch": 13.181818181818182, - "eval_loss": 0.0017004094552248716, - "eval_runtime": 0.2307, - "eval_samples_per_second": 381.375, - "eval_steps_per_second": 47.672, - "step": 290 - }, - { - "epoch": 13.227272727272727, - "grad_norm": 0.030720511451363564, - "learning_rate": 1.894736842105263e-05, - "loss": 0.003, - "step": 291 - }, - { - "epoch": 13.227272727272727, - "eval_loss": 0.0016942427027970552, - "eval_runtime": 0.2316, - "eval_samples_per_second": 379.934, - "eval_steps_per_second": 47.492, - "step": 291 - }, - { - "epoch": 13.272727272727273, - "grad_norm": 0.023519422858953476, - "learning_rate": 1.8907894736842105e-05, - "loss": 0.0025, - "step": 292 - }, - { - "epoch": 13.272727272727273, - "eval_loss": 0.0016882912022992969, - "eval_runtime": 0.2298, - "eval_samples_per_second": 383.008, - "eval_steps_per_second": 47.876, - "step": 292 - }, - { - "epoch": 13.318181818181818, - "grad_norm": 0.02608366496860981, - "learning_rate": 1.886842105263158e-05, - "loss": 0.003, - "step": 293 - }, - { - "epoch": 13.318181818181818, - "eval_loss": 0.001682400587014854, - "eval_runtime": 0.2333, - "eval_samples_per_second": 377.26, - "eval_steps_per_second": 47.157, - "step": 293 - }, - { - "epoch": 13.363636363636363, - "grad_norm": 0.02541464753448963, - "learning_rate": 1.8828947368421053e-05, - "loss": 0.0028, - "step": 294 - }, - { - "epoch": 13.363636363636363, - "eval_loss": 0.0016764701576903462, - "eval_runtime": 0.2276, - "eval_samples_per_second": 386.598, - "eval_steps_per_second": 48.325, - "step": 294 - }, - { - "epoch": 13.409090909090908, - "grad_norm": 0.026540333405137062, - "learning_rate": 1.8789473684210524e-05, - "loss": 0.0028, - "step": 295 - }, - { - "epoch": 13.409090909090908, - "eval_loss": 0.0016703385626897216, - "eval_runtime": 0.2313, - "eval_samples_per_second": 380.436, - "eval_steps_per_second": 47.554, - "step": 295 - }, - { - "epoch": 13.454545454545455, - "grad_norm": 0.021979449316859245, - "learning_rate": 1.8750000000000002e-05, - "loss": 0.0027, - "step": 296 - }, - { - "epoch": 13.454545454545455, - "eval_loss": 0.0016644434072077274, - "eval_runtime": 0.2267, - "eval_samples_per_second": 388.248, - "eval_steps_per_second": 48.531, - "step": 296 - }, - { - "epoch": 13.5, - "grad_norm": 0.027137625962495804, - "learning_rate": 1.8710526315789476e-05, - "loss": 0.0027, - "step": 297 - }, - { - "epoch": 13.5, - "eval_loss": 0.001658798661082983, - "eval_runtime": 0.2286, - "eval_samples_per_second": 384.972, - "eval_steps_per_second": 48.121, - "step": 297 - }, - { - "epoch": 13.545454545454545, - "grad_norm": 0.02321833185851574, - "learning_rate": 1.8671052631578947e-05, - "loss": 0.0027, - "step": 298 - }, - { - "epoch": 13.545454545454545, - "eval_loss": 0.001653428073041141, - "eval_runtime": 0.227, - "eval_samples_per_second": 387.714, - "eval_steps_per_second": 48.464, - "step": 298 - }, - { - "epoch": 13.590909090909092, - "grad_norm": 0.028996985405683517, - "learning_rate": 1.8631578947368424e-05, - "loss": 0.0029, - "step": 299 - }, - { - "epoch": 13.590909090909092, - "eval_loss": 0.0016476112650707364, - "eval_runtime": 0.2299, - "eval_samples_per_second": 382.812, - "eval_steps_per_second": 47.852, - "step": 299 - }, - { - "epoch": 13.636363636363637, - "grad_norm": 0.028486257418990135, - "learning_rate": 1.8592105263157895e-05, - "loss": 0.0027, - "step": 300 - }, - { - "epoch": 13.636363636363637, - "eval_loss": 0.001642104354687035, - "eval_runtime": 0.2398, - "eval_samples_per_second": 367.028, - "eval_steps_per_second": 45.878, - "step": 300 - }, - { - "epoch": 13.681818181818182, - "grad_norm": 0.022658037021756172, - "learning_rate": 1.855263157894737e-05, - "loss": 0.0025, - "step": 301 - }, - { - "epoch": 13.681818181818182, - "eval_loss": 0.0016368039650842547, - "eval_runtime": 0.2377, - "eval_samples_per_second": 370.172, - "eval_steps_per_second": 46.271, - "step": 301 - }, - { - "epoch": 13.727272727272727, - "grad_norm": 0.024452779442071915, - "learning_rate": 1.8513157894736843e-05, - "loss": 0.0028, - "step": 302 - }, - { - "epoch": 13.727272727272727, - "eval_loss": 0.0016317162662744522, - "eval_runtime": 0.2252, - "eval_samples_per_second": 390.707, - "eval_steps_per_second": 48.838, - "step": 302 - }, - { - "epoch": 13.772727272727273, - "grad_norm": 0.02014131471514702, - "learning_rate": 1.8473684210526317e-05, - "loss": 0.0024, - "step": 303 - }, - { - "epoch": 13.772727272727273, - "eval_loss": 0.001626785146072507, - "eval_runtime": 0.2343, - "eval_samples_per_second": 375.607, - "eval_steps_per_second": 46.951, - "step": 303 - }, - { - "epoch": 13.818181818181818, - "grad_norm": 0.02657116763293743, - "learning_rate": 1.8434210526315788e-05, - "loss": 0.0025, - "step": 304 - }, - { - "epoch": 13.818181818181818, - "eval_loss": 0.001621657982468605, - "eval_runtime": 0.2287, - "eval_samples_per_second": 384.722, - "eval_steps_per_second": 48.09, - "step": 304 - }, - { - "epoch": 13.863636363636363, - "grad_norm": 0.02328609488904476, - "learning_rate": 1.8394736842105266e-05, - "loss": 0.0025, - "step": 305 - }, - { - "epoch": 13.863636363636363, - "eval_loss": 0.001616165740415454, - "eval_runtime": 0.2335, - "eval_samples_per_second": 376.921, - "eval_steps_per_second": 47.115, - "step": 305 - }, - { - "epoch": 13.909090909090908, - "grad_norm": 0.02286568656563759, - "learning_rate": 1.8355263157894736e-05, - "loss": 0.0027, - "step": 306 - }, - { - "epoch": 13.909090909090908, - "eval_loss": 0.001610812614671886, - "eval_runtime": 0.2295, - "eval_samples_per_second": 383.365, - "eval_steps_per_second": 47.921, - "step": 306 - }, - { - "epoch": 13.954545454545455, - "grad_norm": 0.025216739624738693, - "learning_rate": 1.831578947368421e-05, - "loss": 0.0026, - "step": 307 - }, - { - "epoch": 13.954545454545455, - "eval_loss": 0.001605312223546207, - "eval_runtime": 0.2306, - "eval_samples_per_second": 381.68, - "eval_steps_per_second": 47.71, - "step": 307 - }, - { - "epoch": 14.0, - "grad_norm": 0.02698989026248455, - "learning_rate": 1.8276315789473685e-05, - "loss": 0.003, - "step": 308 - }, - { - "epoch": 14.0, - "eval_loss": 0.001599607290700078, - "eval_runtime": 0.234, - "eval_samples_per_second": 376.05, - "eval_steps_per_second": 47.006, - "step": 308 - }, - { - "epoch": 14.045454545454545, - "grad_norm": 0.02121439203619957, - "learning_rate": 1.823684210526316e-05, - "loss": 0.0026, - "step": 309 - }, - { - "epoch": 14.045454545454545, - "eval_loss": 0.0015940162120386958, - "eval_runtime": 0.2335, - "eval_samples_per_second": 376.91, - "eval_steps_per_second": 47.114, - "step": 309 - }, - { - "epoch": 14.090909090909092, - "grad_norm": 0.02412167377769947, - "learning_rate": 1.8197368421052633e-05, - "loss": 0.0028, - "step": 310 - }, - { - "epoch": 14.090909090909092, - "eval_loss": 0.001588392653502524, - "eval_runtime": 0.2321, - "eval_samples_per_second": 379.14, - "eval_steps_per_second": 47.392, - "step": 310 - }, - { - "epoch": 14.136363636363637, - "grad_norm": 0.02534678392112255, - "learning_rate": 1.8157894736842107e-05, - "loss": 0.0027, - "step": 311 - }, - { - "epoch": 14.136363636363637, - "eval_loss": 0.0015829313779249787, - "eval_runtime": 0.2274, - "eval_samples_per_second": 386.989, - "eval_steps_per_second": 48.374, - "step": 311 - }, - { - "epoch": 14.181818181818182, - "grad_norm": 0.021638307720422745, - "learning_rate": 1.811842105263158e-05, - "loss": 0.0025, - "step": 312 - }, - { - "epoch": 14.181818181818182, - "eval_loss": 0.0015773712657392025, - "eval_runtime": 0.2294, - "eval_samples_per_second": 383.682, - "eval_steps_per_second": 47.96, - "step": 312 - }, - { - "epoch": 14.227272727272727, - "grad_norm": 0.024357490241527557, - "learning_rate": 1.8078947368421052e-05, - "loss": 0.0027, - "step": 313 - }, - { - "epoch": 14.227272727272727, - "eval_loss": 0.0015717636561021209, - "eval_runtime": 0.2294, - "eval_samples_per_second": 383.662, - "eval_steps_per_second": 47.958, - "step": 313 - }, - { - "epoch": 14.272727272727273, - "grad_norm": 0.022512707859277725, - "learning_rate": 1.8039473684210526e-05, - "loss": 0.0026, - "step": 314 - }, - { - "epoch": 14.272727272727273, - "eval_loss": 0.001566153485327959, - "eval_runtime": 0.2263, - "eval_samples_per_second": 388.817, - "eval_steps_per_second": 48.602, - "step": 314 - }, - { - "epoch": 14.318181818181818, - "grad_norm": 0.022913463413715363, - "learning_rate": 1.8e-05, - "loss": 0.0026, - "step": 315 - }, - { - "epoch": 14.318181818181818, - "eval_loss": 0.001560671953484416, - "eval_runtime": 0.2319, - "eval_samples_per_second": 379.401, - "eval_steps_per_second": 47.425, - "step": 315 - }, - { - "epoch": 14.363636363636363, - "grad_norm": 0.024906402453780174, - "learning_rate": 1.7960526315789475e-05, - "loss": 0.0026, - "step": 316 - }, - { - "epoch": 14.363636363636363, - "eval_loss": 0.0015550776151940227, - "eval_runtime": 0.2309, - "eval_samples_per_second": 381.176, - "eval_steps_per_second": 47.647, - "step": 316 - }, - { - "epoch": 14.409090909090908, - "grad_norm": 0.020846841856837273, - "learning_rate": 1.7921052631578945e-05, - "loss": 0.0024, - "step": 317 - }, - { - "epoch": 14.409090909090908, - "eval_loss": 0.0015492510283365846, - "eval_runtime": 0.23, - "eval_samples_per_second": 382.625, - "eval_steps_per_second": 47.828, - "step": 317 - }, - { - "epoch": 14.454545454545455, - "grad_norm": 0.020949576050043106, - "learning_rate": 1.7881578947368423e-05, - "loss": 0.0024, - "step": 318 - }, - { - "epoch": 14.454545454545455, - "eval_loss": 0.001543792081065476, - "eval_runtime": 0.2687, - "eval_samples_per_second": 327.535, - "eval_steps_per_second": 40.942, - "step": 318 - }, - { - "epoch": 14.5, - "grad_norm": 0.027320073917508125, - "learning_rate": 1.7842105263157894e-05, - "loss": 0.0029, - "step": 319 - }, - { - "epoch": 14.5, - "eval_loss": 0.0015383724821731448, - "eval_runtime": 0.2378, - "eval_samples_per_second": 369.998, - "eval_steps_per_second": 46.25, - "step": 319 - }, - { - "epoch": 14.545454545454545, - "grad_norm": 0.023768380284309387, - "learning_rate": 1.7802631578947368e-05, - "loss": 0.0024, - "step": 320 - }, - { - "epoch": 14.545454545454545, - "eval_loss": 0.0015328243607655168, - "eval_runtime": 0.2636, - "eval_samples_per_second": 333.891, - "eval_steps_per_second": 41.736, - "step": 320 - }, - { - "epoch": 14.590909090909092, - "grad_norm": 0.023090893402695656, - "learning_rate": 1.7763157894736842e-05, - "loss": 0.0028, - "step": 321 - }, - { - "epoch": 14.590909090909092, - "eval_loss": 0.0015273126773536205, - "eval_runtime": 0.2297, - "eval_samples_per_second": 383.091, - "eval_steps_per_second": 47.886, - "step": 321 - }, - { - "epoch": 14.636363636363637, - "grad_norm": 0.021861301735043526, - "learning_rate": 1.7723684210526316e-05, - "loss": 0.0023, - "step": 322 - }, - { - "epoch": 14.636363636363637, - "eval_loss": 0.0015220079803839326, - "eval_runtime": 0.2395, - "eval_samples_per_second": 367.485, - "eval_steps_per_second": 45.936, - "step": 322 - }, - { - "epoch": 14.681818181818182, - "grad_norm": 0.02089674212038517, - "learning_rate": 1.7684210526315787e-05, - "loss": 0.0025, - "step": 323 - }, - { - "epoch": 14.681818181818182, - "eval_loss": 0.0015169020043686032, - "eval_runtime": 0.2277, - "eval_samples_per_second": 386.55, - "eval_steps_per_second": 48.319, - "step": 323 - }, - { - "epoch": 14.727272727272727, - "grad_norm": 0.026943515986204147, - "learning_rate": 1.7644736842105264e-05, - "loss": 0.0027, - "step": 324 - }, - { - "epoch": 14.727272727272727, - "eval_loss": 0.0015122044133022428, - "eval_runtime": 0.2504, - "eval_samples_per_second": 351.497, - "eval_steps_per_second": 43.937, - "step": 324 - }, - { - "epoch": 14.772727272727273, - "grad_norm": 0.021125871688127518, - "learning_rate": 1.760526315789474e-05, - "loss": 0.0024, - "step": 325 - }, - { - "epoch": 14.772727272727273, - "eval_loss": 0.0015074351103976369, - "eval_runtime": 0.2277, - "eval_samples_per_second": 386.421, - "eval_steps_per_second": 48.303, - "step": 325 - }, - { - "epoch": 14.818181818181818, - "grad_norm": 0.023058133199810982, - "learning_rate": 1.756578947368421e-05, - "loss": 0.0025, - "step": 326 - }, - { - "epoch": 14.818181818181818, - "eval_loss": 0.001502548111602664, - "eval_runtime": 0.2371, - "eval_samples_per_second": 371.118, - "eval_steps_per_second": 46.39, - "step": 326 - }, - { - "epoch": 14.863636363636363, - "grad_norm": 0.020260730758309364, - "learning_rate": 1.7526315789473687e-05, - "loss": 0.0023, - "step": 327 - }, - { - "epoch": 14.863636363636363, - "eval_loss": 0.0014978590188547969, - "eval_runtime": 0.231, - "eval_samples_per_second": 380.935, - "eval_steps_per_second": 47.617, - "step": 327 - }, - { - "epoch": 14.909090909090908, - "grad_norm": 0.021094167605042458, - "learning_rate": 1.7486842105263158e-05, - "loss": 0.0024, - "step": 328 - }, - { - "epoch": 14.909090909090908, - "eval_loss": 0.0014932234771549702, - "eval_runtime": 0.2309, - "eval_samples_per_second": 381.042, - "eval_steps_per_second": 47.63, - "step": 328 - }, - { - "epoch": 14.954545454545455, - "grad_norm": 0.023162171244621277, - "learning_rate": 1.7447368421052632e-05, - "loss": 0.0027, - "step": 329 - }, - { - "epoch": 14.954545454545455, - "eval_loss": 0.0014887653524056077, - "eval_runtime": 0.2298, - "eval_samples_per_second": 382.875, - "eval_steps_per_second": 47.859, - "step": 329 - }, - { - "epoch": 15.0, - "grad_norm": 0.021899493411183357, - "learning_rate": 1.7407894736842106e-05, - "loss": 0.0026, - "step": 330 - }, - { - "epoch": 15.0, - "eval_loss": 0.0014844763791188598, - "eval_runtime": 0.2287, - "eval_samples_per_second": 384.811, - "eval_steps_per_second": 48.101, - "step": 330 - }, - { - "epoch": 15.045454545454545, - "grad_norm": 0.02722894586622715, - "learning_rate": 1.736842105263158e-05, - "loss": 0.0029, - "step": 331 - }, - { - "epoch": 15.045454545454545, - "eval_loss": 0.001479836879298091, - "eval_runtime": 0.2296, - "eval_samples_per_second": 383.331, - "eval_steps_per_second": 47.916, - "step": 331 - }, - { - "epoch": 15.090909090909092, - "grad_norm": 0.0198600422590971, - "learning_rate": 1.732894736842105e-05, - "loss": 0.0023, - "step": 332 - }, - { - "epoch": 15.090909090909092, - "eval_loss": 0.001475546509027481, - "eval_runtime": 0.2293, - "eval_samples_per_second": 383.783, - "eval_steps_per_second": 47.973, - "step": 332 - }, - { - "epoch": 15.136363636363637, - "grad_norm": 0.018213720992207527, - "learning_rate": 1.728947368421053e-05, - "loss": 0.0021, - "step": 333 - }, - { - "epoch": 15.136363636363637, - "eval_loss": 0.0014714114367961884, - "eval_runtime": 0.2229, - "eval_samples_per_second": 394.882, - "eval_steps_per_second": 49.36, - "step": 333 - }, - { - "epoch": 15.181818181818182, - "grad_norm": 0.02195083722472191, - "learning_rate": 1.725e-05, - "loss": 0.0026, - "step": 334 - }, - { - "epoch": 15.181818181818182, - "eval_loss": 0.0014672887045890093, - "eval_runtime": 0.2307, - "eval_samples_per_second": 381.499, - "eval_steps_per_second": 47.687, - "step": 334 - }, - { - "epoch": 15.227272727272727, - "grad_norm": 0.020630402490496635, - "learning_rate": 1.7210526315789473e-05, - "loss": 0.0023, - "step": 335 - }, - { - "epoch": 15.227272727272727, - "eval_loss": 0.0014632240636274219, - "eval_runtime": 0.2345, - "eval_samples_per_second": 375.326, - "eval_steps_per_second": 46.916, - "step": 335 - }, - { - "epoch": 15.272727272727273, - "grad_norm": 0.01985459215939045, - "learning_rate": 1.7171052631578947e-05, - "loss": 0.0024, - "step": 336 - }, - { - "epoch": 15.272727272727273, - "eval_loss": 0.0014591444050893188, - "eval_runtime": 0.2344, - "eval_samples_per_second": 375.401, - "eval_steps_per_second": 46.925, - "step": 336 - }, - { - "epoch": 15.318181818181818, - "grad_norm": 0.02400742471218109, - "learning_rate": 1.713157894736842e-05, - "loss": 0.0024, - "step": 337 - }, - { - "epoch": 15.318181818181818, - "eval_loss": 0.001454763114452362, - "eval_runtime": 0.2401, - "eval_samples_per_second": 366.585, - "eval_steps_per_second": 45.823, - "step": 337 - }, - { - "epoch": 15.363636363636363, - "grad_norm": 0.02545950934290886, - "learning_rate": 1.7092105263157896e-05, - "loss": 0.0026, - "step": 338 - }, - { - "epoch": 15.363636363636363, - "eval_loss": 0.0014504102291539311, - "eval_runtime": 0.2315, - "eval_samples_per_second": 380.122, - "eval_steps_per_second": 47.515, - "step": 338 - }, - { - "epoch": 15.409090909090908, - "grad_norm": 0.02126440778374672, - "learning_rate": 1.705263157894737e-05, - "loss": 0.0024, - "step": 339 - }, - { - "epoch": 15.409090909090908, - "eval_loss": 0.0014461844693869352, - "eval_runtime": 0.2351, - "eval_samples_per_second": 374.294, - "eval_steps_per_second": 46.787, - "step": 339 - }, - { - "epoch": 15.454545454545455, - "grad_norm": 0.025197012349963188, - "learning_rate": 1.7013157894736844e-05, - "loss": 0.0025, - "step": 340 - }, - { - "epoch": 15.454545454545455, - "eval_loss": 0.0014418490463867784, - "eval_runtime": 0.2274, - "eval_samples_per_second": 387.064, - "eval_steps_per_second": 48.383, - "step": 340 - }, - { - "epoch": 15.5, - "grad_norm": 0.022640075534582138, - "learning_rate": 1.6973684210526315e-05, - "loss": 0.0024, - "step": 341 - }, - { - "epoch": 15.5, - "eval_loss": 0.0014375299215316772, - "eval_runtime": 0.2405, - "eval_samples_per_second": 365.83, - "eval_steps_per_second": 45.729, - "step": 341 - }, - { - "epoch": 15.545454545454545, - "grad_norm": 0.021050602197647095, - "learning_rate": 1.6934210526315792e-05, - "loss": 0.0024, - "step": 342 - }, - { - "epoch": 15.545454545454545, - "eval_loss": 0.0014335111482068896, - "eval_runtime": 0.226, - "eval_samples_per_second": 389.393, - "eval_steps_per_second": 48.674, - "step": 342 - }, - { - "epoch": 15.590909090909092, - "grad_norm": 0.0219247005879879, - "learning_rate": 1.6894736842105263e-05, - "loss": 0.0025, - "step": 343 - }, - { - "epoch": 15.590909090909092, - "eval_loss": 0.0014295299770310521, - "eval_runtime": 0.2342, - "eval_samples_per_second": 375.717, - "eval_steps_per_second": 46.965, - "step": 343 - }, - { - "epoch": 15.636363636363637, - "grad_norm": 0.020925231277942657, - "learning_rate": 1.6855263157894737e-05, - "loss": 0.0024, - "step": 344 - }, - { - "epoch": 15.636363636363637, - "eval_loss": 0.0014257035218179226, - "eval_runtime": 0.2368, - "eval_samples_per_second": 371.622, - "eval_steps_per_second": 46.453, - "step": 344 - }, - { - "epoch": 15.681818181818182, - "grad_norm": 0.019099295139312744, - "learning_rate": 1.681578947368421e-05, - "loss": 0.0023, - "step": 345 - }, - { - "epoch": 15.681818181818182, - "eval_loss": 0.0014218251453712583, - "eval_runtime": 0.2291, - "eval_samples_per_second": 384.074, - "eval_steps_per_second": 48.009, - "step": 345 - }, - { - "epoch": 15.727272727272727, - "grad_norm": 0.021133864298462868, - "learning_rate": 1.6776315789473686e-05, - "loss": 0.0023, - "step": 346 - }, - { - "epoch": 15.727272727272727, - "eval_loss": 0.0014178574783727527, - "eval_runtime": 0.2372, - "eval_samples_per_second": 370.96, - "eval_steps_per_second": 46.37, - "step": 346 - }, - { - "epoch": 15.772727272727273, - "grad_norm": 0.0220933947712183, - "learning_rate": 1.6736842105263156e-05, - "loss": 0.0024, - "step": 347 - }, - { - "epoch": 15.772727272727273, - "eval_loss": 0.0014137736288830638, - "eval_runtime": 0.2311, - "eval_samples_per_second": 380.859, - "eval_steps_per_second": 47.607, - "step": 347 - }, - { - "epoch": 15.818181818181818, - "grad_norm": 0.02274385653436184, - "learning_rate": 1.6697368421052634e-05, - "loss": 0.0023, - "step": 348 - }, - { - "epoch": 15.818181818181818, - "eval_loss": 0.0014094788348302245, - "eval_runtime": 0.2489, - "eval_samples_per_second": 353.537, - "eval_steps_per_second": 44.192, - "step": 348 - }, - { - "epoch": 15.863636363636363, - "grad_norm": 0.023772120475769043, - "learning_rate": 1.6657894736842105e-05, - "loss": 0.0025, - "step": 349 - }, - { - "epoch": 15.863636363636363, - "eval_loss": 0.0014053123304620385, - "eval_runtime": 0.2394, - "eval_samples_per_second": 367.516, - "eval_steps_per_second": 45.94, - "step": 349 - }, - { - "epoch": 15.909090909090908, - "grad_norm": 0.023701833561062813, - "learning_rate": 1.661842105263158e-05, - "loss": 0.0026, - "step": 350 - }, - { - "epoch": 15.909090909090908, - "eval_loss": 0.0014007468707859516, - "eval_runtime": 0.2428, - "eval_samples_per_second": 362.454, - "eval_steps_per_second": 45.307, - "step": 350 - }, - { - "epoch": 15.954545454545455, - "grad_norm": 0.020177854225039482, - "learning_rate": 1.6578947368421053e-05, - "loss": 0.0023, - "step": 351 - }, - { - "epoch": 15.954545454545455, - "eval_loss": 0.001396444975398481, - "eval_runtime": 0.2227, - "eval_samples_per_second": 395.086, - "eval_steps_per_second": 49.386, - "step": 351 - }, - { - "epoch": 16.0, - "grad_norm": 0.018302910029888153, - "learning_rate": 1.6539473684210527e-05, - "loss": 0.0022, - "step": 352 - }, - { - "epoch": 16.0, - "eval_loss": 0.0013921987265348434, - "eval_runtime": 0.2255, - "eval_samples_per_second": 390.23, - "eval_steps_per_second": 48.779, - "step": 352 - }, - { - "epoch": 16.045454545454547, - "grad_norm": 0.02006903663277626, - "learning_rate": 1.65e-05, - "loss": 0.0024, - "step": 353 - }, - { - "epoch": 16.045454545454547, - "eval_loss": 0.0013879131292924285, - "eval_runtime": 0.2332, - "eval_samples_per_second": 377.362, - "eval_steps_per_second": 47.17, - "step": 353 - }, - { - "epoch": 16.09090909090909, - "grad_norm": 0.02006879448890686, - "learning_rate": 1.6460526315789472e-05, - "loss": 0.0024, - "step": 354 - }, - { - "epoch": 16.09090909090909, - "eval_loss": 0.0013836818980053067, - "eval_runtime": 0.2294, - "eval_samples_per_second": 383.546, - "eval_steps_per_second": 47.943, - "step": 354 - }, - { - "epoch": 16.136363636363637, - "grad_norm": 0.01927405595779419, - "learning_rate": 1.642105263157895e-05, - "loss": 0.0021, - "step": 355 - }, - { - "epoch": 16.136363636363637, - "eval_loss": 0.001379486988298595, - "eval_runtime": 0.2304, - "eval_samples_per_second": 381.9, - "eval_steps_per_second": 47.738, - "step": 355 - }, - { - "epoch": 16.181818181818183, - "grad_norm": 0.019441615790128708, - "learning_rate": 1.638157894736842e-05, - "loss": 0.0024, - "step": 356 - }, - { - "epoch": 16.181818181818183, - "eval_loss": 0.0013752405066043139, - "eval_runtime": 0.2339, - "eval_samples_per_second": 376.279, - "eval_steps_per_second": 47.035, - "step": 356 - }, - { - "epoch": 16.227272727272727, - "grad_norm": 0.019047444686293602, - "learning_rate": 1.6342105263157894e-05, - "loss": 0.0022, - "step": 357 - }, - { - "epoch": 16.227272727272727, - "eval_loss": 0.0013710103230550885, - "eval_runtime": 0.2296, - "eval_samples_per_second": 383.255, - "eval_steps_per_second": 47.907, - "step": 357 - }, - { - "epoch": 16.272727272727273, - "grad_norm": 0.02004443109035492, - "learning_rate": 1.630263157894737e-05, - "loss": 0.002, - "step": 358 - }, - { - "epoch": 16.272727272727273, - "eval_loss": 0.0013666612794622779, - "eval_runtime": 0.2306, - "eval_samples_per_second": 381.651, - "eval_steps_per_second": 47.706, - "step": 358 - }, - { - "epoch": 16.318181818181817, - "grad_norm": 0.018162380903959274, - "learning_rate": 1.6263157894736843e-05, - "loss": 0.0022, - "step": 359 - }, - { - "epoch": 16.318181818181817, - "eval_loss": 0.0013625015271827579, - "eval_runtime": 0.2336, - "eval_samples_per_second": 376.757, - "eval_steps_per_second": 47.095, - "step": 359 - }, - { - "epoch": 16.363636363636363, - "grad_norm": 0.01866663061082363, - "learning_rate": 1.6223684210526314e-05, - "loss": 0.0023, - "step": 360 - }, - { - "epoch": 16.363636363636363, - "eval_loss": 0.001358471461571753, - "eval_runtime": 0.234, - "eval_samples_per_second": 376.031, - "eval_steps_per_second": 47.004, - "step": 360 - }, - { - "epoch": 16.40909090909091, - "grad_norm": 0.023692943155765533, - "learning_rate": 1.618421052631579e-05, - "loss": 0.0021, - "step": 361 - }, - { - "epoch": 16.40909090909091, - "eval_loss": 0.001354728126898408, - "eval_runtime": 0.236, - "eval_samples_per_second": 372.916, - "eval_steps_per_second": 46.614, - "step": 361 - }, - { - "epoch": 16.454545454545453, - "grad_norm": 0.021557440981268883, - "learning_rate": 1.6144736842105262e-05, - "loss": 0.0025, - "step": 362 - }, - { - "epoch": 16.454545454545453, - "eval_loss": 0.0013508024858310819, - "eval_runtime": 0.2359, - "eval_samples_per_second": 373.118, - "eval_steps_per_second": 46.64, - "step": 362 - }, - { - "epoch": 16.5, - "grad_norm": 0.02110958844423294, - "learning_rate": 1.6105263157894736e-05, - "loss": 0.0023, - "step": 363 - }, - { - "epoch": 16.5, - "eval_loss": 0.0013467645039781928, - "eval_runtime": 0.2299, - "eval_samples_per_second": 382.703, - "eval_steps_per_second": 47.838, - "step": 363 - }, - { - "epoch": 16.545454545454547, - "grad_norm": 0.019328676164150238, - "learning_rate": 1.6065789473684214e-05, - "loss": 0.0024, - "step": 364 - }, - { - "epoch": 16.545454545454547, - "eval_loss": 0.0013428251259028912, - "eval_runtime": 0.2289, - "eval_samples_per_second": 384.389, - "eval_steps_per_second": 48.049, - "step": 364 - }, - { - "epoch": 16.59090909090909, - "grad_norm": 0.022835319861769676, - "learning_rate": 1.6026315789473684e-05, - "loss": 0.0023, - "step": 365 - }, - { - "epoch": 16.59090909090909, - "eval_loss": 0.0013391603715717793, - "eval_runtime": 0.2311, - "eval_samples_per_second": 380.86, - "eval_steps_per_second": 47.607, - "step": 365 - }, - { - "epoch": 16.636363636363637, - "grad_norm": 0.01819239743053913, - "learning_rate": 1.598684210526316e-05, - "loss": 0.0022, - "step": 366 - }, - { - "epoch": 16.636363636363637, - "eval_loss": 0.0013354604598134756, - "eval_runtime": 0.2268, - "eval_samples_per_second": 388.088, - "eval_steps_per_second": 48.511, - "step": 366 - }, - { - "epoch": 16.681818181818183, - "grad_norm": 0.019428908824920654, - "learning_rate": 1.5947368421052633e-05, - "loss": 0.0021, - "step": 367 - }, - { - "epoch": 16.681818181818183, - "eval_loss": 0.001331814331933856, - "eval_runtime": 0.2357, - "eval_samples_per_second": 373.331, - "eval_steps_per_second": 46.666, - "step": 367 - }, - { - "epoch": 16.727272727272727, - "grad_norm": 0.018047934398055077, - "learning_rate": 1.5907894736842107e-05, - "loss": 0.0022, - "step": 368 - }, - { - "epoch": 16.727272727272727, - "eval_loss": 0.0013281968422234058, - "eval_runtime": 0.2353, - "eval_samples_per_second": 374.058, - "eval_steps_per_second": 46.757, - "step": 368 - }, - { - "epoch": 16.772727272727273, - "grad_norm": 0.022303372621536255, - "learning_rate": 1.5868421052631578e-05, - "loss": 0.0022, - "step": 369 - }, - { - "epoch": 16.772727272727273, - "eval_loss": 0.0013246661983430386, - "eval_runtime": 0.2364, - "eval_samples_per_second": 372.233, - "eval_steps_per_second": 46.529, - "step": 369 - }, - { - "epoch": 16.818181818181817, - "grad_norm": 0.017466159537434578, - "learning_rate": 1.5828947368421055e-05, - "loss": 0.0021, - "step": 370 - }, - { - "epoch": 16.818181818181817, - "eval_loss": 0.001321229967288673, - "eval_runtime": 0.2328, - "eval_samples_per_second": 378.075, - "eval_steps_per_second": 47.259, - "step": 370 - }, - { - "epoch": 16.863636363636363, - "grad_norm": 0.018749618902802467, - "learning_rate": 1.5789473684210526e-05, - "loss": 0.0021, - "step": 371 - }, - { - "epoch": 16.863636363636363, - "eval_loss": 0.0013179152738302946, - "eval_runtime": 0.2372, - "eval_samples_per_second": 371.017, - "eval_steps_per_second": 46.377, - "step": 371 - }, - { - "epoch": 16.90909090909091, - "grad_norm": 0.01943541131913662, - "learning_rate": 1.575e-05, - "loss": 0.0021, - "step": 372 - }, - { - "epoch": 16.90909090909091, - "eval_loss": 0.0013147753197699785, - "eval_runtime": 0.2325, - "eval_samples_per_second": 378.47, - "eval_steps_per_second": 47.309, - "step": 372 - }, - { - "epoch": 16.954545454545453, - "grad_norm": 0.018470529466867447, - "learning_rate": 1.5710526315789474e-05, - "loss": 0.0021, - "step": 373 - }, - { - "epoch": 16.954545454545453, - "eval_loss": 0.0013116110349074006, - "eval_runtime": 0.2449, - "eval_samples_per_second": 359.303, - "eval_steps_per_second": 44.913, - "step": 373 - }, - { - "epoch": 17.0, - "grad_norm": 0.02088373526930809, - "learning_rate": 1.5671052631578948e-05, - "loss": 0.0022, - "step": 374 - }, - { - "epoch": 17.0, - "eval_loss": 0.0013083978556096554, - "eval_runtime": 0.2373, - "eval_samples_per_second": 370.786, - "eval_steps_per_second": 46.348, - "step": 374 - }, - { - "epoch": 17.045454545454547, - "grad_norm": 0.02049199491739273, - "learning_rate": 1.563157894736842e-05, - "loss": 0.0021, - "step": 375 - }, - { - "epoch": 17.045454545454547, - "eval_loss": 0.0013052173890173435, - "eval_runtime": 0.2375, - "eval_samples_per_second": 370.52, - "eval_steps_per_second": 46.315, - "step": 375 - }, - { - "epoch": 17.09090909090909, - "grad_norm": 0.022884204983711243, - "learning_rate": 1.5592105263157897e-05, - "loss": 0.0023, - "step": 376 - }, - { - "epoch": 17.09090909090909, - "eval_loss": 0.0013022100320085883, - "eval_runtime": 0.2451, - "eval_samples_per_second": 359.032, - "eval_steps_per_second": 44.879, - "step": 376 - }, - { - "epoch": 17.136363636363637, - "grad_norm": 0.018668444827198982, - "learning_rate": 1.5552631578947367e-05, - "loss": 0.002, - "step": 377 - }, - { - "epoch": 17.136363636363637, - "eval_loss": 0.0012990765972062945, - "eval_runtime": 0.2377, - "eval_samples_per_second": 370.243, - "eval_steps_per_second": 46.28, - "step": 377 - }, - { - "epoch": 17.181818181818183, - "grad_norm": 0.018272867426276207, - "learning_rate": 1.551315789473684e-05, - "loss": 0.002, - "step": 378 - }, - { - "epoch": 17.181818181818183, - "eval_loss": 0.0012959121959283948, - "eval_runtime": 0.2445, - "eval_samples_per_second": 359.966, - "eval_steps_per_second": 44.996, - "step": 378 - }, - { - "epoch": 17.227272727272727, - "grad_norm": 0.018142884597182274, - "learning_rate": 1.547368421052632e-05, - "loss": 0.0023, - "step": 379 - }, - { - "epoch": 17.227272727272727, - "eval_loss": 0.0012926937779411674, - "eval_runtime": 0.2463, - "eval_samples_per_second": 357.295, - "eval_steps_per_second": 44.662, - "step": 379 - }, - { - "epoch": 17.272727272727273, - "grad_norm": 0.019035378471016884, - "learning_rate": 1.543421052631579e-05, - "loss": 0.002, - "step": 380 - }, - { - "epoch": 17.272727272727273, - "eval_loss": 0.0012895982945337892, - "eval_runtime": 0.2335, - "eval_samples_per_second": 376.923, - "eval_steps_per_second": 47.115, - "step": 380 - }, - { - "epoch": 17.318181818181817, - "grad_norm": 0.02087828330695629, - "learning_rate": 1.5394736842105264e-05, - "loss": 0.0023, - "step": 381 - }, - { - "epoch": 17.318181818181817, - "eval_loss": 0.0012864163145422935, - "eval_runtime": 0.2398, - "eval_samples_per_second": 367.034, - "eval_steps_per_second": 45.879, - "step": 381 - }, - { - "epoch": 17.363636363636363, - "grad_norm": 0.019186902791261673, - "learning_rate": 1.5355263157894738e-05, - "loss": 0.0021, - "step": 382 - }, - { - "epoch": 17.363636363636363, - "eval_loss": 0.001283234334550798, - "eval_runtime": 0.2265, - "eval_samples_per_second": 388.504, - "eval_steps_per_second": 48.563, - "step": 382 - }, - { - "epoch": 17.40909090909091, - "grad_norm": 0.01789664290845394, - "learning_rate": 1.5315789473684212e-05, - "loss": 0.002, - "step": 383 - }, - { - "epoch": 17.40909090909091, - "eval_loss": 0.0012801456032320857, - "eval_runtime": 0.229, - "eval_samples_per_second": 384.262, - "eval_steps_per_second": 48.033, - "step": 383 - }, - { - "epoch": 17.454545454545453, - "grad_norm": 0.017828669399023056, - "learning_rate": 1.5276315789473683e-05, - "loss": 0.0021, - "step": 384 - }, - { - "epoch": 17.454545454545453, - "eval_loss": 0.0012770771281793714, - "eval_runtime": 0.2259, - "eval_samples_per_second": 389.598, - "eval_steps_per_second": 48.7, - "step": 384 - }, - { - "epoch": 17.5, - "grad_norm": 0.0225471593439579, - "learning_rate": 1.5236842105263159e-05, - "loss": 0.0022, - "step": 385 - }, - { - "epoch": 17.5, - "eval_loss": 0.0012742335675284266, - "eval_runtime": 0.2398, - "eval_samples_per_second": 366.97, - "eval_steps_per_second": 45.871, - "step": 385 - }, - { - "epoch": 17.545454545454547, - "grad_norm": 0.02024303376674652, - "learning_rate": 1.5197368421052631e-05, - "loss": 0.0021, - "step": 386 - }, - { - "epoch": 17.545454545454547, - "eval_loss": 0.0012715155025944114, - "eval_runtime": 0.2322, - "eval_samples_per_second": 378.914, - "eval_steps_per_second": 47.364, - "step": 386 - }, - { - "epoch": 17.59090909090909, - "grad_norm": 0.021520059555768967, - "learning_rate": 1.5157894736842105e-05, - "loss": 0.0021, - "step": 387 - }, - { - "epoch": 17.59090909090909, - "eval_loss": 0.0012686135014519095, - "eval_runtime": 0.2273, - "eval_samples_per_second": 387.222, - "eval_steps_per_second": 48.403, - "step": 387 - }, - { - "epoch": 17.636363636363637, - "grad_norm": 0.02026878483593464, - "learning_rate": 1.5118421052631578e-05, - "loss": 0.0024, - "step": 388 - }, - { - "epoch": 17.636363636363637, - "eval_loss": 0.0012655220925807953, - "eval_runtime": 0.2345, - "eval_samples_per_second": 375.273, - "eval_steps_per_second": 46.909, - "step": 388 - }, - { - "epoch": 17.681818181818183, - "grad_norm": 0.017312707379460335, - "learning_rate": 1.5078947368421054e-05, - "loss": 0.0019, - "step": 389 - }, - { - "epoch": 17.681818181818183, - "eval_loss": 0.0012624793453142047, - "eval_runtime": 0.2341, - "eval_samples_per_second": 375.953, - "eval_steps_per_second": 46.994, - "step": 389 - }, - { - "epoch": 17.727272727272727, - "grad_norm": 0.014796672388911247, - "learning_rate": 1.5039473684210525e-05, - "loss": 0.0018, - "step": 390 - }, - { - "epoch": 17.727272727272727, - "eval_loss": 0.0012595909647643566, - "eval_runtime": 0.2412, - "eval_samples_per_second": 364.883, - "eval_steps_per_second": 45.61, - "step": 390 - }, - { - "epoch": 17.772727272727273, - "grad_norm": 0.024672966450452805, - "learning_rate": 1.5e-05, - "loss": 0.0024, - "step": 391 - }, - { - "epoch": 17.772727272727273, - "eval_loss": 0.001256533432751894, - "eval_runtime": 0.2394, - "eval_samples_per_second": 367.656, - "eval_steps_per_second": 45.957, - "step": 391 - }, - { - "epoch": 17.818181818181817, - "grad_norm": 0.01785973645746708, - "learning_rate": 1.4960526315789475e-05, - "loss": 0.0021, - "step": 392 - }, - { - "epoch": 17.818181818181817, - "eval_loss": 0.001253555528819561, - "eval_runtime": 0.2448, - "eval_samples_per_second": 359.499, - "eval_steps_per_second": 44.937, - "step": 392 - }, - { - "epoch": 17.863636363636363, - "grad_norm": 0.018725674599409103, - "learning_rate": 1.4921052631578947e-05, - "loss": 0.0022, - "step": 393 - }, - { - "epoch": 17.863636363636363, - "eval_loss": 0.001250546658411622, - "eval_runtime": 0.2295, - "eval_samples_per_second": 383.446, - "eval_steps_per_second": 47.931, - "step": 393 - }, - { - "epoch": 17.90909090909091, - "grad_norm": 0.01906488463282585, - "learning_rate": 1.4881578947368421e-05, - "loss": 0.0019, - "step": 394 - }, - { - "epoch": 17.90909090909091, - "eval_loss": 0.0012476051924750209, - "eval_runtime": 0.2392, - "eval_samples_per_second": 367.955, - "eval_steps_per_second": 45.994, - "step": 394 - }, - { - "epoch": 17.954545454545453, - "grad_norm": 0.01702312007546425, - "learning_rate": 1.4842105263157895e-05, - "loss": 0.0021, - "step": 395 - }, - { - "epoch": 17.954545454545453, - "eval_loss": 0.0012446870096027851, - "eval_runtime": 0.2408, - "eval_samples_per_second": 365.513, - "eval_steps_per_second": 45.689, - "step": 395 - }, - { - "epoch": 18.0, - "grad_norm": 0.018446706235408783, - "learning_rate": 1.4802631578947368e-05, - "loss": 0.0021, - "step": 396 - }, - { - "epoch": 18.0, - "eval_loss": 0.0012417498510330915, - "eval_runtime": 0.2401, - "eval_samples_per_second": 366.532, - "eval_steps_per_second": 45.816, - "step": 396 - }, - { - "epoch": 18.045454545454547, - "grad_norm": 0.017580052837729454, - "learning_rate": 1.4763157894736842e-05, - "loss": 0.002, - "step": 397 - }, - { - "epoch": 18.045454545454547, - "eval_loss": 0.0012387962779030204, - "eval_runtime": 0.2359, - "eval_samples_per_second": 373.019, - "eval_steps_per_second": 46.627, - "step": 397 - }, - { - "epoch": 18.09090909090909, - "grad_norm": 0.018549149855971336, - "learning_rate": 1.4723684210526318e-05, - "loss": 0.002, - "step": 398 - }, - { - "epoch": 18.09090909090909, - "eval_loss": 0.0012358062667772174, - "eval_runtime": 0.2409, - "eval_samples_per_second": 365.331, - "eval_steps_per_second": 45.666, - "step": 398 - }, - { - "epoch": 18.136363636363637, - "grad_norm": 0.021288642659783363, - "learning_rate": 1.468421052631579e-05, - "loss": 0.0021, - "step": 399 - }, - { - "epoch": 18.136363636363637, - "eval_loss": 0.00123285548761487, - "eval_runtime": 0.239, - "eval_samples_per_second": 368.2, - "eval_steps_per_second": 46.025, - "step": 399 - }, - { - "epoch": 18.181818181818183, - "grad_norm": 0.018042676150798798, - "learning_rate": 1.4644736842105264e-05, - "loss": 0.0021, - "step": 400 - }, - { - "epoch": 18.181818181818183, - "eval_loss": 0.0012299600057303905, - "eval_runtime": 0.2368, - "eval_samples_per_second": 371.628, - "eval_steps_per_second": 46.454, - "step": 400 - }, - { - "epoch": 18.227272727272727, - "grad_norm": 0.017950624227523804, - "learning_rate": 1.4605263157894737e-05, - "loss": 0.002, - "step": 401 - }, - { - "epoch": 18.227272727272727, - "eval_loss": 0.0012270959559828043, - "eval_runtime": 0.2217, - "eval_samples_per_second": 396.934, - "eval_steps_per_second": 49.617, - "step": 401 - }, - { - "epoch": 18.272727272727273, - "grad_norm": 0.016649143770337105, - "learning_rate": 1.4565789473684211e-05, - "loss": 0.002, - "step": 402 - }, - { - "epoch": 18.272727272727273, - "eval_loss": 0.0012242384254932404, - "eval_runtime": 0.2287, - "eval_samples_per_second": 384.84, - "eval_steps_per_second": 48.105, - "step": 402 - }, - { - "epoch": 18.318181818181817, - "grad_norm": 0.016468649730086327, - "learning_rate": 1.4526315789473685e-05, - "loss": 0.0018, - "step": 403 - }, - { - "epoch": 18.318181818181817, - "eval_loss": 0.001221520360559225, - "eval_runtime": 0.2271, - "eval_samples_per_second": 387.51, - "eval_steps_per_second": 48.439, - "step": 403 - }, - { - "epoch": 18.363636363636363, - "grad_norm": 0.01778615266084671, - "learning_rate": 1.4486842105263158e-05, - "loss": 0.002, - "step": 404 - }, - { - "epoch": 18.363636363636363, - "eval_loss": 0.0012188454857096076, - "eval_runtime": 0.2323, - "eval_samples_per_second": 378.869, - "eval_steps_per_second": 47.359, - "step": 404 - }, - { - "epoch": 18.40909090909091, - "grad_norm": 0.019096923992037773, - "learning_rate": 1.4447368421052632e-05, - "loss": 0.0021, - "step": 405 - }, - { - "epoch": 18.40909090909091, - "eval_loss": 0.0012163707287982106, - "eval_runtime": 0.2287, - "eval_samples_per_second": 384.807, - "eval_steps_per_second": 48.101, - "step": 405 - }, - { - "epoch": 18.454545454545453, - "grad_norm": 0.020378055050969124, - "learning_rate": 1.4407894736842106e-05, - "loss": 0.0019, - "step": 406 - }, - { - "epoch": 18.454545454545453, - "eval_loss": 0.0012139691971242428, - "eval_runtime": 0.2285, - "eval_samples_per_second": 385.172, - "eval_steps_per_second": 48.146, - "step": 406 - }, - { - "epoch": 18.5, - "grad_norm": 0.01801607571542263, - "learning_rate": 1.4368421052631578e-05, - "loss": 0.0019, - "step": 407 - }, - { - "epoch": 18.5, - "eval_loss": 0.0012113729026168585, - "eval_runtime": 0.2323, - "eval_samples_per_second": 378.867, - "eval_steps_per_second": 47.358, - "step": 407 - }, - { - "epoch": 18.545454545454547, - "grad_norm": 0.016806334257125854, - "learning_rate": 1.4328947368421052e-05, - "loss": 0.0019, - "step": 408 - }, - { - "epoch": 18.545454545454547, - "eval_loss": 0.0012086898786947131, - "eval_runtime": 0.2266, - "eval_samples_per_second": 388.422, - "eval_steps_per_second": 48.553, - "step": 408 - }, - { - "epoch": 18.59090909090909, - "grad_norm": 0.01768423058092594, - "learning_rate": 1.4289473684210527e-05, - "loss": 0.0019, - "step": 409 - }, - { - "epoch": 18.59090909090909, - "eval_loss": 0.001205993234179914, - "eval_runtime": 0.233, - "eval_samples_per_second": 377.712, - "eval_steps_per_second": 47.214, - "step": 409 - }, - { - "epoch": 18.636363636363637, - "grad_norm": 0.016840273514389992, - "learning_rate": 1.4249999999999999e-05, - "loss": 0.0019, - "step": 410 - }, - { - "epoch": 18.636363636363637, - "eval_loss": 0.00120334152597934, - "eval_runtime": 0.2278, - "eval_samples_per_second": 386.255, - "eval_steps_per_second": 48.282, - "step": 410 - }, - { - "epoch": 18.681818181818183, - "grad_norm": 0.019254090264439583, - "learning_rate": 1.4210526315789473e-05, - "loss": 0.0021, - "step": 411 - }, - { - "epoch": 18.681818181818183, - "eval_loss": 0.001200651633553207, - "eval_runtime": 0.2414, - "eval_samples_per_second": 364.529, - "eval_steps_per_second": 45.566, - "step": 411 - }, - { - "epoch": 18.727272727272727, - "grad_norm": 0.018222426995635033, - "learning_rate": 1.4171052631578949e-05, - "loss": 0.0021, - "step": 412 - }, - { - "epoch": 18.727272727272727, - "eval_loss": 0.0011977426474913955, - "eval_runtime": 0.2297, - "eval_samples_per_second": 383.168, - "eval_steps_per_second": 47.896, - "step": 412 - }, - { - "epoch": 18.772727272727273, - "grad_norm": 0.017460381612181664, - "learning_rate": 1.4131578947368422e-05, - "loss": 0.0019, - "step": 413 - }, - { - "epoch": 18.772727272727273, - "eval_loss": 0.0011948675382882357, - "eval_runtime": 0.2295, - "eval_samples_per_second": 383.384, - "eval_steps_per_second": 47.923, - "step": 413 - }, - { - "epoch": 18.818181818181817, - "grad_norm": 0.014636803418397903, - "learning_rate": 1.4092105263157896e-05, - "loss": 0.0018, - "step": 414 - }, - { - "epoch": 18.818181818181817, - "eval_loss": 0.0011919679818674922, - "eval_runtime": 0.2375, - "eval_samples_per_second": 370.502, - "eval_steps_per_second": 46.313, - "step": 414 - }, - { - "epoch": 18.863636363636363, - "grad_norm": 0.01725298911333084, - "learning_rate": 1.405263157894737e-05, - "loss": 0.0019, - "step": 415 - }, - { - "epoch": 18.863636363636363, - "eval_loss": 0.0011888709850609303, - "eval_runtime": 0.2319, - "eval_samples_per_second": 379.492, - "eval_steps_per_second": 47.437, - "step": 415 - }, - { - "epoch": 18.90909090909091, - "grad_norm": 0.017635343596339226, - "learning_rate": 1.4013157894736842e-05, - "loss": 0.0019, - "step": 416 - }, - { - "epoch": 18.90909090909091, - "eval_loss": 0.0011859294027090073, - "eval_runtime": 0.232, - "eval_samples_per_second": 379.329, - "eval_steps_per_second": 47.416, - "step": 416 - }, - { - "epoch": 18.954545454545453, - "grad_norm": 0.017270755022764206, - "learning_rate": 1.3973684210526316e-05, - "loss": 0.002, - "step": 417 - }, - { - "epoch": 18.954545454545453, - "eval_loss": 0.0011831001611426473, - "eval_runtime": 0.2293, - "eval_samples_per_second": 383.786, - "eval_steps_per_second": 47.973, - "step": 417 - }, - { - "epoch": 19.0, - "grad_norm": 0.017159774899482727, - "learning_rate": 1.393421052631579e-05, - "loss": 0.0018, - "step": 418 - }, - { - "epoch": 19.0, - "eval_loss": 0.001180406310595572, - "eval_runtime": 0.2475, - "eval_samples_per_second": 355.577, - "eval_steps_per_second": 44.447, - "step": 418 - }, - { - "epoch": 19.045454545454547, - "grad_norm": 0.015916157513856888, - "learning_rate": 1.3894736842105263e-05, - "loss": 0.0018, - "step": 419 - }, - { - "epoch": 19.045454545454547, - "eval_loss": 0.0011776703177019954, - "eval_runtime": 0.2406, - "eval_samples_per_second": 365.71, - "eval_steps_per_second": 45.714, - "step": 419 - }, - { - "epoch": 19.09090909090909, - "grad_norm": 0.016425369307398796, - "learning_rate": 1.3855263157894737e-05, - "loss": 0.002, - "step": 420 - }, - { - "epoch": 19.09090909090909, - "eval_loss": 0.0011750170961022377, - "eval_runtime": 0.2379, - "eval_samples_per_second": 369.975, - "eval_steps_per_second": 46.247, - "step": 420 - }, - { - "epoch": 19.136363636363637, - "grad_norm": 0.017857089638710022, - "learning_rate": 1.3815789473684211e-05, - "loss": 0.0019, - "step": 421 - }, - { - "epoch": 19.136363636363637, - "eval_loss": 0.0011724097421392798, - "eval_runtime": 0.2504, - "eval_samples_per_second": 351.397, - "eval_steps_per_second": 43.925, - "step": 421 - }, - { - "epoch": 19.181818181818183, - "grad_norm": 0.01837003231048584, - "learning_rate": 1.3776315789473684e-05, - "loss": 0.0022, - "step": 422 - }, - { - "epoch": 19.181818181818183, - "eval_loss": 0.0011697578011080623, - "eval_runtime": 0.2585, - "eval_samples_per_second": 340.422, - "eval_steps_per_second": 42.553, - "step": 422 - }, - { - "epoch": 19.227272727272727, - "grad_norm": 0.019487086683511734, - "learning_rate": 1.3736842105263158e-05, - "loss": 0.0021, - "step": 423 - }, - { - "epoch": 19.227272727272727, - "eval_loss": 0.0011671868851408362, - "eval_runtime": 0.2398, - "eval_samples_per_second": 366.896, - "eval_steps_per_second": 45.862, - "step": 423 - }, - { - "epoch": 19.272727272727273, - "grad_norm": 0.016021518036723137, - "learning_rate": 1.369736842105263e-05, - "loss": 0.0019, - "step": 424 - }, - { - "epoch": 19.272727272727273, - "eval_loss": 0.001164758112281561, - "eval_runtime": 0.2642, - "eval_samples_per_second": 333.083, - "eval_steps_per_second": 41.635, - "step": 424 - }, - { - "epoch": 19.318181818181817, - "grad_norm": 0.018122289329767227, - "learning_rate": 1.3657894736842106e-05, - "loss": 0.0019, - "step": 425 - }, - { - "epoch": 19.318181818181817, - "eval_loss": 0.001162288710474968, - "eval_runtime": 0.2578, - "eval_samples_per_second": 341.316, - "eval_steps_per_second": 42.665, - "step": 425 - }, - { - "epoch": 19.363636363636363, - "grad_norm": 0.015892351046204567, - "learning_rate": 1.361842105263158e-05, - "loss": 0.0018, - "step": 426 - }, - { - "epoch": 19.363636363636363, - "eval_loss": 0.001159931649453938, - "eval_runtime": 0.2409, - "eval_samples_per_second": 365.291, - "eval_steps_per_second": 45.661, - "step": 426 - }, - { - "epoch": 19.40909090909091, - "grad_norm": 0.015699921175837517, - "learning_rate": 1.3578947368421053e-05, - "loss": 0.0019, - "step": 427 - }, - { - "epoch": 19.40909090909091, - "eval_loss": 0.0011575055541470647, - "eval_runtime": 0.2388, - "eval_samples_per_second": 368.523, - "eval_steps_per_second": 46.065, - "step": 427 - }, - { - "epoch": 19.454545454545453, - "grad_norm": 0.01474451832473278, - "learning_rate": 1.3539473684210527e-05, - "loss": 0.0017, - "step": 428 - }, - { - "epoch": 19.454545454545453, - "eval_loss": 0.001155222998932004, - "eval_runtime": 0.2408, - "eval_samples_per_second": 365.449, - "eval_steps_per_second": 45.681, - "step": 428 - }, - { - "epoch": 19.5, - "grad_norm": 0.016437875106930733, - "learning_rate": 1.3500000000000001e-05, - "loss": 0.0018, - "step": 429 - }, - { - "epoch": 19.5, - "eval_loss": 0.0011530268238857388, - "eval_runtime": 0.2325, - "eval_samples_per_second": 378.535, - "eval_steps_per_second": 47.317, - "step": 429 - }, - { - "epoch": 19.545454545454547, - "grad_norm": 0.01538484264165163, - "learning_rate": 1.3460526315789474e-05, - "loss": 0.0018, - "step": 430 - }, - { - "epoch": 19.545454545454547, - "eval_loss": 0.0011508835013955832, - "eval_runtime": 0.2309, - "eval_samples_per_second": 381.166, - "eval_steps_per_second": 47.646, - "step": 430 - }, - { - "epoch": 19.59090909090909, - "grad_norm": 0.017129214480519295, - "learning_rate": 1.3421052631578948e-05, - "loss": 0.0019, - "step": 431 - }, - { - "epoch": 19.59090909090909, - "eval_loss": 0.0011487645097076893, - "eval_runtime": 0.2362, - "eval_samples_per_second": 372.58, - "eval_steps_per_second": 46.573, - "step": 431 - }, - { - "epoch": 19.636363636363637, - "grad_norm": 0.016592320054769516, - "learning_rate": 1.3381578947368422e-05, - "loss": 0.0019, - "step": 432 - }, - { - "epoch": 19.636363636363637, - "eval_loss": 0.0011467835865914822, - "eval_runtime": 0.2418, - "eval_samples_per_second": 364.003, - "eval_steps_per_second": 45.5, - "step": 432 - }, - { - "epoch": 19.681818181818183, - "grad_norm": 0.018111824989318848, - "learning_rate": 1.3342105263157894e-05, - "loss": 0.0019, - "step": 433 - }, - { - "epoch": 19.681818181818183, - "eval_loss": 0.0011448581935837865, - "eval_runtime": 0.2437, - "eval_samples_per_second": 361.142, - "eval_steps_per_second": 45.143, - "step": 433 - }, - { - "epoch": 19.727272727272727, - "grad_norm": 0.01678645797073841, - "learning_rate": 1.3302631578947369e-05, - "loss": 0.0018, - "step": 434 - }, - { - "epoch": 19.727272727272727, - "eval_loss": 0.0011427812278270721, - "eval_runtime": 0.229, - "eval_samples_per_second": 384.254, - "eval_steps_per_second": 48.032, - "step": 434 - }, - { - "epoch": 19.772727272727273, - "grad_norm": 0.01921844109892845, - "learning_rate": 1.3263157894736843e-05, - "loss": 0.0021, - "step": 435 - }, - { - "epoch": 19.772727272727273, - "eval_loss": 0.0011407433776184916, - "eval_runtime": 0.24, - "eval_samples_per_second": 366.62, - "eval_steps_per_second": 45.828, - "step": 435 - }, - { - "epoch": 19.818181818181817, - "grad_norm": 0.01700635813176632, - "learning_rate": 1.3223684210526315e-05, - "loss": 0.0019, - "step": 436 - }, - { - "epoch": 19.818181818181817, - "eval_loss": 0.0011388043640181422, - "eval_runtime": 0.241, - "eval_samples_per_second": 365.22, - "eval_steps_per_second": 45.652, - "step": 436 - }, - { - "epoch": 19.863636363636363, - "grad_norm": 0.02139265649020672, - "learning_rate": 1.318421052631579e-05, - "loss": 0.0021, - "step": 437 - }, - { - "epoch": 19.863636363636363, - "eval_loss": 0.0011367396218702197, - "eval_runtime": 0.2327, - "eval_samples_per_second": 378.128, - "eval_steps_per_second": 47.266, - "step": 437 - }, - { - "epoch": 19.90909090909091, - "grad_norm": 0.016315054148435593, - "learning_rate": 1.3144736842105263e-05, - "loss": 0.0018, - "step": 438 - }, - { - "epoch": 19.90909090909091, - "eval_loss": 0.001134704565629363, - "eval_runtime": 0.243, - "eval_samples_per_second": 362.095, - "eval_steps_per_second": 45.262, - "step": 438 - }, - { - "epoch": 19.954545454545453, - "grad_norm": 0.015357021242380142, - "learning_rate": 1.3105263157894738e-05, - "loss": 0.0019, - "step": 439 - }, - { - "epoch": 19.954545454545453, - "eval_loss": 0.0011326519306749105, - "eval_runtime": 0.238, - "eval_samples_per_second": 369.798, - "eval_steps_per_second": 46.225, - "step": 439 - }, - { - "epoch": 20.0, - "grad_norm": 0.01644103042781353, - "learning_rate": 1.3065789473684212e-05, - "loss": 0.0019, - "step": 440 - }, - { - "epoch": 20.0, - "eval_loss": 0.0011306345695629716, - "eval_runtime": 0.2373, - "eval_samples_per_second": 370.835, - "eval_steps_per_second": 46.354, - "step": 440 - }, - { - "epoch": 20.045454545454547, - "grad_norm": 0.0168069489300251, - "learning_rate": 1.3026315789473684e-05, - "loss": 0.002, - "step": 441 - }, - { - "epoch": 20.045454545454547, - "eval_loss": 0.0011284599313512444, - "eval_runtime": 0.2305, - "eval_samples_per_second": 381.741, - "eval_steps_per_second": 47.718, - "step": 441 - }, - { - "epoch": 20.09090909090909, - "grad_norm": 0.015401924960315228, - "learning_rate": 1.2986842105263158e-05, - "loss": 0.0019, - "step": 442 - }, - { - "epoch": 20.09090909090909, - "eval_loss": 0.0011262963525950909, - "eval_runtime": 0.2364, - "eval_samples_per_second": 372.316, - "eval_steps_per_second": 46.54, - "step": 442 - }, - { - "epoch": 20.136363636363637, - "grad_norm": 0.019058704376220703, - "learning_rate": 1.2947368421052633e-05, - "loss": 0.0019, - "step": 443 - }, - { - "epoch": 20.136363636363637, - "eval_loss": 0.0011239717714488506, - "eval_runtime": 0.2383, - "eval_samples_per_second": 369.214, - "eval_steps_per_second": 46.152, - "step": 443 - }, - { - "epoch": 20.181818181818183, - "grad_norm": 0.018643731251358986, - "learning_rate": 1.2907894736842105e-05, - "loss": 0.0019, - "step": 444 - }, - { - "epoch": 20.181818181818183, - "eval_loss": 0.0011216469574719667, - "eval_runtime": 0.2413, - "eval_samples_per_second": 364.671, - "eval_steps_per_second": 45.584, - "step": 444 - }, - { - "epoch": 20.227272727272727, - "grad_norm": 0.018360739573836327, - "learning_rate": 1.2868421052631579e-05, - "loss": 0.002, - "step": 445 - }, - { - "epoch": 20.227272727272727, - "eval_loss": 0.0011192425154149532, - "eval_runtime": 0.2331, - "eval_samples_per_second": 377.473, - "eval_steps_per_second": 47.184, - "step": 445 - }, - { - "epoch": 20.272727272727273, - "grad_norm": 0.016574162989854813, - "learning_rate": 1.2828947368421053e-05, - "loss": 0.0019, - "step": 446 - }, - { - "epoch": 20.272727272727273, - "eval_loss": 0.001116919214837253, - "eval_runtime": 0.2433, - "eval_samples_per_second": 361.621, - "eval_steps_per_second": 45.203, - "step": 446 - }, - { - "epoch": 20.318181818181817, - "grad_norm": 0.01646783947944641, - "learning_rate": 1.2789473684210526e-05, - "loss": 0.0019, - "step": 447 - }, - { - "epoch": 20.318181818181817, - "eval_loss": 0.0011146310716867447, - "eval_runtime": 0.2514, - "eval_samples_per_second": 349.985, - "eval_steps_per_second": 43.748, - "step": 447 - }, - { - "epoch": 20.363636363636363, - "grad_norm": 0.017044425010681152, - "learning_rate": 1.275e-05, - "loss": 0.0018, - "step": 448 - }, - { - "epoch": 20.363636363636363, - "eval_loss": 0.0011123091680929065, - "eval_runtime": 0.253, - "eval_samples_per_second": 347.827, - "eval_steps_per_second": 43.478, - "step": 448 - }, - { - "epoch": 20.40909090909091, - "grad_norm": 0.017729461193084717, - "learning_rate": 1.2710526315789474e-05, - "loss": 0.0019, - "step": 449 - }, - { - "epoch": 20.40909090909091, - "eval_loss": 0.001110163051635027, - "eval_runtime": 0.2651, - "eval_samples_per_second": 331.944, - "eval_steps_per_second": 41.493, - "step": 449 - }, - { - "epoch": 20.454545454545453, - "grad_norm": 0.014911322854459286, - "learning_rate": 1.2671052631578947e-05, - "loss": 0.0017, - "step": 450 - }, - { - "epoch": 20.454545454545453, - "eval_loss": 0.0011080644326284528, - "eval_runtime": 0.2496, - "eval_samples_per_second": 352.625, - "eval_steps_per_second": 44.078, - "step": 450 - }, - { - "epoch": 20.5, - "grad_norm": 0.016675200313329697, - "learning_rate": 1.263157894736842e-05, - "loss": 0.0019, - "step": 451 - }, - { - "epoch": 20.5, - "eval_loss": 0.0011060454417020082, - "eval_runtime": 0.26, - "eval_samples_per_second": 338.446, - "eval_steps_per_second": 42.306, - "step": 451 - }, - { - "epoch": 20.545454545454547, - "grad_norm": 0.016018547117710114, - "learning_rate": 1.2592105263157895e-05, - "loss": 0.0018, - "step": 452 - }, - { - "epoch": 20.545454545454547, - "eval_loss": 0.0011039102682843804, - "eval_runtime": 0.2399, - "eval_samples_per_second": 366.846, - "eval_steps_per_second": 45.856, - "step": 452 - }, - { - "epoch": 20.59090909090909, - "grad_norm": 0.016912776976823807, - "learning_rate": 1.2552631578947369e-05, - "loss": 0.0019, - "step": 453 - }, - { - "epoch": 20.59090909090909, - "eval_loss": 0.0011017858050763607, - "eval_runtime": 0.2273, - "eval_samples_per_second": 387.134, - "eval_steps_per_second": 48.392, - "step": 453 - }, - { - "epoch": 20.636363636363637, - "grad_norm": 0.015879783779382706, - "learning_rate": 1.2513157894736843e-05, - "loss": 0.0018, - "step": 454 - }, - { - "epoch": 20.636363636363637, - "eval_loss": 0.0010996219934895635, - "eval_runtime": 0.2449, - "eval_samples_per_second": 359.378, - "eval_steps_per_second": 44.922, - "step": 454 - }, - { - "epoch": 20.681818181818183, - "grad_norm": 0.017021868377923965, - "learning_rate": 1.2473684210526317e-05, - "loss": 0.0019, - "step": 455 - }, - { - "epoch": 20.681818181818183, - "eval_loss": 0.0010973933385685086, - "eval_runtime": 0.229, - "eval_samples_per_second": 384.317, - "eval_steps_per_second": 48.04, - "step": 455 - }, - { - "epoch": 20.727272727272727, - "grad_norm": 0.015419513918459415, - "learning_rate": 1.243421052631579e-05, - "loss": 0.0019, - "step": 456 - }, - { - "epoch": 20.727272727272727, - "eval_loss": 0.001095130923204124, - "eval_runtime": 0.2362, - "eval_samples_per_second": 372.489, - "eval_steps_per_second": 46.561, - "step": 456 - }, - { - "epoch": 20.772727272727273, - "grad_norm": 0.01693497784435749, - "learning_rate": 1.2394736842105264e-05, - "loss": 0.0018, - "step": 457 - }, - { - "epoch": 20.772727272727273, - "eval_loss": 0.0010928618721663952, - "eval_runtime": 0.2233, - "eval_samples_per_second": 394.174, - "eval_steps_per_second": 49.272, - "step": 457 - }, - { - "epoch": 20.818181818181817, - "grad_norm": 0.017432473599910736, - "learning_rate": 1.2355263157894738e-05, - "loss": 0.0018, - "step": 458 - }, - { - "epoch": 20.818181818181817, - "eval_loss": 0.0010908265830948949, - "eval_runtime": 0.2275, - "eval_samples_per_second": 386.81, - "eval_steps_per_second": 48.351, - "step": 458 - }, - { - "epoch": 20.863636363636363, - "grad_norm": 0.014237020164728165, - "learning_rate": 1.231578947368421e-05, - "loss": 0.0016, - "step": 459 - }, - { - "epoch": 20.863636363636363, - "eval_loss": 0.0010887522948905826, - "eval_runtime": 0.236, - "eval_samples_per_second": 372.82, - "eval_steps_per_second": 46.603, - "step": 459 - }, - { - "epoch": 20.90909090909091, - "grad_norm": 0.016278453171253204, - "learning_rate": 1.2276315789473685e-05, - "loss": 0.0017, - "step": 460 - }, - { - "epoch": 20.90909090909091, - "eval_loss": 0.0010867157252505422, - "eval_runtime": 0.2288, - "eval_samples_per_second": 384.554, - "eval_steps_per_second": 48.069, - "step": 460 - }, - { - "epoch": 20.954545454545453, - "grad_norm": 0.01595933921635151, - "learning_rate": 1.2236842105263159e-05, - "loss": 0.0019, - "step": 461 - }, - { - "epoch": 20.954545454545453, - "eval_loss": 0.0010847292141988873, - "eval_runtime": 0.2252, - "eval_samples_per_second": 390.754, - "eval_steps_per_second": 48.844, - "step": 461 - }, - { - "epoch": 21.0, - "grad_norm": 0.017483873292803764, - "learning_rate": 1.2197368421052631e-05, - "loss": 0.0018, - "step": 462 - }, - { - "epoch": 21.0, - "eval_loss": 0.0010827549267560244, - "eval_runtime": 0.2236, - "eval_samples_per_second": 393.554, - "eval_steps_per_second": 49.194, - "step": 462 - }, - { - "epoch": 21.045454545454547, - "grad_norm": 0.01537961047142744, - "learning_rate": 1.2157894736842105e-05, - "loss": 0.0018, - "step": 463 - }, - { - "epoch": 21.045454545454547, - "eval_loss": 0.0010808442020788789, - "eval_runtime": 0.2361, - "eval_samples_per_second": 372.729, - "eval_steps_per_second": 46.591, - "step": 463 - }, - { - "epoch": 21.09090909090909, - "grad_norm": 0.015306917950510979, - "learning_rate": 1.2118421052631578e-05, - "loss": 0.0017, - "step": 464 - }, - { - "epoch": 21.09090909090909, - "eval_loss": 0.0010790039086714387, - "eval_runtime": 0.2298, - "eval_samples_per_second": 382.888, - "eval_steps_per_second": 47.861, - "step": 464 - }, - { - "epoch": 21.136363636363637, - "grad_norm": 0.013436819426715374, - "learning_rate": 1.2078947368421052e-05, - "loss": 0.0016, - "step": 465 - }, - { - "epoch": 21.136363636363637, - "eval_loss": 0.0010772122768685222, - "eval_runtime": 0.2421, - "eval_samples_per_second": 363.528, - "eval_steps_per_second": 45.441, - "step": 465 - }, - { - "epoch": 21.181818181818183, - "grad_norm": 0.016245294362306595, - "learning_rate": 1.2039473684210528e-05, - "loss": 0.0018, - "step": 466 - }, - { - "epoch": 21.181818181818183, - "eval_loss": 0.0010752826929092407, - "eval_runtime": 0.2313, - "eval_samples_per_second": 380.386, - "eval_steps_per_second": 47.548, - "step": 466 - }, - { - "epoch": 21.227272727272727, - "grad_norm": 0.015921350568532944, - "learning_rate": 1.2e-05, - "loss": 0.0017, - "step": 467 - }, - { - "epoch": 21.227272727272727, - "eval_loss": 0.0010733003728091717, - "eval_runtime": 0.2302, - "eval_samples_per_second": 382.349, - "eval_steps_per_second": 47.794, - "step": 467 - }, - { - "epoch": 21.272727272727273, - "grad_norm": 0.016333753243088722, - "learning_rate": 1.1960526315789474e-05, - "loss": 0.0018, - "step": 468 - }, - { - "epoch": 21.272727272727273, - "eval_loss": 0.0010712259681895375, - "eval_runtime": 0.2299, - "eval_samples_per_second": 382.824, - "eval_steps_per_second": 47.853, - "step": 468 - }, - { - "epoch": 21.318181818181817, - "grad_norm": 0.015542343258857727, - "learning_rate": 1.1921052631578949e-05, - "loss": 0.0017, - "step": 469 - }, - { - "epoch": 21.318181818181817, - "eval_loss": 0.0010691812494769692, - "eval_runtime": 0.2401, - "eval_samples_per_second": 366.569, - "eval_steps_per_second": 45.821, - "step": 469 - }, - { - "epoch": 21.363636363636363, - "grad_norm": 0.017036397010087967, - "learning_rate": 1.1881578947368421e-05, - "loss": 0.0019, - "step": 470 - }, - { - "epoch": 21.363636363636363, - "eval_loss": 0.0010671325726434588, - "eval_runtime": 0.2367, - "eval_samples_per_second": 371.749, - "eval_steps_per_second": 46.469, - "step": 470 - }, - { - "epoch": 21.40909090909091, - "grad_norm": 0.01621134579181671, - "learning_rate": 1.1842105263157895e-05, - "loss": 0.0018, - "step": 471 - }, - { - "epoch": 21.40909090909091, - "eval_loss": 0.0010652164928615093, - "eval_runtime": 0.2376, - "eval_samples_per_second": 370.382, - "eval_steps_per_second": 46.298, - "step": 471 - }, - { - "epoch": 21.454545454545453, - "grad_norm": 0.013604752719402313, - "learning_rate": 1.180263157894737e-05, - "loss": 0.0017, - "step": 472 - }, - { - "epoch": 21.454545454545453, - "eval_loss": 0.0010633313795551658, - "eval_runtime": 0.2408, - "eval_samples_per_second": 365.399, - "eval_steps_per_second": 45.675, - "step": 472 - }, - { - "epoch": 21.5, - "grad_norm": 0.014795001596212387, - "learning_rate": 1.1763157894736842e-05, - "loss": 0.0016, - "step": 473 - }, - { - "epoch": 21.5, - "eval_loss": 0.001061469316482544, - "eval_runtime": 0.2486, - "eval_samples_per_second": 354.0, - "eval_steps_per_second": 44.25, - "step": 473 - }, - { - "epoch": 21.545454545454547, - "grad_norm": 0.015267064794898033, - "learning_rate": 1.1723684210526316e-05, - "loss": 0.0018, - "step": 474 - }, - { - "epoch": 21.545454545454547, - "eval_loss": 0.0010596156353130937, - "eval_runtime": 0.2421, - "eval_samples_per_second": 363.419, - "eval_steps_per_second": 45.427, - "step": 474 - }, - { - "epoch": 21.59090909090909, - "grad_norm": 0.017209574580192566, - "learning_rate": 1.168421052631579e-05, - "loss": 0.0018, - "step": 475 - }, - { - "epoch": 21.59090909090909, - "eval_loss": 0.0010576344793662429, - "eval_runtime": 0.2464, - "eval_samples_per_second": 357.122, - "eval_steps_per_second": 44.64, - "step": 475 - }, - { - "epoch": 21.636363636363637, - "grad_norm": 0.0154210040345788, - "learning_rate": 1.1644736842105263e-05, - "loss": 0.0018, - "step": 476 - }, - { - "epoch": 21.636363636363637, - "eval_loss": 0.0010555870831012726, - "eval_runtime": 0.2538, - "eval_samples_per_second": 346.671, - "eval_steps_per_second": 43.334, - "step": 476 - }, - { - "epoch": 21.681818181818183, - "grad_norm": 0.017148546874523163, - "learning_rate": 1.1605263157894737e-05, - "loss": 0.0018, - "step": 477 - }, - { - "epoch": 21.681818181818183, - "eval_loss": 0.0010535044129937887, - "eval_runtime": 0.2437, - "eval_samples_per_second": 361.038, - "eval_steps_per_second": 45.13, - "step": 477 - }, - { - "epoch": 21.727272727272727, - "grad_norm": 0.01518462784588337, - "learning_rate": 1.1565789473684211e-05, - "loss": 0.0017, - "step": 478 - }, - { - "epoch": 21.727272727272727, - "eval_loss": 0.0010514232562854886, - "eval_runtime": 0.2402, - "eval_samples_per_second": 366.378, - "eval_steps_per_second": 45.797, - "step": 478 - }, - { - "epoch": 21.772727272727273, - "grad_norm": 0.01500785257667303, - "learning_rate": 1.1526315789473683e-05, - "loss": 0.0016, - "step": 479 - }, - { - "epoch": 21.772727272727273, - "eval_loss": 0.0010493744630366564, - "eval_runtime": 0.2449, - "eval_samples_per_second": 359.362, - "eval_steps_per_second": 44.92, - "step": 479 - }, - { - "epoch": 21.818181818181817, - "grad_norm": 0.015978703275322914, - "learning_rate": 1.148684210526316e-05, - "loss": 0.0018, - "step": 480 - }, - { - "epoch": 21.818181818181817, - "eval_loss": 0.0010474204318597913, - "eval_runtime": 0.2586, - "eval_samples_per_second": 340.345, - "eval_steps_per_second": 42.543, - "step": 480 - }, - { - "epoch": 21.863636363636363, - "grad_norm": 0.01765250600874424, - "learning_rate": 1.1447368421052632e-05, - "loss": 0.0017, - "step": 481 - }, - { - "epoch": 21.863636363636363, - "eval_loss": 0.0010454690782353282, - "eval_runtime": 0.2292, - "eval_samples_per_second": 383.999, - "eval_steps_per_second": 48.0, - "step": 481 - }, - { - "epoch": 21.90909090909091, - "grad_norm": 0.016576098278164864, - "learning_rate": 1.1407894736842106e-05, - "loss": 0.0017, - "step": 482 - }, - { - "epoch": 21.90909090909091, - "eval_loss": 0.0010435826843604445, - "eval_runtime": 0.2414, - "eval_samples_per_second": 364.501, - "eval_steps_per_second": 45.563, - "step": 482 - }, - { - "epoch": 21.954545454545453, - "grad_norm": 0.014276851899921894, - "learning_rate": 1.136842105263158e-05, - "loss": 0.0017, - "step": 483 - }, - { - "epoch": 21.954545454545453, - "eval_loss": 0.0010416691657155752, - "eval_runtime": 0.2241, - "eval_samples_per_second": 392.673, - "eval_steps_per_second": 49.084, - "step": 483 - }, - { - "epoch": 22.0, - "grad_norm": 0.01667684316635132, - "learning_rate": 1.1328947368421052e-05, - "loss": 0.0017, - "step": 484 - }, - { - "epoch": 22.0, - "eval_loss": 0.0010398232843726873, - "eval_runtime": 0.24, - "eval_samples_per_second": 366.592, - "eval_steps_per_second": 45.824, - "step": 484 - }, - { - "epoch": 22.045454545454547, - "grad_norm": 0.016187671571969986, - "learning_rate": 1.1289473684210527e-05, - "loss": 0.0018, - "step": 485 - }, - { - "epoch": 22.045454545454547, - "eval_loss": 0.0010379315353929996, - "eval_runtime": 0.2306, - "eval_samples_per_second": 381.551, - "eval_steps_per_second": 47.694, - "step": 485 - }, - { - "epoch": 22.09090909090909, - "grad_norm": 0.014743163250386715, - "learning_rate": 1.125e-05, - "loss": 0.0018, - "step": 486 - }, - { - "epoch": 22.09090909090909, - "eval_loss": 0.0010359951993450522, - "eval_runtime": 0.227, - "eval_samples_per_second": 387.598, - "eval_steps_per_second": 48.45, - "step": 486 - }, - { - "epoch": 22.136363636363637, - "grad_norm": 0.01694609597325325, - "learning_rate": 1.1210526315789473e-05, - "loss": 0.0017, - "step": 487 - }, - { - "epoch": 22.136363636363637, - "eval_loss": 0.0010341384913772345, - "eval_runtime": 0.2407, - "eval_samples_per_second": 365.633, - "eval_steps_per_second": 45.704, - "step": 487 - }, - { - "epoch": 22.181818181818183, - "grad_norm": 0.014260073192417622, - "learning_rate": 1.1171052631578947e-05, - "loss": 0.0017, - "step": 488 - }, - { - "epoch": 22.181818181818183, - "eval_loss": 0.0010322789894416928, - "eval_runtime": 0.2279, - "eval_samples_per_second": 386.189, - "eval_steps_per_second": 48.274, - "step": 488 - }, - { - "epoch": 22.227272727272727, - "grad_norm": 0.017539717257022858, - "learning_rate": 1.1131578947368421e-05, - "loss": 0.0016, - "step": 489 - }, - { - "epoch": 22.227272727272727, - "eval_loss": 0.001030544051900506, - "eval_runtime": 0.239, - "eval_samples_per_second": 368.276, - "eval_steps_per_second": 46.034, - "step": 489 - }, - { - "epoch": 22.272727272727273, - "grad_norm": 0.013456945307552814, - "learning_rate": 1.1092105263157894e-05, - "loss": 0.0016, - "step": 490 - }, - { - "epoch": 22.272727272727273, - "eval_loss": 0.0010288661578670144, - "eval_runtime": 0.2301, - "eval_samples_per_second": 382.513, - "eval_steps_per_second": 47.814, - "step": 490 - }, - { - "epoch": 22.318181818181817, - "grad_norm": 0.016474781557917595, - "learning_rate": 1.1052631578947368e-05, - "loss": 0.0017, - "step": 491 - }, - { - "epoch": 22.318181818181817, - "eval_loss": 0.0010273018851876259, - "eval_runtime": 0.235, - "eval_samples_per_second": 374.491, - "eval_steps_per_second": 46.811, - "step": 491 - }, - { - "epoch": 22.363636363636363, - "grad_norm": 0.01373574323952198, - "learning_rate": 1.1013157894736842e-05, - "loss": 0.0014, - "step": 492 - }, - { - "epoch": 22.363636363636363, - "eval_loss": 0.00102571165189147, - "eval_runtime": 0.2263, - "eval_samples_per_second": 388.813, - "eval_steps_per_second": 48.602, - "step": 492 - }, - { - "epoch": 22.40909090909091, - "grad_norm": 0.015442097559571266, - "learning_rate": 1.0973684210526316e-05, - "loss": 0.0016, - "step": 493 - }, - { - "epoch": 22.40909090909091, - "eval_loss": 0.0010241527343168855, - "eval_runtime": 0.2352, - "eval_samples_per_second": 374.081, - "eval_steps_per_second": 46.76, - "step": 493 - }, - { - "epoch": 22.454545454545453, - "grad_norm": 0.015592455863952637, - "learning_rate": 1.093421052631579e-05, - "loss": 0.0017, - "step": 494 - }, - { - "epoch": 22.454545454545453, - "eval_loss": 0.0010226276936009526, - "eval_runtime": 0.2373, - "eval_samples_per_second": 370.902, - "eval_steps_per_second": 46.363, - "step": 494 - }, - { - "epoch": 22.5, - "grad_norm": 0.013556539081037045, - "learning_rate": 1.0894736842105265e-05, - "loss": 0.0016, - "step": 495 - }, - { - "epoch": 22.5, - "eval_loss": 0.001021133502945304, - "eval_runtime": 0.2433, - "eval_samples_per_second": 361.732, - "eval_steps_per_second": 45.217, - "step": 495 - }, - { - "epoch": 22.545454545454547, - "grad_norm": 0.012894881889224052, - "learning_rate": 1.0855263157894737e-05, - "loss": 0.0016, - "step": 496 - }, - { - "epoch": 22.545454545454547, - "eval_loss": 0.0010197004303336143, - "eval_runtime": 0.2415, - "eval_samples_per_second": 364.331, - "eval_steps_per_second": 45.541, - "step": 496 - }, - { - "epoch": 22.59090909090909, - "grad_norm": 0.014628540724515915, - "learning_rate": 1.0815789473684211e-05, - "loss": 0.0017, - "step": 497 - }, - { - "epoch": 22.59090909090909, - "eval_loss": 0.0010182132245972753, - "eval_runtime": 0.2417, - "eval_samples_per_second": 364.047, - "eval_steps_per_second": 45.506, - "step": 497 - }, - { - "epoch": 22.636363636363637, - "grad_norm": 0.014721691608428955, - "learning_rate": 1.0776315789473685e-05, - "loss": 0.0017, - "step": 498 - }, - { - "epoch": 22.636363636363637, - "eval_loss": 0.0010166773572564125, - "eval_runtime": 0.2388, - "eval_samples_per_second": 368.522, - "eval_steps_per_second": 46.065, - "step": 498 - }, - { - "epoch": 22.681818181818183, - "grad_norm": 0.01576976478099823, - "learning_rate": 1.0736842105263158e-05, - "loss": 0.0018, - "step": 499 - }, - { - "epoch": 22.681818181818183, - "eval_loss": 0.001015029032714665, - "eval_runtime": 0.2308, - "eval_samples_per_second": 381.26, - "eval_steps_per_second": 47.657, - "step": 499 - }, - { - "epoch": 22.727272727272727, - "grad_norm": 0.015886450186371803, - "learning_rate": 1.0697368421052632e-05, - "loss": 0.0017, - "step": 500 - }, - { - "epoch": 22.727272727272727, - "eval_loss": 0.0010134456679224968, - "eval_runtime": 0.236, - "eval_samples_per_second": 372.817, - "eval_steps_per_second": 46.602, - "step": 500 - }, - { - "epoch": 22.772727272727273, - "grad_norm": 0.01687587983906269, - "learning_rate": 1.0657894736842106e-05, - "loss": 0.0017, - "step": 501 - }, - { - "epoch": 22.772727272727273, - "eval_loss": 0.0010118514765053988, - "eval_runtime": 0.2468, - "eval_samples_per_second": 356.526, - "eval_steps_per_second": 44.566, - "step": 501 - }, - { - "epoch": 22.818181818181817, - "grad_norm": 0.013874330557882786, - "learning_rate": 1.0618421052631579e-05, - "loss": 0.0016, - "step": 502 - }, - { - "epoch": 22.818181818181817, - "eval_loss": 0.0010103358654305339, - "eval_runtime": 0.2231, - "eval_samples_per_second": 394.376, - "eval_steps_per_second": 49.297, - "step": 502 - }, - { - "epoch": 22.863636363636363, - "grad_norm": 0.014864981174468994, - "learning_rate": 1.0578947368421053e-05, - "loss": 0.0017, - "step": 503 - }, - { - "epoch": 22.863636363636363, - "eval_loss": 0.001008835039101541, - "eval_runtime": 0.2399, - "eval_samples_per_second": 366.77, - "eval_steps_per_second": 45.846, - "step": 503 - }, - { - "epoch": 22.90909090909091, - "grad_norm": 0.013614412397146225, - "learning_rate": 1.0539473684210525e-05, - "loss": 0.0016, - "step": 504 - }, - { - "epoch": 22.90909090909091, - "eval_loss": 0.001007361221127212, - "eval_runtime": 0.2267, - "eval_samples_per_second": 388.143, - "eval_steps_per_second": 48.518, - "step": 504 - }, - { - "epoch": 22.954545454545453, - "grad_norm": 0.019395658746361732, - "learning_rate": 1.05e-05, - "loss": 0.0019, - "step": 505 - }, - { - "epoch": 22.954545454545453, - "eval_loss": 0.0010058052139356732, - "eval_runtime": 0.2757, - "eval_samples_per_second": 319.24, - "eval_steps_per_second": 39.905, - "step": 505 - }, - { - "epoch": 23.0, - "grad_norm": 0.017713138833642006, - "learning_rate": 1.0460526315789474e-05, - "loss": 0.0018, - "step": 506 - }, - { - "epoch": 23.0, - "eval_loss": 0.0010041649220511317, - "eval_runtime": 0.2305, - "eval_samples_per_second": 381.809, - "eval_steps_per_second": 47.726, - "step": 506 - }, - { - "epoch": 23.045454545454547, - "grad_norm": 0.014331554993987083, - "learning_rate": 1.0421052631578948e-05, - "loss": 0.0017, - "step": 507 - }, - { - "epoch": 23.045454545454547, - "eval_loss": 0.0010025816736742854, - "eval_runtime": 0.2325, - "eval_samples_per_second": 378.56, - "eval_steps_per_second": 47.32, - "step": 507 - }, - { - "epoch": 23.09090909090909, - "grad_norm": 0.014041769318282604, - "learning_rate": 1.0381578947368422e-05, - "loss": 0.0017, - "step": 508 - }, - { - "epoch": 23.09090909090909, - "eval_loss": 0.001001022639684379, - "eval_runtime": 0.2296, - "eval_samples_per_second": 383.301, - "eval_steps_per_second": 47.913, - "step": 508 - }, - { - "epoch": 23.136363636363637, - "grad_norm": 0.014782671816647053, - "learning_rate": 1.0342105263157896e-05, - "loss": 0.0017, - "step": 509 - }, - { - "epoch": 23.136363636363637, - "eval_loss": 0.0009995178552344441, - "eval_runtime": 0.2324, - "eval_samples_per_second": 378.719, - "eval_steps_per_second": 47.34, - "step": 509 - }, - { - "epoch": 23.181818181818183, - "grad_norm": 0.014820964075624943, - "learning_rate": 1.0302631578947368e-05, - "loss": 0.0017, - "step": 510 - }, - { - "epoch": 23.181818181818183, - "eval_loss": 0.0009979914175346494, - "eval_runtime": 0.2306, - "eval_samples_per_second": 381.609, - "eval_steps_per_second": 47.701, - "step": 510 - }, - { - "epoch": 23.227272727272727, - "grad_norm": 0.014552117325365543, - "learning_rate": 1.0263157894736843e-05, - "loss": 0.0017, - "step": 511 - }, - { - "epoch": 23.227272727272727, - "eval_loss": 0.0009964742930606008, - "eval_runtime": 0.2277, - "eval_samples_per_second": 386.477, - "eval_steps_per_second": 48.31, - "step": 511 - }, - { - "epoch": 23.272727272727273, - "grad_norm": 0.016575666144490242, - "learning_rate": 1.0223684210526317e-05, - "loss": 0.0018, - "step": 512 - }, - { - "epoch": 23.272727272727273, - "eval_loss": 0.0009949287632480264, - "eval_runtime": 0.2408, - "eval_samples_per_second": 365.446, - "eval_steps_per_second": 45.681, - "step": 512 - }, - { - "epoch": 23.318181818181817, - "grad_norm": 0.013247662223875523, - "learning_rate": 1.018421052631579e-05, - "loss": 0.0016, - "step": 513 - }, - { - "epoch": 23.318181818181817, - "eval_loss": 0.0009934090776368976, - "eval_runtime": 0.2312, - "eval_samples_per_second": 380.548, - "eval_steps_per_second": 47.569, - "step": 513 - }, - { - "epoch": 23.363636363636363, - "grad_norm": 0.014102768152952194, - "learning_rate": 1.0144736842105263e-05, - "loss": 0.0017, - "step": 514 - }, - { - "epoch": 23.363636363636363, - "eval_loss": 0.0009918283903971314, - "eval_runtime": 0.2315, - "eval_samples_per_second": 380.202, - "eval_steps_per_second": 47.525, - "step": 514 - }, - { - "epoch": 23.40909090909091, - "grad_norm": 0.015047273598611355, - "learning_rate": 1.0105263157894738e-05, - "loss": 0.0017, - "step": 515 - }, - { - "epoch": 23.40909090909091, - "eval_loss": 0.0009903222089633346, - "eval_runtime": 0.2308, - "eval_samples_per_second": 381.309, - "eval_steps_per_second": 47.664, - "step": 515 - }, - { - "epoch": 23.454545454545453, - "grad_norm": 0.016119079664349556, - "learning_rate": 1.006578947368421e-05, - "loss": 0.0018, - "step": 516 - }, - { - "epoch": 23.454545454545453, - "eval_loss": 0.0009887360502034426, - "eval_runtime": 0.2356, - "eval_samples_per_second": 373.467, - "eval_steps_per_second": 46.683, - "step": 516 - }, - { - "epoch": 23.5, - "grad_norm": 0.013055874034762383, - "learning_rate": 1.0026315789473684e-05, - "loss": 0.0015, - "step": 517 - }, - { - "epoch": 23.5, - "eval_loss": 0.0009872028604149818, - "eval_runtime": 0.2353, - "eval_samples_per_second": 373.918, - "eval_steps_per_second": 46.74, - "step": 517 - }, - { - "epoch": 23.545454545454547, - "grad_norm": 0.014796939678490162, - "learning_rate": 9.986842105263158e-06, - "loss": 0.0017, - "step": 518 - }, - { - "epoch": 23.545454545454547, - "eval_loss": 0.0009856532560661435, - "eval_runtime": 0.236, - "eval_samples_per_second": 372.816, - "eval_steps_per_second": 46.602, - "step": 518 - }, - { - "epoch": 23.59090909090909, - "grad_norm": 0.01749352179467678, - "learning_rate": 9.94736842105263e-06, - "loss": 0.0018, - "step": 519 - }, - { - "epoch": 23.59090909090909, - "eval_loss": 0.000984109123237431, - "eval_runtime": 0.2375, - "eval_samples_per_second": 370.519, - "eval_steps_per_second": 46.315, - "step": 519 - }, - { - "epoch": 23.636363636363637, - "grad_norm": 0.014436857774853706, - "learning_rate": 9.907894736842107e-06, - "loss": 0.0017, - "step": 520 - }, - { - "epoch": 23.636363636363637, - "eval_loss": 0.0009825569577515125, - "eval_runtime": 0.2329, - "eval_samples_per_second": 377.839, - "eval_steps_per_second": 47.23, - "step": 520 - }, - { - "epoch": 23.681818181818183, - "grad_norm": 0.0134369982406497, - "learning_rate": 9.868421052631579e-06, - "loss": 0.0015, - "step": 521 - }, - { - "epoch": 23.681818181818183, - "eval_loss": 0.0009810830233618617, - "eval_runtime": 0.2463, - "eval_samples_per_second": 357.352, - "eval_steps_per_second": 44.669, - "step": 521 - }, - { - "epoch": 23.727272727272727, - "grad_norm": 0.015284021385014057, - "learning_rate": 9.828947368421053e-06, - "loss": 0.0017, - "step": 522 - }, - { - "epoch": 23.727272727272727, - "eval_loss": 0.0009796229423955083, - "eval_runtime": 0.2303, - "eval_samples_per_second": 382.111, - "eval_steps_per_second": 47.764, - "step": 522 - }, - { - "epoch": 23.772727272727273, - "grad_norm": 0.01389851700514555, - "learning_rate": 9.789473684210527e-06, - "loss": 0.0016, - "step": 523 - }, - { - "epoch": 23.772727272727273, - "eval_loss": 0.0009782682172954082, - "eval_runtime": 0.2358, - "eval_samples_per_second": 373.188, - "eval_steps_per_second": 46.649, - "step": 523 - }, - { - "epoch": 23.818181818181817, - "grad_norm": 0.013064984232187271, - "learning_rate": 9.75e-06, - "loss": 0.0016, - "step": 524 - }, - { - "epoch": 23.818181818181817, - "eval_loss": 0.000976921641267836, - "eval_runtime": 0.4347, - "eval_samples_per_second": 202.42, - "eval_steps_per_second": 25.303, - "step": 524 - }, - { - "epoch": 23.863636363636363, - "grad_norm": 0.01853189431130886, - "learning_rate": 9.710526315789474e-06, - "loss": 0.0018, - "step": 525 - }, - { - "epoch": 23.863636363636363, - "eval_loss": 0.0009755737846717238, - "eval_runtime": 0.3094, - "eval_samples_per_second": 284.419, - "eval_steps_per_second": 35.552, - "step": 525 - }, - { - "epoch": 23.90909090909091, - "grad_norm": 0.015431704930961132, - "learning_rate": 9.671052631578948e-06, - "loss": 0.0016, - "step": 526 - }, - { - "epoch": 23.90909090909091, - "eval_loss": 0.0009742649854160845, - "eval_runtime": 0.3285, - "eval_samples_per_second": 267.881, - "eval_steps_per_second": 33.485, - "step": 526 - }, - { - "epoch": 23.954545454545453, - "grad_norm": 0.015396500937640667, - "learning_rate": 9.63157894736842e-06, - "loss": 0.0017, - "step": 527 - }, - { - "epoch": 23.954545454545453, - "eval_loss": 0.0009728847653605044, - "eval_runtime": 0.379, - "eval_samples_per_second": 232.167, - "eval_steps_per_second": 29.021, - "step": 527 - }, - { - "epoch": 24.0, - "grad_norm": 0.018940720707178116, - "learning_rate": 9.592105263157895e-06, - "loss": 0.0018, - "step": 528 - }, - { - "epoch": 24.0, - "eval_loss": 0.0009714543703012168, - "eval_runtime": 0.4121, - "eval_samples_per_second": 213.562, - "eval_steps_per_second": 26.695, - "step": 528 - }, - { - "epoch": 24.045454545454547, - "grad_norm": 0.013447549194097519, - "learning_rate": 9.552631578947369e-06, - "loss": 0.0016, - "step": 529 - }, - { - "epoch": 24.045454545454547, - "eval_loss": 0.0009699968504719436, - "eval_runtime": 0.4871, - "eval_samples_per_second": 180.65, - "eval_steps_per_second": 22.581, - "step": 529 - }, - { - "epoch": 24.09090909090909, - "grad_norm": 0.01361093670129776, - "learning_rate": 9.513157894736841e-06, - "loss": 0.0016, - "step": 530 - }, - { - "epoch": 24.09090909090909, - "eval_loss": 0.0009685555123724043, - "eval_runtime": 0.4944, - "eval_samples_per_second": 178.009, - "eval_steps_per_second": 22.251, - "step": 530 - }, - { - "epoch": 24.136363636363637, - "grad_norm": 0.014719787985086441, - "learning_rate": 9.473684210526315e-06, - "loss": 0.0016, - "step": 531 - }, - { - "epoch": 24.136363636363637, - "eval_loss": 0.0009670979925431311, - "eval_runtime": 0.3472, - "eval_samples_per_second": 253.483, - "eval_steps_per_second": 31.685, - "step": 531 - }, - { - "epoch": 24.181818181818183, - "grad_norm": 0.01682870462536812, - "learning_rate": 9.43421052631579e-06, - "loss": 0.0018, - "step": 532 - }, - { - "epoch": 24.181818181818183, - "eval_loss": 0.0009655930334702134, - "eval_runtime": 0.2295, - "eval_samples_per_second": 383.473, - "eval_steps_per_second": 47.934, - "step": 532 - }, - { - "epoch": 24.227272727272727, - "grad_norm": 0.015661459416151047, - "learning_rate": 9.394736842105262e-06, - "loss": 0.0016, - "step": 533 - }, - { - "epoch": 24.227272727272727, - "eval_loss": 0.0009641083306632936, - "eval_runtime": 0.247, - "eval_samples_per_second": 356.243, - "eval_steps_per_second": 44.53, - "step": 533 - }, - { - "epoch": 24.272727272727273, - "grad_norm": 0.015652479603886604, - "learning_rate": 9.355263157894738e-06, - "loss": 0.0016, - "step": 534 - }, - { - "epoch": 24.272727272727273, - "eval_loss": 0.0009626846294850111, - "eval_runtime": 0.2337, - "eval_samples_per_second": 376.608, - "eval_steps_per_second": 47.076, - "step": 534 - }, - { - "epoch": 24.318181818181817, - "grad_norm": 0.013394070789217949, - "learning_rate": 9.315789473684212e-06, - "loss": 0.0016, - "step": 535 - }, - { - "epoch": 24.318181818181817, - "eval_loss": 0.0009613109868951142, - "eval_runtime": 0.2315, - "eval_samples_per_second": 380.202, - "eval_steps_per_second": 47.525, - "step": 535 - }, - { - "epoch": 24.363636363636363, - "grad_norm": 0.015152989886701107, - "learning_rate": 9.276315789473685e-06, - "loss": 0.0016, - "step": 536 - }, - { - "epoch": 24.363636363636363, - "eval_loss": 0.0009599780314601958, - "eval_runtime": 0.2373, - "eval_samples_per_second": 370.835, - "eval_steps_per_second": 46.354, - "step": 536 - }, - { - "epoch": 24.40909090909091, - "grad_norm": 0.014209273271262646, - "learning_rate": 9.236842105263159e-06, - "loss": 0.0016, - "step": 537 - }, - { - "epoch": 24.40909090909091, - "eval_loss": 0.0009586341911926866, - "eval_runtime": 0.2342, - "eval_samples_per_second": 375.816, - "eval_steps_per_second": 46.977, - "step": 537 - }, - { - "epoch": 24.454545454545453, - "grad_norm": 0.014566083438694477, - "learning_rate": 9.197368421052633e-06, - "loss": 0.0015, - "step": 538 - }, - { - "epoch": 24.454545454545453, - "eval_loss": 0.000957344425842166, - "eval_runtime": 0.2373, - "eval_samples_per_second": 370.82, - "eval_steps_per_second": 46.352, - "step": 538 - }, - { - "epoch": 24.5, - "grad_norm": 0.016195589676499367, - "learning_rate": 9.157894736842105e-06, - "loss": 0.0017, - "step": 539 - }, - { - "epoch": 24.5, - "eval_loss": 0.0009560330072417855, - "eval_runtime": 0.2313, - "eval_samples_per_second": 380.382, - "eval_steps_per_second": 47.548, - "step": 539 - }, - { - "epoch": 24.545454545454547, - "grad_norm": 0.01577996276319027, - "learning_rate": 9.11842105263158e-06, - "loss": 0.0017, - "step": 540 - }, - { - "epoch": 24.545454545454547, - "eval_loss": 0.0009547690278850496, - "eval_runtime": 0.2288, - "eval_samples_per_second": 384.628, - "eval_steps_per_second": 48.079, - "step": 540 - }, - { - "epoch": 24.59090909090909, - "grad_norm": 0.013901899568736553, - "learning_rate": 9.078947368421054e-06, - "loss": 0.0015, - "step": 541 - }, - { - "epoch": 24.59090909090909, - "eval_loss": 0.0009535103454254568, - "eval_runtime": 0.2351, - "eval_samples_per_second": 374.379, - "eval_steps_per_second": 46.797, - "step": 541 - }, - { - "epoch": 24.636363636363637, - "grad_norm": 0.014091338962316513, - "learning_rate": 9.039473684210526e-06, - "loss": 0.0016, - "step": 542 - }, - { - "epoch": 24.636363636363637, - "eval_loss": 0.0009522747131995857, - "eval_runtime": 0.2274, - "eval_samples_per_second": 387.015, - "eval_steps_per_second": 48.377, - "step": 542 - }, - { - "epoch": 24.681818181818183, - "grad_norm": 0.014544407837092876, - "learning_rate": 9e-06, - "loss": 0.0017, - "step": 543 - }, - { - "epoch": 24.681818181818183, - "eval_loss": 0.0009510606760159135, - "eval_runtime": 0.2442, - "eval_samples_per_second": 360.336, - "eval_steps_per_second": 45.042, - "step": 543 - }, - { - "epoch": 24.727272727272727, - "grad_norm": 0.01616845279932022, - "learning_rate": 8.960526315789473e-06, - "loss": 0.0017, - "step": 544 - }, - { - "epoch": 24.727272727272727, - "eval_loss": 0.0009498685249127448, - "eval_runtime": 0.2388, - "eval_samples_per_second": 368.514, - "eval_steps_per_second": 46.064, - "step": 544 - }, - { - "epoch": 24.772727272727273, - "grad_norm": 0.01609298586845398, - "learning_rate": 8.921052631578947e-06, - "loss": 0.0017, - "step": 545 - }, - { - "epoch": 24.772727272727273, - "eval_loss": 0.0009486477356404066, - "eval_runtime": 0.2287, - "eval_samples_per_second": 384.803, - "eval_steps_per_second": 48.1, - "step": 545 - }, - { - "epoch": 24.818181818181817, - "grad_norm": 0.013633071444928646, - "learning_rate": 8.881578947368421e-06, - "loss": 0.0016, - "step": 546 - }, - { - "epoch": 24.818181818181817, - "eval_loss": 0.0009474134421907365, - "eval_runtime": 0.2393, - "eval_samples_per_second": 367.684, - "eval_steps_per_second": 45.96, - "step": 546 - }, - { - "epoch": 24.863636363636363, - "grad_norm": 0.013738269917666912, - "learning_rate": 8.842105263157893e-06, - "loss": 0.0016, - "step": 547 - }, - { - "epoch": 24.863636363636363, - "eval_loss": 0.0009461792069487274, - "eval_runtime": 0.2312, - "eval_samples_per_second": 380.637, - "eval_steps_per_second": 47.58, - "step": 547 - }, - { - "epoch": 24.90909090909091, - "grad_norm": 0.013620936311781406, - "learning_rate": 8.80263157894737e-06, - "loss": 0.0015, - "step": 548 - }, - { - "epoch": 24.90909090909091, - "eval_loss": 0.0009449638891965151, - "eval_runtime": 0.2459, - "eval_samples_per_second": 357.891, - "eval_steps_per_second": 44.736, - "step": 548 - }, - { - "epoch": 24.954545454545453, - "grad_norm": 0.015967663377523422, - "learning_rate": 8.763157894736843e-06, - "loss": 0.0017, - "step": 549 - }, - { - "epoch": 24.954545454545453, - "eval_loss": 0.0009437742992304265, - "eval_runtime": 0.239, - "eval_samples_per_second": 368.142, - "eval_steps_per_second": 46.018, - "step": 549 - }, - { - "epoch": 25.0, - "grad_norm": 0.012870087288320065, - "learning_rate": 8.723684210526316e-06, - "loss": 0.0015, - "step": 550 - }, - { - "epoch": 25.0, - "eval_loss": 0.0009425426251254976, - "eval_runtime": 0.2335, - "eval_samples_per_second": 376.798, - "eval_steps_per_second": 47.1, - "step": 550 - }, - { - "epoch": 25.045454545454547, - "grad_norm": 0.012893461622297764, - "learning_rate": 8.68421052631579e-06, - "loss": 0.0015, - "step": 551 - }, - { - "epoch": 25.045454545454547, - "eval_loss": 0.0009413667139597237, - "eval_runtime": 0.236, - "eval_samples_per_second": 372.836, - "eval_steps_per_second": 46.605, - "step": 551 - }, - { - "epoch": 25.09090909090909, - "grad_norm": 0.014959870837628841, - "learning_rate": 8.644736842105264e-06, - "loss": 0.0016, - "step": 552 - }, - { - "epoch": 25.09090909090909, - "eval_loss": 0.0009402299183420837, - "eval_runtime": 0.2482, - "eval_samples_per_second": 354.624, - "eval_steps_per_second": 44.328, - "step": 552 - }, - { - "epoch": 25.136363636363637, - "grad_norm": 0.01649138703942299, - "learning_rate": 8.605263157894737e-06, - "loss": 0.0017, - "step": 553 - }, - { - "epoch": 25.136363636363637, - "eval_loss": 0.0009390347986482084, - "eval_runtime": 0.2599, - "eval_samples_per_second": 338.554, - "eval_steps_per_second": 42.319, - "step": 553 - }, - { - "epoch": 25.181818181818183, - "grad_norm": 0.01470938976854086, - "learning_rate": 8.56578947368421e-06, - "loss": 0.0016, - "step": 554 - }, - { - "epoch": 25.181818181818183, - "eval_loss": 0.0009378465474583209, - "eval_runtime": 0.2574, - "eval_samples_per_second": 341.926, - "eval_steps_per_second": 42.741, - "step": 554 - }, - { - "epoch": 25.227272727272727, - "grad_norm": 0.011589915491640568, - "learning_rate": 8.526315789473685e-06, - "loss": 0.0014, - "step": 555 - }, - { - "epoch": 25.227272727272727, - "eval_loss": 0.000936675991397351, - "eval_runtime": 0.2348, - "eval_samples_per_second": 374.714, - "eval_steps_per_second": 46.839, - "step": 555 - }, - { - "epoch": 25.272727272727273, - "grad_norm": 0.012033880688250065, - "learning_rate": 8.486842105263157e-06, - "loss": 0.0014, - "step": 556 - }, - { - "epoch": 25.272727272727273, - "eval_loss": 0.0009355823858641088, - "eval_runtime": 0.2479, - "eval_samples_per_second": 354.912, - "eval_steps_per_second": 44.364, - "step": 556 - }, - { - "epoch": 25.318181818181817, - "grad_norm": 0.012967276386916637, - "learning_rate": 8.447368421052632e-06, - "loss": 0.0016, - "step": 557 - }, - { - "epoch": 25.318181818181817, - "eval_loss": 0.0009344658465124667, - "eval_runtime": 0.2455, - "eval_samples_per_second": 358.387, - "eval_steps_per_second": 44.798, - "step": 557 - }, - { - "epoch": 25.363636363636363, - "grad_norm": 0.01223038136959076, - "learning_rate": 8.407894736842106e-06, - "loss": 0.0015, - "step": 558 - }, - { - "epoch": 25.363636363636363, - "eval_loss": 0.0009333452326245606, - "eval_runtime": 0.2906, - "eval_samples_per_second": 302.832, - "eval_steps_per_second": 37.854, - "step": 558 - }, - { - "epoch": 25.40909090909091, - "grad_norm": 0.015218369662761688, - "learning_rate": 8.368421052631578e-06, - "loss": 0.0016, - "step": 559 - }, - { - "epoch": 25.40909090909091, - "eval_loss": 0.0009322408004663885, - "eval_runtime": 0.2272, - "eval_samples_per_second": 387.247, - "eval_steps_per_second": 48.406, - "step": 559 - }, - { - "epoch": 25.454545454545453, - "grad_norm": 0.015988919883966446, - "learning_rate": 8.328947368421052e-06, - "loss": 0.0016, - "step": 560 - }, - { - "epoch": 25.454545454545453, - "eval_loss": 0.0009310647728852928, - "eval_runtime": 0.2299, - "eval_samples_per_second": 382.796, - "eval_steps_per_second": 47.85, - "step": 560 - }, - { - "epoch": 25.5, - "grad_norm": 0.012890150770545006, - "learning_rate": 8.289473684210526e-06, - "loss": 0.0015, - "step": 561 - }, - { - "epoch": 25.5, - "eval_loss": 0.0009298656368628144, - "eval_runtime": 0.2335, - "eval_samples_per_second": 376.874, - "eval_steps_per_second": 47.109, - "step": 561 - }, - { - "epoch": 25.545454545454547, - "grad_norm": 0.013084178790450096, - "learning_rate": 8.25e-06, - "loss": 0.0016, - "step": 562 - }, - { - "epoch": 25.545454545454547, - "eval_loss": 0.0009286908898502588, - "eval_runtime": 0.2286, - "eval_samples_per_second": 384.978, - "eval_steps_per_second": 48.122, - "step": 562 - }, - { - "epoch": 25.59090909090909, - "grad_norm": 0.01568671688437462, - "learning_rate": 8.210526315789475e-06, - "loss": 0.0018, - "step": 563 - }, - { - "epoch": 25.59090909090909, - "eval_loss": 0.0009274999029003084, - "eval_runtime": 0.2258, - "eval_samples_per_second": 389.702, - "eval_steps_per_second": 48.713, - "step": 563 - }, - { - "epoch": 25.636363636363637, - "grad_norm": 0.012654740363359451, - "learning_rate": 8.171052631578947e-06, - "loss": 0.0014, - "step": 564 - }, - { - "epoch": 25.636363636363637, - "eval_loss": 0.0009263442480005324, - "eval_runtime": 0.2297, - "eval_samples_per_second": 383.078, - "eval_steps_per_second": 47.885, - "step": 564 - }, - { - "epoch": 25.681818181818183, - "grad_norm": 0.014308282174170017, - "learning_rate": 8.131578947368421e-06, - "loss": 0.0016, - "step": 565 - }, - { - "epoch": 25.681818181818183, - "eval_loss": 0.0009251585579477251, - "eval_runtime": 0.2407, - "eval_samples_per_second": 365.643, - "eval_steps_per_second": 45.705, - "step": 565 - }, - { - "epoch": 25.727272727272727, - "grad_norm": 0.013645520433783531, - "learning_rate": 8.092105263157896e-06, - "loss": 0.0016, - "step": 566 - }, - { - "epoch": 25.727272727272727, - "eval_loss": 0.000924033869523555, - "eval_runtime": 0.2295, - "eval_samples_per_second": 383.49, - "eval_steps_per_second": 47.936, - "step": 566 - }, - { - "epoch": 25.772727272727273, - "grad_norm": 0.013325618579983711, - "learning_rate": 8.052631578947368e-06, - "loss": 0.0016, - "step": 567 - }, - { - "epoch": 25.772727272727273, - "eval_loss": 0.0009229186689481139, - "eval_runtime": 0.2286, - "eval_samples_per_second": 384.951, - "eval_steps_per_second": 48.119, - "step": 567 - }, - { - "epoch": 25.818181818181817, - "grad_norm": 0.013046055100858212, - "learning_rate": 8.013157894736842e-06, - "loss": 0.0015, - "step": 568 - }, - { - "epoch": 25.818181818181817, - "eval_loss": 0.0009218386840075254, - "eval_runtime": 0.2278, - "eval_samples_per_second": 386.339, - "eval_steps_per_second": 48.292, - "step": 568 - }, - { - "epoch": 25.863636363636363, - "grad_norm": 0.014013804495334625, - "learning_rate": 7.973684210526316e-06, - "loss": 0.0015, - "step": 569 - }, - { - "epoch": 25.863636363636363, - "eval_loss": 0.0009208493283949792, - "eval_runtime": 0.239, - "eval_samples_per_second": 368.218, - "eval_steps_per_second": 46.027, - "step": 569 - }, - { - "epoch": 25.90909090909091, - "grad_norm": 0.014438400976359844, - "learning_rate": 7.934210526315789e-06, - "loss": 0.0016, - "step": 570 - }, - { - "epoch": 25.90909090909091, - "eval_loss": 0.0009198287734761834, - "eval_runtime": 0.2403, - "eval_samples_per_second": 366.205, - "eval_steps_per_second": 45.776, - "step": 570 - }, - { - "epoch": 25.954545454545453, - "grad_norm": 0.013837904669344425, - "learning_rate": 7.894736842105263e-06, - "loss": 0.0016, - "step": 571 - }, - { - "epoch": 25.954545454545453, - "eval_loss": 0.0009188164258375764, - "eval_runtime": 0.2295, - "eval_samples_per_second": 383.499, - "eval_steps_per_second": 47.937, - "step": 571 - }, - { - "epoch": 26.0, - "grad_norm": 0.014442033134400845, - "learning_rate": 7.855263157894737e-06, - "loss": 0.0015, - "step": 572 - }, - { - "epoch": 26.0, - "eval_loss": 0.0009178462787531316, - "eval_runtime": 0.2369, - "eval_samples_per_second": 371.428, - "eval_steps_per_second": 46.428, - "step": 572 - }, - { - "epoch": 26.045454545454547, - "grad_norm": 0.01597905345261097, - "learning_rate": 7.81578947368421e-06, - "loss": 0.0016, - "step": 573 - }, - { - "epoch": 26.045454545454547, - "eval_loss": 0.000916794640943408, - "eval_runtime": 0.2272, - "eval_samples_per_second": 387.243, - "eval_steps_per_second": 48.405, - "step": 573 - }, - { - "epoch": 26.09090909090909, - "grad_norm": 0.014845073223114014, - "learning_rate": 7.776315789473684e-06, - "loss": 0.0016, - "step": 574 - }, - { - "epoch": 26.09090909090909, - "eval_loss": 0.0009157375898212194, - "eval_runtime": 0.2356, - "eval_samples_per_second": 373.503, - "eval_steps_per_second": 46.688, - "step": 574 - }, - { - "epoch": 26.136363636363637, - "grad_norm": 0.016282513737678528, - "learning_rate": 7.73684210526316e-06, - "loss": 0.0016, - "step": 575 - }, - { - "epoch": 26.136363636363637, - "eval_loss": 0.0009147171513177454, - "eval_runtime": 0.232, - "eval_samples_per_second": 379.38, - "eval_steps_per_second": 47.422, - "step": 575 - }, - { - "epoch": 26.181818181818183, - "grad_norm": 0.01518057007342577, - "learning_rate": 7.697368421052632e-06, - "loss": 0.0016, - "step": 576 - }, - { - "epoch": 26.181818181818183, - "eval_loss": 0.0009137062006630003, - "eval_runtime": 0.2426, - "eval_samples_per_second": 362.715, - "eval_steps_per_second": 45.339, - "step": 576 - }, - { - "epoch": 26.227272727272727, - "grad_norm": 0.014094051904976368, - "learning_rate": 7.657894736842106e-06, - "loss": 0.0016, - "step": 577 - }, - { - "epoch": 26.227272727272727, - "eval_loss": 0.0009126991499215364, - "eval_runtime": 0.2293, - "eval_samples_per_second": 383.817, - "eval_steps_per_second": 47.977, - "step": 577 - }, - { - "epoch": 26.272727272727273, - "grad_norm": 0.013502271845936775, - "learning_rate": 7.6184210526315794e-06, - "loss": 0.0015, - "step": 578 - }, - { - "epoch": 26.272727272727273, - "eval_loss": 0.0009116692817769945, - "eval_runtime": 0.2603, - "eval_samples_per_second": 338.068, - "eval_steps_per_second": 42.258, - "step": 578 - }, - { - "epoch": 26.318181818181817, - "grad_norm": 0.01577981747686863, - "learning_rate": 7.578947368421053e-06, - "loss": 0.0016, - "step": 579 - }, - { - "epoch": 26.318181818181817, - "eval_loss": 0.0009106568759307265, - "eval_runtime": 0.2284, - "eval_samples_per_second": 385.212, - "eval_steps_per_second": 48.151, - "step": 579 - }, - { - "epoch": 26.363636363636363, - "grad_norm": 0.013350007124245167, - "learning_rate": 7.539473684210527e-06, - "loss": 0.0016, - "step": 580 - }, - { - "epoch": 26.363636363636363, - "eval_loss": 0.0009096513967961073, - "eval_runtime": 0.251, - "eval_samples_per_second": 350.661, - "eval_steps_per_second": 43.833, - "step": 580 - }, - { - "epoch": 26.40909090909091, - "grad_norm": 0.013078941963613033, - "learning_rate": 7.5e-06, - "loss": 0.0014, - "step": 581 - }, - { - "epoch": 26.40909090909091, - "eval_loss": 0.000908670190256089, - "eval_runtime": 0.2388, - "eval_samples_per_second": 368.458, - "eval_steps_per_second": 46.057, - "step": 581 - }, - { - "epoch": 26.454545454545453, - "grad_norm": 0.013791137374937534, - "learning_rate": 7.4605263157894735e-06, - "loss": 0.0015, - "step": 582 - }, - { - "epoch": 26.454545454545453, - "eval_loss": 0.000907672569155693, - "eval_runtime": 0.242, - "eval_samples_per_second": 363.581, - "eval_steps_per_second": 45.448, - "step": 582 - }, - { - "epoch": 26.5, - "grad_norm": 0.015615719370543957, - "learning_rate": 7.421052631578948e-06, - "loss": 0.0017, - "step": 583 - }, - { - "epoch": 26.5, - "eval_loss": 0.0009066305938176811, - "eval_runtime": 0.2567, - "eval_samples_per_second": 342.844, - "eval_steps_per_second": 42.856, - "step": 583 - }, - { - "epoch": 26.545454545454547, - "grad_norm": 0.015224572271108627, - "learning_rate": 7.381578947368421e-06, - "loss": 0.0016, - "step": 584 - }, - { - "epoch": 26.545454545454547, - "eval_loss": 0.000905528839211911, - "eval_runtime": 0.2456, - "eval_samples_per_second": 358.266, - "eval_steps_per_second": 44.783, - "step": 584 - }, - { - "epoch": 26.59090909090909, - "grad_norm": 0.015507878735661507, - "learning_rate": 7.342105263157895e-06, - "loss": 0.0016, - "step": 585 - }, - { - "epoch": 26.59090909090909, - "eval_loss": 0.0009044149774126709, - "eval_runtime": 0.2492, - "eval_samples_per_second": 353.074, - "eval_steps_per_second": 44.134, - "step": 585 - }, - { - "epoch": 26.636363636363637, - "grad_norm": 0.012780736200511456, - "learning_rate": 7.302631578947368e-06, - "loss": 0.0015, - "step": 586 - }, - { - "epoch": 26.636363636363637, - "eval_loss": 0.0009033335372805595, - "eval_runtime": 0.2468, - "eval_samples_per_second": 356.58, - "eval_steps_per_second": 44.572, - "step": 586 - }, - { - "epoch": 26.681818181818183, - "grad_norm": 0.014048571698367596, - "learning_rate": 7.2631578947368426e-06, - "loss": 0.0015, - "step": 587 - }, - { - "epoch": 26.681818181818183, - "eval_loss": 0.0009022265439853072, - "eval_runtime": 0.2552, - "eval_samples_per_second": 344.851, - "eval_steps_per_second": 43.106, - "step": 587 - }, - { - "epoch": 26.727272727272727, - "grad_norm": 0.015583625994622707, - "learning_rate": 7.223684210526316e-06, - "loss": 0.0017, - "step": 588 - }, - { - "epoch": 26.727272727272727, - "eval_loss": 0.0009011449874378741, - "eval_runtime": 0.2308, - "eval_samples_per_second": 381.278, - "eval_steps_per_second": 47.66, - "step": 588 - }, - { - "epoch": 26.772727272727273, - "grad_norm": 0.01401633583009243, - "learning_rate": 7.184210526315789e-06, - "loss": 0.0015, - "step": 589 - }, - { - "epoch": 26.772727272727273, - "eval_loss": 0.0009001016733236611, - "eval_runtime": 0.2374, - "eval_samples_per_second": 370.679, - "eval_steps_per_second": 46.335, - "step": 589 - }, - { - "epoch": 26.818181818181817, - "grad_norm": 0.01262589916586876, - "learning_rate": 7.144736842105263e-06, - "loss": 0.0015, - "step": 590 - }, - { - "epoch": 26.818181818181817, - "eval_loss": 0.0008990716305561364, - "eval_runtime": 0.2399, - "eval_samples_per_second": 366.822, - "eval_steps_per_second": 45.853, - "step": 590 - }, - { - "epoch": 26.863636363636363, - "grad_norm": 0.015306267887353897, - "learning_rate": 7.105263157894737e-06, - "loss": 0.0016, - "step": 591 - }, - { - "epoch": 26.863636363636363, - "eval_loss": 0.0008980457205325365, - "eval_runtime": 0.2286, - "eval_samples_per_second": 385.033, - "eval_steps_per_second": 48.129, - "step": 591 - }, - { - "epoch": 26.90909090909091, - "grad_norm": 0.014178605750203133, - "learning_rate": 7.065789473684211e-06, - "loss": 0.0016, - "step": 592 - }, - { - "epoch": 26.90909090909091, - "eval_loss": 0.0008970522903837264, - "eval_runtime": 0.2364, - "eval_samples_per_second": 372.229, - "eval_steps_per_second": 46.529, - "step": 592 - }, - { - "epoch": 26.954545454545453, - "grad_norm": 0.013244709931313992, - "learning_rate": 7.026315789473685e-06, - "loss": 0.0016, - "step": 593 - }, - { - "epoch": 26.954545454545453, - "eval_loss": 0.0008960642153397202, - "eval_runtime": 0.2462, - "eval_samples_per_second": 357.44, - "eval_steps_per_second": 44.68, - "step": 593 - }, - { - "epoch": 27.0, - "grad_norm": 0.012383348308503628, - "learning_rate": 6.986842105263158e-06, - "loss": 0.0014, - "step": 594 - }, - { - "epoch": 27.0, - "eval_loss": 0.0008951277122832835, - "eval_runtime": 0.2326, - "eval_samples_per_second": 378.306, - "eval_steps_per_second": 47.288, - "step": 594 - }, - { - "epoch": 27.045454545454547, - "grad_norm": 0.011418252252042294, - "learning_rate": 6.9473684210526315e-06, - "loss": 0.0014, - "step": 595 - }, - { - "epoch": 27.045454545454547, - "eval_loss": 0.0008942168205976486, - "eval_runtime": 0.2431, - "eval_samples_per_second": 362.037, - "eval_steps_per_second": 45.255, - "step": 595 - }, - { - "epoch": 27.09090909090909, - "grad_norm": 0.013398653827607632, - "learning_rate": 6.907894736842106e-06, - "loss": 0.0014, - "step": 596 - }, - { - "epoch": 27.09090909090909, - "eval_loss": 0.0008933371282182634, - "eval_runtime": 0.2375, - "eval_samples_per_second": 370.507, - "eval_steps_per_second": 46.313, - "step": 596 - }, - { - "epoch": 27.136363636363637, - "grad_norm": 0.013324232771992683, - "learning_rate": 6.868421052631579e-06, - "loss": 0.0014, - "step": 597 - }, - { - "epoch": 27.136363636363637, - "eval_loss": 0.0008924913126975298, - "eval_runtime": 0.2409, - "eval_samples_per_second": 365.308, - "eval_steps_per_second": 45.663, - "step": 597 - }, - { - "epoch": 27.181818181818183, - "grad_norm": 0.014774598181247711, - "learning_rate": 6.828947368421053e-06, - "loss": 0.0016, - "step": 598 - }, - { - "epoch": 27.181818181818183, - "eval_loss": 0.0008916006772778928, - "eval_runtime": 0.2374, - "eval_samples_per_second": 370.613, - "eval_steps_per_second": 46.327, - "step": 598 - }, - { - "epoch": 27.227272727272727, - "grad_norm": 0.015260329470038414, - "learning_rate": 6.7894736842105264e-06, - "loss": 0.0016, - "step": 599 - }, - { - "epoch": 27.227272727272727, - "eval_loss": 0.0008907453739084303, - "eval_runtime": 0.2427, - "eval_samples_per_second": 362.645, - "eval_steps_per_second": 45.331, - "step": 599 - }, - { - "epoch": 27.272727272727273, - "grad_norm": 0.01440617348998785, - "learning_rate": 6.750000000000001e-06, - "loss": 0.0016, - "step": 600 - }, - { - "epoch": 27.272727272727273, - "eval_loss": 0.0008899224339984357, - "eval_runtime": 0.2506, - "eval_samples_per_second": 351.14, - "eval_steps_per_second": 43.892, - "step": 600 - }, - { - "epoch": 27.318181818181817, - "grad_norm": 0.0139328483492136, - "learning_rate": 6.710526315789474e-06, - "loss": 0.0015, - "step": 601 - }, - { - "epoch": 27.318181818181817, - "eval_loss": 0.0008891185279935598, - "eval_runtime": 0.223, - "eval_samples_per_second": 394.603, - "eval_steps_per_second": 49.325, - "step": 601 - }, - { - "epoch": 27.363636363636363, - "grad_norm": 0.014009720645844936, - "learning_rate": 6.671052631578947e-06, - "loss": 0.0015, - "step": 602 - }, - { - "epoch": 27.363636363636363, - "eval_loss": 0.0008883295231498778, - "eval_runtime": 0.2262, - "eval_samples_per_second": 389.114, - "eval_steps_per_second": 48.639, - "step": 602 - }, - { - "epoch": 27.40909090909091, - "grad_norm": 0.014640220440924168, - "learning_rate": 6.631578947368421e-06, - "loss": 0.0016, - "step": 603 - }, - { - "epoch": 27.40909090909091, - "eval_loss": 0.0008875647909007967, - "eval_runtime": 0.2259, - "eval_samples_per_second": 389.586, - "eval_steps_per_second": 48.698, - "step": 603 - }, - { - "epoch": 27.454545454545453, - "grad_norm": 0.012875789776444435, - "learning_rate": 6.592105263157895e-06, - "loss": 0.0014, - "step": 604 - }, - { - "epoch": 27.454545454545453, - "eval_loss": 0.0008868000004440546, - "eval_runtime": 0.2267, - "eval_samples_per_second": 388.239, - "eval_steps_per_second": 48.53, - "step": 604 - }, - { - "epoch": 27.5, - "grad_norm": 0.012748241424560547, - "learning_rate": 6.552631578947369e-06, - "loss": 0.0014, - "step": 605 - }, - { - "epoch": 27.5, - "eval_loss": 0.0008860474918037653, - "eval_runtime": 0.2273, - "eval_samples_per_second": 387.108, - "eval_steps_per_second": 48.388, - "step": 605 - }, - { - "epoch": 27.545454545454547, - "grad_norm": 0.015082623809576035, - "learning_rate": 6.513157894736842e-06, - "loss": 0.0016, - "step": 606 - }, - { - "epoch": 27.545454545454547, - "eval_loss": 0.0008852502796798944, - "eval_runtime": 0.2413, - "eval_samples_per_second": 364.656, - "eval_steps_per_second": 45.582, - "step": 606 - }, - { - "epoch": 27.59090909090909, - "grad_norm": 0.012016087770462036, - "learning_rate": 6.473684210526316e-06, - "loss": 0.0014, - "step": 607 - }, - { - "epoch": 27.59090909090909, - "eval_loss": 0.0008844301337376237, - "eval_runtime": 0.2344, - "eval_samples_per_second": 375.37, - "eval_steps_per_second": 46.921, - "step": 607 - }, - { - "epoch": 27.636363636363637, - "grad_norm": 0.013424508273601532, - "learning_rate": 6.4342105263157896e-06, - "loss": 0.0014, - "step": 608 - }, - { - "epoch": 27.636363636363637, - "eval_loss": 0.0008835734915919602, - "eval_runtime": 0.2456, - "eval_samples_per_second": 358.327, - "eval_steps_per_second": 44.791, - "step": 608 - }, - { - "epoch": 27.681818181818183, - "grad_norm": 0.014258569106459618, - "learning_rate": 6.394736842105263e-06, - "loss": 0.0016, - "step": 609 - }, - { - "epoch": 27.681818181818183, - "eval_loss": 0.0008827546262182295, - "eval_runtime": 0.2293, - "eval_samples_per_second": 383.729, - "eval_steps_per_second": 47.966, - "step": 609 - }, - { - "epoch": 27.727272727272727, - "grad_norm": 0.012304065749049187, - "learning_rate": 6.355263157894737e-06, - "loss": 0.0014, - "step": 610 - }, - { - "epoch": 27.727272727272727, - "eval_loss": 0.0008819656213745475, - "eval_runtime": 0.2293, - "eval_samples_per_second": 383.825, - "eval_steps_per_second": 47.978, - "step": 610 - }, - { - "epoch": 27.772727272727273, - "grad_norm": 0.01459804829210043, - "learning_rate": 6.31578947368421e-06, - "loss": 0.0016, - "step": 611 - }, - { - "epoch": 27.772727272727273, - "eval_loss": 0.000881133193615824, - "eval_runtime": 0.2354, - "eval_samples_per_second": 373.888, - "eval_steps_per_second": 46.736, - "step": 611 - }, - { - "epoch": 27.818181818181817, - "grad_norm": 0.013015978038311005, - "learning_rate": 6.2763157894736845e-06, - "loss": 0.0014, - "step": 612 - }, - { - "epoch": 27.818181818181817, - "eval_loss": 0.0008803331875242293, - "eval_runtime": 0.267, - "eval_samples_per_second": 329.599, - "eval_steps_per_second": 41.2, - "step": 612 - }, - { - "epoch": 27.863636363636363, - "grad_norm": 0.013901845552027225, - "learning_rate": 6.236842105263159e-06, - "loss": 0.0016, - "step": 613 - }, - { - "epoch": 27.863636363636363, - "eval_loss": 0.0008795224712230265, - "eval_runtime": 0.2596, - "eval_samples_per_second": 339.008, - "eval_steps_per_second": 42.376, - "step": 613 - }, - { - "epoch": 27.90909090909091, - "grad_norm": 0.012065750546753407, - "learning_rate": 6.197368421052632e-06, - "loss": 0.0014, - "step": 614 - }, - { - "epoch": 27.90909090909091, - "eval_loss": 0.0008787267142906785, - "eval_runtime": 0.2638, - "eval_samples_per_second": 333.543, - "eval_steps_per_second": 41.693, - "step": 614 - }, - { - "epoch": 27.954545454545453, - "grad_norm": 0.013637811876833439, - "learning_rate": 6.157894736842105e-06, - "loss": 0.0016, - "step": 615 - }, - { - "epoch": 27.954545454545453, - "eval_loss": 0.0008779308409430087, - "eval_runtime": 0.2586, - "eval_samples_per_second": 340.291, - "eval_steps_per_second": 42.536, - "step": 615 - }, - { - "epoch": 28.0, - "grad_norm": 0.012989726848900318, - "learning_rate": 6.118421052631579e-06, - "loss": 0.0015, - "step": 616 - }, - { - "epoch": 28.0, - "eval_loss": 0.0008771241991780698, - "eval_runtime": 0.2429, - "eval_samples_per_second": 362.255, - "eval_steps_per_second": 45.282, - "step": 616 - }, - { - "epoch": 28.045454545454547, - "grad_norm": 0.011249346658587456, - "learning_rate": 6.078947368421053e-06, - "loss": 0.0013, - "step": 617 - }, - { - "epoch": 28.045454545454547, - "eval_loss": 0.0008763446821831167, - "eval_runtime": 0.2419, - "eval_samples_per_second": 363.721, - "eval_steps_per_second": 45.465, - "step": 617 - }, - { - "epoch": 28.09090909090909, - "grad_norm": 0.013492336496710777, - "learning_rate": 6.039473684210526e-06, - "loss": 0.0016, - "step": 618 - }, - { - "epoch": 28.09090909090909, - "eval_loss": 0.000875540659762919, - "eval_runtime": 0.2616, - "eval_samples_per_second": 336.357, - "eval_steps_per_second": 42.045, - "step": 618 - }, - { - "epoch": 28.136363636363637, - "grad_norm": 0.013201452791690826, - "learning_rate": 6e-06, - "loss": 0.0014, - "step": 619 - }, - { - "epoch": 28.136363636363637, - "eval_loss": 0.0008747638785280287, - "eval_runtime": 0.2332, - "eval_samples_per_second": 377.308, - "eval_steps_per_second": 47.163, - "step": 619 - }, - { - "epoch": 28.181818181818183, - "grad_norm": 0.012346605770289898, - "learning_rate": 5.960526315789474e-06, - "loss": 0.0015, - "step": 620 - }, - { - "epoch": 28.181818181818183, - "eval_loss": 0.0008740072953514755, - "eval_runtime": 0.2297, - "eval_samples_per_second": 383.134, - "eval_steps_per_second": 47.892, - "step": 620 - }, - { - "epoch": 28.227272727272727, - "grad_norm": 0.013474266044795513, - "learning_rate": 5.921052631578948e-06, - "loss": 0.0015, - "step": 621 - }, - { - "epoch": 28.227272727272727, - "eval_loss": 0.0008732505375519395, - "eval_runtime": 0.2261, - "eval_samples_per_second": 389.249, - "eval_steps_per_second": 48.656, - "step": 621 - }, - { - "epoch": 28.272727272727273, - "grad_norm": 0.011779211461544037, - "learning_rate": 5.881578947368421e-06, - "loss": 0.0013, - "step": 622 - }, - { - "epoch": 28.272727272727273, - "eval_loss": 0.0008725319639779627, - "eval_runtime": 0.2358, - "eval_samples_per_second": 373.257, - "eval_steps_per_second": 46.657, - "step": 622 - }, - { - "epoch": 28.318181818181817, - "grad_norm": 0.01458238996565342, - "learning_rate": 5.842105263157895e-06, - "loss": 0.0015, - "step": 623 - }, - { - "epoch": 28.318181818181817, - "eval_loss": 0.0008718472090549767, - "eval_runtime": 0.2469, - "eval_samples_per_second": 356.442, - "eval_steps_per_second": 44.555, - "step": 623 - }, - { - "epoch": 28.363636363636363, - "grad_norm": 0.013492444530129433, - "learning_rate": 5.802631578947368e-06, - "loss": 0.0015, - "step": 624 - }, - { - "epoch": 28.363636363636363, - "eval_loss": 0.0008711445843800902, - "eval_runtime": 0.2339, - "eval_samples_per_second": 376.299, - "eval_steps_per_second": 47.037, - "step": 624 - }, - { - "epoch": 28.40909090909091, - "grad_norm": 0.016801927238702774, - "learning_rate": 5.763157894736842e-06, - "loss": 0.0016, - "step": 625 - }, - { - "epoch": 28.40909090909091, - "eval_loss": 0.0008704178035259247, - "eval_runtime": 0.2467, - "eval_samples_per_second": 356.761, - "eval_steps_per_second": 44.595, - "step": 625 - }, - { - "epoch": 28.454545454545453, - "grad_norm": 0.01472269557416439, - "learning_rate": 5.723684210526316e-06, - "loss": 0.0015, - "step": 626 - }, - { - "epoch": 28.454545454545453, - "eval_loss": 0.0008697099983692169, - "eval_runtime": 0.2361, - "eval_samples_per_second": 372.695, - "eval_steps_per_second": 46.587, - "step": 626 - }, - { - "epoch": 28.5, - "grad_norm": 0.012456816621124744, - "learning_rate": 5.68421052631579e-06, - "loss": 0.0014, - "step": 627 - }, - { - "epoch": 28.5, - "eval_loss": 0.0008690251270309091, - "eval_runtime": 0.227, - "eval_samples_per_second": 387.675, - "eval_steps_per_second": 48.459, - "step": 627 - }, - { - "epoch": 28.545454545454547, - "grad_norm": 0.010930378921329975, - "learning_rate": 5.644736842105263e-06, - "loss": 0.0013, - "step": 628 - }, - { - "epoch": 28.545454545454547, - "eval_loss": 0.0008683226769790053, - "eval_runtime": 0.2396, - "eval_samples_per_second": 367.217, - "eval_steps_per_second": 45.902, - "step": 628 - }, - { - "epoch": 28.59090909090909, - "grad_norm": 0.013773776590824127, - "learning_rate": 5.605263157894737e-06, - "loss": 0.0016, - "step": 629 - }, - { - "epoch": 28.59090909090909, - "eval_loss": 0.0008676418801769614, - "eval_runtime": 0.2255, - "eval_samples_per_second": 390.204, - "eval_steps_per_second": 48.776, - "step": 629 - }, - { - "epoch": 28.636363636363637, - "grad_norm": 0.01485821045935154, - "learning_rate": 5.565789473684211e-06, - "loss": 0.0015, - "step": 630 - }, - { - "epoch": 28.636363636363637, - "eval_loss": 0.0008669787785038352, - "eval_runtime": 0.238, - "eval_samples_per_second": 369.806, - "eval_steps_per_second": 46.226, - "step": 630 - }, - { - "epoch": 28.681818181818183, - "grad_norm": 0.012882347218692303, - "learning_rate": 5.526315789473684e-06, - "loss": 0.0015, - "step": 631 - }, - { - "epoch": 28.681818181818183, - "eval_loss": 0.0008663006592541933, - "eval_runtime": 0.2392, - "eval_samples_per_second": 367.945, - "eval_steps_per_second": 45.993, - "step": 631 - }, - { - "epoch": 28.727272727272727, - "grad_norm": 0.013756033033132553, - "learning_rate": 5.486842105263158e-06, - "loss": 0.0015, - "step": 632 - }, - { - "epoch": 28.727272727272727, - "eval_loss": 0.0008656617719680071, - "eval_runtime": 0.2392, - "eval_samples_per_second": 367.897, - "eval_steps_per_second": 45.987, - "step": 632 - }, - { - "epoch": 28.772727272727273, - "grad_norm": 0.011964356526732445, - "learning_rate": 5.447368421052632e-06, - "loss": 0.0014, - "step": 633 - }, - { - "epoch": 28.772727272727273, - "eval_loss": 0.0008649809169583023, - "eval_runtime": 0.2416, - "eval_samples_per_second": 364.235, - "eval_steps_per_second": 45.529, - "step": 633 - }, - { - "epoch": 28.818181818181817, - "grad_norm": 0.014426548965275288, - "learning_rate": 5.407894736842106e-06, - "loss": 0.0015, - "step": 634 - }, - { - "epoch": 28.818181818181817, - "eval_loss": 0.0008642975008115172, - "eval_runtime": 0.2426, - "eval_samples_per_second": 362.673, - "eval_steps_per_second": 45.334, - "step": 634 - }, - { - "epoch": 28.863636363636363, - "grad_norm": 0.013472221791744232, - "learning_rate": 5.368421052631579e-06, - "loss": 0.0014, - "step": 635 - }, - { - "epoch": 28.863636363636363, - "eval_loss": 0.0008636031416244805, - "eval_runtime": 0.2517, - "eval_samples_per_second": 349.684, - "eval_steps_per_second": 43.711, - "step": 635 - }, - { - "epoch": 28.90909090909091, - "grad_norm": 0.012157904915511608, - "learning_rate": 5.328947368421053e-06, - "loss": 0.0014, - "step": 636 - }, - { - "epoch": 28.90909090909091, - "eval_loss": 0.000862881715875119, - "eval_runtime": 0.2369, - "eval_samples_per_second": 371.509, - "eval_steps_per_second": 46.439, - "step": 636 - }, - { - "epoch": 28.954545454545453, - "grad_norm": 0.012409983202815056, - "learning_rate": 5.289473684210526e-06, - "loss": 0.0014, - "step": 637 - }, - { - "epoch": 28.954545454545453, - "eval_loss": 0.0008621684974059463, - "eval_runtime": 0.2465, - "eval_samples_per_second": 357.054, - "eval_steps_per_second": 44.632, - "step": 637 - }, - { - "epoch": 29.0, - "grad_norm": 0.013315846212208271, - "learning_rate": 5.25e-06, - "loss": 0.0015, - "step": 638 - }, - { - "epoch": 29.0, - "eval_loss": 0.0008614835678599775, - "eval_runtime": 0.2407, - "eval_samples_per_second": 365.586, - "eval_steps_per_second": 45.698, - "step": 638 - }, - { - "epoch": 29.045454545454547, - "grad_norm": 0.015236815437674522, - "learning_rate": 5.210526315789474e-06, - "loss": 0.0016, - "step": 639 - }, - { - "epoch": 29.045454545454547, - "eval_loss": 0.0008607918862253428, - "eval_runtime": 0.2362, - "eval_samples_per_second": 372.636, - "eval_steps_per_second": 46.579, - "step": 639 - }, - { - "epoch": 29.09090909090909, - "grad_norm": 0.01497814990580082, - "learning_rate": 5.171052631578948e-06, - "loss": 0.0015, - "step": 640 - }, - { - "epoch": 29.09090909090909, - "eval_loss": 0.0008601464214734733, - "eval_runtime": 0.2513, - "eval_samples_per_second": 350.225, - "eval_steps_per_second": 43.778, - "step": 640 - }, - { - "epoch": 29.136363636363637, - "grad_norm": 0.010525020770728588, - "learning_rate": 5.131578947368421e-06, - "loss": 0.0013, - "step": 641 - }, - { - "epoch": 29.136363636363637, - "eval_loss": 0.0008594872197136283, - "eval_runtime": 0.2472, - "eval_samples_per_second": 355.947, - "eval_steps_per_second": 44.493, - "step": 641 - }, - { - "epoch": 29.181818181818183, - "grad_norm": 0.012257490307092667, - "learning_rate": 5.092105263157895e-06, - "loss": 0.0014, - "step": 642 - }, - { - "epoch": 29.181818181818183, - "eval_loss": 0.0008588552009314299, - "eval_runtime": 0.2514, - "eval_samples_per_second": 350.01, - "eval_steps_per_second": 43.751, - "step": 642 - }, - { - "epoch": 29.227272727272727, - "grad_norm": 0.016379721462726593, - "learning_rate": 5.052631578947369e-06, - "loss": 0.0016, - "step": 643 - }, - { - "epoch": 29.227272727272727, - "eval_loss": 0.0008582230657339096, - "eval_runtime": 0.2421, - "eval_samples_per_second": 363.525, - "eval_steps_per_second": 45.441, - "step": 643 - }, - { - "epoch": 29.272727272727273, - "grad_norm": 0.013389473780989647, - "learning_rate": 5.013157894736842e-06, - "loss": 0.0014, - "step": 644 - }, - { - "epoch": 29.272727272727273, - "eval_loss": 0.0008576181135140359, - "eval_runtime": 0.2837, - "eval_samples_per_second": 310.222, - "eval_steps_per_second": 38.778, - "step": 644 - }, - { - "epoch": 29.318181818181817, - "grad_norm": 0.011728441342711449, - "learning_rate": 4.973684210526315e-06, - "loss": 0.0014, - "step": 645 - }, - { - "epoch": 29.318181818181817, - "eval_loss": 0.0008570144418627024, - "eval_runtime": 0.3144, - "eval_samples_per_second": 279.869, - "eval_steps_per_second": 34.984, - "step": 645 - }, - { - "epoch": 29.363636363636363, - "grad_norm": 0.014150052331387997, - "learning_rate": 4.9342105263157895e-06, - "loss": 0.0015, - "step": 646 - }, - { - "epoch": 29.363636363636363, - "eval_loss": 0.0008564500021748245, - "eval_runtime": 0.2427, - "eval_samples_per_second": 362.611, - "eval_steps_per_second": 45.326, - "step": 646 - }, - { - "epoch": 29.40909090909091, - "grad_norm": 0.012562847696244717, - "learning_rate": 4.894736842105264e-06, - "loss": 0.0015, - "step": 647 - }, - { - "epoch": 29.40909090909091, - "eval_loss": 0.0008558626868762076, - "eval_runtime": 0.2385, - "eval_samples_per_second": 368.954, - "eval_steps_per_second": 46.119, - "step": 647 - }, - { - "epoch": 29.454545454545453, - "grad_norm": 0.01115860603749752, - "learning_rate": 4.855263157894737e-06, - "loss": 0.0012, - "step": 648 - }, - { - "epoch": 29.454545454545453, - "eval_loss": 0.000855276535730809, - "eval_runtime": 0.2434, - "eval_samples_per_second": 361.501, - "eval_steps_per_second": 45.188, - "step": 648 - }, - { - "epoch": 29.5, - "grad_norm": 0.014787169173359871, - "learning_rate": 4.81578947368421e-06, - "loss": 0.0015, - "step": 649 - }, - { - "epoch": 29.5, - "eval_loss": 0.0008546687895432115, - "eval_runtime": 0.2404, - "eval_samples_per_second": 366.019, - "eval_steps_per_second": 45.752, - "step": 649 - }, - { - "epoch": 29.545454545454547, - "grad_norm": 0.014013570733368397, - "learning_rate": 4.7763157894736844e-06, - "loss": 0.0014, - "step": 650 - }, - { - "epoch": 29.545454545454547, - "eval_loss": 0.0008540409035049379, - "eval_runtime": 0.2415, - "eval_samples_per_second": 364.376, - "eval_steps_per_second": 45.547, - "step": 650 - }, - { - "epoch": 29.59090909090909, - "grad_norm": 0.013314800336956978, - "learning_rate": 4.736842105263158e-06, - "loss": 0.0015, - "step": 651 - }, - { - "epoch": 29.59090909090909, - "eval_loss": 0.0008533978252671659, - "eval_runtime": 0.2334, - "eval_samples_per_second": 377.055, - "eval_steps_per_second": 47.132, - "step": 651 - }, - { - "epoch": 29.636363636363637, - "grad_norm": 0.011727740988135338, - "learning_rate": 4.697368421052631e-06, - "loss": 0.0014, - "step": 652 - }, - { - "epoch": 29.636363636363637, - "eval_loss": 0.0008527915342710912, - "eval_runtime": 0.2324, - "eval_samples_per_second": 378.693, - "eval_steps_per_second": 47.337, - "step": 652 - }, - { - "epoch": 29.681818181818183, - "grad_norm": 0.014551502652466297, - "learning_rate": 4.657894736842106e-06, - "loss": 0.0016, - "step": 653 - }, - { - "epoch": 29.681818181818183, - "eval_loss": 0.0008521459531039, - "eval_runtime": 0.2274, - "eval_samples_per_second": 387.021, - "eval_steps_per_second": 48.378, - "step": 653 - }, - { - "epoch": 29.727272727272727, - "grad_norm": 0.01226063258945942, - "learning_rate": 4.618421052631579e-06, - "loss": 0.0013, - "step": 654 - }, - { - "epoch": 29.727272727272727, - "eval_loss": 0.0008515057852491736, - "eval_runtime": 0.2277, - "eval_samples_per_second": 386.523, - "eval_steps_per_second": 48.315, - "step": 654 - }, - { - "epoch": 29.772727272727273, - "grad_norm": 0.013769338838756084, - "learning_rate": 4.578947368421053e-06, - "loss": 0.0015, - "step": 655 - }, - { - "epoch": 29.772727272727273, - "eval_loss": 0.0008508588653057814, - "eval_runtime": 0.2246, - "eval_samples_per_second": 391.814, - "eval_steps_per_second": 48.977, - "step": 655 - }, - { - "epoch": 29.818181818181817, - "grad_norm": 0.012221275828778744, - "learning_rate": 4.539473684210527e-06, - "loss": 0.0015, - "step": 656 - }, - { - "epoch": 29.818181818181817, - "eval_loss": 0.0008502537966705859, - "eval_runtime": 0.2288, - "eval_samples_per_second": 384.596, - "eval_steps_per_second": 48.075, - "step": 656 - }, - { - "epoch": 29.863636363636363, - "grad_norm": 0.011863375082612038, - "learning_rate": 4.5e-06, - "loss": 0.0013, - "step": 657 - }, - { - "epoch": 29.863636363636363, - "eval_loss": 0.0008496582740917802, - "eval_runtime": 0.2473, - "eval_samples_per_second": 355.889, - "eval_steps_per_second": 44.486, - "step": 657 - }, - { - "epoch": 29.90909090909091, - "grad_norm": 0.01440768875181675, - "learning_rate": 4.460526315789473e-06, - "loss": 0.0015, - "step": 658 - }, - { - "epoch": 29.90909090909091, - "eval_loss": 0.0008490938926115632, - "eval_runtime": 0.2279, - "eval_samples_per_second": 386.137, - "eval_steps_per_second": 48.267, - "step": 658 - }, - { - "epoch": 29.954545454545453, - "grad_norm": 0.013953134417533875, - "learning_rate": 4.421052631578947e-06, - "loss": 0.0014, - "step": 659 - }, - { - "epoch": 29.954545454545453, - "eval_loss": 0.0008485484286211431, - "eval_runtime": 0.2309, - "eval_samples_per_second": 381.17, - "eval_steps_per_second": 47.646, - "step": 659 - }, - { - "epoch": 30.0, - "grad_norm": 0.012044006027281284, - "learning_rate": 4.381578947368422e-06, - "loss": 0.0014, - "step": 660 - }, - { - "epoch": 30.0, - "eval_loss": 0.0008479988318867981, - "eval_runtime": 0.2301, - "eval_samples_per_second": 382.482, - "eval_steps_per_second": 47.81, - "step": 660 - }, - { - "epoch": 30.045454545454547, - "grad_norm": 0.014352229423820972, - "learning_rate": 4.342105263157895e-06, - "loss": 0.0015, - "step": 661 - }, - { - "epoch": 30.045454545454547, - "eval_loss": 0.0008474763599224389, - "eval_runtime": 0.228, - "eval_samples_per_second": 385.949, - "eval_steps_per_second": 48.244, - "step": 661 - }, - { - "epoch": 30.09090909090909, - "grad_norm": 0.012857983820140362, - "learning_rate": 4.302631578947368e-06, - "loss": 0.0015, - "step": 662 - }, - { - "epoch": 30.09090909090909, - "eval_loss": 0.0008469296153634787, - "eval_runtime": 0.2254, - "eval_samples_per_second": 390.463, - "eval_steps_per_second": 48.808, - "step": 662 - }, - { - "epoch": 30.136363636363637, - "grad_norm": 0.013745253905653954, - "learning_rate": 4.2631578947368425e-06, - "loss": 0.0014, - "step": 663 - }, - { - "epoch": 30.136363636363637, - "eval_loss": 0.0008464112761430442, - "eval_runtime": 0.2615, - "eval_samples_per_second": 336.504, - "eval_steps_per_second": 42.063, - "step": 663 - }, - { - "epoch": 30.181818181818183, - "grad_norm": 0.011542108841240406, - "learning_rate": 4.223684210526316e-06, - "loss": 0.0014, - "step": 664 - }, - { - "epoch": 30.181818181818183, - "eval_loss": 0.0008458928787149489, - "eval_runtime": 0.2363, - "eval_samples_per_second": 372.361, - "eval_steps_per_second": 46.545, - "step": 664 - }, - { - "epoch": 30.227272727272727, - "grad_norm": 0.013680350966751575, - "learning_rate": 4.184210526315789e-06, - "loss": 0.0015, - "step": 665 - }, - { - "epoch": 30.227272727272727, - "eval_loss": 0.0008453825721517205, - "eval_runtime": 0.2422, - "eval_samples_per_second": 363.317, - "eval_steps_per_second": 45.415, - "step": 665 - }, - { - "epoch": 30.272727272727273, - "grad_norm": 0.01278683077543974, - "learning_rate": 4.144736842105263e-06, - "loss": 0.0013, - "step": 666 - }, - { - "epoch": 30.272727272727273, - "eval_loss": 0.0008448913577012718, - "eval_runtime": 0.2274, - "eval_samples_per_second": 386.997, - "eval_steps_per_second": 48.375, - "step": 666 - }, - { - "epoch": 30.318181818181817, - "grad_norm": 0.013793477788567543, - "learning_rate": 4.105263157894737e-06, - "loss": 0.0016, - "step": 667 - }, - { - "epoch": 30.318181818181817, - "eval_loss": 0.0008444040431641042, - "eval_runtime": 0.2383, - "eval_samples_per_second": 369.303, - "eval_steps_per_second": 46.163, - "step": 667 - }, - { - "epoch": 30.363636363636363, - "grad_norm": 0.013766897842288017, - "learning_rate": 4.065789473684211e-06, - "loss": 0.0014, - "step": 668 - }, - { - "epoch": 30.363636363636363, - "eval_loss": 0.0008439045632258058, - "eval_runtime": 0.2472, - "eval_samples_per_second": 355.963, - "eval_steps_per_second": 44.495, - "step": 668 - }, - { - "epoch": 30.40909090909091, - "grad_norm": 0.01388518325984478, - "learning_rate": 4.026315789473684e-06, - "loss": 0.0014, - "step": 669 - }, - { - "epoch": 30.40909090909091, - "eval_loss": 0.0008434146293438971, - "eval_runtime": 0.2555, - "eval_samples_per_second": 344.476, - "eval_steps_per_second": 43.059, - "step": 669 - }, - { - "epoch": 30.454545454545453, - "grad_norm": 0.013302307575941086, - "learning_rate": 3.986842105263158e-06, - "loss": 0.0014, - "step": 670 - }, - { - "epoch": 30.454545454545453, - "eval_loss": 0.0008429314475506544, - "eval_runtime": 0.234, - "eval_samples_per_second": 375.99, - "eval_steps_per_second": 46.999, - "step": 670 - }, - { - "epoch": 30.5, - "grad_norm": 0.015602638944983482, - "learning_rate": 3.9473684210526315e-06, - "loss": 0.0015, - "step": 671 - }, - { - "epoch": 30.5, - "eval_loss": 0.0008424482657574117, - "eval_runtime": 0.2312, - "eval_samples_per_second": 380.69, - "eval_steps_per_second": 47.586, - "step": 671 - }, - { - "epoch": 30.545454545454547, - "grad_norm": 0.012195833958685398, - "learning_rate": 3.907894736842105e-06, - "loss": 0.0014, - "step": 672 - }, - { - "epoch": 30.545454545454547, - "eval_loss": 0.0008419921505264938, - "eval_runtime": 0.2348, - "eval_samples_per_second": 374.848, - "eval_steps_per_second": 46.856, - "step": 672 - }, - { - "epoch": 30.59090909090909, - "grad_norm": 0.012124909088015556, - "learning_rate": 3.86842105263158e-06, - "loss": 0.0014, - "step": 673 - }, - { - "epoch": 30.59090909090909, - "eval_loss": 0.0008415495394729078, - "eval_runtime": 0.2337, - "eval_samples_per_second": 376.514, - "eval_steps_per_second": 47.064, - "step": 673 - }, - { - "epoch": 30.636363636363637, - "grad_norm": 0.012487749569118023, - "learning_rate": 3.828947368421053e-06, - "loss": 0.0014, - "step": 674 - }, - { - "epoch": 30.636363636363637, - "eval_loss": 0.0008411163580603898, - "eval_runtime": 0.2252, - "eval_samples_per_second": 390.686, - "eval_steps_per_second": 48.836, - "step": 674 - }, - { - "epoch": 30.681818181818183, - "grad_norm": 0.013694563880562782, - "learning_rate": 3.7894736842105264e-06, - "loss": 0.0015, - "step": 675 - }, - { - "epoch": 30.681818181818183, - "eval_loss": 0.0008406452834606171, - "eval_runtime": 0.2277, - "eval_samples_per_second": 386.401, - "eval_steps_per_second": 48.3, - "step": 675 - }, - { - "epoch": 30.727272727272727, - "grad_norm": 0.012177863158285618, - "learning_rate": 3.75e-06, - "loss": 0.0015, - "step": 676 - }, - { - "epoch": 30.727272727272727, - "eval_loss": 0.0008401837549172342, - "eval_runtime": 0.2284, - "eval_samples_per_second": 385.297, - "eval_steps_per_second": 48.162, - "step": 676 - }, - { - "epoch": 30.772727272727273, - "grad_norm": 0.011734875850379467, - "learning_rate": 3.710526315789474e-06, - "loss": 0.0013, - "step": 677 - }, - { - "epoch": 30.772727272727273, - "eval_loss": 0.0008397437632083893, - "eval_runtime": 0.2349, - "eval_samples_per_second": 374.661, - "eval_steps_per_second": 46.833, - "step": 677 - }, - { - "epoch": 30.818181818181817, - "grad_norm": 0.012181814759969711, - "learning_rate": 3.6710526315789476e-06, - "loss": 0.0015, - "step": 678 - }, - { - "epoch": 30.818181818181817, - "eval_loss": 0.000839310756418854, - "eval_runtime": 0.2267, - "eval_samples_per_second": 388.249, - "eval_steps_per_second": 48.531, - "step": 678 - }, - { - "epoch": 30.863636363636363, - "grad_norm": 0.014351209625601768, - "learning_rate": 3.6315789473684213e-06, - "loss": 0.0015, - "step": 679 - }, - { - "epoch": 30.863636363636363, - "eval_loss": 0.0008388804271817207, - "eval_runtime": 0.2382, - "eval_samples_per_second": 369.474, - "eval_steps_per_second": 46.184, - "step": 679 - }, - { - "epoch": 30.90909090909091, - "grad_norm": 0.01179533638060093, - "learning_rate": 3.5921052631578946e-06, - "loss": 0.0014, - "step": 680 - }, - { - "epoch": 30.90909090909091, - "eval_loss": 0.0008384499233216047, - "eval_runtime": 0.227, - "eval_samples_per_second": 387.694, - "eval_steps_per_second": 48.462, - "step": 680 - }, - { - "epoch": 30.954545454545453, - "grad_norm": 0.01200299896299839, - "learning_rate": 3.5526315789473683e-06, - "loss": 0.0014, - "step": 681 - }, - { - "epoch": 30.954545454545453, - "eval_loss": 0.0008380439248867333, - "eval_runtime": 0.2357, - "eval_samples_per_second": 373.384, - "eval_steps_per_second": 46.673, - "step": 681 - }, - { - "epoch": 31.0, - "grad_norm": 0.012165653519332409, - "learning_rate": 3.5131578947368425e-06, - "loss": 0.0014, - "step": 682 - }, - { - "epoch": 31.0, - "eval_loss": 0.0008376243058592081, - "eval_runtime": 0.2268, - "eval_samples_per_second": 387.994, - "eval_steps_per_second": 48.499, - "step": 682 - }, - { - "epoch": 31.045454545454547, - "grad_norm": 0.013023504056036472, - "learning_rate": 3.4736842105263158e-06, - "loss": 0.0014, - "step": 683 - }, - { - "epoch": 31.045454545454547, - "eval_loss": 0.0008372208685614169, - "eval_runtime": 0.2408, - "eval_samples_per_second": 365.51, - "eval_steps_per_second": 45.689, - "step": 683 - }, - { - "epoch": 31.09090909090909, - "grad_norm": 0.012478847056627274, - "learning_rate": 3.4342105263157895e-06, - "loss": 0.0015, - "step": 684 - }, - { - "epoch": 31.09090909090909, - "eval_loss": 0.0008367864647880197, - "eval_runtime": 0.2261, - "eval_samples_per_second": 389.221, - "eval_steps_per_second": 48.653, - "step": 684 - }, - { - "epoch": 31.136363636363637, - "grad_norm": 0.011943116784095764, - "learning_rate": 3.3947368421052632e-06, - "loss": 0.0014, - "step": 685 - }, - { - "epoch": 31.136363636363637, - "eval_loss": 0.0008363695815205574, - "eval_runtime": 0.2405, - "eval_samples_per_second": 365.829, - "eval_steps_per_second": 45.729, - "step": 685 - }, - { - "epoch": 31.181818181818183, - "grad_norm": 0.012198768556118011, - "learning_rate": 3.355263157894737e-06, - "loss": 0.0014, - "step": 686 - }, - { - "epoch": 31.181818181818183, - "eval_loss": 0.000835962186101824, - "eval_runtime": 0.2414, - "eval_samples_per_second": 364.526, - "eval_steps_per_second": 45.566, - "step": 686 - }, - { - "epoch": 31.227272727272727, - "grad_norm": 0.012970656156539917, - "learning_rate": 3.3157894736842107e-06, - "loss": 0.0014, - "step": 687 - }, - { - "epoch": 31.227272727272727, - "eval_loss": 0.0008355574682354927, - "eval_runtime": 0.2355, - "eval_samples_per_second": 373.686, - "eval_steps_per_second": 46.711, - "step": 687 - }, - { - "epoch": 31.272727272727273, - "grad_norm": 0.01133756898343563, - "learning_rate": 3.2763157894736844e-06, - "loss": 0.0012, - "step": 688 - }, - { - "epoch": 31.272727272727273, - "eval_loss": 0.0008351581636816263, - "eval_runtime": 0.239, - "eval_samples_per_second": 368.146, - "eval_steps_per_second": 46.018, - "step": 688 - }, - { - "epoch": 31.318181818181817, - "grad_norm": 0.014246292412281036, - "learning_rate": 3.236842105263158e-06, - "loss": 0.0014, - "step": 689 - }, - { - "epoch": 31.318181818181817, - "eval_loss": 0.0008347549010068178, - "eval_runtime": 0.2413, - "eval_samples_per_second": 364.723, - "eval_steps_per_second": 45.59, - "step": 689 - }, - { - "epoch": 31.363636363636363, - "grad_norm": 0.01505040843039751, - "learning_rate": 3.1973684210526314e-06, - "loss": 0.0016, - "step": 690 - }, - { - "epoch": 31.363636363636363, - "eval_loss": 0.0008343501249328256, - "eval_runtime": 0.2321, - "eval_samples_per_second": 379.09, - "eval_steps_per_second": 47.386, - "step": 690 - }, - { - "epoch": 31.40909090909091, - "grad_norm": 0.011749452911317348, - "learning_rate": 3.157894736842105e-06, - "loss": 0.0013, - "step": 691 - }, - { - "epoch": 31.40909090909091, - "eval_loss": 0.0008339481428265572, - "eval_runtime": 0.2656, - "eval_samples_per_second": 331.332, - "eval_steps_per_second": 41.416, - "step": 691 - }, - { - "epoch": 31.454545454545453, - "grad_norm": 0.012921934016048908, - "learning_rate": 3.1184210526315793e-06, - "loss": 0.0015, - "step": 692 - }, - { - "epoch": 31.454545454545453, - "eval_loss": 0.0008335394668392837, - "eval_runtime": 0.2542, - "eval_samples_per_second": 346.242, - "eval_steps_per_second": 43.28, - "step": 692 - }, - { - "epoch": 31.5, - "grad_norm": 0.01331315003335476, - "learning_rate": 3.0789473684210526e-06, - "loss": 0.0014, - "step": 693 - }, - { - "epoch": 31.5, - "eval_loss": 0.0008331468561664224, - "eval_runtime": 0.2417, - "eval_samples_per_second": 364.055, - "eval_steps_per_second": 45.507, - "step": 693 - }, - { - "epoch": 31.545454545454547, - "grad_norm": 0.012770496308803558, - "learning_rate": 3.0394736842105263e-06, - "loss": 0.0015, - "step": 694 - }, - { - "epoch": 31.545454545454547, - "eval_loss": 0.0008327368414029479, - "eval_runtime": 0.2689, - "eval_samples_per_second": 327.265, - "eval_steps_per_second": 40.908, - "step": 694 - }, - { - "epoch": 31.59090909090909, - "grad_norm": 0.012804139405488968, - "learning_rate": 3e-06, - "loss": 0.0014, - "step": 695 - }, - { - "epoch": 31.59090909090909, - "eval_loss": 0.0008323252550326288, - "eval_runtime": 0.2468, - "eval_samples_per_second": 356.61, - "eval_steps_per_second": 44.576, - "step": 695 - }, - { - "epoch": 31.636363636363637, - "grad_norm": 0.014062759466469288, - "learning_rate": 2.960526315789474e-06, - "loss": 0.0015, - "step": 696 - }, - { - "epoch": 31.636363636363637, - "eval_loss": 0.0008318935870192945, - "eval_runtime": 0.2529, - "eval_samples_per_second": 347.95, - "eval_steps_per_second": 43.494, - "step": 696 - }, - { - "epoch": 31.681818181818183, - "grad_norm": 0.013049440458416939, - "learning_rate": 2.9210526315789475e-06, - "loss": 0.0014, - "step": 697 - }, - { - "epoch": 31.681818181818183, - "eval_loss": 0.0008314928272739053, - "eval_runtime": 0.2521, - "eval_samples_per_second": 349.0, - "eval_steps_per_second": 43.625, - "step": 697 - }, - { - "epoch": 31.727272727272727, - "grad_norm": 0.01172225084155798, - "learning_rate": 2.881578947368421e-06, - "loss": 0.0013, - "step": 698 - }, - { - "epoch": 31.727272727272727, - "eval_loss": 0.0008310881094075739, - "eval_runtime": 0.2672, - "eval_samples_per_second": 329.329, - "eval_steps_per_second": 41.166, - "step": 698 - }, - { - "epoch": 31.772727272727273, - "grad_norm": 0.01266531739383936, - "learning_rate": 2.842105263157895e-06, - "loss": 0.0014, - "step": 699 - }, - { - "epoch": 31.772727272727273, - "eval_loss": 0.0008307105163112283, - "eval_runtime": 0.3176, - "eval_samples_per_second": 277.082, - "eval_steps_per_second": 34.635, - "step": 699 - }, - { - "epoch": 31.818181818181817, - "grad_norm": 0.014071842655539513, - "learning_rate": 2.8026315789473683e-06, - "loss": 0.0015, - "step": 700 - }, - { - "epoch": 31.818181818181817, - "eval_loss": 0.0008303424110636115, - "eval_runtime": 0.2648, - "eval_samples_per_second": 332.279, - "eval_steps_per_second": 41.535, - "step": 700 - }, - { - "epoch": 31.863636363636363, - "grad_norm": 0.01333391759544611, - "learning_rate": 2.763157894736842e-06, - "loss": 0.0015, - "step": 701 - }, - { - "epoch": 31.863636363636363, - "eval_loss": 0.0008299809414893389, - "eval_runtime": 0.2429, - "eval_samples_per_second": 362.239, - "eval_steps_per_second": 45.28, - "step": 701 - }, - { - "epoch": 31.90909090909091, - "grad_norm": 0.010583317838609219, - "learning_rate": 2.723684210526316e-06, - "loss": 0.0012, - "step": 702 - }, - { - "epoch": 31.90909090909091, - "eval_loss": 0.0008296439773403108, - "eval_runtime": 0.2463, - "eval_samples_per_second": 357.358, - "eval_steps_per_second": 44.67, - "step": 702 - }, - { - "epoch": 31.954545454545453, - "grad_norm": 0.01122986525297165, - "learning_rate": 2.6842105263157895e-06, - "loss": 0.0013, - "step": 703 - }, - { - "epoch": 31.954545454545453, - "eval_loss": 0.0008293138234876096, - "eval_runtime": 0.2433, - "eval_samples_per_second": 361.652, - "eval_steps_per_second": 45.206, - "step": 703 - }, - { - "epoch": 32.0, - "grad_norm": 0.011437175795435905, - "learning_rate": 2.644736842105263e-06, - "loss": 0.0013, - "step": 704 - }, - { - "epoch": 32.0, - "eval_loss": 0.0008289901888929307, - "eval_runtime": 0.2357, - "eval_samples_per_second": 373.319, - "eval_steps_per_second": 46.665, - "step": 704 - }, - { - "epoch": 32.04545454545455, - "grad_norm": 0.012699670158326626, - "learning_rate": 2.605263157894737e-06, - "loss": 0.0014, - "step": 705 - }, - { - "epoch": 32.04545454545455, - "eval_loss": 0.0008286829688586295, - "eval_runtime": 0.2319, - "eval_samples_per_second": 379.476, - "eval_steps_per_second": 47.435, - "step": 705 - }, - { - "epoch": 32.09090909090909, - "grad_norm": 0.013239861465990543, - "learning_rate": 2.5657894736842107e-06, - "loss": 0.0014, - "step": 706 - }, - { - "epoch": 32.09090909090909, - "eval_loss": 0.0008283716160804033, - "eval_runtime": 0.2319, - "eval_samples_per_second": 379.415, - "eval_steps_per_second": 47.427, - "step": 706 - }, - { - "epoch": 32.13636363636363, - "grad_norm": 0.012133197858929634, - "learning_rate": 2.5263157894736844e-06, - "loss": 0.0013, - "step": 707 - }, - { - "epoch": 32.13636363636363, - "eval_loss": 0.0008280739421024919, - "eval_runtime": 0.2242, - "eval_samples_per_second": 392.558, - "eval_steps_per_second": 49.07, - "step": 707 - }, - { - "epoch": 32.18181818181818, - "grad_norm": 0.011126801371574402, - "learning_rate": 2.4868421052631577e-06, - "loss": 0.0013, - "step": 708 - }, - { - "epoch": 32.18181818181818, - "eval_loss": 0.0008277747547253966, - "eval_runtime": 0.2381, - "eval_samples_per_second": 369.534, - "eval_steps_per_second": 46.192, - "step": 708 - }, - { - "epoch": 32.22727272727273, - "grad_norm": 0.012151258997619152, - "learning_rate": 2.447368421052632e-06, - "loss": 0.0014, - "step": 709 - }, - { - "epoch": 32.22727272727273, - "eval_loss": 0.0008274810388684273, - "eval_runtime": 0.265, - "eval_samples_per_second": 332.045, - "eval_steps_per_second": 41.506, - "step": 709 - }, - { - "epoch": 32.27272727272727, - "grad_norm": 0.013219231739640236, - "learning_rate": 2.407894736842105e-06, - "loss": 0.0014, - "step": 710 - }, - { - "epoch": 32.27272727272727, - "eval_loss": 0.0008271847036667168, - "eval_runtime": 0.2428, - "eval_samples_per_second": 362.463, - "eval_steps_per_second": 45.308, - "step": 710 - }, - { - "epoch": 32.31818181818182, - "grad_norm": 0.010275053791701794, - "learning_rate": 2.368421052631579e-06, - "loss": 0.0012, - "step": 711 - }, - { - "epoch": 32.31818181818182, - "eval_loss": 0.0008268963429145515, - "eval_runtime": 0.2418, - "eval_samples_per_second": 363.953, - "eval_steps_per_second": 45.494, - "step": 711 - }, - { - "epoch": 32.36363636363637, - "grad_norm": 0.013079304248094559, - "learning_rate": 2.328947368421053e-06, - "loss": 0.0014, - "step": 712 - }, - { - "epoch": 32.36363636363637, - "eval_loss": 0.00082661077613011, - "eval_runtime": 0.232, - "eval_samples_per_second": 379.238, - "eval_steps_per_second": 47.405, - "step": 712 - }, - { - "epoch": 32.40909090909091, - "grad_norm": 0.019619744271039963, - "learning_rate": 2.2894736842105263e-06, - "loss": 0.0014, - "step": 713 - }, - { - "epoch": 32.40909090909091, - "eval_loss": 0.0008263156050816178, - "eval_runtime": 0.2626, - "eval_samples_per_second": 335.09, - "eval_steps_per_second": 41.886, - "step": 713 - }, - { - "epoch": 32.45454545454545, - "grad_norm": 0.014103109948337078, - "learning_rate": 2.25e-06, - "loss": 0.0015, - "step": 714 - }, - { - "epoch": 32.45454545454545, - "eval_loss": 0.0008260206668637693, - "eval_runtime": 0.256, - "eval_samples_per_second": 343.813, - "eval_steps_per_second": 42.977, - "step": 714 - }, - { - "epoch": 32.5, - "grad_norm": 0.013360358774662018, - "learning_rate": 2.2105263157894734e-06, - "loss": 0.0015, - "step": 715 - }, - { - "epoch": 32.5, - "eval_loss": 0.0008257552981376648, - "eval_runtime": 0.2719, - "eval_samples_per_second": 323.628, - "eval_steps_per_second": 40.454, - "step": 715 - }, - { - "epoch": 32.54545454545455, - "grad_norm": 0.012335807085037231, - "learning_rate": 2.1710526315789475e-06, - "loss": 0.0014, - "step": 716 - }, - { - "epoch": 32.54545454545455, - "eval_loss": 0.0008254764834418893, - "eval_runtime": 0.257, - "eval_samples_per_second": 342.453, - "eval_steps_per_second": 42.807, - "step": 716 - }, - { - "epoch": 32.59090909090909, - "grad_norm": 0.012738436460494995, - "learning_rate": 2.1315789473684212e-06, - "loss": 0.0014, - "step": 717 - }, - { - "epoch": 32.59090909090909, - "eval_loss": 0.0008252071565948427, - "eval_runtime": 0.2774, - "eval_samples_per_second": 317.284, - "eval_steps_per_second": 39.66, - "step": 717 - }, - { - "epoch": 32.63636363636363, - "grad_norm": 0.011913586407899857, - "learning_rate": 2.0921052631578945e-06, - "loss": 0.0013, - "step": 718 - }, - { - "epoch": 32.63636363636363, - "eval_loss": 0.0008249431848526001, - "eval_runtime": 0.2458, - "eval_samples_per_second": 358.083, - "eval_steps_per_second": 44.76, - "step": 718 - }, - { - "epoch": 32.68181818181818, - "grad_norm": 0.010375920683145523, - "learning_rate": 2.0526315789473687e-06, - "loss": 0.0013, - "step": 719 - }, - { - "epoch": 32.68181818181818, - "eval_loss": 0.0008246820070780814, - "eval_runtime": 0.2548, - "eval_samples_per_second": 345.32, - "eval_steps_per_second": 43.165, - "step": 719 - }, - { - "epoch": 32.72727272727273, - "grad_norm": 0.016080064699053764, - "learning_rate": 2.013157894736842e-06, - "loss": 0.0016, - "step": 720 - }, - { - "epoch": 32.72727272727273, - "eval_loss": 0.0008244179771281779, - "eval_runtime": 0.2695, - "eval_samples_per_second": 326.571, - "eval_steps_per_second": 40.821, - "step": 720 - }, - { - "epoch": 32.77272727272727, - "grad_norm": 0.01252568420022726, - "learning_rate": 1.9736842105263157e-06, - "loss": 0.0013, - "step": 721 - }, - { - "epoch": 32.77272727272727, - "eval_loss": 0.0008241839241236448, - "eval_runtime": 0.2948, - "eval_samples_per_second": 298.515, - "eval_steps_per_second": 37.314, - "step": 721 - }, - { - "epoch": 32.81818181818182, - "grad_norm": 0.012378372251987457, - "learning_rate": 1.93421052631579e-06, - "loss": 0.0014, - "step": 722 - }, - { - "epoch": 32.81818181818182, - "eval_loss": 0.0008239619201049209, - "eval_runtime": 0.2733, - "eval_samples_per_second": 321.958, - "eval_steps_per_second": 40.245, - "step": 722 - }, - { - "epoch": 32.86363636363637, - "grad_norm": 0.013344389386475086, - "learning_rate": 1.8947368421052632e-06, - "loss": 0.0015, - "step": 723 - }, - { - "epoch": 32.86363636363637, - "eval_loss": 0.000823718321043998, - "eval_runtime": 0.2569, - "eval_samples_per_second": 342.55, - "eval_steps_per_second": 42.819, - "step": 723 - }, - { - "epoch": 32.90909090909091, - "grad_norm": 0.012948358431458473, - "learning_rate": 1.855263157894737e-06, - "loss": 0.0015, - "step": 724 - }, - { - "epoch": 32.90909090909091, - "eval_loss": 0.0008235003333538771, - "eval_runtime": 0.2559, - "eval_samples_per_second": 343.901, - "eval_steps_per_second": 42.988, - "step": 724 - }, - { - "epoch": 32.95454545454545, - "grad_norm": 0.011233711615204811, - "learning_rate": 1.8157894736842106e-06, - "loss": 0.0012, - "step": 725 - }, - { - "epoch": 32.95454545454545, - "eval_loss": 0.000823282403871417, - "eval_runtime": 0.2642, - "eval_samples_per_second": 333.059, - "eval_steps_per_second": 41.632, - "step": 725 - }, - { - "epoch": 33.0, - "grad_norm": 0.01327808853238821, - "learning_rate": 1.7763157894736842e-06, - "loss": 0.0015, - "step": 726 - }, - { - "epoch": 33.0, - "eval_loss": 0.0008230686071328819, - "eval_runtime": 0.2841, - "eval_samples_per_second": 309.721, - "eval_steps_per_second": 38.715, - "step": 726 - }, - { - "epoch": 33.04545454545455, - "grad_norm": 0.011662392877042294, - "learning_rate": 1.7368421052631579e-06, - "loss": 0.0014, - "step": 727 - }, - { - "epoch": 33.04545454545455, - "eval_loss": 0.0008228750666603446, - "eval_runtime": 0.249, - "eval_samples_per_second": 353.382, - "eval_steps_per_second": 44.173, - "step": 727 - }, - { - "epoch": 33.09090909090909, - "grad_norm": 0.011290736496448517, - "learning_rate": 1.6973684210526316e-06, - "loss": 0.0013, - "step": 728 - }, - { - "epoch": 33.09090909090909, - "eval_loss": 0.0008226787904277444, - "eval_runtime": 0.2459, - "eval_samples_per_second": 357.906, - "eval_steps_per_second": 44.738, - "step": 728 - }, - { - "epoch": 33.13636363636363, - "grad_norm": 0.011928938329219818, - "learning_rate": 1.6578947368421053e-06, - "loss": 0.0014, - "step": 729 - }, - { - "epoch": 33.13636363636363, - "eval_loss": 0.0008224839111790061, - "eval_runtime": 0.2693, - "eval_samples_per_second": 326.807, - "eval_steps_per_second": 40.851, - "step": 729 - }, - { - "epoch": 33.18181818181818, - "grad_norm": 0.013969271443784237, - "learning_rate": 1.618421052631579e-06, - "loss": 0.0014, - "step": 730 - }, - { - "epoch": 33.18181818181818, - "eval_loss": 0.0008223024778999388, - "eval_runtime": 0.2985, - "eval_samples_per_second": 294.848, - "eval_steps_per_second": 36.856, - "step": 730 - }, - { - "epoch": 33.22727272727273, - "grad_norm": 0.01247771643102169, - "learning_rate": 1.5789473684210526e-06, - "loss": 0.0014, - "step": 731 - }, - { - "epoch": 33.22727272727273, - "eval_loss": 0.0008221129537560046, - "eval_runtime": 0.2564, - "eval_samples_per_second": 343.194, - "eval_steps_per_second": 42.899, - "step": 731 - }, - { - "epoch": 33.27272727272727, - "grad_norm": 0.012111688032746315, - "learning_rate": 1.5394736842105263e-06, - "loss": 0.0013, - "step": 732 - }, - { - "epoch": 33.27272727272727, - "eval_loss": 0.0008219464216381311, - "eval_runtime": 0.2484, - "eval_samples_per_second": 354.308, - "eval_steps_per_second": 44.289, - "step": 732 - }, - { - "epoch": 33.31818181818182, - "grad_norm": 0.01268478948622942, - "learning_rate": 1.5e-06, - "loss": 0.0014, - "step": 733 - }, - { - "epoch": 33.31818181818182, - "eval_loss": 0.0008217745926231146, - "eval_runtime": 0.2938, - "eval_samples_per_second": 299.518, - "eval_steps_per_second": 37.44, - "step": 733 - }, - { - "epoch": 33.36363636363637, - "grad_norm": 0.01151086576282978, - "learning_rate": 1.4605263157894738e-06, - "loss": 0.0012, - "step": 734 - }, - { - "epoch": 33.36363636363637, - "eval_loss": 0.0008215824491344392, - "eval_runtime": 0.2627, - "eval_samples_per_second": 335.021, - "eval_steps_per_second": 41.878, - "step": 734 - }, - { - "epoch": 33.40909090909091, - "grad_norm": 0.012743664905428886, - "learning_rate": 1.4210526315789475e-06, - "loss": 0.0014, - "step": 735 - }, - { - "epoch": 33.40909090909091, - "eval_loss": 0.0008214117842726409, - "eval_runtime": 0.257, - "eval_samples_per_second": 342.469, - "eval_steps_per_second": 42.809, - "step": 735 - }, - { - "epoch": 33.45454545454545, - "grad_norm": 0.014465508982539177, - "learning_rate": 1.381578947368421e-06, - "loss": 0.0015, - "step": 736 - }, - { - "epoch": 33.45454545454545, - "eval_loss": 0.0008212332031689584, - "eval_runtime": 0.2383, - "eval_samples_per_second": 369.294, - "eval_steps_per_second": 46.162, - "step": 736 - }, - { - "epoch": 33.5, - "grad_norm": 0.011136289685964584, - "learning_rate": 1.3421052631578947e-06, - "loss": 0.0013, - "step": 737 - }, - { - "epoch": 33.5, - "eval_loss": 0.0008210748201236129, - "eval_runtime": 0.2515, - "eval_samples_per_second": 349.848, - "eval_steps_per_second": 43.731, - "step": 737 - }, - { - "epoch": 33.54545454545455, - "grad_norm": 0.013279801234602928, - "learning_rate": 1.3026315789473685e-06, - "loss": 0.0014, - "step": 738 - }, - { - "epoch": 33.54545454545455, - "eval_loss": 0.0008209014777094126, - "eval_runtime": 0.2624, - "eval_samples_per_second": 335.412, - "eval_steps_per_second": 41.926, - "step": 738 - }, - { - "epoch": 33.59090909090909, - "grad_norm": 0.011146324686706066, - "learning_rate": 1.2631578947368422e-06, - "loss": 0.0012, - "step": 739 - }, - { - "epoch": 33.59090909090909, - "eval_loss": 0.0008207445498555899, - "eval_runtime": 0.2477, - "eval_samples_per_second": 355.277, - "eval_steps_per_second": 44.41, - "step": 739 - }, - { - "epoch": 33.63636363636363, - "grad_norm": 0.011300037615001202, - "learning_rate": 1.223684210526316e-06, - "loss": 0.0013, - "step": 740 - }, - { - "epoch": 33.63636363636363, - "eval_loss": 0.0008206011261790991, - "eval_runtime": 0.2524, - "eval_samples_per_second": 348.598, - "eval_steps_per_second": 43.575, - "step": 740 - }, - { - "epoch": 33.68181818181818, - "grad_norm": 0.013210857287049294, - "learning_rate": 1.1842105263157894e-06, - "loss": 0.0013, - "step": 741 - }, - { - "epoch": 33.68181818181818, - "eval_loss": 0.0008204494952224195, - "eval_runtime": 0.2769, - "eval_samples_per_second": 317.777, - "eval_steps_per_second": 39.722, - "step": 741 - }, - { - "epoch": 33.72727272727273, - "grad_norm": 0.011201176792383194, - "learning_rate": 1.1447368421052632e-06, - "loss": 0.0013, - "step": 742 - }, - { - "epoch": 33.72727272727273, - "eval_loss": 0.0008203128236345947, - "eval_runtime": 0.259, - "eval_samples_per_second": 339.749, - "eval_steps_per_second": 42.469, - "step": 742 - }, - { - "epoch": 33.77272727272727, - "grad_norm": 0.012550720945000648, - "learning_rate": 1.1052631578947367e-06, - "loss": 0.0013, - "step": 743 - }, - { - "epoch": 33.77272727272727, - "eval_loss": 0.0008201680611819029, - "eval_runtime": 0.2549, - "eval_samples_per_second": 345.3, - "eval_steps_per_second": 43.163, - "step": 743 - }, - { - "epoch": 33.81818181818182, - "grad_norm": 0.011524029076099396, - "learning_rate": 1.0657894736842106e-06, - "loss": 0.0014, - "step": 744 - }, - { - "epoch": 33.81818181818182, - "eval_loss": 0.0008200569427572191, - "eval_runtime": 0.2576, - "eval_samples_per_second": 341.575, - "eval_steps_per_second": 42.697, - "step": 744 - }, - { - "epoch": 33.86363636363637, - "grad_norm": 0.014999749138951302, - "learning_rate": 1.0263157894736843e-06, - "loss": 0.0015, - "step": 745 - }, - { - "epoch": 33.86363636363637, - "eval_loss": 0.0008199459407478571, - "eval_runtime": 0.2573, - "eval_samples_per_second": 342.038, - "eval_steps_per_second": 42.755, - "step": 745 - }, - { - "epoch": 33.90909090909091, - "grad_norm": 0.013432620093226433, - "learning_rate": 9.868421052631579e-07, - "loss": 0.0014, - "step": 746 - }, - { - "epoch": 33.90909090909091, - "eval_loss": 0.0008198119467124343, - "eval_runtime": 0.2794, - "eval_samples_per_second": 314.936, - "eval_steps_per_second": 39.367, - "step": 746 - }, - { - "epoch": 33.95454545454545, - "grad_norm": 0.011333504691720009, - "learning_rate": 9.473684210526316e-07, - "loss": 0.0014, - "step": 747 - }, - { - "epoch": 33.95454545454545, - "eval_loss": 0.0008196914568543434, - "eval_runtime": 0.2549, - "eval_samples_per_second": 345.205, - "eval_steps_per_second": 43.151, - "step": 747 - }, - { - "epoch": 34.0, - "grad_norm": 0.0102554215118289, - "learning_rate": 9.078947368421053e-07, - "loss": 0.0012, - "step": 748 - }, - { - "epoch": 34.0, - "eval_loss": 0.0008195764967240393, - "eval_runtime": 0.2656, - "eval_samples_per_second": 331.349, - "eval_steps_per_second": 41.419, - "step": 748 - }, - { - "epoch": 34.04545454545455, - "grad_norm": 0.011500447988510132, - "learning_rate": 8.684210526315789e-07, - "loss": 0.0013, - "step": 749 - }, - { - "epoch": 34.04545454545455, - "eval_loss": 0.0008194709080271423, - "eval_runtime": 0.2631, - "eval_samples_per_second": 334.415, - "eval_steps_per_second": 41.802, - "step": 749 - }, - { - "epoch": 34.09090909090909, - "grad_norm": 0.011614636518061161, - "learning_rate": 8.289473684210527e-07, - "loss": 0.0014, - "step": 750 - }, - { - "epoch": 34.09090909090909, - "eval_loss": 0.0008193707326427102, - "eval_runtime": 0.2898, - "eval_samples_per_second": 303.667, - "eval_steps_per_second": 37.958, - "step": 750 - }, - { - "epoch": 34.13636363636363, - "grad_norm": 0.010696956887841225, - "learning_rate": 7.894736842105263e-07, - "loss": 0.0013, - "step": 751 - }, - { - "epoch": 34.13636363636363, - "eval_loss": 0.0008192665409296751, - "eval_runtime": 0.2328, - "eval_samples_per_second": 378.006, - "eval_steps_per_second": 47.251, - "step": 751 - }, - { - "epoch": 34.18181818181818, - "grad_norm": 0.011633389629423618, - "learning_rate": 7.5e-07, - "loss": 0.0014, - "step": 752 - }, - { - "epoch": 34.18181818181818, - "eval_loss": 0.0008191689848899841, - "eval_runtime": 0.2379, - "eval_samples_per_second": 369.833, - "eval_steps_per_second": 46.229, - "step": 752 - }, - { - "epoch": 34.22727272727273, - "grad_norm": 0.013071279041469097, - "learning_rate": 7.105263157894737e-07, - "loss": 0.0013, - "step": 753 - }, - { - "epoch": 34.22727272727273, - "eval_loss": 0.0008190743392333388, - "eval_runtime": 0.2373, - "eval_samples_per_second": 370.812, - "eval_steps_per_second": 46.351, - "step": 753 - }, - { - "epoch": 34.27272727272727, - "grad_norm": 0.011386328376829624, - "learning_rate": 6.710526315789474e-07, - "loss": 0.0013, - "step": 754 - }, - { - "epoch": 34.27272727272727, - "eval_loss": 0.0008190052467398345, - "eval_runtime": 0.2237, - "eval_samples_per_second": 393.447, - "eval_steps_per_second": 49.181, - "step": 754 - }, - { - "epoch": 34.31818181818182, - "grad_norm": 0.011327208951115608, - "learning_rate": 6.315789473684211e-07, - "loss": 0.0013, - "step": 755 - }, - { - "epoch": 34.31818181818182, - "eval_loss": 0.0008189321961253881, - "eval_runtime": 0.2293, - "eval_samples_per_second": 383.737, - "eval_steps_per_second": 47.967, - "step": 755 - }, - { - "epoch": 34.36363636363637, - "grad_norm": 0.011524545960128307, - "learning_rate": 5.921052631578947e-07, - "loss": 0.0014, - "step": 756 - }, - { - "epoch": 34.36363636363637, - "eval_loss": 0.0008188713109120727, - "eval_runtime": 0.2355, - "eval_samples_per_second": 373.643, - "eval_steps_per_second": 46.705, - "step": 756 - }, - { - "epoch": 34.40909090909091, - "grad_norm": 0.012313243001699448, - "learning_rate": 5.526315789473683e-07, - "loss": 0.0014, - "step": 757 - }, - { - "epoch": 34.40909090909091, - "eval_loss": 0.0008188103674910963, - "eval_runtime": 0.4518, - "eval_samples_per_second": 194.767, - "eval_steps_per_second": 24.346, - "step": 757 - }, - { - "epoch": 34.45454545454545, - "grad_norm": 0.012687238864600658, - "learning_rate": 5.131578947368422e-07, - "loss": 0.0014, - "step": 758 - }, - { - "epoch": 34.45454545454545, - "eval_loss": 0.0008187480852939188, - "eval_runtime": 0.2941, - "eval_samples_per_second": 299.267, - "eval_steps_per_second": 37.408, - "step": 758 - }, - { - "epoch": 34.5, - "grad_norm": 0.012826275080442429, - "learning_rate": 4.736842105263158e-07, - "loss": 0.0013, - "step": 759 - }, - { - "epoch": 34.5, - "eval_loss": 0.0008186926716007292, - "eval_runtime": 0.3991, - "eval_samples_per_second": 220.5, - "eval_steps_per_second": 27.562, - "step": 759 - }, - { - "epoch": 34.54545454545455, - "grad_norm": 0.012961960397660732, - "learning_rate": 4.3421052631578947e-07, - "loss": 0.0015, - "step": 760 - }, - { - "epoch": 34.54545454545455, - "eval_loss": 0.0008186465711332858, - "eval_runtime": 0.2484, - "eval_samples_per_second": 354.263, - "eval_steps_per_second": 44.283, - "step": 760 - }, - { - "epoch": 34.59090909090909, - "grad_norm": 0.013269671238958836, - "learning_rate": 3.9473684210526315e-07, - "loss": 0.0014, - "step": 761 - }, - { - "epoch": 34.59090909090909, - "eval_loss": 0.0008186018676497042, - "eval_runtime": 0.291, - "eval_samples_per_second": 302.416, - "eval_steps_per_second": 37.802, - "step": 761 - }, - { - "epoch": 34.63636363636363, - "grad_norm": 0.012951558455824852, - "learning_rate": 3.5526315789473687e-07, - "loss": 0.0013, - "step": 762 - }, - { - "epoch": 34.63636363636363, - "eval_loss": 0.0008185504120774567, - "eval_runtime": 0.2594, - "eval_samples_per_second": 339.213, - "eval_steps_per_second": 42.402, - "step": 762 - }, - { - "epoch": 34.68181818181818, - "grad_norm": 0.01040305569767952, - "learning_rate": 3.1578947368421055e-07, - "loss": 0.0013, - "step": 763 - }, - { - "epoch": 34.68181818181818, - "eval_loss": 0.0008185274782590568, - "eval_runtime": 0.3222, - "eval_samples_per_second": 273.081, - "eval_steps_per_second": 34.135, - "step": 763 - }, - { - "epoch": 34.72727272727273, - "grad_norm": 0.013104148209095001, - "learning_rate": 2.7631578947368417e-07, - "loss": 0.0014, - "step": 764 - }, - { - "epoch": 34.72727272727273, - "eval_loss": 0.0008184895268641412, - "eval_runtime": 0.3124, - "eval_samples_per_second": 281.65, - "eval_steps_per_second": 35.206, - "step": 764 - }, - { - "epoch": 34.77272727272727, - "grad_norm": 0.012136269360780716, - "learning_rate": 2.368421052631579e-07, - "loss": 0.0014, - "step": 765 - }, - { - "epoch": 34.77272727272727, - "eval_loss": 0.0008184570469893515, - "eval_runtime": 0.2394, - "eval_samples_per_second": 367.55, - "eval_steps_per_second": 45.944, - "step": 765 - }, - { - "epoch": 34.81818181818182, - "grad_norm": 0.011621113866567612, - "learning_rate": 1.9736842105263157e-07, - "loss": 0.0014, - "step": 766 - }, - { - "epoch": 34.81818181818182, - "eval_loss": 0.0008184341131709516, - "eval_runtime": 0.2627, - "eval_samples_per_second": 334.967, - "eval_steps_per_second": 41.871, - "step": 766 - }, - { - "epoch": 34.86363636363637, - "grad_norm": 0.0140585508197546, - "learning_rate": 1.5789473684210527e-07, - "loss": 0.0014, - "step": 767 - }, - { - "epoch": 34.86363636363637, - "eval_loss": 0.0008184110629372299, - "eval_runtime": 0.2624, - "eval_samples_per_second": 335.359, - "eval_steps_per_second": 41.92, - "step": 767 - }, - { - "epoch": 34.90909090909091, - "grad_norm": 0.0137332146987319, - "learning_rate": 1.1842105263157895e-07, - "loss": 0.0013, - "step": 768 - }, - { - "epoch": 34.90909090909091, - "eval_loss": 0.000818394822999835, - "eval_runtime": 0.3455, - "eval_samples_per_second": 254.714, - "eval_steps_per_second": 31.839, - "step": 768 - }, - { - "epoch": 34.95454545454545, - "grad_norm": 0.013574851676821709, - "learning_rate": 7.894736842105264e-08, - "loss": 0.0015, - "step": 769 - }, - { - "epoch": 34.95454545454545, - "eval_loss": 0.000818385393358767, - "eval_runtime": 0.4011, - "eval_samples_per_second": 219.413, - "eval_steps_per_second": 27.427, - "step": 769 - }, - { - "epoch": 35.0, - "grad_norm": 0.01393211167305708, - "learning_rate": 3.947368421052632e-08, - "loss": 0.0014, - "step": 770 - }, - { - "epoch": 35.0, - "eval_loss": 0.0008183813188225031, - "eval_runtime": 0.2776, - "eval_samples_per_second": 316.984, - "eval_steps_per_second": 39.623, - "step": 770 - } - ], - "logging_steps": 1, - "max_steps": 770, - "num_input_tokens_seen": 0, - "num_train_epochs": 35, - "save_steps": 50, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 335566894333440.0, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -}